diff --git a/.github/workflows/PR-5.x.yaml b/.github/workflows/PR-5.x.yaml
index c2a3d150553..dcd3de86347 100644
--- a/.github/workflows/PR-5.x.yaml
+++ b/.github/workflows/PR-5.x.yaml
@@ -26,3 +26,6 @@ jobs:
 
   macOS-X64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-5.x-macOS-x86_64.yaml@main
+
+  Linux-RISC-V-Clang:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-5.x-RISCV.yaml@main
diff --git a/modules/cudaarithm/test/test_event.cpp b/modules/cudaarithm/test/test_event.cpp
index 375c51d0d35..ffe0f7b681e 100644
--- a/modules/cudaarithm/test/test_event.cpp
+++ b/modules/cudaarithm/test/test_event.cpp
@@ -91,7 +91,7 @@ CUDA_TEST_P(AsyncEvent, Timing)
             const double elTimeMs = Event::elapsedTime(startEvent, stopEvent);
             ASSERT_GT(elTimeMs, 0);
         }
-        catch (cv::Exception ex) {
+        catch (const cv::Exception& ex) {
             failed = true;
         }
         ASSERT_EQ(failed, shouldFail.at(i));
diff --git a/modules/cudacodec/include/opencv2/cudacodec.hpp b/modules/cudacodec/include/opencv2/cudacodec.hpp
index 42325c64613..d6421c2b8a4 100644
--- a/modules/cudacodec/include/opencv2/cudacodec.hpp
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -184,18 +184,18 @@ struct CV_EXPORTS_W_SIMPLE EncoderParams
 public:
     CV_WRAP EncoderParams() : nvPreset(ENC_PRESET_P3), tuningInfo(ENC_TUNING_INFO_HIGH_QUALITY), encodingProfile(ENC_CODEC_PROFILE_AUTOSELECT),
         rateControlMode(ENC_PARAMS_RC_VBR), multiPassEncoding(ENC_MULTI_PASS_DISABLED), constQp({ 0,0,0 }), averageBitRate(0), maxBitRate(0),
-        targetQuality(30), gopLength(0) {};
-
+        targetQuality(30), gopLength(250), idrPeriod(250) {};
     CV_PROP_RW EncodePreset nvPreset;
     CV_PROP_RW EncodeTuningInfo tuningInfo;
     CV_PROP_RW EncodeProfile encodingProfile;
     CV_PROP_RW EncodeParamsRcMode rateControlMode;
     CV_PROP_RW EncodeMultiPass multiPassEncoding;
-    CV_PROP_RW EncodeQp constQp; //!< QP's for ENC_PARAMS_RC_CONSTQP.
-    CV_PROP_RW int averageBitRate; //!< target bitrate for ENC_PARAMS_RC_VBR and ENC_PARAMS_RC_CBR.
-    CV_PROP_RW int maxBitRate; //!< upper bound on bitrate for ENC_PARAMS_RC_VBR and ENC_PARAMS_RC_CONSTQP.
-    CV_PROP_RW uint8_t targetQuality; //!< value 0 - 51 where video quality decreases as targetQuality increases, used with ENC_PARAMS_RC_VBR.
-    CV_PROP_RW int gopLength;
+    CV_PROP_RW EncodeQp constQp; //!< QP's for \ref ENC_PARAMS_RC_CONSTQP.
+    CV_PROP_RW int averageBitRate; //!< target bitrate for \ref ENC_PARAMS_RC_VBR and \ref ENC_PARAMS_RC_CBR.
+    CV_PROP_RW int maxBitRate; //!< upper bound on bitrate for \ref ENC_PARAMS_RC_VBR and \ref ENC_PARAMS_RC_CONSTQP.
+    CV_PROP_RW uint8_t targetQuality; //!< value 0 - 51 where video quality decreases as targetQuality increases, used with \ref ENC_PARAMS_RC_VBR.
+    CV_PROP_RW int gopLength; //!< the number of pictures in one GOP, ensuring \ref idrPeriod >= \ref gopLength.
+    CV_PROP_RW int idrPeriod; //!< IDR interval, ensuring \ref idrPeriod >= \ref gopLength.
 };
 CV_EXPORTS bool operator==(const EncoderParams& lhs, const EncoderParams& rhs);
 
@@ -209,7 +209,7 @@ class CV_EXPORTS_W EncoderCallback {
 
     @param vPacket The raw bitstream for one or more frames.
     */
-    virtual void onEncoded(std::vector<std::vector<uint8_t>> vPacket) = 0;
+    virtual void onEncoded(const std::vector<std::vector<uint8_t>>& vPacket) = 0;
 
     /** @brief Callback function to that the encoding has finished.
     * */
@@ -218,14 +218,14 @@ class CV_EXPORTS_W EncoderCallback {
     virtual ~EncoderCallback() {}
 };
 
-/** @brief Video writer interface.
+/** @brief Video writer interface, see createVideoWriter().
 
-Available when built with WITH_NVCUVENC=ON while Nvidia's Video Codec SDK is installed.
+Available if Nvidia's Video Codec SDK is installed.
 
-Encoding support is dependent on the GPU, refer to the Nvidia Video Codec SDK Video Encode and Decode GPU Support Matrix for details.
+Only Codec::H264 and Codec::HEVC are supported with encoding support dependent on the GPU, refer to the Nvidia Video Codec SDK Video Encode and Decode GPU Support Matrix for details.
 
 @note
-   -   An example on how to use the videoWriter class can be found at
+   -   An example on how to use the VideoWriter class can be found at
         opencv_source_code/samples/gpu/video_writer.cpp
 */
 class CV_EXPORTS_W VideoWriter
@@ -253,9 +253,9 @@ class CV_EXPORTS_W VideoWriter
 
 /** @brief Creates video writer.
 
-@param fileName Name of the output video file. Only raw h264 or hevc files are supported.
+@param fileName Name of the output video file.
 @param frameSize Size of the input video frames.
-@param codec Codec.
+@param codec Supports Codec::H264 and Codec::HEVC.
 @param fps Framerate of the created video stream.
 @param colorFormat OpenCv color format of the frames to be encoded.
 @param encoderCallback Callbacks for video encoder. See cudacodec::EncoderCallback. Required for working with the encoded video stream.
@@ -266,9 +266,9 @@ CV_EXPORTS_W Ptr<cudacodec::VideoWriter> createVideoWriter(const String& fileNam
 
 /** @brief Creates video writer.
 
-@param fileName Name of the output video file. Only raw h264 or hevc files are supported.
+@param fileName Name of the output video file.
 @param frameSize Size of the input video frames.
-@param codec Codec.
+@param codec Supports Codec::H264 and Codec::HEVC.
 @param fps Framerate of the created video stream.
 @param colorFormat OpenCv color format of the frames to be encoded.
 @param params Additional encoding parameters.
@@ -361,14 +361,14 @@ enum class VideoReaderProps {
 #endif
 };
 
-/** @brief Video reader interface.
+/** @brief Video reader interface, see createVideoReader().
 
-Available when built with WITH_NVCUVID=ON while Nvidia's Video Codec SDK is installed.
+Available if Nvidia's Video Codec SDK is installed.
 
 Decoding support is dependent on the GPU, refer to the Nvidia Video Codec SDK Video Encode and Decode GPU Support Matrix for details.
 
 @note
-   -   An example on how to use the videoReader class can be found at
+   -   An example on how to use the VideoReader interface can be found at
         opencv_source_code/samples/gpu/video_reader.cpp
  */
 class CV_EXPORTS_W VideoReader
diff --git a/modules/cudacodec/src/NvEncoder.cpp b/modules/cudacodec/src/NvEncoder.cpp
index 249f6f1c61e..aa9d2a67c17 100644
--- a/modules/cudacodec/src/NvEncoder.cpp
+++ b/modules/cudacodec/src/NvEncoder.cpp
@@ -7,16 +7,6 @@
 #include "NvEncoder.h"
 
 namespace cv { namespace cudacodec {
-#ifndef _WIN32
-#include <cstring>
-static inline bool operator==(const GUID& guid1, const GUID& guid2) {
-    return !memcmp(&guid1, &guid2, sizeof(GUID));
-}
-
-static inline bool operator!=(const GUID& guid1, const GUID& guid2) {
-    return !(guid1 == guid2);
-}
-#endif
 
 NvEncoder::NvEncoder(NV_ENC_DEVICE_TYPE eDeviceType, void* pDevice, uint32_t nWidth, uint32_t nHeight, NV_ENC_BUFFER_FORMAT eBufferFormat,
     uint32_t nExtraOutputDelay) :
diff --git a/modules/cudacodec/src/NvEncoder.h b/modules/cudacodec/src/NvEncoder.h
index dd13d2c1501..c5a53712e14 100644
--- a/modules/cudacodec/src/NvEncoder.h
+++ b/modules/cudacodec/src/NvEncoder.h
@@ -15,6 +15,17 @@
 
 namespace cv { namespace cudacodec {
 
+#ifndef _WIN32
+#include <cstring>
+    static inline bool operator==(const GUID& guid1, const GUID& guid2) {
+        return !memcmp(&guid1, &guid2, sizeof(GUID));
+    }
+
+    static inline bool operator!=(const GUID& guid1, const GUID& guid2) {
+        return !(guid1 == guid2);
+    }
+#endif
+
 #define NVENC_THROW_ERROR( errorStr, errorCode ) \
 do \
 { \
diff --git a/modules/cudacodec/src/video_writer.cpp b/modules/cudacodec/src/video_writer.cpp
index db3e2e36306..8b5c703f759 100644
--- a/modules/cudacodec/src/video_writer.cpp
+++ b/modules/cudacodec/src/video_writer.cpp
@@ -59,7 +59,6 @@ GUID CodecGuid(const Codec codec);
 void FrameRate(const double fps, uint32_t& frameRateNum, uint32_t& frameRateDen);
 GUID EncodingProfileGuid(const EncodeProfile encodingProfile);
 GUID EncodingPresetGuid(const EncodePreset nvPreset);
-bool Equal(const GUID& g1, const GUID& g2);
 
 bool operator==(const EncoderParams& lhs, const EncoderParams& rhs)
 {
@@ -68,12 +67,48 @@ bool operator==(const EncoderParams& lhs, const EncoderParams& rhs)
             rhs.averageBitRate, rhs.maxBitRate, rhs.targetQuality, rhs.gopLength);
 };
 
+class FFmpegVideoWriter : public EncoderCallback
+{
+public:
+    FFmpegVideoWriter(const String& fileName, const Codec codec, const int fps, const Size sz, const int idrPeriod);
+    ~FFmpegVideoWriter();
+    void onEncoded(const std::vector<std::vector<uint8_t>>& vPacket);
+    void onEncodingFinished();
+private:
+    cv::VideoWriter writer;
+};
+
+FFmpegVideoWriter::FFmpegVideoWriter(const String& fileName, const Codec codec, const int fps, const Size sz, const int idrPeriod) {
+    if (!videoio_registry::hasBackend(CAP_FFMPEG))
+        CV_Error(Error::StsNotImplemented, "FFmpeg backend not found");
+    const int fourcc = codec == Codec::H264 ? cv::VideoWriter::fourcc('a', 'v', 'c', '1') : cv::VideoWriter::fourcc('h', 'e', 'v', '1');
+    writer.open(fileName, fourcc, fps, sz, { VideoWriterProperties::VIDEOWRITER_PROP_RAW_VIDEO, 1, VideoWriterProperties::VIDEOWRITER_PROP_KEY_INTERVAL, idrPeriod });
+    if (!writer.isOpened())
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported video sink");
+}
+
+void FFmpegVideoWriter::onEncodingFinished() {
+    writer.release();
+}
+
+FFmpegVideoWriter::~FFmpegVideoWriter() {
+    onEncodingFinished();
+}
+
+void FFmpegVideoWriter::onEncoded(const std::vector<std::vector<uint8_t>>& vPacket) {
+    for (auto& packet : vPacket) {
+        Mat wrappedPacket(1, packet.size(), CV_8UC1, (void*)packet.data());
+        writer.write(wrappedPacket);
+    }
+}
+
+
 class RawVideoWriter : public EncoderCallback
 {
 public:
-    RawVideoWriter(String fileName);
+    RawVideoWriter(const String fileName);
     ~RawVideoWriter();
-    void onEncoded(std::vector<std::vector<uint8_t>> vPacket);
+    void onEncoded(const std::vector<std::vector<uint8_t>>& vPacket);
     void onEncodingFinished();
 private:
     std::ofstream fpOut;
@@ -93,9 +128,9 @@ RawVideoWriter::~RawVideoWriter() {
     onEncodingFinished();
 }
 
-void RawVideoWriter::onEncoded(std::vector<std::vector<uint8_t>> vPacket) {
+void RawVideoWriter::onEncoded(const std::vector<std::vector<uint8_t>>& vPacket) {
     for (auto& packet : vPacket)
-        fpOut.write(reinterpret_cast<char*>(packet.data()), packet.size());
+        fpOut.write(reinterpret_cast<const char*>(packet.data()), packet.size());
 }
 
 class VideoWriterImpl : public VideoWriter
@@ -172,12 +207,6 @@ VideoWriterImpl::VideoWriterImpl(const Ptr<EncoderCallback>& encoderCallBack_, c
     Init(codec, fps, frameSz);
 }
 
-VideoWriterImpl::VideoWriterImpl(const Ptr<EncoderCallback>& encoderCallback, const Size frameSz, const Codec codec, const double fps,
-    const ColorFormat colorFormat, const Stream& stream) :
-    VideoWriterImpl(encoderCallback, frameSz, codec, fps, colorFormat, EncoderParams(), stream)
-{
-}
-
 void VideoWriterImpl::release() {
     pEnc->EndEncode(vPacket);
     encoderCallback->onEncoded(vPacket);
@@ -271,12 +300,6 @@ GUID EncodingPresetGuid(const EncodePreset nvPreset) {
     CV_Error(Error::StsUnsupportedFormat, msg);
 }
 
-bool Equal(const GUID& g1, const GUID& g2) {
-    if (std::tie(g1.Data1, g1.Data2, g1.Data3, g1.Data4) == std::tie(g2.Data1, g2.Data2, g2.Data3, g2.Data4))
-        return true;
-    return false;
-}
-
 void VideoWriterImpl::InitializeEncoder(const GUID codec, const double fps)
 {
     NV_ENC_INITIALIZE_PARAMS initializeParams = {};
@@ -293,10 +316,10 @@ void VideoWriterImpl::InitializeEncoder(const GUID codec, const double fps)
     initializeParams.encodeConfig->rcParams.maxBitRate = encoderParams.maxBitRate;
     initializeParams.encodeConfig->rcParams.targetQuality = encoderParams.targetQuality;
     initializeParams.encodeConfig->gopLength = encoderParams.gopLength;
-    if (Equal(codec, NV_ENC_CODEC_H264_GUID))
-        initializeParams.encodeConfig->encodeCodecConfig.h264Config.idrPeriod = encoderParams.gopLength;
-    else if (Equal(codec, NV_ENC_CODEC_HEVC_GUID))
-        initializeParams.encodeConfig->encodeCodecConfig.hevcConfig.idrPeriod = encoderParams.gopLength;
+    if (codec == NV_ENC_CODEC_H264_GUID)
+        initializeParams.encodeConfig->encodeCodecConfig.h264Config.idrPeriod = encoderParams.idrPeriod;
+    else if (codec == NV_ENC_CODEC_HEVC_GUID)
+        initializeParams.encodeConfig->encodeCodecConfig.hevcConfig.idrPeriod = encoderParams.idrPeriod;
     pEnc->CreateEncoder(&initializeParams);
 }
 
@@ -371,14 +394,25 @@ EncoderParams VideoWriterImpl::getEncoderParams() const {
 Ptr<VideoWriter> createVideoWriter(const String& fileName, const Size frameSize, const Codec codec, const double fps, const ColorFormat colorFormat,
     Ptr<EncoderCallback> encoderCallback, const Stream& stream)
 {
-    encoderCallback = encoderCallback ? encoderCallback : new RawVideoWriter(fileName);
-    return makePtr<VideoWriterImpl>(encoderCallback, frameSize, codec, fps, colorFormat, stream);
+    return createVideoWriter(fileName, frameSize, codec, fps, colorFormat, EncoderParams(), encoderCallback, stream);
 }
 
 Ptr<VideoWriter> createVideoWriter(const String& fileName, const Size frameSize, const Codec codec, const double fps, const ColorFormat colorFormat,
     const EncoderParams& params, Ptr<EncoderCallback> encoderCallback, const Stream& stream)
 {
-    encoderCallback = encoderCallback ? encoderCallback : new RawVideoWriter(fileName);
+    CV_Assert(params.idrPeriod >= params.gopLength);
+    if (!encoderCallback) {
+        // required until PR for raw video encapsulation is merged and windows dll is updated
+#ifndef WIN32 // remove #define and keep code once merged
+        try {
+            encoderCallback = new FFmpegVideoWriter(fileName, codec, fps, frameSize, params.idrPeriod);
+        }
+        catch (...)
+#endif
+        {
+            encoderCallback = new RawVideoWriter(fileName);
+        }
+    }
     return makePtr<VideoWriterImpl>(encoderCallback, frameSize, codec, fps, colorFormat, params, stream);
 }
 
diff --git a/modules/cudacodec/test/test_video.cpp b/modules/cudacodec/test/test_video.cpp
index ead5fa944ca..45365dab230 100644
--- a/modules/cudacodec/test/test_video.cpp
+++ b/modules/cudacodec/test/test_video.cpp
@@ -639,7 +639,13 @@ CUDA_TEST_P(TransCode, H264ToH265)
     constexpr cv::cudacodec::ColorFormat colorFormat = cv::cudacodec::ColorFormat::NV_NV12;
     constexpr double fps = 25;
     const cudacodec::Codec codec = cudacodec::Codec::HEVC;
-    const std::string ext = ".h265";
+    // required until PR for raw video encapsulation is merged and windows dll is updated
+#ifdef WIN32
+    const std::string ext = ".hevc";
+#else
+    // use this after update
+    const std::string ext = ".mp4";
+#endif
     const std::string outputFile = cv::tempfile(ext.c_str());
     constexpr int nFrames = 5;
     Size frameSz;
@@ -716,7 +722,13 @@ CUDA_TEST_P(Write, Writer)
     const cudacodec::Codec codec = GET_PARAM(2);
     const double fps = GET_PARAM(3);
     const cv::cudacodec::ColorFormat colorFormat = GET_PARAM(4);
+    // required until PR for raw video encapsulation is merged and windows dll is updated
+#ifdef WIN32
     const std::string ext = codec == cudacodec::Codec::H264 ? ".h264" : ".hevc";
+#else
+    // use this after update
+    const std::string ext = ".mp4";
+#endif
     const std::string outputFile = cv::tempfile(ext.c_str());
     constexpr int nFrames = 5;
     Size frameSz;
@@ -750,7 +762,7 @@ CUDA_TEST_P(Write, Writer)
         const int width = static_cast<int>(cap.get(CAP_PROP_FRAME_WIDTH));
         const int height = static_cast<int>(cap.get(CAP_PROP_FRAME_HEIGHT));
         ASSERT_EQ(frameSz, Size(width, height));
-        ASSERT_TRUE(abs(fps - cap.get(CAP_PROP_FPS)) < 0.5);
+        ASSERT_EQ(fps, cap.get(CAP_PROP_FPS));
         Mat frame;
         for (int i = 0; i < nFrames; ++i) {
             cap >> frame;
@@ -761,24 +773,22 @@ CUDA_TEST_P(Write, Writer)
 }
 
 #define DEVICE_SRC true, false
-#define FPS 10, 29.7
+#define FPS 10, 29
 #define CODEC cv::cudacodec::Codec::H264, cv::cudacodec::Codec::HEVC
 #define COLOR_FORMAT cv::cudacodec::ColorFormat::BGR, cv::cudacodec::ColorFormat::RGB, cv::cudacodec::ColorFormat::BGRA, \
 cv::cudacodec::ColorFormat::RGBA, cv::cudacodec::ColorFormat::GRAY
 INSTANTIATE_TEST_CASE_P(CUDA_Codec, Write, testing::Combine(ALL_DEVICES, testing::Values(DEVICE_SRC), testing::Values(CODEC), testing::Values(FPS),
     testing::Values(COLOR_FORMAT)));
 
-
-struct EncoderParams : testing::TestWithParam<cv::cuda::DeviceInfo>
+PARAM_TEST_CASE(EncoderParams, cv::cuda::DeviceInfo, int)
 {
     cv::cuda::DeviceInfo devInfo;
     cv::cudacodec::EncoderParams params;
     virtual void SetUp()
     {
-        devInfo = GetParam();
+        devInfo = GET_PARAM(0);
         cv::cuda::setDevice(devInfo.deviceID());
         // Fixed params for CBR test
-        params.nvPreset = cv::cudacodec::EncodePreset::ENC_PRESET_P7;
         params.tuningInfo = cv::cudacodec::EncodeTuningInfo::ENC_TUNING_INFO_HIGH_QUALITY;
         params.encodingProfile = cv::cudacodec::EncodeProfile::ENC_H264_PROFILE_MAIN;
         params.rateControlMode = cv::cudacodec::EncodeParamsRcMode::ENC_PARAMS_RC_CBR;
@@ -787,19 +797,25 @@ struct EncoderParams : testing::TestWithParam<cv::cuda::DeviceInfo>
         params.maxBitRate = 0;
         params.targetQuality = 0;
         params.gopLength = 5;
+        params.idrPeriod = GET_PARAM(1);
     }
 };
 
-
 CUDA_TEST_P(EncoderParams, Writer)
 {
     const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.mp4";
     constexpr double fps = 25.0;
     constexpr cudacodec::Codec codec = cudacodec::Codec::H264;
+    // required until PR for raw video encapsulation is merged and windows dll is updated
+#ifdef WIN32
     const std::string ext = ".h264";
+#else
+    // use this after update
+    const std::string ext = ".mp4";
+#endif
     const std::string outputFile = cv::tempfile(ext.c_str());
     Size frameSz;
-    constexpr int nFrames = 5;
+    const int nFrames = max(params.gopLength, params.idrPeriod) + 1;
     {
         cv::VideoCapture reader(inputFile);
         ASSERT_TRUE(reader.isOpened());
@@ -829,20 +845,36 @@ CUDA_TEST_P(EncoderParams, Writer)
         const int height = static_cast<int>(cap.get(CAP_PROP_FRAME_HEIGHT));
         ASSERT_EQ(frameSz, Size(width, height));
         ASSERT_EQ(fps, cap.get(CAP_PROP_FPS));
-        const bool checkGop = videoio_registry::hasBackend(CAP_FFMPEG);
-        Mat frame;
+        const bool checkFrameType = videoio_registry::hasBackend(CAP_FFMPEG);
+        VideoCapture capRaw;
+        int idrPeriod = 0;
+        if (checkFrameType) {
+            capRaw.open(outputFile, CAP_FFMPEG, { CAP_PROP_FORMAT, -1 });
+            ASSERT_TRUE(capRaw.isOpened());
+            idrPeriod = params.idrPeriod == 0 ? params.gopLength : params.idrPeriod;
+        }
+        const double frameTypeIAsciiCode = 73.0; // see CAP_PROP_FRAME_TYPE
+        Mat frame, frameRaw;
         for (int i = 0; i < nFrames; ++i) {
             cap >> frame;
             ASSERT_FALSE(frame.empty());
-            if (checkGop && (cap.get(CAP_PROP_FRAME_TYPE) == 73)) {
-                ASSERT_TRUE(i % params.gopLength == 0);
+            if (checkFrameType) {
+                capRaw >> frameRaw;
+                ASSERT_FALSE(frameRaw.empty());
+                const bool intraFrameReference = cap.get(CAP_PROP_FRAME_TYPE) == frameTypeIAsciiCode;
+                const bool intraFrameActual = i % params.gopLength == 0;
+                ASSERT_EQ(intraFrameActual, intraFrameReference);
+                const bool keyFrameActual = capRaw.get(CAP_PROP_LRF_HAS_KEY_FRAME) == 1.0;
+                const bool keyFrameReference = i % idrPeriod == 0;
+                ASSERT_EQ(keyFrameActual, keyFrameReference);
             }
         }
     }
     ASSERT_EQ(0, remove(outputFile.c_str()));
 }
 
-INSTANTIATE_TEST_CASE_P(CUDA_Codec, EncoderParams, ALL_DEVICES);
+#define IDR_PERIOD testing::Values(5,10)
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, EncoderParams, testing::Combine(ALL_DEVICES, IDR_PERIOD));
 
 #endif // HAVE_NVCUVENC
 
diff --git a/modules/datasets/src/tinyxml2/tinyxml2.h b/modules/datasets/src/tinyxml2/tinyxml2.h
index 95ae3bcc057..89a16c65b75 100644
--- a/modules/datasets/src/tinyxml2/tinyxml2.h
+++ b/modules/datasets/src/tinyxml2/tinyxml2.h
@@ -212,7 +212,7 @@ template <class T, int INIT>
 class DynArray
 {
 public:
-    DynArray< T, INIT >() {
+    DynArray() {
         _mem = _pool;
         _allocated = INIT;
         _size = 0;
diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index 8fde6e457c1..e51f8091cbe 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -296,7 +296,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                             v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                             v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                            v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                            v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                             v_int32x4 t0, t1;
                             v_int16x8 t00, t01, t10, t11;
@@ -304,35 +304,35 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_zip(v10, v11, t10, t11);
 
                             //subpixel interpolation
-                            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                            t0 = t0 >> (W_BITS - 5);
-                            t1 = t1 >> (W_BITS - 5);
+                            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                            t0 = v_shr(t0, W_BITS - 5);
+                            t1 = v_shr(t1, W_BITS - 5);
                             diff0 = v_pack(t0, t1);
                             // I*gain.x + gain.x
                             v_int16x8 diff[4] =
                             {
-                                ((v11 << 5) - vI) & vmask,
-                                ((v01 << 5) - vI) & vmask,
-                                ((v10 << 5) - vI) & vmask,
-                                ((v00 << 5) - vI) & vmask
+                                v_and(v_sub(v_shl<5>(v11), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v01), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v10), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v00), vI), vmask)
                             };
-                            diff0 = diff0 - vI;
-                            diff0 = diff0 & vmask;
+                            diff0 = v_sub(diff0, vI);
+                            diff0 = v_and(diff0, vmask);
 
-                            v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                            veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                            v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                            veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                             // since there is no abs vor int16x8 we have to do this hack
                             v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                             v_int16x8 vset2, vset1;
                             // |It| < sigma1 ?
-                            vset2 = vabs_diff < vparam1;
+                            vset2 = v_lt(vabs_diff, vparam1);
                             // It > 0 ?
-                            v_int16x8 vdiff_is_pos = diff0 > vzero;
+                            v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                             // sigma0 < |It| < sigma1 ?
-                            vset1 = vset2 & (vabs_diff > vparam0);
+                            vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                             // val = |It| -/+ sigma1
-                            v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                            v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
 
                             v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                             v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@@ -342,7 +342,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             for (unsigned int mmi = 0; mmi < 4; mmi++)
                             {
                                 // It == 0     ? |It| > sigma13
-                                diff0 = vset2 & diff[mmi];
+                                diff0 = v_and(vset2, diff[mmi]);
                                 // It == val ? sigma0 < |It| < sigma1
                                 diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -350,16 +350,16 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 // diff = diff * sigma2
                                 v_int32x4 diff_int_0, diff_int_1;
                                 v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                                v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                                v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                                v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
+                                v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
                                 diff0 = v_pack(diff0_0, diff0_1);
                                 v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
                                 v_zip(vIxy_0, vIxy_1, v10, v11);
                                 v_zip(diff2, diff1, v00, v01);
 
-                                vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
                             }
                             if (j == 0)
                             {
@@ -387,8 +387,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 v_float32x4 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                v_float32x4 fxtale = fx * vtale_0;
-                                v_float32x4 fytale = fy * vtale_0;
+                                v_float32x4 fxtale = v_mul(fx, vtale_0);
+                                v_float32x4 fytale = v_mul(fy, vtale_0);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
@@ -402,8 +402,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                fxtale = fx * vtale_1;
-                                fytale = fy * vtale_1;
+                                fxtale = v_mul(fx, vtale_1);
+                                fytale = v_mul(fy, vtale_1);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
@@ -544,7 +544,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     float CV_DECL_ALIGNED(16) bbuf[4];
                     for (int mmi = 0; mmi < 4; mmi++)
                     {
-                        v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                        v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                         _b0[mmi] = bbuf[0] + bbuf[2];
                         _b1[mmi] = bbuf[1] + bbuf[3];
                     }
@@ -960,7 +960,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                             v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                             v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                            v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                            v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                             v_int32x4 t0, t1;
                             v_int16x8 t00, t01, t10, t11;
@@ -968,38 +968,38 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_zip(v10, v11, t10, t11);
 
                             //subpixel interpolation
-                            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                            t0 = t0 >> (W_BITS - 5);
-                            t1 = t1 >> (W_BITS - 5);
+                            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                            t0 = v_shr(t0, W_BITS - 5);
+                            t1 = v_shr(t1, W_BITS - 5);
                             diff0 = v_pack(t0, t1);
                             // I*gain.x + gain.x
                             v_mul_expand(vI, vgain_value, t0, t1);
-                            v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI;
+                            v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI);
 
                             v_int16x8 diff[4] =
                             {
-                                ((v11 << 5) + diff_value) & vmask,
-                                ((v01 << 5) + diff_value) & vmask,
-                                ((v10 << 5) + diff_value) & vmask,
-                                ((v00 << 5) + diff_value) & vmask
+                                v_and(v_add(v_shl<5>(v11), diff_value), vmask),
+                                v_and(v_add(v_shl<5>(v01), diff_value), vmask),
+                                v_and(v_add(v_shl<5>(v10), diff_value), vmask),
+                                v_and(v_add(v_shl<5>(v00), diff_value), vmask)
                             };
-                            diff0 = diff0 + diff_value;
-                            diff0 = diff0 & vmask;
+                            diff0 = v_add(diff0, diff_value);
+                            diff0 = v_and(diff0, vmask);
 
-                            v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                            veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                            v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                            veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                             // since there is no abs vor int16x8 we have to do this hack
                             v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                             v_int16x8 vset2, vset1;
                             // |It| < sigma1 ?
-                            vset2 = vabs_diff < vparam1;
+                            vset2 = v_lt(vabs_diff, vparam1);
                             // It > 0 ?
-                            v_int16x8 vdiff_is_pos = diff0 > vzero;
+                            v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                             // sigma0 < |It| < sigma1 ?
-                            vset1 = vset2 & (vabs_diff > vparam0);
+                            vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                             // val = |It| -/+ sigma1
-                            v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                            v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
 
                             v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                             v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@@ -1009,7 +1009,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             for (unsigned int mmi = 0; mmi < 4; mmi++)
                             {
                                 // It == 0     ? |It| > sigma13
-                                diff0 = vset2 & diff[mmi];
+                                diff0 = v_and(vset2, diff[mmi]);
                                 // It == val ? sigma0 < |It| < sigma1
                                 diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -1017,22 +1017,22 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 // diff = diff * sigma2
                                 v_int32x4 diff_int_0, diff_int_1;
                                 v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                                v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                                v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                                v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
+                                v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
                                 diff0 = v_pack(diff0_0, diff0_1);
                                 v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
                                 v_zip(vIxy_0, vIxy_1, v10, v11);
                                 v_zip(diff2, diff1, v00, v01);
 
-                                vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
 
-                                vqb2[mmi] += v_cvt_f32(diff0_0 * vI0);
-                                vqb2[mmi] += v_cvt_f32(diff0_1 * vI1);
+                                vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0)));
+                                vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                                vqb3[mmi] += v_cvt_f32(diff0_0);
-                                vqb3[mmi] += v_cvt_f32(diff0_1);
+                                vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0));
+                                vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1));
                             }
                             if (j == 0)
                             {
@@ -1060,29 +1060,29 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 v_float32x4 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                v_float32x4 fxtale = fx * vtale_0;
-                                v_float32x4 fytale = fy * vtale_0;
+                                v_float32x4 fxtale = v_mul(fx, vtale_0);
+                                v_float32x4 fytale = v_mul(fy, vtale_0);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
                                 vAxx = v_muladd(fx, fxtale, vAxx);
 
                                 // sumIx und sumIy
-                                vsumIx += fxtale;
-                                vsumIy += fytale;
+                                vsumIx = v_add(vsumIx, fxtale);
+                                vsumIy = v_add(vsumIy, fytale);
 
-                                vsumW1 += vI_ps * fxtale;
-                                vsumW2 += vI_ps * fytale;
+                                vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                                vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                                 // sumI
-                                v_float32x4 vI_tale = vI_ps * vtale_0;
-                                vsumI += vI_tale;
+                                v_float32x4 vI_tale = v_mul(vI_ps, vtale_0);
+                                vsumI = v_add(vsumI, vI_tale);
 
                                 // sumW
-                                vsumW += vtale_0;
+                                vsumW = v_add(vsumW, vtale_0);
 
                                 // sumDI
-                                vsumDI += vI_ps * vI_tale;
+                                vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
 
                                 v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                                 v_expand(v01, t1, t0);
@@ -1092,29 +1092,29 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                fxtale = fx * vtale_1;
-                                fytale = fy * vtale_1;
+                                fxtale = v_mul(fx, vtale_1);
+                                fytale = v_mul(fy, vtale_1);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
                                 vAxx = v_muladd(fx, fxtale, vAxx);
 
                                 // sumIx und sumIy
-                                vsumIx += fxtale;
-                                vsumIy += fytale;
+                                vsumIx = v_add(vsumIx, fxtale);
+                                vsumIy = v_add(vsumIy, fytale);
 
-                                vsumW1 += vI_ps * fxtale;
-                                vsumW2 += vI_ps * fytale;
+                                vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                                vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                                 // sumI
-                                vI_tale = vI_ps * vtale_1;
-                                vsumI += vI_tale;
+                                vI_tale = v_mul(vI_ps, vtale_1);
+                                vsumI = v_add(vsumI, vI_tale);
 
                                 // sumW
-                                vsumW += vtale_1;
+                                vsumW = v_add(vsumW, vtale_1);
 
                                 // sumDI
-                                vsumDI += vI_ps * vI_tale;
+                                vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
                             }
                      }
 
@@ -1304,7 +1304,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 float CV_DECL_ALIGNED(16) bbuf[4];
                 for(int mmi = 0; mmi < 4; mmi++)
                 {
-                    v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                    v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                     _b0[mmi] = bbuf[0] + bbuf[2];
                     _b1[mmi] = bbuf[1] + bbuf[3];
                     _b2[mmi] = v_reduce_sum(vqb2[mmi]);
@@ -1655,14 +1655,14 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                             v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                             v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                             v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                            v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                            v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                             v_int16x8 diff[4] =
                             {
-                                ((v00 << 5) - vI) & vmask,
-                                ((v01 << 5) - vI) & vmask,
-                                ((v10 << 5) - vI) & vmask,
-                                ((v11 << 5) - vI) & vmask,
+                                v_and(v_sub(v_shl<5>(v00), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v01), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v10), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v11), vI), vmask),
                             };
 
                             v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
@@ -1672,8 +1672,8 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                                 v_zip(diff[mmi], diff[mmi], diff1, diff0);
                                 v_zip(vIxy_0, vIxy_1, v10, v11);
                                 v_zip(diff1, diff0, v00, v01);
-                                vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
                             }
                          }
 #else
@@ -1704,7 +1704,7 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                     float CV_DECL_ALIGNED(16) bbuf[4];
                     for (int mmi = 0; mmi < 4; mmi++)
                     {
-                        v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                        v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                         _b1[mmi] = bbuf[0] + bbuf[2];
                         _b2[mmi] = bbuf[1] + bbuf[3];
                     }
@@ -2071,7 +2071,7 @@ namespace radial {
                                 v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                                 v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                                 v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                                v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                                v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                                 v_int32x4 t0, t1;
                                 v_int16x8 t00, t01, t10, t11;
@@ -2079,21 +2079,21 @@ namespace radial {
                                 v_zip(v10, v11, t10, t11);
 
                                 //subpixel interpolation
-                                t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                                t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                                t0 = t0 >> (W_BITS - 5);
-                                t1 = t1 >> (W_BITS - 5);
+                                t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                                t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                                t0 = v_shr(t0, W_BITS - 5);
+                                t1 = v_shr(t1, W_BITS - 5);
                                 diff0 = v_pack(t0, t1);
                                 // I*gain.x + gain.x
                                 v_mul_expand(vI, vgain_value, t0, t1);
-                                v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI;
+                                v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI);
 
                                 v_int16x8 diff[4] =
                                 {
-                                    ((v11 << 5) + diff_value) & vmask,
-                                    ((v01 << 5) + diff_value) & vmask,
-                                    ((v10 << 5) + diff_value) & vmask,
-                                    ((v00 << 5) + diff_value) & vmask
+                                    v_and(v_add(v_shl<5>(v11), diff_value), vmask),
+                                    v_and(v_add(v_shl<5>(v01), diff_value), vmask),
+                                    v_and(v_add(v_shl<5>(v10), diff_value), vmask),
+                                    v_and(v_add(v_shl<5>(v00), diff_value), vmask)
                                 };
                                 v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                                 v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@@ -2109,14 +2109,14 @@ namespace radial {
                                     v_zip(diff[mmi], diff[mmi], diff2, diff1);
 
                                     v_zip(diff2, diff1, v00, v01);
-                                    vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                    vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                    vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                    vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
 
-                                    vqb2[mmi] += v_cvt_f32(diff0_0 * vI0);
-                                    vqb2[mmi] += v_cvt_f32(diff0_1 * vI1);
+                                    vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0)));
+                                    vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                                    vqb3[mmi] += v_cvt_f32(diff0_0);
-                                    vqb3[mmi] += v_cvt_f32(diff0_1);
+                                    vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0));
+                                    vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1));
                                 }
                                 if (j == 0)
                                 {
@@ -2133,17 +2133,17 @@ namespace radial {
                                     vAxx = v_muladd(fx, fx, vAxx);
 
                                     // sumIx und sumIy
-                                    vsumIx += fx;
-                                    vsumIy += fy;
+                                    vsumIx = v_add(vsumIx, fx);
+                                    vsumIy = v_add(vsumIy, fy);
 
-                                    vsumW1 += vI_ps * fx;
-                                    vsumW2 += vI_ps * fy;
+                                    vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                                    vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                                     // sumI
-                                    vsumI += vI_ps;
+                                    vsumI = v_add(vsumI, vI_ps);
 
                                     // sumDI
-                                    vsumDI += vI_ps * vI_ps;
+                                    vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
 
                                     v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                                     v_expand(v01, t1, t0);
@@ -2158,17 +2158,17 @@ namespace radial {
                                     vAxx = v_muladd(fx, fx, vAxx);
 
                                     // sumIx und sumIy
-                                    vsumIx += fx;
-                                    vsumIy += fy;
+                                    vsumIx = v_add(vsumIx, fx);
+                                    vsumIy = v_add(vsumIy, fy);
 
-                                    vsumW1 += vI_ps * fx;
-                                    vsumW2 += vI_ps * fy;
+                                    vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                                    vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                                     // sumI
-                                    vsumI += vI_ps;
+                                    vsumI = v_add(vsumI, vI_ps);
 
                                     // sumDI
-                                    vsumDI += vI_ps * vI_ps;
+                                    vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
                                 }
                             }
 
@@ -2299,7 +2299,7 @@ namespace radial {
                         float CV_DECL_ALIGNED(16) bbuf[4];
                         for (int mmi = 0; mmi < 4; mmi++)
                         {
-                            v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                            v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                             _b0[mmi] = bbuf[0] + bbuf[2];
                             _b1[mmi] = bbuf[1] + bbuf[3];
                             _b2[mmi] = v_reduce_sum(vqb2[mmi]);
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index 5ea85de889e..71cf50c8205 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -229,7 +229,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
@@ -237,17 +237,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(v10, v11, t10, t11);
 
                         //subpixel interpolation
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
 
                         // diff = J - I
-                        diff0 = v_pack(t0, t1) - vI;
+                        diff0 = v_sub(v_pack(t0, t1), vI);
                         // I*gain.x + gain.x
                         v_mul_expand(vI, vgain_value, t0, t1);
-                        diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
-                        diff0 = diff0 & vmask;
+                        diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value);
+                        diff0 = v_and(diff0, vmask);
                         v_zip(diff0, diff0, diff2, diff1);
 
                         v_int32x4 diff0_0;
@@ -259,16 +259,16 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
 
                         v_int32x4 vI0, vI1;
                         v_expand(vI, vI0, vI1);
-                        vqb2 += v_cvt_f32(diff0_0 * vI0);
-                        vqb2 += v_cvt_f32(diff0_1 * vI1);
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0)));
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                        vqb3 += v_cvt_f32(diff0_0);
-                        vqb3 += v_cvt_f32(diff0_1);
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_0));
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_1));
 
                         if (j == 0)
                         {
@@ -285,17 +285,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             vAxx = v_muladd(fx, fx, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fx;
-                            vsumIy += fy;
+                            vsumIx = v_add(vsumIx, fx);
+                            vsumIy = v_add(vsumIy, fy);
 
-                            vsumW1 += vI_ps * fx;
-                            vsumW2 += vI_ps * fy;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                             // sumI
-                            vsumI += vI_ps;
+                            vsumI = v_add(vsumI, vI_ps);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_ps;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
 
                             v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                             v_expand(v01, t1, t0);
@@ -309,17 +309,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             vAxx = v_muladd(fx, fx, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fx;
-                            vsumIy += fy;
+                            vsumIx = v_add(vsumIx, fx);
+                            vsumIy = v_add(vsumIy, fy);
 
-                            vsumW1 += vI_ps * fx;
-                            vsumW2 += vI_ps * fy;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                             // sumI
-                            vsumI += vI_ps;
+                            vsumI = v_add(vsumI, vI_ps);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_ps;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
                         }
                     }
 #else
@@ -388,7 +388,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 = bbuf[0] + bbuf[2];
                 b2 = bbuf[1] + bbuf[3];
                 b3 = v_reduce_sum(vqb2);
@@ -696,19 +696,19 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
                         v_zip(v00, v01, t00, t01);
                         v_zip(v10, v11, t10, t11);
 
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
-                        diff0 = v_pack(t0, t1) - diff0;
-                        diff0 = diff0 & vmask;
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
+                        diff0 = v_sub(v_pack(t0, t1), diff0);
+                        diff0 = v_and(diff0, vmask);
 
                         v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
@@ -717,8 +717,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
                     }
 #else
                     for( ; x < winSize.width*cn; x++, dIptr += 2 )
@@ -737,7 +737,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 = bbuf[0] + bbuf[2];
                 b2 = bbuf[1] + bbuf[3];
 #endif
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index 9bee35fc6a3..5597d882491 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -246,35 +246,35 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
                         v_zip(v00, v01, t00, t01);
                         v_zip(v10, v11, t10, t11);
 
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
-                        diff0 = v_pack(t0, t1) - diff0;
-                        diff0 = diff0 & vmask;
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
+                        diff0 = v_sub(v_pack(t0, t1), diff0);
+                        diff0 = v_and(diff0, vmask);
 
-                        v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                        veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                        v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                        veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                         // since there is no abs vor int16x8 we have to do this hack
                         v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                         v_int16x8 vset2, vset1;
                         // |It| < sigma1 ?
-                        vset2 = vabs_diff < vparam1;
+                        vset2 = v_lt(vabs_diff, vparam1);
                         // It > 0 ?
-                        v_int16x8 vdiff_is_pos = diff0 > vzero;
+                        v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                         // sigma0 < |It| < sigma1 ?
-                        vset1 = vset2 & (vabs_diff > vparam0);
+                        vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                         // val = |It| -/+ sigma1
-                        v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                        v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
                         // It == 0     ? |It| > sigma13
-                        diff0 = vset2 & diff0;
+                        diff0 = v_and(vset2, diff0);
                         // It == val ? sigma0 < |It| < sigma1
                         diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -282,7 +282,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         // diff = diff * sigma2
                         v_int32x4 diff_int_0, diff_int_1;
                         v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                        diff0 = v_pack(diff_int_0 >> s2bitShift, diff_int_1 >> s2bitShift);
+                        diff0 = v_pack(v_shr(diff_int_0, s2bitShift), v_shr(diff_int_1, s2bitShift));
                         v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
                         v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
@@ -290,8 +290,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
                         if (j == 0)
                         {
                             v_int32x4 vset1_0, vset1_1, vset2_0, vset2_1;
@@ -316,8 +316,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_float32x4 fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            v_float32x4 fxtale = fx * vtale_0;
-                            v_float32x4 fytale = fy * vtale_0;
+                            v_float32x4 fxtale = v_mul(fx, vtale_0);
+                            v_float32x4 fytale = v_mul(fy, vtale_0);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
@@ -330,8 +330,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            fxtale = fx * vtale_1;
-                            fytale = fy * vtale_1;
+                            fxtale = v_mul(fx, vtale_1);
+                            fytale = v_mul(fy, vtale_1);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
@@ -431,7 +431,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 += bbuf[0] + bbuf[2];
                 b2 += bbuf[1] + bbuf[3];
 #endif
@@ -769,7 +769,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
@@ -777,33 +777,33 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(v10, v11, t10, t11);
 
                         //subpixel interpolation
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
 
                         // diff = J - I
-                        diff0 = v_pack(t0, t1) - vI;
+                        diff0 = v_sub(v_pack(t0, t1), vI);
                         // I*gain.x + gain.x
                         v_mul_expand(vI, vgain_value, t0, t1);
-                        diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
-                        diff0 = diff0 & vmask;
+                        diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value);
+                        diff0 = v_and(diff0, vmask);
 
-                        v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                        veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                        v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                        veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                         // since there is no abs vor int16x8 we have to do this hack
                         v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                         v_int16x8 vset2, vset1;
                         // |It| < sigma1 ?
-                        vset2 = vabs_diff < vparam1;
+                        vset2 = v_lt(vabs_diff, vparam1);
                         // It > 0 ?
-                        v_int16x8 vdiff_is_pos = diff0 > vzero;
+                        v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                         // sigma0 < |It| < sigma1 ?
-                        vset1 = vset2 & (vabs_diff > vparam0);
+                        vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                         // val = |It| -/+ sigma1
-                        v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                        v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
                         // It == 0     ? |It| > sigma13
-                        diff0 = vset2 & diff0;
+                        diff0 = v_and(vset2, diff0);
                         // It == val ? sigma0 < |It| < sigma1
                         diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -811,8 +811,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         // diff = diff * sigma2
                         v_int32x4 diff_int_0, diff_int_1;
                         v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                        v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                        v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                        v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
+                        v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
                         diff0 = v_pack(diff0_0, diff0_1);
                         v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
@@ -821,16 +821,16 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
 
                         v_int32x4 vI0, vI1;
                         v_expand(vI, vI0, vI1);
-                        vqb2 += v_cvt_f32(diff0_0 * vI0);
-                        vqb2 += v_cvt_f32(diff0_1 * vI1);
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0)));
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                        vqb3 += v_cvt_f32(diff0_0);
-                        vqb3 += v_cvt_f32(diff0_1);
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_0));
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_1));
 
                         if (j == 0)
                         {
@@ -858,29 +858,29 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_float32x4 fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            v_float32x4 fxtale = fx * vtale_0;
-                            v_float32x4 fytale = fy * vtale_0;
+                            v_float32x4 fxtale = v_mul(fx, vtale_0);
+                            v_float32x4 fytale = v_mul(fy, vtale_0);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
                             vAxx = v_muladd(fx, fxtale, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fxtale;
-                            vsumIy += fytale;
+                            vsumIx = v_add(vsumIx, fxtale);
+                            vsumIy = v_add(vsumIy, fytale);
 
-                            vsumW1 += vI_ps * fxtale;
-                            vsumW2 += vI_ps * fytale;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                             // sumI
-                            v_float32x4 vI_tale = vI_ps * vtale_0;
-                            vsumI += vI_tale;
+                            v_float32x4 vI_tale = v_mul(vI_ps, vtale_0);
+                            vsumI = v_add(vsumI, vI_tale);
 
                             // sumW
-                            vsumW += vtale_0;
+                            vsumW = v_add(vsumW, vtale_0);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_tale;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
 
                             v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                             v_expand(v01, t1, t0);
@@ -890,29 +890,29 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            fxtale = fx * vtale_1;
-                            fytale = fy * vtale_1;
+                            fxtale = v_mul(fx, vtale_1);
+                            fytale = v_mul(fy, vtale_1);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
                             vAxx = v_muladd(fx, fxtale, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fxtale;
-                            vsumIy += fytale;
+                            vsumIx = v_add(vsumIx, fxtale);
+                            vsumIy = v_add(vsumIy, fytale);
 
-                            vsumW1 += vI_ps * fxtale;
-                            vsumW2 += vI_ps * fytale;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                             // sumI
-                            vI_tale = vI_ps * vtale_1;
-                            vsumI += vI_tale;
+                            vI_tale = v_mul(vI_ps, vtale_1);
+                            vsumI = v_add(vsumI, vI_tale);
 
                             // sumW
-                            vsumW += vtale_1;
+                            vsumW = v_add(vsumW, vtale_1);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_tale;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
                         }
                     }
 #else
@@ -1017,7 +1017,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 }
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 = bbuf[0] + bbuf[2];
                 b2 = bbuf[1] + bbuf[3];
                 b3 = v_reduce_sum(vqb2);
diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
index c6f77f6d62c..2db4234ecd8 100644
--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -71,15 +71,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 
         for (; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
         {
-            v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
-            v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
+            v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32);
+            v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32);
             if (x + 4 > winSize.width)
             {
-                vmask0 = vmask0 & vmask_border_0;
+                vmask0 = v_and(vmask0, vmask_border_0);
             }
             if (x + 8 > winSize.width)
             {
-                vmask1 = vmask1 & vmask_border_1;
+                vmask1 = v_and(vmask1, vmask_border_1);
             }
 
             v_int32x4 t0, t1;
@@ -91,10 +91,10 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
-            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-            t0 = t0 >> (W_BITS - 5)  & vmask0;
-            t1 = t1 >> (W_BITS - 5)  & vmask1;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+            t0 = v_and(v_shr(t0, W_BITS - 5), vmask0);
+            t1 = v_and(v_shr(t1, W_BITS - 5), vmask1);
             v_store(Iptr + x, v_pack(t0, t1));
 
             v00 = v_reinterpret_as_s16(v_load(dsrc));
@@ -105,12 +105,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask0);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask0));
             v_store(dIptr, v00);
 
             v00 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2));
@@ -121,12 +121,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask1);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask1));
             v_store(dIptr + 4 * 2, v00);
         }
 #else
@@ -187,15 +187,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 #if CV_SIMD128
         for (int x = 0; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
         {
-            v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
-            v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
+            v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32);
+            v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32);
             if (x + 4 > winSize.width)
             {
-                vmask0 = vmask0 & vmask_border0;
+                vmask0 = v_and(vmask0, vmask_border0);
             }
             if (x + 8 > winSize.width)
             {
-                vmask1 = vmask1 & vmask_border1;
+                vmask1 = v_and(vmask1, vmask_border1);
             }
 
             v_int32x4 t0, t1;
@@ -207,12 +207,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
-            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-            t0 = t0 >> (W_BITS - 5);
-            t1 = t1 >> (W_BITS - 5);
-            t0 = t0 & vmask0;
-            t1 = t1 & vmask1;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS - 5);
+            t1 = v_shr(t1, W_BITS - 5);
+            t0 = v_and(t0, vmask0);
+            t1 = v_and(t1, vmask1);
             v_store(Iptr + x, v_pack(t0, t1));
 
             v00 = v_reinterpret_as_s16(v_load(dsrc));
@@ -223,12 +223,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask0);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask0));
             v_store(dIptr, v00);
 
             v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
@@ -249,12 +249,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask1);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask1));
             v_store(dIptr + 4 * 2, v00);
 
             v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp
index 6b88860e60e..756195891cf 100644
--- a/modules/optflow/src/rlof/rlof_localflow.cpp
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -52,8 +52,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
                 v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
                 v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
 
-                v_int16x8 t1 = s2 - s0;
-                v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
+                v_int16x8 t1 = v_sub(s2, s0);
+                v_int16x8 t0 = v_add(v_mul_wrap(v_add(s0, s2), c3), v_mul_wrap(s1, c10));
 
                 v_store(trow0 + x, t0);
                 v_store(trow1 + x, t1);
@@ -90,8 +90,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
                 v_int16x8 s3 = v_load(trow1 + x);
                 v_int16x8 s4 = v_load(trow1 + x + cn);
 
-                v_int16x8 t0 = s1 - s0;
-                v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
+                v_int16x8 t0 = v_sub(s1, s0);
+                v_int16x8 t1 = v_add(v_mul_wrap(v_add(s2, s4), c3), v_mul_wrap(s3, c10));
 
                 v_store_interleave((drow + x * 2), t0, t1);
             }
diff --git a/modules/rgbd/src/utils.hpp b/modules/rgbd/src/utils.hpp
index 7b11a3d3f07..4e8572f5d1a 100644
--- a/modules/rgbd/src/utils.hpp
+++ b/modules/rgbd/src/utils.hpp
@@ -29,7 +29,7 @@ inline bool isNaN(cv::Point3f p)
 #if USE_INTRINSICS
 static inline bool isNaN(const cv::v_float32x4& p)
 {
-    return cv::v_check_any(p != p);
+    return cv::v_check_any(v_ne(p, p));
 }
 #endif
 
diff --git a/modules/sfm/src/libmv_light/libmv/numeric/numeric.h b/modules/sfm/src/libmv_light/libmv/numeric/numeric.h
index 9e7927e0bbc..41a55634dcd 100644
--- a/modules/sfm/src/libmv_light/libmv/numeric/numeric.h
+++ b/modules/sfm/src/libmv_light/libmv/numeric/numeric.h
@@ -32,6 +32,7 @@
 #include <Eigen/LU>
 #include <Eigen/QR>
 #include <Eigen/SVD>
+#include <cassert>
 
 #if !defined(__MINGW32__)
 #  if defined(_WIN32) || defined(__APPLE__) || \
diff --git a/modules/ximgproc/src/anisodiff.cpp b/modules/ximgproc/src/anisodiff.cpp
index 996b4ac5b77..2b230a71242 100644
--- a/modules/ximgproc/src/anisodiff.cpp
+++ b/modules/ximgproc/src/anisodiff.cpp
@@ -74,8 +74,8 @@ inline v_uint8x16 v_finalize_pix_ch(const v_int16x8& c0, const v_int16x8& c1,
     v_expand_f32(c0, f0, f1);
     v_expand_f32(c1, f2, f3);
 
-    v_int16x8 d0 = v_pack(v_round(s0*alpha + f0), v_round(s1*alpha + f1));
-    v_int16x8 d1 = v_pack(v_round(s2*alpha + f2), v_round(s3*alpha + f3));
+    v_int16x8 d0 = v_pack(v_round(v_add(v_mul(s0, alpha), f0)), v_round(v_add(v_mul(s1, alpha), f1)));
+    v_int16x8 d1 = v_pack(v_round(v_add(v_mul(s2, alpha), f2)), v_round(v_add(v_mul(s3, alpha), f3)));
 
     return v_pack_u(d0, d1);
 }
@@ -135,12 +135,12 @@ class ADBody : public ParallelLoopBody
                     v_expand_s(p1, p10, p11);
                     v_expand_s(p2, p20, p21);
 
-                    v_int16x8 d00 = p00 - c00, d01 = p01 - c01;
-                    v_int16x8 d10 = p10 - c10, d11 = p11 - c11;
-                    v_int16x8 d20 = p20 - c20, d21 = p21 - c21;
+                    v_int16x8 d00 = v_sub(p00, c00), d01 = v_sub(p01, c01);
+                    v_int16x8 d10 = v_sub(p10, c10), d11 = v_sub(p11, c11);
+                    v_int16x8 d20 = v_sub(p20, c20), d21 = v_sub(p21, c21);
 
-                    v_uint16x8 n0 = v_abs(d00) + v_abs(d10) + v_abs(d20);
-                    v_uint16x8 n1 = v_abs(d01) + v_abs(d11) + v_abs(d21);
+                    v_uint16x8 n0 = v_add(v_add(v_abs(d00), v_abs(d10)), v_abs(d20));
+                    v_uint16x8 n1 = v_add(v_add(v_abs(d01), v_abs(d11)), v_abs(d21));
 
                     ushort CV_DECL_ALIGNED(16) nbuf[16];
                     v_store(nbuf, n0);
@@ -153,13 +153,13 @@ class ADBody : public ParallelLoopBody
 
                     v_expand_f32(d00, fd0, fd1);
                     v_expand_f32(d01, fd2, fd3);
-                    s00 += fd0*w0; s01 += fd1*w1; s02 += fd2*w2; s03 += fd3*w3;
+                    s00 = v_add(s00, v_mul(fd0, w0)); s01 = v_add(s01, v_mul(fd1, w1)); s02 = v_add(s02, v_mul(fd2, w2)); s03 = v_add(s03, v_mul(fd3, w3));
                     v_expand_f32(d10, fd0, fd1);
                     v_expand_f32(d11, fd2, fd3);
-                    s10 += fd0*w0; s11 += fd1*w1; s12 += fd2*w2; s13 += fd3*w3;
+                    s10 = v_add(s10, v_mul(fd0, w0)); s11 = v_add(s11, v_mul(fd1, w1)); s12 = v_add(s12, v_mul(fd2, w2)); s13 = v_add(s13, v_mul(fd3, w3));
                     v_expand_f32(d20, fd0, fd1);
                     v_expand_f32(d21, fd2, fd3);
-                    s20 += fd0*w0; s21 += fd1*w1; s22 += fd2*w2; s23 += fd3*w3;
+                    s20 = v_add(s20, v_mul(fd0, w0)); s21 = v_add(s21, v_mul(fd1, w1)); s22 = v_add(s22, v_mul(fd2, w2)); s23 = v_add(s23, v_mul(fd3, w3));
                 }
 
                 c0 = v_finalize_pix_ch(c00, c01, s00, s01, s02, s03, v_alpha);
diff --git a/modules/ximgproc/src/fgs_filter.cpp b/modules/ximgproc/src/fgs_filter.cpp
index 5e168da5dad..804e9f00a02 100644
--- a/modules/ximgproc/src/fgs_filter.cpp
+++ b/modules/ximgproc/src/fgs_filter.cpp
@@ -303,15 +303,15 @@ void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i)
         v_float32x4 aux0,aux1,aux2,aux3;
 
 #define PROC4(Chor_in,cur_in,coef_prev_in,interD_prev_in,cur_prev_in,interD_out,cur_out,coef_cur_out)\
-        coef_cur_out = lambda_reg*Chor_in;\
-        aux0 = interD_prev_in*coef_prev_in;\
-        aux1 = coef_cur_out+coef_prev_in;\
-        aux1 = one_reg-aux1;\
-        aux0 = aux1-aux0;\
-        interD_out = coef_cur_out/aux0;\
-        aux1 = cur_prev_in*coef_prev_in;\
-        aux1 = cur_in - aux1;\
-        cur_out = aux1/aux0;
+        coef_cur_out = v_mul(lambda_reg, Chor_in);\
+        aux0 = v_mul(interD_prev_in, coef_prev_in);\
+        aux1 = v_add(coef_cur_out, coef_prev_in);\
+        aux1 = v_sub(one_reg, aux1);\
+        aux0 = v_sub(aux1, aux0);\
+        interD_out = v_div(coef_cur_out, aux0);\
+        aux1 = v_mul(cur_prev_in, coef_prev_in);\
+        aux1 = v_sub(cur_in, aux1);\
+        cur_out = v_div(aux1, aux0);
 
         for(;j<w-3;j+=4)
         {
@@ -406,14 +406,14 @@ void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i)
             aux3 = v_load(cur_row_next3+j);
             v_transpose4x4(aux0,aux1,aux2,aux3,b0,b1,b2,b3);
 
-            aux0 = a3*cur_next_reg;
-            b3 = b3-aux0;
-            aux0 = a2*b3;
-            b2 = b2-aux0;
-            aux0 = a1*b2;
-            b1 = b1-aux0;
-            aux0 = a0*b1;
-            b0 = b0-aux0;
+            aux0 = v_mul(a3, cur_next_reg);
+            b3 = v_sub(b3, aux0);
+            aux0 = v_mul(a2, b3);
+            b2 = v_sub(b2, aux0);
+            aux0 = v_mul(a1, b2);
+            b1 = v_sub(b1, aux0);
+            aux0 = v_mul(a0, b1);
+            b0 = v_sub(b0, aux0);
 
             cur_next_reg = b0;
 
@@ -522,24 +522,24 @@ void FastGlobalSmootherFilterImpl::VerticalPass_ParBody::operator()(const Range&
         {
             a = v_load(Cvert_row_prev+j);
             b = v_load(Cvert_row+j);
-            coef_prev_reg = lambda_reg*a;
-            coef_cur_reg =  lambda_reg*b;
+            coef_prev_reg = v_mul(lambda_reg, a);
+            coef_cur_reg =  v_mul(lambda_reg, b);
 
             a = v_load(interD_row_prev+j);
-            a = a*coef_prev_reg;
+            a = v_mul(a, coef_prev_reg);
 
-            b = coef_prev_reg+coef_cur_reg;
-            b = b+a;
-            a = one_reg-b; //computed denom
+            b = v_add(coef_prev_reg, coef_cur_reg);
+            b = v_add(b, a);
+            a = v_sub(one_reg, b); //computed denom
 
-            b =  coef_cur_reg/a; //computed interD_row
+            b =  v_div(coef_cur_reg, a); //computed interD_row
 
             c = v_load(cur_row_prev+j);
-            c = c*coef_prev_reg;
+            c = v_mul(c, coef_prev_reg);
 
             d = v_load(cur_row+j);
-            d = d-c;
-            d = d/a; //computed cur_row
+            d = v_sub(d, c);
+            d = v_div(d, a); //computed cur_row
 
             v_store(interD_row+j,b);
             v_store(cur_row+j,d);
@@ -570,10 +570,10 @@ void FastGlobalSmootherFilterImpl::VerticalPass_ParBody::operator()(const Range&
         {
             a = v_load(interD_row+j);
             b = v_load(cur_row_next+j);
-            b = a*b;
+            b = v_mul(a, b);
 
             a = v_load(cur_row+j);
-            b = a-b;
+            b = v_sub(a, b);
             v_store(cur_row+j,b);
         }
 #endif
diff --git a/modules/xphoto/src/grayworld_white_balance.cpp b/modules/xphoto/src/grayworld_white_balance.cpp
index 8d5953d34ae..2c944c84e85 100644
--- a/modules/xphoto/src/grayworld_white_balance.cpp
+++ b/modules/xphoto/src/grayworld_white_balance.cpp
@@ -130,21 +130,21 @@ void calculateChannelSums(uint &sumB, uint &sumG, uint &sumR, uchar *src_data, i
         v_expand(v_max_val, v_max1, v_max2);
 
         // Calculate masks
-        v_m1 = ~(v_mul_wrap(v_max1 - v_min1, v_255) > v_mul_wrap(v_thresh, v_max1));
-        v_m2 = ~(v_mul_wrap(v_max2 - v_min2, v_255) > v_mul_wrap(v_thresh, v_max2));
+        v_m1 = v_not(v_gt(v_mul_wrap(v_sub(v_max1, v_min1), v_255), v_mul_wrap(v_thresh, v_max1)));
+        v_m2 = v_not(v_gt(v_mul_wrap(v_sub(v_max2, v_min2), v_255), v_mul_wrap(v_thresh, v_max2)));
 
         // Apply masks
-        v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2);
-        v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2);
-        v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2);
+        v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2));
+        v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2));
+        v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2));
 
         // Split and add to the sums:
         v_expand(v_iB1, v_uint1, v_uint2);
-        v_SB += v_uint1 + v_uint2;
+        v_SB = v_add(v_SB, v_add(v_uint1, v_uint2));
         v_expand(v_iG1, v_uint1, v_uint2);
-        v_SG += v_uint1 + v_uint2;
+        v_SG = v_add(v_SG, v_add(v_uint1, v_uint2));
         v_expand(v_iR1, v_uint1, v_uint2);
-        v_SR += v_uint1 + v_uint2;
+        v_SR = v_add(v_SR, v_add(v_uint1, v_uint2));
     }
 
     sumB = v_reduce_sum(v_SB);
@@ -197,21 +197,21 @@ void calculateChannelSums(uint64 &sumB, uint64 &sumG, uint64 &sumR, ushort *src_
         v_expand(v_max_val, v_max1, v_max2);
 
         // Calculate masks
-        v_m1 = ~((v_max1 - v_min1) * v_65535 > v_thresh * v_max1);
-        v_m2 = ~((v_max2 - v_min2) * v_65535 > v_thresh * v_max2);
+        v_m1 = v_not(v_gt(v_mul(v_sub(v_max1, v_min1), v_65535), v_mul(v_thresh, v_max1)));
+        v_m2 = v_not(v_gt(v_mul(v_sub(v_max2, v_min2), v_65535), v_mul(v_thresh, v_max2)));
 
         // Apply masks
-        v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2);
-        v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2);
-        v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2);
+        v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2));
+        v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2));
+        v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2));
 
         // Split and add to the sums:
         v_expand(v_iB1, v_u64_1, v_u64_2);
-        v_SB += v_u64_1 + v_u64_2;
+        v_SB = v_add(v_SB, v_add(v_u64_1, v_u64_2));
         v_expand(v_iG1, v_u64_1, v_u64_2);
-        v_SG += v_u64_1 + v_u64_2;
+        v_SG = v_add(v_SG, v_add(v_u64_1, v_u64_2));
         v_expand(v_iR1, v_u64_1, v_u64_2);
-        v_SR += v_u64_1 + v_u64_2;
+        v_SR = v_add(v_SR, v_add(v_u64_1, v_u64_2));
     }
 
     // Perform final reduction
@@ -282,12 +282,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai
             v_expand(v_inR, v_sR1, v_sR2);
 
             // Multiply by gains
-            v_sB1 = v_mul_wrap(v_sB1, v_gainB) >> 8;
-            v_sB2 = v_mul_wrap(v_sB2, v_gainB) >> 8;
-            v_sG1 = v_mul_wrap(v_sG1, v_gainG) >> 8;
-            v_sG2 = v_mul_wrap(v_sG2, v_gainG) >> 8;
-            v_sR1 = v_mul_wrap(v_sR1, v_gainR) >> 8;
-            v_sR2 = v_mul_wrap(v_sR2, v_gainR) >> 8;
+            v_sB1 = v_shr(v_mul_wrap(v_sB1, v_gainB), 8);
+            v_sB2 = v_shr(v_mul_wrap(v_sB2, v_gainB), 8);
+            v_sG1 = v_shr(v_mul_wrap(v_sG1, v_gainG), 8);
+            v_sG2 = v_shr(v_mul_wrap(v_sG2, v_gainG), 8);
+            v_sR1 = v_shr(v_mul_wrap(v_sR1, v_gainR), 8);
+            v_sR2 = v_shr(v_mul_wrap(v_sR2, v_gainR), 8);
 
             // Pack into vectors of v_uint8x16
             v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2));
@@ -325,12 +325,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai
             v_expand(v_inR, v_sR1, v_sR2);
 
             // Multiply by scaling factors
-            v_sB1 = (v_sB1 * v_gainB) >> 16;
-            v_sB2 = (v_sB2 * v_gainB) >> 16;
-            v_sG1 = (v_sG1 * v_gainG) >> 16;
-            v_sG2 = (v_sG2 * v_gainG) >> 16;
-            v_sR1 = (v_sR1 * v_gainR) >> 16;
-            v_sR2 = (v_sR2 * v_gainR) >> 16;
+            v_sB1 = v_shr(v_mul(v_sB1, v_gainB), 16);
+            v_sB2 = v_shr(v_mul(v_sB2, v_gainB), 16);
+            v_sG1 = v_shr(v_mul(v_sG1, v_gainG), 16);
+            v_sG2 = v_shr(v_mul(v_sG2, v_gainG), 16);
+            v_sR1 = v_shr(v_mul(v_sR1, v_gainR), 16);
+            v_sR2 = v_shr(v_mul(v_sR2, v_gainR), 16);
 
             // Pack into vectors of v_uint16x8
             v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2));
diff --git a/modules/xphoto/src/learning_based_color_balance.cpp b/modules/xphoto/src/learning_based_color_balance.cpp
index bd408e6cb49..de1958dcc60 100644
--- a/modules/xphoto/src/learning_based_color_balance.cpp
+++ b/modules/xphoto/src/learning_based_color_balance.cpp
@@ -192,7 +192,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src)
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_local_max = v_max(v_inB, v_max(v_inG, v_inR));
             v_global_max = v_max(v_local_max, v_global_max);
-            v_mask = (v_local_max < v_thresh);
+            v_mask = (v_lt(v_local_max, v_thresh));
             v_store(mask_ptr + i, v_mask);
         }
         uchar global_max[16];
@@ -225,7 +225,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src)
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_local_max = v_max(v_inB, v_max(v_inG, v_inR));
             v_global_max = v_max(v_local_max, v_global_max);
-            v_mask = (v_local_max < v_thresh);
+            v_mask = (v_lt(v_local_max, v_thresh));
             v_pack_store(mask_ptr + i, v_mask);
         }
         ushort global_max[8];
@@ -270,9 +270,9 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_uint8x16 v_mask = v_load(mask_ptr + i);
 
-            v_inB &= v_mask;
-            v_inG &= v_mask;
-            v_inR &= v_mask;
+            v_inB = v_and(v_inB, v_mask);
+            v_inG = v_and(v_inG, v_mask);
+            v_inR = v_and(v_inR, v_mask);
 
             v_uint16x8 v_sR1, v_sR2, v_sG1, v_sG2, v_sB1, v_sB2;
             v_expand(v_inB, v_sB1, v_sB2);
@@ -280,33 +280,33 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_expand(v_inR, v_sR1, v_sR2);
 
             // update the brightest (R,G,B) tuple (process left half):
-            v_uint16x8 v_sum = v_sB1 + v_sG1 + v_sR1;
-            v_uint16x8 v_max_mask = (v_sum > v_max_sum);
+            v_uint16x8 v_sum = v_add(v_add(v_sB1, v_sG1), v_sR1);
+            v_uint16x8 v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_sB1 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_sG1 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_sR1 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_sB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_sG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_sR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update the brightest (R,G,B) tuple (process right half):
-            v_sum = v_sB2 + v_sG2 + v_sR2;
-            v_max_mask = (v_sum > v_max_sum);
+            v_sum = v_add(v_add(v_sB2, v_sG2), v_sR2);
+            v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_sB2 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_sG2 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_sR2 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_sB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_sG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_sR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update sums:
-            v_sB1 = v_sB1 + v_sB2;
-            v_sG1 = v_sG1 + v_sG2;
-            v_sR1 = v_sR1 + v_sR2;
+            v_sB1 = v_add(v_sB1, v_sB2);
+            v_sG1 = v_add(v_sG1, v_sG2);
+            v_sR1 = v_add(v_sR1, v_sR2);
 
             v_uint32x4 v_uint1, v_uint2;
             v_expand(v_sB1, v_uint1, v_uint2);
-            v_SB += v_uint1 + v_uint2;
+            v_SB = v_add(v_SB, v_add(v_uint1, v_uint2));
             v_expand(v_sG1, v_uint1, v_uint2);
-            v_SG += v_uint1 + v_uint2;
+            v_SG = v_add(v_SG, v_add(v_uint1, v_uint2));
             v_expand(v_sR1, v_uint1, v_uint2);
-            v_SR += v_uint1 + v_uint2;
+            v_SR = v_add(v_SR, v_add(v_uint1, v_uint2));
         }
         sumB = v_reduce_sum(v_SB);
         sumG = v_reduce_sum(v_SG);
@@ -361,11 +361,11 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_uint16x8 v_inB, v_inG, v_inR;
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_uint16x8 v_mask = v_load_expand(mask_ptr + i);
-            v_mask = v_mask | ((v_mask & v_mask_lower) << 8);
+            v_mask = v_or(v_mask, v_shl<8>(v_and(v_mask, v_mask_lower)));
 
-            v_inB &= v_mask;
-            v_inG &= v_mask;
-            v_inR &= v_mask;
+            v_inB = v_and(v_inB, v_mask);
+            v_inG = v_and(v_inG, v_mask);
+            v_inR = v_and(v_inR, v_mask);
 
             v_uint32x4 v_iR1, v_iR2, v_iG1, v_iG2, v_iB1, v_iB2;
             v_expand(v_inB, v_iB1, v_iB2);
@@ -373,32 +373,32 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_expand(v_inR, v_iR1, v_iR2);
 
             // update the brightest (R,G,B) tuple (process left half):
-            v_uint32x4 v_sum = v_iB1 + v_iG1 + v_iR1;
-            v_uint32x4 v_max_mask = (v_sum > v_max_sum);
+            v_uint32x4 v_sum = v_add(v_add(v_iB1, v_iG1), v_iR1);
+            v_uint32x4 v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_iB1 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_iG1 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_iR1 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_iB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_iG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_iR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update the brightest (R,G,B) tuple (process right half):
-            v_sum = v_iB2 + v_iG2 + v_iR2;
-            v_max_mask = (v_sum > v_max_sum);
+            v_sum = v_add(v_add(v_iB2, v_iG2), v_iR2);
+            v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_iB2 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_iG2 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_iR2 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_iB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_iG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_iR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update sums:
-            v_iB1 = v_iB1 + v_iB2;
-            v_iG1 = v_iG1 + v_iG2;
-            v_iR1 = v_iR1 + v_iR2;
+            v_iB1 = v_add(v_iB1, v_iB2);
+            v_iG1 = v_add(v_iG1, v_iG2);
+            v_iR1 = v_add(v_iR1, v_iR2);
             v_uint64x2 v_uint64_1, v_uint64_2;
             v_expand(v_iB1, v_uint64_1, v_uint64_2);
-            v_SB += v_uint64_1 + v_uint64_2;
+            v_SB = v_add(v_SB, v_add(v_uint64_1, v_uint64_2));
             v_expand(v_iG1, v_uint64_1, v_uint64_2);
-            v_SG += v_uint64_1 + v_uint64_2;
+            v_SG = v_add(v_SG, v_add(v_uint64_1, v_uint64_2));
             v_expand(v_iR1, v_uint64_1, v_uint64_2);
-            v_SR += v_uint64_1 + v_uint64_2;
+            v_SR = v_add(v_SR, v_add(v_uint64_1, v_uint64_2));
         }
         uint64 sum_arr[2];
         v_store(sum_arr, v_SB);
diff --git a/modules/xphoto/src/oilpainting.cpp b/modules/xphoto/src/oilpainting.cpp
index 21e62414c32..daeffd386a7 100644
--- a/modules/xphoto/src/oilpainting.cpp
+++ b/modules/xphoto/src/oilpainting.cpp
@@ -58,7 +58,7 @@ class ParallelOilPainting : public ParallelLoopBody
     int dynRatio;
 
 public:
-    ParallelOilPainting<Type>(Mat& img, Mat &d, Mat &iLuminance, int r,int k) :
+    ParallelOilPainting(Mat& img, Mat &d, Mat &iLuminance, int r,int k) :
         imgSrc(img),
         dst(d),
         imgLuminance(iLuminance),