diff --git a/modules/cannarithm/include/opencv2/acl_stream_accessor.hpp b/modules/cannarithm/include/opencv2/acl_stream_accessor.hpp
deleted file mode 100644
index 27118d807e3..00000000000
--- a/modules/cannarithm/include/opencv2/acl_stream_accessor.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifndef OPENCV_CANN_STREAM_ACCESSOR_HPP
-#define OPENCV_CANN_STREAM_ACCESSOR_HPP
-
-#include <acl/acl.h>
-#include "opencv2/cann.hpp"
-
-namespace cv
-{
-namespace cann
-{
-
-//! @addtogroup cann_struct
-//! @{
-
-/** @brief Class that enables getting aclrtAclStream from cann::AclStream
- */
-struct AclStreamAccessor
-{
-    CV_EXPORTS static aclrtStream getStream(const AclStream& stream);
-    CV_EXPORTS static AclStream wrapStream(aclrtStream stream);
-};
-
-/** @brief Class that enables getting aclrtAclEvent from cann::AclEvent
- */
-struct AclEventAccessor
-{
-    CV_EXPORTS static aclrtEvent getEvent(const AclEvent& event);
-    CV_EXPORTS static AclEvent wrapEvent(aclrtEvent event);
-};
-
-//! @} cann_struct
-
-} // namespace cann
-} // namespace cv
-
-#endif // OPENCV_CANN_STREAM_ACCESSOR_HPP
diff --git a/modules/cannarithm/include/opencv2/cann.hpp b/modules/cannarithm/include/opencv2/cann.hpp
deleted file mode 100644
index 6b79f045c0e..00000000000
--- a/modules/cannarithm/include/opencv2/cann.hpp
+++ /dev/null
@@ -1,335 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifndef OPENCV_CANN_HPP
-#define OPENCV_CANN_HPP
-
-#include "opencv2/core.hpp"
-
-/**
-  @defgroup cann Ascend-accelerated Computer Vision
-  @{
-    @defgroup canncore Core part
-    @{
-      @defgroup cann_struct Data Structures
-      @defgroup cann_init Initializeation and Information
-    @}
-  @}
- */
-
-namespace cv
-{
-namespace cann
-{
-class AclStream;
-
-//! @addtogroup cann_struct
-//! @{
-
-//===================================================================================
-// AclMat
-//===================================================================================
-
-/** @brief Base storage class for NPU memory with reference counting.
- * AclMat class has a similar interface with Mat and AclMat, and work on [Ascend
- * NPU](https://www.hiascend.com/) backend.
- * @sa Mat cuda::GpuMat
- */
-
-class CV_EXPORTS_W AclMat
-{
-public:
-    class CV_EXPORTS_W Allocator
-    {
-    public:
-        virtual ~Allocator() {}
-
-        // allocator must fill data, step and refcount fields
-        virtual bool allocate(AclMat* mat, int rows, int cols, size_t elemSize) = 0;
-        virtual void free(AclMat* mat) = 0;
-    };
-
-    /**
-     * @brief Create default allocator for AclMat. This allocator alloc memory from device for
-     * specific size.
-     */
-    CV_WRAP static AclMat::Allocator* defaultAllocator();
-
-    /**
-     * @brief Set allocator for AclMat.
-     * @param allocator
-     */
-    CV_WRAP static void setDefaultAllocator(AclMat::Allocator* allocator);
-
-    //! default constructor
-    CV_WRAP explicit AclMat(AclMat::Allocator* allocator_ = AclMat::defaultAllocator());
-
-    //! constructs AclMat of the specified size and type
-    CV_WRAP AclMat(int rows, int cols, int type,
-                   AclMat::Allocator* allocator = AclMat::defaultAllocator());
-    //! constructs AclMat of the specified size and type
-    CV_WRAP AclMat(Size size, int type, AclMat::Allocator* allocator = AclMat::defaultAllocator());
-
-    //! constructs AclMat and fills it with the specified value s
-    CV_WRAP AclMat(int rows, int cols, int type, Scalar& s,
-                   AclMat::Allocator* allocator = AclMat::defaultAllocator());
-    //! constructs AclMat and fills it with the specified value s
-    CV_WRAP AclMat(Size size, int type, Scalar& s,
-                   AclMat::Allocator* allocator = AclMat::defaultAllocator());
-
-    //! copy constructor
-    CV_WRAP AclMat(const AclMat& m);
-
-    //! constructor for AclMat headers pointing to user-allocated data
-    AclMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);
-    //! constructor for AclMat headers pointing to user-allocated data
-    AclMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);
-
-    //! builds AclMat from host memory (Blocking call)
-    CV_WRAP explicit AclMat(InputArray arr,
-                            AclMat::Allocator* allocator = AclMat::defaultAllocator());
-
-    //! assignment operators
-    AclMat& operator=(const AclMat& m);
-
-    //! destructor - calls release()
-    ~AclMat();
-
-    //! sets some of the AclMat elements to s (Blocking call)
-    CV_WRAP AclMat& setTo(Scalar s);
-    //! sets some of the AclMat elements to s (Non-Blocking call)
-    CV_WRAP AclMat& setTo(Scalar s, AclStream& stream);
-
-    //! swaps with other smart pointer
-    CV_WRAP void swap(AclMat& mat);
-
-    //! allocates new AclMat data unless the AclMat already has specified size and type
-    CV_WRAP void create(int rows, int cols, int type);
-
-    //! upload host memory data to AclMat (Blocking call)
-    CV_WRAP void upload(InputArray arr);
-    //! upload host memory data to AclMat (Non-Blocking call)
-    CV_WRAP void upload(InputArray arr, AclStream& stream);
-
-    //! download data from AclMat to host (Blocking call)
-    CV_WRAP void download(OutputArray dst) const;
-    //! download data from AclMat to host (Non-Blocking call)
-    CV_WRAP void download(OutputArray dst, AclStream& stream) const;
-
-    //! converts AclMat to another datatype (Blocking call)
-    CV_WRAP void convertTo(CV_OUT AclMat& dst, int rtype) const;
-
-    //! converts AclMat to another datatype (Non-Blocking call)
-    CV_WRAP void convertTo(CV_OUT AclMat& dst, int rtype, AclStream& stream) const;
-
-    //! decreases reference counter, deallocate the data when reference counter reaches 0
-    CV_WRAP void release();
-
-    //! returns element size in bytes
-    CV_WRAP size_t elemSize() const;
-
-    //! returns the size of element channel in bytes
-    CV_WRAP size_t elemSize1() const;
-
-    //! returns element type
-    CV_WRAP int type() const;
-
-    //! returns element type
-    CV_WRAP int depth() const;
-
-    //! returns number of channels
-    CV_WRAP int channels() const;
-
-    //! returns step/elemSize1()
-    CV_WRAP size_t step1() const;
-
-    //! returns AclMat size : width == number of columns, height == number of rows
-    CV_WRAP Size size() const;
-
-    //! returns true if AclMat data is NULL
-    CV_WRAP bool empty() const;
-
-    //! internal use method: updates the continuity flag
-    CV_WRAP void updateContinuityFlag();
-
-    //! expand one channel mat to multi-channels (Blocking call)
-    //! @note, source mat must only have one channel, copy value to all channels.
-    CV_WRAP void expandTo(CV_OUT AclMat& dst, int channels) const;
-
-    //! expand one channel mat to multi-channels (Non-Blocking call)
-    //! @note, source mat must only have one channel, copy value to all channels.
-    CV_WRAP void expandTo(CV_OUT AclMat& dst, int channels, AclStream& stream) const;
-
-    /*! includes several bit-fields:
-     - the magic signature
-     - continuity flag
-     - depth
-     - number of channels
-     */
-    int flags;
-
-    //! the number of rows and columns
-    int rows, cols;
-
-    //! a distance between successive rows in bytes; includes the gap if any
-    CV_PROP size_t step;
-
-    //! pointer to the data
-    uchar* data;
-
-    //! pointer to the reference counter;
-    //! when AclMat points to user-allocated data, the pointer is NULL
-    int* refcount;
-
-    //! helper fields used in locateROI and adjustROI
-    uchar* datastart;
-    const uchar* dataend;
-
-    //! allocator
-    Allocator* allocator;
-};
-
-class AclStream;
-class AclStreamAccessor;
-class AclEvent;
-class AclEventAccessor;
-class DefaultDeviceInitializer;
-
-//===================================================================================
-// AclStream
-//===================================================================================
-
-/** @brief In AscendCL Stream(AclStream) is a task queue. Stream is used to manage the parallelism
- * of tasks. The tasks inside a Stream are executed sequentially, that is, the Stream executes
- * sequentially according to the sent tasks; the tasks in different Streams are executed in
- * parallel.
- *
- * All Non-blocking functions should pass parameter stream, These function returns immediately after
- * the task is submitted. Caller should wait stream until completion.
- *
- * Blocking functions implicityly use the default stream, and synchronize stream before function
- * return.
- * @sa cuda::Stream
- */
-
-// TODO: Stream is defined in namespace cuda, and pybind code does not use a namespace of stream,
-// change stream name to AclStream to avoid confilct.
-class CV_EXPORTS_W AclStream
-{
-public:
-    CV_WRAP AclStream();
-
-    //! blocks the current CPU thread until all operations in the stream are complete.
-    CV_WRAP void waitForCompletion();
-
-    //! blocks the current CPU thread until event trigger.
-    CV_WRAP void waitAclEvent(const cv::cann::AclEvent& event);
-
-    /**
-     * @brief return default AclStream object for default Acl stream.
-     */
-    CV_WRAP static AclStream& Null();
-
-    // acl symbols CANNOT used in any hpp files. Use a inner class to avoid acl symbols defined in
-    // hpp.
-    class Impl;
-
-    // add temporary mat for async release.
-    void addToAsyncRelease(const AclMat& mat);
-
-private:
-    Ptr<Impl> impl_;
-    AclStream(const Ptr<Impl>& impl);
-
-    friend class AclStreamAccessor;
-    friend class DefaultDeviceInitializer;
-};
-
-/**
- * @brief AclEvent to synchronize between different streams.
- */
-class CV_EXPORTS_W AclEvent
-{
-public:
-    CV_WRAP AclEvent();
-
-    //! records an event
-    CV_WRAP void record(AclStream& stream = AclStream::Null());
-
-    //! waits for an event to complete
-    CV_WRAP void waitForComplete() const;
-
-    class Impl;
-
-private:
-    Ptr<Impl> impl_;
-    AclEvent(const Ptr<Impl>& impl);
-
-    friend class AclEventAccessor;
-};
-
-/** @brief Bindings overload to create a Stream object from the address stored in an existing CANN
- * Runtime API stream pointer (aclrtStream).
- * @param aclStreamAddress Memory address stored in a CANN Runtime API stream pointer
- * (aclrtStream). The created Stream object does not perform any allocation or deallocation and simply
- * wraps existing raw CANN Runtime API stream pointer.
- * @note Overload for generation of bindings only, not exported or intended for use internally fro C++.
- */
-CV_EXPORTS_W AclStream wrapStream(size_t aclStreamAddress);
-
-//! @} cann_struct
-
-//===================================================================================
-// Initialization & Info
-//===================================================================================
-
-//! @addtogroup cann_init
-//! @{
-
-//! Get Ascend matrix object from Input array, upload matrix memory if need. (Blocking call)
-AclMat getInputMat(InputArray src);
-//! Get Ascend matrix object from Input array, upload matrix memory if need. (Non-Blocking call)
-AclMat getInputMat(InputArray src, AclStream& stream);
-
-//! Get Ascend matrix object from Output array, upload matrix memory if need.
-AclMat getOutputMat(OutputArray dst, int rows, int cols, int type);
-
-//! Sync output matrix to Output array, download matrix memory if need.
-void syncOutput(const AclMat& dst, OutputArray _dst);
-
-/**
- * @brief Choose Ascend npu device.
- */
-CV_EXPORTS_W void setDevice(int device);
-
-/**
- * @brief Clear all context created in current Ascend device.
- */
-CV_EXPORTS_W void resetDevice();
-
-/**
- * @brief Get current Ascend device.
- */
-CV_EXPORTS_W int32_t getDevice();
-
-/**
- * @brief init AscendCL.
- */
-CV_EXPORTS_W void initAcl();
-
-/**
- * @brief finalize AscendCL.
- * @note finalizeAcl only can be called once for a process. Call this function after all AscendCL
- * options finished.
- */
-CV_EXPORTS_W void finalizeAcl();
-
-//! @} cann_init
-
-} // namespace cann
-} // namespace cv
-
-#include "opencv2/cann.inl.hpp"
-
-#endif /* OPENCV_CANN_HPP */
diff --git a/modules/cannarithm/include/opencv2/cann.inl.hpp b/modules/cannarithm/include/opencv2/cann.inl.hpp
deleted file mode 100644
index 0c85e8dcc7a..00000000000
--- a/modules/cannarithm/include/opencv2/cann.inl.hpp
+++ /dev/null
@@ -1,111 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifndef OPENCV_CANNINL_HPP
-#define OPENCV_CANNINL_HPP
-
-#include "opencv2/cann.hpp"
-
-namespace cv
-{
-namespace cann
-{
-inline AclMat::AclMat(AclMat::Allocator* allocator_)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0),
-      allocator(allocator_)
-{
-}
-
-inline AclMat::AclMat(int rows_, int cols_, int type_, AclMat::Allocator* allocator_)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0),
-      allocator(allocator_)
-{
-    if (rows_ > 0 && cols_ > 0)
-        create(rows_, cols_, type_);
-}
-
-inline AclMat::AclMat(Size size_, int type_, AclMat::Allocator* allocator_)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0),
-      allocator(allocator_)
-{
-    if (size_.height > 0 && size_.width > 0)
-        create(size_.height, size_.width, type_);
-}
-
-inline AclMat::AclMat(InputArray arr, AclMat::Allocator* allocator_)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0),
-      allocator(allocator_)
-{
-    upload(arr);
-}
-
-inline AclMat::AclMat(const AclMat& m)
-    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount),
-      datastart(m.datastart), dataend(m.dataend), allocator(m.allocator)
-{
-    if (refcount)
-        CV_XADD(refcount, 1);
-}
-
-inline AclMat::~AclMat() { release(); }
-
-inline AclMat& AclMat::operator=(const AclMat& m)
-{
-    if (this != &m)
-    {
-        AclMat temp(m);
-        swap(temp);
-    }
-
-    return *this;
-}
-
-inline void AclMat::swap(AclMat& b)
-{
-    std::swap(flags, b.flags);
-    std::swap(rows, b.rows);
-    std::swap(cols, b.cols);
-    std::swap(step, b.step);
-    std::swap(data, b.data);
-    std::swap(datastart, b.datastart);
-    std::swap(dataend, b.dataend);
-    std::swap(refcount, b.refcount);
-    std::swap(allocator, b.allocator);
-}
-
-inline void AclMat::release()
-{
-    CV_DbgAssert(allocator != 0);
-
-    if (refcount && CV_XADD(refcount, -1) == 1)
-        allocator->free(this);
-
-    dataend = data = datastart = 0;
-    step = rows = cols = 0;
-    refcount = 0;
-}
-
-inline size_t AclMat::elemSize() const { return CV_ELEM_SIZE(flags); }
-
-inline size_t AclMat::elemSize1() const { return CV_ELEM_SIZE1(flags); }
-
-inline int AclMat::type() const { return CV_MAT_TYPE(flags); }
-
-inline int AclMat::depth() const { return CV_MAT_DEPTH(flags); }
-
-inline int AclMat::channels() const { return CV_MAT_CN(flags); }
-
-inline size_t AclMat::step1() const { return step / elemSize1(); }
-
-inline Size AclMat::size() const { return Size(cols, rows); }
-
-inline bool AclMat::empty() const { return data == 0; }
-
-inline AclStream::AclStream(const Ptr<AclStream::Impl>& impl) : impl_(impl) {}
-
-inline AclEvent::AclEvent(const Ptr<AclEvent::Impl>& impl) : impl_(impl) {}
-} // namespace cann
-} // namespace cv
-
-#endif // OPENCV_CANNINL_HPP
diff --git a/modules/cannarithm/include/opencv2/cann_arithm.hpp b/modules/cannarithm/include/opencv2/cann_arithm.hpp
deleted file mode 100644
index 9a0f3f1655f..00000000000
--- a/modules/cannarithm/include/opencv2/cann_arithm.hpp
+++ /dev/null
@@ -1,176 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifndef OPENCV_CANNARITHM_HPP
-#define OPENCV_CANNARITHM_HPP
-
-#include "opencv2/cann.hpp"
-
-namespace cv
-{
-namespace cann
-{
-
-/**
-  @addtogroup cann
-  @{
-    @defgroup cannarithm Operations on Matrices
-    @{
-        @defgroup cannarithm_elem Per-element Operations
-    @}
-  @}
- */
-
-//! @addtogroup cannarithm_elem
-//! @{
-
-/** @brief Computes a matrix-matrix or matrix-scalar sum.
- * @param src1 First source matrix or scalar.
- * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
- * @param dst Destination matrix that has the same size and number of channels as the input
- * array(s). The depth is defined by dtype or src1 depth.
- * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
- * destination array to be changed. The mask can be used only with single channel images.
- * @param dtype Optional depth of the output array.
- * @param stream AclStream for the asynchronous version.
- * @sa cv::add cuda::add
- */
-CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst,
-                      InputArray mask = noArray(), int dtype = -1,
-                      AclStream& stream = AclStream::Null());
-// This code should not be compiled nor analyzed by doxygen. This interface only for python binding
-// code generation. add(InputArray, InputArray ...) can accept Scalar as its parametr.(Scalar -> Mat
-// -> InputArray)
-#ifdef NEVER_DEFINED
-CV_EXPORTS_W void add(InputArray src1, Scalar src2, OutputArray dst, InputArray mask = noArray(),
-                      int dtype = -1, AclStream& stream = AclStream::Null());
-CV_EXPORTS_W void add(Scalar src1, InputArray src2, OutputArray dst, InputArray mask = noArray(),
-                      int dtype = -1, AclStream& stream = AclStream::Null());
-#endif
-
-/** @brief Computes a matrix-matrix or matrix-scalar difference.
- * @param src1 First source matrix or scalar.
- * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
- * @param dst Destination matrix that has the same size and number of channels as the input
- * array(s). The depth is defined by dtype or src1 depth.
- * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
- * destination array to be changed. The mask can be used only with single channel images.
- * @param dtype Optional depth of the output array.
- * @param stream AclStream for the asynchronous version.
- * @sa cv::subtract cuda::subtract
- */
-CV_EXPORTS_W void subtract(InputArray src1, InputArray src2, OutputArray dst,
-                           InputArray mask = noArray(), int dtype = -1,
-                           AclStream& stream = AclStream::Null());
-#ifdef NEVER_DEFINED
-CV_EXPORTS_W void subtract(InputArray src1, Scalar src2, OutputArray dst,
-                           InputArray mask = noArray(), int dtype = -1,
-                           AclStream& stream = AclStream::Null());
-CV_EXPORTS_W void subtract(Scalar src1, InputArray src2, OutputArray dst,
-                           InputArray mask = noArray(), int dtype = -1,
-                           AclStream& stream = AclStream::Null());
-#endif
-
-/** @brief Computes a matrix-matrix or matrix-scalar per-element product.
- * @param src1 First source matrix or scalar.
- * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
- * @param dst Destination matrix that has the same size and number of channels as the input
- * array(s). The depth is defined by dtype or src1 depth.
- * @param scale Optional scale factor.
- * @param dtype Optional depth of the output array.
- * @param stream AclStream for the asynchronous version.
- * @sa cv::multiply cuda::multiply
- */
-CV_EXPORTS_W void multiply(InputArray src1, InputArray src2, OutputArray dst, float scale,
-                           int dtype = -1, AclStream& stream = AclStream::Null());
-#ifdef NEVER_DEFINED
-CV_EXPORTS_W void multiply(InputArray src1, Scalar src2, OutputArray dst, float scale,
-                           int dtype = -1, AclStream& stream = AclStream::Null());
-CV_EXPORTS_W void multiply(Scalar src1, InputArray src2, OutputArray dst, float scale,
-                           int dtype = -1, AclStream& stream = AclStream::Null());
-#endif
-
-/** @brief Computes a matrix-matrix or matrix-scalar division.
- * @param src1 First source matrix or scalar.
- * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
- * @param dst Destination matrix that has the same size and number of channels as the input
- * array(s). The depth is defined by dtype or src1 depth.
- * @param scale Optional scale factor.
- * @param dtype Optional depth of the output array.
- * @param stream AclStream for the asynchronous version.
- * @sa cv::divide cuda::divide
- */
-CV_EXPORTS_W void divide(InputArray src1, InputArray src2, OutputArray dst, float scale,
-                         int dtype = -1, AclStream& stream = AclStream::Null());
-#ifdef NEVER_DEFINED
-CV_EXPORTS_W void divide(InputArray src1, Scalar src2, OutputArray dst, float scale, int dtype = -1,
-                         AclStream& stream = AclStream::Null());
-CV_EXPORTS_W void divide(Scalar src1, InputArray src2, OutputArray dst, float scale, int dtype = -1,
-                         AclStream& stream = AclStream::Null());
-#endif
-
-/** @brief Performs a per-element bitwise conjunction of two matrices (or of matrix and scalar).
- * @param src1 First source matrix or scalar.
- * @param src2 Second source matrix or scalar.
- * @param dst Destination matrix that has the same size and number of channels as the input
- * array(s). The depth is defined by dtype or src1 depth.
- * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
- * destination array to be changed. The mask can be used only with single channel images.
- * @param stream AclStream for the asynchronous version.
- * @sa cv::bitwise_and cuda::bitwise_and
- */
-CV_EXPORTS_W void bitwise_and(InputArray src1, InputArray src2, OutputArray dst,
-                              InputArray mask = noArray(), AclStream& stream = AclStream::Null());
-#ifdef NEVER_DEFINED
-CV_EXPORTS_W void bitwise_and(InputArray src1, Scalar src2, OutputArray dst,
-                              InputArray mask = noArray(), AclStream& stream = AclStream::Null());
-CV_EXPORTS_W void bitwise_and(Scalar src1, InputArray src2, OutputArray dst,
-                              InputArray mask = noArray(), AclStream& stream = AclStream::Null());
-#endif
-
-/** @brief Performs a per-element bitwise disjunction of two matrices (or of matrix and scalar).
- * @param src1 First source matrix or scalar.
- * @param src2 Second source matrix or scalar.
- * @param dst Destination matrix that has the same size and number of channels as the input
- * array(s). The depth is defined by dtype or src1 depth.
- * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
- * destination array to be changed. The mask can be used only with single channel images.
- * @param stream AclStream for the asynchronous version.
- * @sa cv::bitwise_or cuda::bitwise_or
- */
-CV_EXPORTS_W void bitwise_or(InputArray src1, InputArray src2, OutputArray dst,
-                             InputArray mask = noArray(), AclStream& stream = AclStream::Null());
-#ifdef NEVER_DEFINED
-CV_EXPORTS_W void bitwise_or(InputArray src1, Scalar src2, OutputArray dst,
-                             InputArray mask = noArray(), AclStream& stream = AclStream::Null());
-CV_EXPORTS_W void bitwise_or(Scalar src1, InputArray src2, OutputArray dst,
-                             InputArray mask = noArray(), AclStream& stream = AclStream::Null());
-#endif
-
-/** @brief Performs a per-element bitwise exclusive or operation of two matrices (or of matrix and
- * scalar).
- * @param src1 First source matrix or scalar.
- * @param src2 Second source matrix or scalar.
- * @param dst Destination matrix that has the same size and number of channels as the input
- * array(s). The depth is defined by dtype or src1 depth.
- * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
- * destination array to be changed. The mask can be used only with single channel images.
- * @param stream AclStream for the asynchronous version.
- * @sa cv::bitwise_xor cuda::bitwise_xor
- */
-CV_EXPORTS_W void bitwise_xor(InputArray src1, InputArray src2, OutputArray dst,
-                              InputArray mask = noArray(), AclStream& stream = AclStream::Null());
-#ifdef NEVER_DEFINED
-CV_EXPORTS_W void bitwise_xor(InputArray src1, Scalar src2, OutputArray dst,
-                              InputArray mask = noArray(), AclStream& stream = AclStream::Null());
-CV_EXPORTS_W void bitwise_xor(Scalar src1, InputArray src2, OutputArray dst,
-                              InputArray mask = noArray(), AclStream& stream = AclStream::Null());
-#endif
-
-//! @} cannarithm_elem
-
-} // namespace cann
-} // namespace cv
-
-#endif /* OPENCV_CANNARITHM_HPP */
diff --git a/modules/cannarithm/include/opencv2/cann_call.hpp b/modules/cannarithm/include/opencv2/cann_call.hpp
deleted file mode 100644
index 6afdd266a21..00000000000
--- a/modules/cannarithm/include/opencv2/cann_call.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifndef OPENCV_CANNCALL_HPP
-#define OPENCV_CANNCALL_HPP
-
-#include <vector>
-#include <acl/acl.h>
-#include "opencv2/cann.hpp"
-
-namespace cv
-{
-namespace cann
-{
-struct AclAttribute
-{
-    virtual ~AclAttribute() = default;
-    virtual void addAttr(aclopAttr* opAttr) = 0;
-};
-
-#define DEFINE_ATTR(FUNC, TYPE)                                                              \
-    class Acl##FUNC##Attribute : public AclAttribute                                         \
-    {                                                                                        \
-        const char* name;                                                                    \
-        TYPE value;                                                                          \
-                                                                                             \
-    public:                                                                                  \
-        Acl##FUNC##Attribute(const char* _name, TYPE _value) : name(_name), value(_value){}; \
-        void addAttr(aclopAttr* opAttr) override                                             \
-        {                                                                                    \
-            CV_ACL_SAFE_CALL(aclopSetAttr##FUNC(opAttr, name, value));                       \
-        }                                                                                    \
-    }
-
-DEFINE_ATTR(Float, float);
-DEFINE_ATTR(String, const char*);
-
-static std::vector<AclAttribute*> emptyattr;
-void aclOneInput(const AclMat& src, AclMat& dst, const char* op,
-                 AclStream& stream = AclStream::Null(),
-                 std::vector<AclAttribute*>& attrs = emptyattr);
-
-void aclTwoInputs(const AclMat& src1, const AclMat& src2, AclMat& dst, const char* op,
-                  AclStream& stream = AclStream::Null());
-
-void transNCHWToNHWC(const AclMat& src, AclMat& dst, AclStream& stream = AclStream::Null());
-
-} // namespace cann
-} // namespace cv
-
-#endif // OPENCV_CANNCALL_HPP
diff --git a/modules/cannarithm/include/opencv2/cann_common.hpp b/modules/cannarithm/include/opencv2/cann_common.hpp
deleted file mode 100644
index ecff9f07589..00000000000
--- a/modules/cannarithm/include/opencv2/cann_common.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifndef OPENCV_CANN_COMMON_HPP
-#define OPENCV_CANN_COMMON_HPP
-
-#include <acl/acl.h>
-
-namespace cv
-{
-namespace cann
-{
-static inline void checkAclError(aclError err, const char* file, const int line, const char* func)
-{
-    if (ACL_SUCCESS != err)
-    {
-        const char* errMsg = aclGetRecentErrMsg();
-        cv::error(cv::Error::AscendApiCallError, errMsg == nullptr ? "" : errMsg, func, file, line);
-    }
-}
-
-static inline void checkAclPtr(void* ptr, const char* file, const int line, const char* func)
-{
-    if (nullptr == ptr)
-    {
-        const char* errMsg = aclGetRecentErrMsg();
-        cv::error(cv::Error::AscendApiCallError, errMsg == nullptr ? "" : errMsg, func, file, line);
-    }
-}
-
-} // namespace cann
-} // namespace cv
-
-#define CV_ACL_SAFE_CALL(expr) cv::cann::checkAclError((expr), __FILE__, __LINE__, CV_Func)
-#define CV_ACL_SAFE_CALL_PTR(expr)                               \
-    ({                                                           \
-        auto ptr = (expr);                                       \
-        cv::cann::checkAclPtr(ptr, __FILE__, __LINE__, CV_Func); \
-        ptr;                                                     \
-    })
-
-#endif // OPENCV_CANN_COMMON_HPP
diff --git a/modules/cannarithm/include/opencv2/cann_prepare.hpp b/modules/cannarithm/include/opencv2/cann_prepare.hpp
deleted file mode 100644
index cc1aba25618..00000000000
--- a/modules/cannarithm/include/opencv2/cann_prepare.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifndef OPENCV_CANNPREPARE_HPP
-#define OPENCV_CANNPREPARE_HPP
-
-#include <vector>
-#include <acl/acl.h>
-#include "opencv2/core.hpp"
-#include "opencv2/cann_common.hpp"
-
-namespace cv
-{
-namespace cann
-{
-struct CannPreparation
-{
-    CannPreparation() { opAttr_ = CV_ACL_SAFE_CALL_PTR(aclopCreateAttr()); }
-
-    virtual ~CannPreparation()
-    {
-        for (auto desc : inputDesc_)
-        {
-            aclDestroyTensorDesc(desc);
-        }
-
-        for (auto desc : outputDesc_)
-        {
-            aclDestroyTensorDesc(desc);
-        }
-
-        for (auto buf : inputBuffers_)
-        {
-            aclDestroyDataBuffer(buf);
-        }
-
-        for (auto buf : outputBuffers_)
-        {
-            aclDestroyDataBuffer(buf);
-        }
-
-        aclopDestroyAttr(opAttr_);
-    }
-
-    std::vector<aclDataBuffer*> inputBuffers_;
-    std::vector<aclDataBuffer*> outputBuffers_;
-    std::vector<aclTensorDesc*> inputDesc_;
-    std::vector<aclTensorDesc*> outputDesc_;
-    aclopAttr* opAttr_;
-};
-
-#define CANN_PREPARE_ADD_ATTR(var, type, ...)                           \
-    do                                                                  \
-    {                                                                   \
-        CV_ACL_SAFE_CALL(aclopSetAttr##type(var.opAttr_, __VA_ARGS__)); \
-    } while (0)
-
-#define CANN_PREPARE_INPUTDESC(var, ...)                                     \
-    do                                                                       \
-    {                                                                        \
-        auto _rPtr = CV_ACL_SAFE_CALL_PTR(aclCreateTensorDesc(__VA_ARGS__)); \
-        if (_rPtr != nullptr)                                                \
-            var.inputDesc_.push_back(_rPtr);                                 \
-    } while (0)
-
-#define CANN_PREPARE_OUTPUTDESC(var, ...)                                    \
-    do                                                                       \
-    {                                                                        \
-        auto _rPtr = CV_ACL_SAFE_CALL_PTR(aclCreateTensorDesc(__VA_ARGS__)); \
-        if (_rPtr != nullptr)                                                \
-            var.outputDesc_.push_back(_rPtr);                                \
-    } while (0)
-
-#define CANN_PREPARE_INPUTBUFFER(var, ...)                                   \
-    do                                                                       \
-    {                                                                        \
-        auto _rPtr = CV_ACL_SAFE_CALL_PTR(aclCreateDataBuffer(__VA_ARGS__)); \
-        if (_rPtr != nullptr)                                                \
-            var.inputBuffers_.push_back(_rPtr);                              \
-    } while (0)
-
-#define CANN_PREPARE_OUTPUTBUFFER(var, ...)                                  \
-    do                                                                       \
-    {                                                                        \
-        auto _rPtr = CV_ACL_SAFE_CALL_PTR(aclCreateDataBuffer(__VA_ARGS__)); \
-        if (_rPtr != nullptr)                                                \
-            var.outputBuffers_.push_back(_rPtr);                             \
-    } while (0)
-
-aclDataType getACLType(int opencvdepth);
-
-} // namespace cann
-} // namespace cv
-
-#endif // OPENCV_CANNPREPARE_HPP
diff --git a/modules/cannarithm/misc/python/pyopencv_cann.hpp b/modules/cannarithm/misc/python/pyopencv_cann.hpp
deleted file mode 100644
index 61dc824c886..00000000000
--- a/modules/cannarithm/misc/python/pyopencv_cann.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#ifdef HAVE_OPENCV_CORE
-
-#include "opencv2/cann.hpp"
-
-typedef std::vector<cann::AclMat> vector_AclMat;
-typedef cann::AclMat::Allocator AclMat_Allocator;
-
-CV_PY_TO_CLASS(cann::AclMat);
-CV_PY_TO_CLASS(cann::AclStream);
-
-CV_PY_TO_CLASS_PTR(cann::AclMat);
-CV_PY_TO_CLASS_PTR(cann::AclMat::Allocator);
-
-CV_PY_FROM_CLASS(cann::AclMat);
-CV_PY_FROM_CLASS(cann::AclStream);
-
-CV_PY_FROM_CLASS_PTR(cann::AclMat::Allocator);
-
-#endif
diff --git a/modules/cannarithm/perf/perf_element_operations.cpp b/modules/cannarithm/perf/perf_element_operations.cpp
deleted file mode 100644
index 5299f4b3c78..00000000000
--- a/modules/cannarithm/perf/perf_element_operations.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#include "perf_precomp.hpp"
-#include "opencv2/cann_arithm.hpp"
-
-namespace opencv_test
-{
-namespace
-{
-
-#define ARITHM_MAT_DEPTH Values(CV_32S, CV_32SC3)
-#define TYPICAL_ACL_MAT_SIZES ::perf::sz1080p, ::perf::sz2K, ::perf::sz2160p, ::perf::sz4320p
-#define DEVICE_ID 0
-#define DEF_PARAM_TEST(name, ...) \
-    typedef ::perf::TestBaseWithParam<testing::tuple<__VA_ARGS__>> name
-
-// NPU Perf Test
-DEF_PARAM_TEST(NPU, cv::Size, perf::MatDepth);
-#define TEST_NPU_OP_MAT(idx, op, ...)                                                       \
-    PERF_TEST_P(NPU, MAT_##op##_MAT_##idx,                                                  \
-                testing::Combine(testing::Values(TYPICAL_ACL_MAT_SIZES), ARITHM_MAT_DEPTH)) \
-    {                                                                                       \
-        Size size = GET_PARAM(0);                                                           \
-        int depth = GET_PARAM(1);                                                           \
-                                                                                            \
-        Mat src1(size, depth), src2(size, depth);                                           \
-        declare.in(src1, WARMUP_RNG);                                                       \
-        declare.in(src2, WARMUP_RNG);                                                       \
-        cv::cann::setDevice(DEVICE_ID);                                                     \
-                                                                                            \
-        AclMat npu_src1, npu_src2, dst;                                                     \
-        npu_src1.upload(src1);                                                              \
-        npu_src2.upload(src2);                                                              \
-        AclStream stream;                                                                   \
-        TEST_CYCLE() { cv::cann::op(npu_src1, npu_src2, dst, __VA_ARGS__); }                \
-        SANITY_CHECK_NOTHING();                                                             \
-        cv::cann::resetDevice();                                                            \
-    }
-
-// CPU Perf Test
-DEF_PARAM_TEST(CPU, cv::Size, perf::MatDepth);
-#define TEST_CPU_OP_MAT(idx, op, ...)                                                       \
-    PERF_TEST_P(CPU, MAT_##op##_MAT_##idx,                                                  \
-                testing::Combine(testing::Values(TYPICAL_ACL_MAT_SIZES), ARITHM_MAT_DEPTH)) \
-    {                                                                                       \
-        Size size = GET_PARAM(0);                                                           \
-        int depth = GET_PARAM(1);                                                           \
-                                                                                            \
-        Mat src1(size, depth), src2(size, depth), dst(size, depth);                         \
-        declare.in(src1, WARMUP_RNG);                                                       \
-        declare.in(src2, WARMUP_RNG);                                                       \
-                                                                                            \
-        TEST_CYCLE() cv::op(src1, src2, dst, __VA_ARGS__);                                  \
-        SANITY_CHECK_NOTHING();                                                             \
-    }
-
-TEST_NPU_OP_MAT(1, add, noArray(), -1);
-TEST_CPU_OP_MAT(1, add, noArray(), -1);
-
-TEST_NPU_OP_MAT(1, subtract, noArray(), -1);
-TEST_CPU_OP_MAT(1, subtract, noArray(), -1);
-
-TEST_NPU_OP_MAT(1, multiply, 1, -1);
-TEST_CPU_OP_MAT(1, multiply, 1, -1);
-
-TEST_NPU_OP_MAT(1, divide, 1, -1);
-TEST_CPU_OP_MAT(1, divide, 1, -1);
-
-TEST_NPU_OP_MAT(1, bitwise_and, noArray());
-TEST_CPU_OP_MAT(1, bitwise_and, noArray());
-
-TEST_NPU_OP_MAT(1, bitwise_or, noArray());
-TEST_CPU_OP_MAT(1, bitwise_or, noArray());
-
-TEST_NPU_OP_MAT(1, bitwise_xor, noArray());
-TEST_CPU_OP_MAT(1, bitwise_xor, noArray());
-
-} // namespace
-} // namespace opencv_test
diff --git a/modules/cannarithm/src/aclmat.cpp b/modules/cannarithm/src/aclmat.cpp
deleted file mode 100644
index a7d0dced4d0..00000000000
--- a/modules/cannarithm/src/aclmat.cpp
+++ /dev/null
@@ -1,605 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#include "precomp.hpp"
-
-namespace
-{
-/********************************************AclMat********************************************/
-class DefaultAllocator : public cv::cann::AclMat::Allocator
-{
-public:
-    bool allocate(cv::cann::AclMat* mat, int rows, int cols, size_t elemSize) CV_OVERRIDE;
-    void free(cv::cann::AclMat* mat) CV_OVERRIDE;
-};
-
-bool DefaultAllocator::allocate(cv::cann::AclMat* mat, int rows, int cols, size_t elemSize)
-{
-    CV_ACL_SAFE_CALL(
-        aclrtMalloc((void**)(&mat->data), elemSize * cols * rows, ACL_MEM_MALLOC_HUGE_FIRST));
-
-    mat->step = cols * elemSize;
-    mat->refcount = (int*)cv::fastMalloc(sizeof(int));
-
-    return true;
-}
-
-void DefaultAllocator::free(cv::cann::AclMat* mat)
-{
-    aclrtFree(mat->datastart);
-    cv::fastFree(mat->refcount);
-}
-
-DefaultAllocator cannDefaultAllocator;
-cv::cann::AclMat::Allocator* g_defaultAllocator = &cannDefaultAllocator;
-} // namespace
-
-namespace cv
-{
-namespace cann
-{
-AclMat::Allocator* AclMat::defaultAllocator() { return g_defaultAllocator; }
-
-void AclMat::setDefaultAllocator(AclMat::Allocator* allocator)
-{
-    CV_Assert(allocator != 0);
-    g_defaultAllocator = allocator;
-}
-
-// TODO: this function is copied from matrix.cpp, which is a local symbol there and can be
-// refreneced.
-static int updateContinuityFlag(int flags, int dims, const int* size, const size_t* step)
-{
-    int i, j;
-    for (i = 0; i < dims; i++)
-    {
-        if (size[i] > 1)
-            break;
-    }
-
-    uint64 t = (uint64)size[std::min(i, dims - 1)] * CV_MAT_CN(flags);
-    for (j = dims - 1; j > i; j--)
-    {
-        t *= size[j];
-        if (step[j] * size[j] < step[j - 1])
-            break;
-    }
-
-    if (j <= i && t == (uint64)(int)t)
-        return flags | Mat::CONTINUOUS_FLAG;
-    return flags & ~Mat::CONTINUOUS_FLAG;
-}
-
-void AclMat::updateContinuityFlag()
-{
-    int sz[] = {rows, cols};
-    size_t steps[] = {step, elemSize()};
-    flags = cv::cann::updateContinuityFlag(flags, 2, sz, steps);
-}
-
-AclMat::AclMat(int rows_, int cols_, int type_, void* data_, size_t step_)
-    : flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(rows_), cols(cols_), step(step_),
-      data((uchar*)data_), refcount(0), datastart((uchar*)data_), dataend((const uchar*)data_),
-      allocator(defaultAllocator())
-{
-    size_t minstep = cols * elemSize();
-
-    if (step == Mat::AUTO_STEP)
-    {
-        step = minstep;
-    }
-    else
-    {
-        if (rows == 1)
-            step = minstep;
-
-        CV_DbgAssert(step >= minstep);
-    }
-
-    dataend += step * (rows - 1) + minstep;
-    updateContinuityFlag();
-}
-
-AclMat::AclMat(Size size_, int type_, void* data_, size_t step_)
-    : flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(size_.height), cols(size_.width),
-      step(step_), data((uchar*)data_), refcount(0), datastart((uchar*)data_),
-      dataend((const uchar*)data_), allocator(defaultAllocator())
-{
-    size_t minstep = cols * elemSize();
-
-    if (step == Mat::AUTO_STEP)
-    {
-        step = minstep;
-    }
-    else
-    {
-        if (rows == 1)
-            step = minstep;
-
-        CV_DbgAssert(step >= minstep);
-    }
-
-    dataend += step * (rows - 1) + minstep;
-    updateContinuityFlag();
-}
-
-void AclMat::create(int _rows, int _cols, int _type)
-{
-    CV_DbgAssert(_rows >= 0 && _cols >= 0);
-
-    _type &= Mat::TYPE_MASK;
-
-    if (rows == _rows && cols == _cols && type() == _type && data)
-        return;
-
-    if (data)
-        release();
-
-    if (_rows > 0 && _cols > 0)
-    {
-        flags = Mat::MAGIC_VAL + _type;
-        rows = _rows;
-        cols = _cols;
-
-        const size_t esz = elemSize();
-
-        bool allocSuccess = allocator->allocate(this, rows, cols, esz);
-
-        if (!allocSuccess)
-        {
-            // custom allocator fails, try default allocator
-            allocator = defaultAllocator();
-            allocSuccess = allocator->allocate(this, rows, cols, esz);
-            CV_Assert(allocSuccess);
-        }
-
-        if (esz * cols == step)
-            flags |= Mat::CONTINUOUS_FLAG;
-
-        datastart = data;
-        dataend = data + step * (rows - 1) + cols * esz;
-
-        if (refcount)
-            *refcount = 1;
-    }
-}
-
-void AclMat::upload(InputArray arr)
-{
-    Mat mat = arr.getMat();
-    CV_DbgAssert(!mat.empty());
-    create(mat.rows, mat.cols, mat.type());
-    CV_ACL_SAFE_CALL(aclrtMemcpy2d(data, step, mat.data, mat.step[0], cols * elemSize(), rows,
-                                   ACL_MEMCPY_HOST_TO_DEVICE));
-}
-
-void AclMat::upload(InputArray arr, AclStream& _stream)
-{
-    Mat mat = arr.getMat();
-    CV_DbgAssert(!mat.empty());
-    create(mat.rows, mat.cols, mat.type());
-    aclrtStream stream = AclStreamAccessor::getStream(_stream);
-    CV_ACL_SAFE_CALL(aclrtMemcpy2dAsync(data, step, mat.data, mat.step[0], cols * elemSize(), rows,
-                                        ACL_MEMCPY_HOST_TO_DEVICE, stream));
-}
-
-void AclMat::download(OutputArray _dst) const
-{
-    CV_DbgAssert(!empty());
-
-    _dst.create(size(), type());
-    Mat dst = _dst.getMat();
-    CV_ACL_SAFE_CALL(aclrtMemcpy2d(dst.data, dst.step[0], data, step, cols * elemSize(), rows,
-                                   ACL_MEMCPY_DEVICE_TO_HOST));
-}
-
-void AclMat::download(OutputArray _dst, AclStream& _stream) const
-{
-    CV_DbgAssert(!empty());
-
-    _dst.create(size(), type());
-    Mat dst = _dst.getMat();
-    aclrtStream stream = AclStreamAccessor::getStream(_stream);
-    CV_ACL_SAFE_CALL(aclrtMemcpy2dAsync(dst.data, dst.step[0], data, step, cols * elemSize(), rows,
-                                        ACL_MEMCPY_DEVICE_TO_HOST, stream));
-}
-
-AclMat::AclMat(int rows_, int cols_, int type_, Scalar& s_, AclMat::Allocator* allocator_)
-    : flags(0), rows(rows_), cols(cols_), step(0), data(0), refcount(0), datastart(0), dataend(0),
-      allocator(allocator_)
-{
-    create(rows_, cols_, type_);
-    setTo(s_);
-}
-
-AclMat::AclMat(Size size_, int type_, Scalar& s_, AclMat::Allocator* allocator_)
-    : flags(0), rows(size_.height), cols(size_.width), step(0), data(0), refcount(0), datastart(0),
-      dataend(0), allocator(allocator_)
-{
-    create(size_.height, size_.width, type_);
-    setTo(s_);
-}
-
-AclMat& AclMat::setTo(Scalar s_) { return setTo(s_, AclStream::Null()); }
-
-AclMat& AclMat::setTo(Scalar s_, AclStream& stream_)
-{
-    size_t totalBytes = (size_t)rows * cols * elemSize();
-    if (totalBytes == 0)
-        return *this;
-
-    CV_ACL_SAFE_CALL(aclrtMemset(data, totalBytes, 0, totalBytes));
-
-    Mat scMat(1, 1, type(), s_);
-    AclMat scAclMat;
-    scAclMat.upload(scMat);
-
-    AclMat dst(rows, cols, type());
-    // TODO use AssignAdd to avoid memcpy, or use broadcase.
-    aclTwoInputs(*this, scAclMat, dst, "Add", stream_);
-    swap(dst);
-
-    return *this;
-}
-
-void AclMat::convertTo(AclMat& dst, int rtype) const { convertTo(dst, rtype, AclStream::Null()); }
-
-void AclMat::convertTo(AclMat& dst, int _rtype, AclStream& _stream) const
-{
-    int cn = channels();
-    dst.create(rows, cols, CV_MAKE_TYPE(_rtype, cn));
-    aclOneInput(*this, dst, "Cast", _stream);
-}
-
-void AclMat::expandTo(CV_OUT AclMat& dst, int chs) const { expandTo(dst, chs, AclStream::Null()); }
-
-void AclMat::expandTo(CV_OUT AclMat& dst, int chs, AclStream& stream) const
-{
-    CV_Assert(channels() == 1);
-
-    // TODO use inplace expand.
-    AclMat NCHW_mat;
-    NCHW_mat.create(rows, cols, CV_MAKE_TYPE(depth(), chs));
-
-    aclrtStream rawStream = AclStreamAccessor::getStream(stream);
-    size_t expandsize = rows * step * chs;
-    uchar* dataptr = (uchar*)NCHW_mat.data;
-    for (int ch = 0; ch < chs; ch++)
-    {
-        if (rawStream == nullptr)
-        {
-            CV_ACL_SAFE_CALL(
-                aclrtMemcpy(dataptr, expandsize, data, rows * step, ACL_MEMCPY_DEVICE_TO_DEVICE));
-        }
-        else
-        {
-            CV_ACL_SAFE_CALL(aclrtMemcpyAsync(dataptr, expandsize, data, rows * step,
-                                              ACL_MEMCPY_DEVICE_TO_DEVICE, rawStream));
-        }
-
-        dataptr += (step * rows);
-    }
-
-    dst.create(rows, cols, CV_MAKE_TYPE(depth(), chs));
-
-    transNCHWToNHWC(NCHW_mat, dst, stream);
-}
-
-AclStream wrapStream(size_t aclStreamAddress)
-{
-    return AclStreamAccessor::wrapStream(reinterpret_cast<aclrtStream>(aclStreamAddress));
-}
-
-static AclMat getAclMat(InputArray arr)
-{
-    _InputArray::KindFlag k = arr.kind();
-    if (k == _InputArray::ACL_MAT)
-    {
-        const cann::AclMat* a_mat = (const cann::AclMat*)arr.getObj();
-        return *a_mat;
-    }
-
-    if (k == _InputArray::NONE)
-        return cann::AclMat();
-
-    CV_Error(cv::Error::StsNotImplemented, "getAclMat is available only for cann::AclMat");
-}
-
-AclMat getInputMat(InputArray _src)
-{
-    AclMat src;
-    if (_src.kind() == _InputArray::ACL_MAT)
-    {
-        src = getAclMat(_src);
-    }
-    else if (!_src.empty())
-    {
-        src.upload(_src);
-    }
-    return src;
-}
-
-AclMat getInputMat(InputArray _src, AclStream& stream)
-{
-    AclMat src;
-    if (_src.kind() == _InputArray::ACL_MAT)
-    {
-        src = getAclMat(_src);
-    }
-    else if (!_src.empty())
-    {
-        aclrtStream rawStream = AclStreamAccessor::getStream(stream);
-        if (rawStream == nullptr)
-        {
-            src.upload(_src);
-        }
-        else
-        {
-            src.upload(_src, stream);
-        }
-    }
-    return src;
-}
-
-AclMat getOutputMat(OutputArray _dst, int rows, int cols, int type)
-{
-    AclMat dst;
-    if (_dst.kind() == _InputArray::ACL_MAT)
-    {
-        ((cann::AclMat*)(_dst.getObj()))->create(rows, cols, type);
-        dst = getAclMat(_dst);
-    }
-    else
-    {
-        dst.create(rows, cols, type);
-    }
-    return dst;
-}
-
-void syncOutput(const AclMat& dst, OutputArray _dst)
-{
-    if (_dst.kind() != _InputArray::ACL_MAT)
-    {
-        dst.download(_dst);
-    }
-}
-
-/********************************************Device********************************************/
-
-void setDevice(int device_id)
-{
-    aclrtContext context;
-    CV_ACL_SAFE_CALL(aclrtSetDevice(device_id));
-    CV_ACL_SAFE_CALL(aclrtCreateContext(&context, device_id));
-}
-
-void resetDevice() { CV_ACL_SAFE_CALL(aclrtResetDevice(getDevice())); }
-
-int32_t getDevice()
-{
-    int32_t deviceId;
-    CV_ACL_SAFE_CALL(aclrtGetDevice(&deviceId));
-    return deviceId;
-}
-
-void initAcl() { CV_ACL_SAFE_CALL(aclInit(nullptr)); }
-
-void finalizeAcl() { CV_ACL_SAFE_CALL(aclFinalize()); }
-
-class DefaultDeviceInitializer
-{
-public:
-    DefaultDeviceInitializer();
-    ~DefaultDeviceInitializer();
-
-    AclStream& getNullAclStream(int deviceId);
-
-private:
-    std::vector<Ptr<AclStream>> streams_;
-    Mutex streams_mtx_;
-};
-
-DefaultDeviceInitializer::DefaultDeviceInitializer() {}
-
-DefaultDeviceInitializer::~DefaultDeviceInitializer() { streams_.clear(); }
-
-AclStream& DefaultDeviceInitializer::getNullAclStream(int deviceId)
-{
-    AutoLock lock(streams_mtx_);
-
-    if (streams_.empty())
-    {
-        uint32_t deviceCount;
-        CV_ACL_SAFE_CALL(aclrtGetDeviceCount(&deviceCount));
-
-        if (deviceCount > 0)
-            streams_.resize(deviceCount);
-    }
-
-    CV_DbgAssert(deviceId >= 0 && deviceId < static_cast<int>(streams_.size()));
-
-    if (streams_[deviceId].empty())
-    {
-        aclrtStream stream = nullptr;
-        Ptr<AclStream::Impl> impl = makePtr<AclStream::Impl>(stream);
-        streams_[deviceId] = Ptr<AclStream>(new AclStream(impl));
-    }
-
-    return *streams_[deviceId];
-}
-
-DefaultDeviceInitializer initializer;
-
-/********************************************AclEvent********************************************/
-class AclEvent::Impl
-{
-public:
-    aclrtEvent event;
-    bool ownEvent;
-
-    Impl();
-    explicit Impl(aclrtEvent event);
-
-    ~Impl();
-};
-
-AclEvent::Impl::Impl() : event(nullptr), ownEvent(true)
-{
-    CV_ACL_SAFE_CALL(aclrtCreateEvent(&event));
-}
-
-AclEvent::Impl::Impl(aclrtEvent e) : event(e), ownEvent(false) {}
-
-AclEvent::Impl::~Impl()
-{
-    if (event && ownEvent)
-    {
-        CV_ACL_SAFE_CALL(aclrtDestroyEvent(event));
-    }
-}
-
-aclrtEvent AclEventAccessor::getEvent(const AclEvent& event) { return event.impl_->event; }
-
-AclEvent AclEventAccessor::wrapEvent(aclrtEvent event)
-{
-    return AclEvent(makePtr<AclEvent::Impl>(event));
-}
-
-AclEvent::AclEvent() { impl_ = makePtr<Impl>(); }
-
-void AclEvent::record(AclStream& stream)
-{
-    CV_ACL_SAFE_CALL(aclrtRecordEvent(impl_->event, AclStreamAccessor::getStream(stream)));
-}
-
-void AclEvent::waitForComplete() const { CV_ACL_SAFE_CALL(aclrtSynchronizeEvent(impl_->event)); }
-
-/******************************************AclStream********************************************/
-struct AsyncThdArgs
-{
-    bool isExit;
-    void* context;
-    pthread_mutex_t mutex;
-    AsyncThdArgs() : isExit(false), context(nullptr), mutex(PTHREAD_MUTEX_INITIALIZER) {}
-};
-
-class AclStream::Impl
-{
-public:
-    aclrtStream stream;
-    bool ownStream;
-    AsyncThdArgs asyncThdArgs;
-    pthread_t asyncThdId;
-
-    void bindThread();
-    void addToAsyncRelease(const AclMat& mat);
-
-    Impl();
-    explicit Impl(aclrtStream stream);
-
-    ~Impl();
-};
-
-AclStream::Impl::Impl() : stream(nullptr), ownStream(true), asyncThdId(0)
-{
-    CV_ACL_SAFE_CALL(aclrtCreateStream(&stream));
-}
-
-AclStream::Impl::Impl(aclrtStream s) : stream(s), ownStream(false), asyncThdId(0) {}
-
-AclStream::Impl::~Impl()
-{
-    if (stream && ownStream)
-    {
-        aclrtSynchronizeStream(stream);
-        if (asyncThdId != 0)
-        {
-            asyncThdArgs.isExit = true;
-            CV_ACL_SAFE_CALL(aclrtUnSubscribeReport(asyncThdId, stream));
-            (void)pthread_join(asyncThdId, nullptr);
-        }
-        CV_ACL_SAFE_CALL(aclrtDestroyStream(stream));
-    }
-}
-
-static void* processReportLoop(void* args_)
-{
-    AsyncThdArgs* args = (AsyncThdArgs*)args_;
-    CV_ACL_SAFE_CALL(aclrtSetCurrentContext(args->context));
-
-    // Wait for subscribe.
-    pthread_mutex_lock(&args->mutex);
-    pthread_mutex_unlock(&args->mutex);
-
-    while (!args->isExit)
-    {
-        aclError ret = aclrtProcessReport(-1);
-        // Skip error check if exiting. aclrtProcessReport will report an timeout error when
-        // unsubscribing.
-        if (!args->isExit)
-            CV_ACL_SAFE_CALL(ret);
-    }
-
-    return (nullptr);
-}
-
-void AclStream::Impl::bindThread()
-{
-    // Only one thread will created. Lock for parallelling.
-    pthread_mutex_lock(&asyncThdArgs.mutex);
-    if (asyncThdId == 0)
-    {
-        CV_ACL_SAFE_CALL(aclrtGetCurrentContext(&asyncThdArgs.context));
-        (void)pthread_create(&asyncThdId, nullptr, processReportLoop, &asyncThdArgs);
-        CV_ACL_SAFE_CALL(aclrtSubscribeReport(asyncThdId, stream));
-    }
-    pthread_mutex_unlock(&asyncThdArgs.mutex);
-}
-
-static void releaseAclMatCB(void* releaseHandle)
-{
-    if (releaseHandle == nullptr)
-        return;
-    AclMat* mat = (AclMat*)releaseHandle;
-    delete mat;
-}
-
-void AclStream::Impl::addToAsyncRelease(const AclMat& mat)
-{
-    if (stream != nullptr)
-    {
-        if (asyncThdId == 0)
-            bindThread();
-        AclMat* releaseHandle = new AclMat(mat);
-        CV_ACL_SAFE_CALL(
-            aclrtLaunchCallback(releaseAclMatCB, releaseHandle, ACL_CALLBACK_BLOCK, stream));
-    }
-}
-
-aclrtStream AclStreamAccessor::getStream(const AclStream& stream) { return stream.impl_->stream; }
-
-AclStream AclStreamAccessor::wrapStream(aclrtStream stream)
-{
-    return AclStream(makePtr<AclStream::Impl>(stream));
-}
-
-AclStream::AclStream() { impl_ = makePtr<Impl>(); }
-
-void AclStream::waitForCompletion() { CV_ACL_SAFE_CALL(aclrtSynchronizeStream(impl_->stream)); }
-
-void AclStream::waitAclEvent(const AclEvent& event)
-{
-    CV_ACL_SAFE_CALL(aclrtStreamWaitEvent(impl_->stream, AclEventAccessor::getEvent(event)));
-}
-
-AclStream& AclStream::Null()
-{
-    const uint32_t deviceId = getDevice();
-    return initializer.getNullAclStream(deviceId);
-}
-
-void AclStream::addToAsyncRelease(const AclMat& mat) { impl_->addToAsyncRelease(mat); }
-
-} // namespace cann
-} // namespace cv
diff --git a/modules/cannarithm/src/cann_call.cpp b/modules/cannarithm/src/cann_call.cpp
deleted file mode 100644
index 0e9ad8036bb..00000000000
--- a/modules/cannarithm/src/cann_call.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#include "precomp.hpp"
-
-namespace cv
-{
-namespace cann
-{
-void aclOneInput(const AclMat& src, AclMat& dst, const char* op, AclStream& stream,
-                 std::vector<AclAttribute*>& attrs)
-{
-    CannPreparation prepare;
-    for (auto& attrIterator : attrs)
-    {
-        attrIterator->addAttr(prepare.opAttr_);
-    }
-
-    int64_t dimSrc[] = {1, src.rows, src.cols, src.channels()};
-    int64_t dimDst[] = {1, dst.rows, dst.cols, dst.channels()};
-    CANN_PREPARE_INPUTDESC(prepare, getACLType(src.depth()), sizeof(dimSrc) / sizeof(dimSrc[0]),
-                           dimSrc, ACL_FORMAT_NHWC);
-    CANN_PREPARE_OUTPUTDESC(prepare, getACLType(dst.depth()), sizeof(dimDst) / sizeof(dimDst[0]),
-                            dimDst, ACL_FORMAT_NHWC);
-
-    CANN_PREPARE_INPUTBUFFER(prepare, const_cast<uchar*>(src.data), src.rows * src.step);
-    CANN_PREPARE_OUTPUTBUFFER(prepare, const_cast<uchar*>(dst.data), dst.rows * dst.step);
-
-    aclrtStream rawStream = AclStreamAccessor::getStream(stream);
-
-    CV_ACL_SAFE_CALL(aclopCompileAndExecute(
-        op, prepare.inputDesc_.size(), prepare.inputDesc_.data(), prepare.inputBuffers_.data(),
-        prepare.outputDesc_.size(), prepare.outputDesc_.data(), prepare.outputBuffers_.data(),
-        prepare.opAttr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, rawStream));
-    if (rawStream == nullptr)
-        CV_ACL_SAFE_CALL(aclrtSynchronizeStream(rawStream));
-    else
-    {
-        stream.addToAsyncRelease(src);
-        stream.addToAsyncRelease(dst);
-    }
-}
-
-void aclTwoInputs(const AclMat& src1, const AclMat& src2, AclMat& dst, const char* op,
-                  AclStream& stream)
-{
-    CannPreparation prepare;
-    aclrtStream rawStream = AclStreamAccessor::getStream(stream);
-
-    int64_t dimSrc1[] = {1, src1.rows, src1.cols, src1.channels()};
-    int64_t dimSrc2[] = {1, src2.rows, src2.cols, src2.channels()};
-
-    int64_t dimDst[] = {1, dst.rows, dst.cols, dst.channels()};
-
-    CANN_PREPARE_INPUTDESC(prepare, getACLType(src1.depth()), sizeof(dimSrc1) / sizeof(dimSrc1[0]),
-                           dimSrc1, ACL_FORMAT_NHWC);
-
-    CANN_PREPARE_INPUTDESC(prepare, getACLType(src2.depth()), sizeof(dimSrc2) / sizeof(dimSrc2[0]),
-                           dimSrc2, ACL_FORMAT_NHWC);
-
-    CANN_PREPARE_OUTPUTDESC(prepare, getACLType(dst.depth()), sizeof(dimDst) / sizeof(dimDst[0]),
-                            dimDst, ACL_FORMAT_NHWC);
-
-    CANN_PREPARE_INPUTBUFFER(prepare, const_cast<uchar*>(src1.data), src1.rows * src1.step);
-    CANN_PREPARE_INPUTBUFFER(prepare, const_cast<uchar*>(src2.data), src2.rows * src2.step);
-    CANN_PREPARE_OUTPUTBUFFER(prepare, const_cast<uchar*>(dst.data), dst.rows * dst.step);
-
-    CV_ACL_SAFE_CALL(aclopCompileAndExecute(
-        op, prepare.inputDesc_.size(), prepare.inputDesc_.data(), prepare.inputBuffers_.data(),
-        prepare.outputDesc_.size(), prepare.outputDesc_.data(), prepare.outputBuffers_.data(),
-        prepare.opAttr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, rawStream));
-    if (rawStream == nullptr)
-        CV_ACL_SAFE_CALL(aclrtSynchronizeStream(rawStream));
-    else
-    {
-        stream.addToAsyncRelease(src1);
-        stream.addToAsyncRelease(src2);
-        stream.addToAsyncRelease(dst);
-    }
-}
-
-void transNCHWToNHWC(const AclMat& src, AclMat& dst, AclStream& stream)
-{
-    CannPreparation prepare;
-    CANN_PREPARE_ADD_ATTR(prepare, String, "src_format", "NCHW");
-    CANN_PREPARE_ADD_ATTR(prepare, String, "dst_format", "NHWC");
-
-    int64_t dimSrc[] = {1, src.channels(), src.rows, src.cols};
-    int64_t dimDst[] = {1, dst.rows, dst.cols, dst.channels()};
-
-    CANN_PREPARE_INPUTDESC(prepare, getACLType(src.depth()), sizeof(dimSrc) / sizeof(dimSrc[0]),
-                           dimSrc, ACL_FORMAT_NCHW);
-    CANN_PREPARE_OUTPUTDESC(prepare, getACLType(dst.depth()), sizeof(dimDst) / sizeof(dimDst[0]),
-                            dimDst, ACL_FORMAT_NHWC);
-
-    CANN_PREPARE_INPUTBUFFER(prepare, const_cast<uchar*>(src.data), src.rows * src.step);
-    CANN_PREPARE_OUTPUTBUFFER(prepare, const_cast<uchar*>(dst.data), dst.rows * dst.step);
-
-    aclrtStream rawStream = AclStreamAccessor::getStream(stream);
-
-    CV_ACL_SAFE_CALL(aclopCompileAndExecute("TransData", prepare.inputDesc_.size(),
-                                            prepare.inputDesc_.data(), prepare.inputBuffers_.data(),
-                                            prepare.outputDesc_.size(), prepare.outputDesc_.data(),
-                                            prepare.outputBuffers_.data(), prepare.opAttr_,
-                                            ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, rawStream));
-    if (rawStream == nullptr)
-        CV_ACL_SAFE_CALL(aclrtSynchronizeStream(rawStream));
-    else
-    {
-        stream.addToAsyncRelease(src);
-        stream.addToAsyncRelease(dst);
-    }
-}
-
-aclDataType getACLType(int opencvdepth)
-{
-    switch (opencvdepth)
-    {
-        case CV_8S:
-            return ACL_INT8;
-        case CV_16S:
-            return ACL_INT16;
-        case CV_8U:
-            return ACL_UINT8;
-        case CV_16U:
-            return ACL_UINT16;
-        case CV_32S:
-            return ACL_INT32;
-        case CV_64F:
-            return ACL_DOUBLE;
-        case CV_16F:
-            return ACL_FLOAT16;
-        default:
-            return ACL_DT_UNDEFINED;
-    }
-}
-
-} // namespace cann
-} // namespace cv
diff --git a/modules/cannarithm/src/element_operations.cpp b/modules/cannarithm/src/element_operations.cpp
deleted file mode 100644
index f23323bf384..00000000000
--- a/modules/cannarithm/src/element_operations.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#include "precomp.hpp"
-#include <map>
-
-namespace cv
-{
-namespace cann
-{
-void opMatMat(AclMat&, AclMat&, AclMat&, const char*, AclStream& stream = AclStream::Null());
-void opMatMat(AclMat& src1, AclMat& src2, AclMat& dst, const char* op, AclStream& stream)
-{
-    aclTwoInputs(src1, src2, dst, op, stream);
-}
-
-void opMatScalar(AclMat&, AclMat&, bool, Scalar, const char*,
-                 AclStream& stream = AclStream::Null());
-void opMatScalar(AclMat& src, AclMat& dst, bool inv, Scalar s, const char* op, AclStream& stream)
-{
-    Mat scMat(1, 1, src.type(), s);
-    AclMat scAclMat;
-    scAclMat.upload(scMat);
-    if (inv)
-        aclTwoInputs(scAclMat, src, dst, op, stream);
-    else
-        aclTwoInputs(src, scAclMat, dst, op, stream);
-}
-
-void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, float scale, int dtype,
-               const char* op, AclStream& stream = AclStream::Null());
-void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, float scale,  int dtype,
-               const char* op, AclStream& stream)
-{
-    const int kind1 = _src1.kind();
-    const int kind2 = _src2.kind();
-
-    const bool isScalar1 = (kind1 == _InputArray::MATX);
-    const bool isScalar2 = (kind2 == _InputArray::MATX);
-
-    AclMat src1, src2;
-
-    if (!isScalar1)
-        src1 = getInputMat(_src1, stream);
-
-    if (!isScalar2)
-        src2 = getInputMat(_src2, stream);
-
-    Mat scalar;
-    if (isScalar1)
-        scalar = _src1.getMat();
-    else if (isScalar2)
-        scalar = _src2.getMat();
-
-    Scalar val;
-    if (!scalar.empty())
-    {
-        CV_Assert(scalar.total() <= 4);
-        scalar.convertTo(Mat_<double>(scalar.rows, scalar.cols, &val[0]), CV_64F);
-    }
-
-    const int sdepth = src1.empty() ? src2.depth() : src1.depth();
-    const int cn = src1.empty() ? src2.channels() : src1.channels();
-    const Size size = src1.empty() ? src2.size() : src1.size();
-
-    if (dtype < 0)
-        dtype = sdepth;
-
-    const int ddepth = CV_MAT_DEPTH(dtype);
-
-    CV_Assert(sdepth <= CV_64F && ddepth <= CV_64F);
-    CV_Assert(!scalar.empty() || (src2.depth() == src1.depth() && src2.size() == src1.size()));
-
-    AclMat dst = getOutputMat(_dst, size.height, size.width, CV_MAKE_TYPE(ddepth, cn));
-
-    if (isScalar1)
-        opMatScalar(src2, dst, true, val, op, stream);
-    else if (isScalar2)
-        opMatScalar(src1, dst, false, val, op, stream);
-    else
-        opMatMat(src1, src2, dst, op, stream);
-
-    // TODO implement emtpy for AclMat in InputArray
-    AclMat mask = getInputMat(_mask, stream);
-    if (!mask.empty())
-    {
-        int mtype = mask.type();
-
-        CV_Assert((mtype == CV_8UC1 || mtype == CV_8SC1) && mask.size() == size);
-        // TODO use MaskSelect?
-        AclMat formatedMask;
-        if (mask.depth() != dst.depth())
-            mask.convertTo(formatedMask, dst.depth());
-        else
-            formatedMask = mask;
-
-        AclMat expandedMask;
-        if (dst.channels() != 1)
-            formatedMask.expandTo(expandedMask, dst.channels());
-        else
-            expandedMask = formatedMask;
-
-        // TODO call DIV before expand?
-        AclMat divRet;
-        arithm_op(expandedMask, expandedMask, divRet, noArray(), 1, -1, "Div", stream);
-        AclMat dstCopy = dst;
-        // TODO dst memory and dskCopy mempry point to a same memory area, seems no harm yet.
-        arithm_op(dstCopy, divRet, dst, noArray(), 1,  -1, "Mul", stream);
-    }
-
-    if(scale != 1)
-    {
-        AclMat dstCpy = dst;
-        AclFloatAttribute scaleOP("value", scale);
-        std::vector<AclAttribute*> attrs{&scaleOP};
-        aclOneInput(dstCpy, dst, "Muls", stream, attrs);
-    }
-
-    syncOutput(dst, _dst);
-}
-
-void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype,
-         AclStream& stream)
-{
-    arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
-}
-
-void subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype,
-              AclStream& stream)
-{
-    arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
-}
-
-void multiply(InputArray src1, InputArray src2, OutputArray dst, float scale, int dtype, AclStream& stream)
-{
-    arithm_op(src1, src2, dst, noArray(), scale, dtype, "Mul", stream);
-}
-
-void divide(InputArray src1, InputArray src2, OutputArray dst, float scale, int dtype, AclStream& stream)
-{
-    arithm_op(src1, src2, dst, noArray(), scale, dtype, "Div", stream);
-}
-
-void bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask,
-                 AclStream& stream)
-{
-    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
-}
-
-void bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask,
-                AclStream& stream)
-{
-    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
-}
-
-void bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask,
-                 AclStream& stream)
-{
-    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
-}
-
-
-} // namespace cann
-} // namespace cv
diff --git a/modules/cannarithm/test/test_cann.cpp b/modules/cannarithm/test/test_cann.cpp
deleted file mode 100644
index 6c2e65beefe..00000000000
--- a/modules/cannarithm/test/test_cann.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#include "test_precomp.hpp"
-#include <opencv2/ts/cuda_test.hpp>
-
-namespace opencv_test
-{
-namespace
-{
-
-class DummyAllocator : public AclMat::Allocator
-{
-public:
-    bool allocate(cv::cann::AclMat* mat, int rows, int cols, size_t elemSize) CV_OVERRIDE
-    {
-        CV_UNUSED(rows);
-        CV_UNUSED(cols);
-        CV_UNUSED(elemSize);
-        mat->data = (uchar*)0x12345;
-        mat->refcount = (int*)cv::fastMalloc(sizeof(int));
-        return true;
-    }
-    void free(cv::cann::AclMat* mat) CV_OVERRIDE
-    {
-        mat->data = (uchar*)0x54321;
-        cv::fastFree(mat->refcount);
-    }
-};
-
-TEST(AclMat, Construct)
-{
-    cv::cann::setDevice(0);
-    // 1 Default constructor.
-    AclMat defaultAclMat;
-    AclMat::Allocator* defaultAllocator = AclMat::defaultAllocator();
-    ASSERT_EQ(defaultAclMat.allocator, defaultAllocator);
-
-    // 2 get & set allocator.
-    DummyAllocator dummyAllocator;
-    AclMat::setDefaultAllocator(&dummyAllocator);
-    ASSERT_EQ(defaultAclMat.defaultAllocator(), &dummyAllocator);
-    AclMat::setDefaultAllocator(defaultAllocator);
-
-    // 3 constructs AclMat of the specified size and type
-    AclMat specifiedSizeAclMat1(5, 6, CV_8UC3);
-    AclMat specifiedSizeAclMat2(Size(300, 200), CV_64F);
-
-    ASSERT_EQ(specifiedSizeAclMat1.rows, 5);
-    ASSERT_EQ(specifiedSizeAclMat1.cols, 6);
-    ASSERT_EQ(specifiedSizeAclMat1.depth(), CV_8U);
-    ASSERT_EQ(specifiedSizeAclMat1.channels(), 3);
-
-    ASSERT_EQ(specifiedSizeAclMat2.cols, 300);
-    ASSERT_EQ(specifiedSizeAclMat2.rows, 200);
-    ASSERT_EQ(specifiedSizeAclMat2.depth(), CV_64F);
-    ASSERT_EQ(specifiedSizeAclMat2.channels(), 1);
-
-    // 4 constructs AclMat and fills it with the specified value s
-    srand((unsigned int)(time(NULL)));
-    Scalar sc(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
-
-    Mat scalarToMat(7, 8, CV_8UC3, sc);
-    AclMat scalarToAclMat1(7, 8, CV_8UC3, sc);
-    Mat scalarToMatChecker;
-    scalarToAclMat1.download(scalarToMatChecker);
-
-    EXPECT_MAT_NEAR(scalarToMat, scalarToMatChecker, 0.0);
-
-    AclMat scalarToAclMat2(Size(123, 345), CV_32S);
-
-    ASSERT_EQ(scalarToAclMat1.rows, 7);
-    ASSERT_EQ(scalarToAclMat1.cols, 8);
-    ASSERT_EQ(scalarToAclMat1.depth(), CV_8U);
-    ASSERT_EQ(scalarToAclMat1.channels(), 3);
-
-    ASSERT_EQ(scalarToAclMat2.cols, 123);
-    ASSERT_EQ(scalarToAclMat2.rows, 345);
-    ASSERT_EQ(scalarToAclMat2.depth(), CV_32S);
-    ASSERT_EQ(scalarToAclMat2.channels(), 1);
-
-    // 5 constructor for AclMat headers pointing to user-allocated data
-    void* userAllocatedData = malloc(1);
-    AclMat userAllocatedAclMat1(9, 10, CV_16SC2, userAllocatedData);
-    AclMat userAllocatedAclMat2(Size(1024, 2048), CV_16F, userAllocatedData);
-
-    ASSERT_EQ(userAllocatedAclMat1.rows, 9);
-    ASSERT_EQ(userAllocatedAclMat1.cols, 10);
-    ASSERT_EQ(userAllocatedAclMat1.depth(), CV_16S);
-    ASSERT_EQ(userAllocatedAclMat1.channels(), 2);
-    ASSERT_EQ(userAllocatedAclMat1.data, userAllocatedData);
-
-    ASSERT_EQ(userAllocatedAclMat2.cols, 1024);
-    ASSERT_EQ(userAllocatedAclMat2.rows, 2048);
-    ASSERT_EQ(userAllocatedAclMat2.depth(), CV_16F);
-    ASSERT_EQ(userAllocatedAclMat2.channels(), 1);
-    ASSERT_EQ(userAllocatedAclMat1.data, userAllocatedData);
-
-    // 6 builds AclMat from host memory
-    Scalar sc2(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
-    Mat randomMat(7, 8, CV_8UC3, sc2);
-    InputArray arr = randomMat;
-
-    AclMat fromInputArray(arr);
-    Mat randomMatChecker;
-    fromInputArray.download(randomMatChecker);
-    EXPECT_MAT_NEAR(randomMat, randomMatChecker, 0.0);
-
-    cv::cann::resetDevice();
-}
-
-TEST(AclMat, RefCount)
-{
-    DummyAllocator dummyAllocator;
-    AclMat* mat = new AclMat(1, 1, CV_8U, &dummyAllocator);
-    ASSERT_EQ(*(mat->refcount), 1);
-    ASSERT_EQ(mat->data, (uchar*)0x12345);
-
-    AclMat* copy1 = new AclMat(*mat);
-    ASSERT_EQ(mat->refcount, copy1->refcount);
-    ASSERT_EQ(*(copy1->refcount), 2);
-
-    AclMat* copy2 = new AclMat(*copy1);
-    ASSERT_EQ(mat->refcount, copy2->refcount);
-    ASSERT_EQ(*(copy2->refcount), 3);
-
-    delete copy1;
-    ASSERT_EQ(mat->data, (uchar*)0x12345);
-    ASSERT_EQ(*(mat->refcount), 2);
-
-    delete copy2;
-    ASSERT_EQ(mat->data, (uchar*)0x12345);
-    ASSERT_EQ(*(mat->refcount), 1);
-
-    delete mat;
-}
-
-TEST(AclMat, Assignment)
-{
-    DummyAllocator dummyAllocator;
-    AclMat mat1;
-    AclMat mat2(3, 4, CV_8SC1, &dummyAllocator);
-    mat1 = mat2;
-
-    ASSERT_EQ(mat1.rows, 3);
-    ASSERT_EQ(mat1.cols, 4);
-    ASSERT_EQ(mat1.depth(), CV_8S);
-    ASSERT_EQ(mat1.channels(), 1);
-    ASSERT_EQ(mat1.data, (uchar*)0x12345);
-}
-
-TEST(AclMat, SetTo)
-{
-    cv::cann::setDevice(0);
-
-    srand((unsigned int)(time(NULL)));
-    Scalar sc(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
-
-    AclMat aclMat(2, 2, CV_8UC4);
-    aclMat.setTo(sc);
-    Mat mat(2, 2, CV_8UC4, sc);
-    Mat checker;
-    aclMat.download(checker);
-
-    EXPECT_MAT_NEAR(mat, checker, 0.0);
-
-    cv::cann::resetDevice();
-}
-
-TEST(AclMat, ConvertTo)
-{
-    cv::cann::setDevice(0);
-
-    srand((unsigned int)(time(NULL)));
-    Scalar sc(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
-
-    AclMat aclMat(2, 2, CV_8UC4, sc);
-    AclMat convertedAclMat;
-    aclMat.convertTo(convertedAclMat, CV_16S);
-    Mat mat(2, 2, CV_16SC4, sc);
-    Mat checker;
-    convertedAclMat.download(checker);
-
-    EXPECT_MAT_NEAR(mat, checker, 0.0);
-
-    cv::cann::resetDevice();
-}
-
-TEST(AclMat, ExpandTo)
-{
-    cv::cann::setDevice(0);
-
-    Scalar sc1(1);
-    Scalar sc2(1, 1, 1);
-    AclMat aclMat(10, 10, CV_8UC1, sc1);
-    Mat mat(10, 10, CV_8UC3, sc2);
-    AclMat expandedAclMat;
-    aclMat.expandTo(expandedAclMat, 3);
-    Mat checker;
-    expandedAclMat.download(checker);
-
-    EXPECT_MAT_NEAR(mat, checker, 0.0);
-
-    cv::cann::resetDevice();
-}
-
-TEST(AclStream, AsyncProcess)
-{
-    cv::cann::setDevice(0);
-
-    DummyAllocator dummyAllocator;
-    AclMat* mat = new AclMat(&dummyAllocator);
-    AclStream stream;
-
-    stream.addToAsyncRelease(*mat);
-    stream.waitForCompletion();
-
-    // TODO: need sync point to check:
-    // 1. mat->data is not freed after it add to async release list even mat is deleted.
-    // 2. mat->data is freed after callback is called.
-
-    cv::cann::resetDevice();
-}
-
-} // namespace
-} // namespace opencv_test
diff --git a/modules/cannarithm/test/test_element_operation.cpp b/modules/cannarithm/test/test_element_operation.cpp
deleted file mode 100644
index db20321d43f..00000000000
--- a/modules/cannarithm/test/test_element_operation.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-#include "test_precomp.hpp"
-#include "opencv2/ts/cuda_test.hpp"
-#include "opencv2/cann_arithm.hpp"
-
-namespace opencv_test
-{
-namespace
-{
-// Random Generator
-Mat randomMat(int w, int h, int dtype)
-{
-    Mat rnMat(w, h, dtype);
-    RNG rng;
-    rng.fill(rnMat, RNG::UNIFORM, 0.f, 1.f);
-    return rnMat;
-}
-cv::Scalar randomScalar()
-{
-    RNG rng;
-    return Scalar(rng, rng.next(), rng.next(), rng.next());
-}
-float randomNum()
-{
-    RNG rng;
-    float rdnNum = float(rng.uniform(0.3, 3.0));
-    return rdnNum;
-}
-Mat genMask()
-{
-    Mat mask = Mat::zeros(Size(10, 10), CV_8UC1);
-    rectangle(mask, cv::Rect(5, 5, 3, 3), Scalar(255), -1);
-    return mask;
-}
-
-#define DEVICE_ID 0
-
-/****************TEST CASE***************/
-// MAT & Mat
-#define TEST_MAT_OP_MAT(idx, op, ...)                        \
-    TEST(ELEMENTWISE_OP, MAT_##op##_MAT_##idx)               \
-    {                                                        \
-        cv::cann::setDevice(DEVICE_ID);                      \
-                                                             \
-        Mat cpuMat1 = randomMat(10, 10, CV_32SC3);           \
-        Mat cpuMat2 = randomMat(10, 10, CV_32SC3);           \
-        Mat cpuDst;                                          \
-        cv::op(cpuMat1, cpuMat2, cpuDst, __VA_ARGS__);       \
-                                                             \
-        AclMat mat1, mat2;                                   \
-        mat1.upload(cpuMat1);                                \
-        mat2.upload(cpuMat2);                                \
-        AclMat dst, dstS;                                    \
-        cv::cann::op(mat1, mat2, dst, __VA_ARGS__);          \
-        Mat npuDst, npuDstS;                                 \
-        dst.download(npuDst);                                \
-        AclStream stream;                                    \
-        cv::cann::op(mat1, mat2, dstS, __VA_ARGS__, stream); \
-        stream.waitForCompletion();                          \
-        dstS.download(npuDstS);                              \
-                                                             \
-        EXPECT_MAT_NEAR(npuDst, cpuDst, 0.0);                \
-        EXPECT_MAT_NEAR(npuDst, npuDstS, 0.0);               \
-        cv::cann::resetDevice();                             \
-    }
-
-TEST_MAT_OP_MAT(1, add, noArray(), -1);
-TEST_MAT_OP_MAT(1, subtract, noArray(), -1);
-TEST_MAT_OP_MAT(1, multiply, 1, -1);
-TEST_MAT_OP_MAT(1, divide, 1, -1);
-TEST_MAT_OP_MAT(1, bitwise_and, noArray());
-TEST_MAT_OP_MAT(1, bitwise_or, noArray());
-TEST_MAT_OP_MAT(1, bitwise_xor, noArray());
-
-TEST_MAT_OP_MAT(2, add, genMask(), CV_32SC3);
-TEST_MAT_OP_MAT(2, subtract, genMask(), CV_32SC3);
-TEST_MAT_OP_MAT(2, multiply, randomNum(), -1);
-TEST_MAT_OP_MAT(2, divide, randomNum(), -1);
-TEST_MAT_OP_MAT(2, bitwise_and, genMask());
-TEST_MAT_OP_MAT(2, bitwise_or, genMask());
-TEST_MAT_OP_MAT(2, bitwise_xor, genMask());
-
-// SCALAR & MAT
-#define TEST_MAT_OP_SCALAR(idx, op, ...)                           \
-    TEST(ELEMENTWISE_OP, MAT_##op##_SCALAR_##idx)                  \
-    {                                                              \
-        Scalar cpuS1 = randomScalar();                             \
-        Scalar cpuS2 = randomScalar();                             \
-        Mat cpuMatS1(10, 10, CV_32SC3, cpuS1);                     \
-        Mat cpuMatS2(10, 10, CV_32SC3, cpuS2);                     \
-        Mat cpuDst, cpuDstC;                                       \
-        cv::op(cpuMatS1, cpuMatS2, cpuDst, __VA_ARGS__);           \
-        cv::op(cpuMatS2, cpuMatS1, cpuDstC, __VA_ARGS__);          \
-        cv::cann::setDevice(DEVICE_ID);                            \
-                                                                   \
-        AclMat mat;                                                \
-        mat.upload(cpuMatS2);                                      \
-        AclMat dst, dstS, dstC, dstCS;                             \
-        cv::cann::op(cpuS1, cpuMatS2, dst, __VA_ARGS__);           \
-        cv::cann::op(cpuMatS2, cpuS1, dstC, __VA_ARGS__);          \
-        Mat npuDst, npuDstS, npuDstC, npuDstCS;                    \
-        dst.download(npuDst);                                      \
-        dstC.download(npuDstC);                                    \
-        AclStream stream;                                          \
-        cv::cann::op(cpuS1, cpuMatS2, dstS, __VA_ARGS__, stream);  \
-        cv::cann::op(cpuMatS2, cpuS1, dstCS, __VA_ARGS__, stream); \
-        stream.waitForCompletion();                                \
-        dstS.download(npuDstS);                                    \
-        dstCS.download(npuDstCS);                                  \
-                                                                   \
-        EXPECT_MAT_NEAR(npuDst, npuDstS, 0.0);                     \
-        EXPECT_MAT_NEAR(npuDst, cpuDst, 0.0);                      \
-        EXPECT_MAT_NEAR(npuDstC, npuDstCS, 0.0);                   \
-        EXPECT_MAT_NEAR(npuDstC, cpuDstC, 0.0);                    \
-                                                                   \
-        cv::cann::resetDevice();                                   \
-    }
-TEST_MAT_OP_SCALAR(1, add, noArray(), -1);
-TEST_MAT_OP_SCALAR(1, subtract, noArray(), -1);
-TEST_MAT_OP_SCALAR(1, multiply, 1, -1);
-TEST_MAT_OP_SCALAR(1, divide, 1, -1);
-TEST_MAT_OP_SCALAR(1, bitwise_and, noArray());
-TEST_MAT_OP_SCALAR(1, bitwise_or, noArray());
-TEST_MAT_OP_SCALAR(1, bitwise_xor, noArray());
-
-TEST_MAT_OP_SCALAR(2, add, genMask(), CV_32SC3);
-TEST_MAT_OP_SCALAR(2, subtract, genMask(), CV_32SC3);
-TEST_MAT_OP_SCALAR(2, bitwise_and, genMask());
-TEST_MAT_OP_SCALAR(2, bitwise_or, genMask());
-TEST_MAT_OP_SCALAR(2, bitwise_xor, genMask());
-TEST_MAT_OP_SCALAR(2, multiply, randomNum(), -1);
-TEST_MAT_OP_SCALAR(2, divide, randomNum(), -1);
-} // namespace
-} // namespace opencv_test
diff --git a/modules/cannarithm/CMakeLists.txt b/modules/cannops/CMakeLists.txt
similarity index 66%
rename from modules/cannarithm/CMakeLists.txt
rename to modules/cannops/CMakeLists.txt
index 55bcc028510..016168d9359 100644
--- a/modules/cannarithm/CMakeLists.txt
+++ b/modules/cannops/CMakeLists.txt
@@ -1,10 +1,10 @@
  if(IOS OR WINRT OR ANDROID OR APPLE OR WIN32 OR (NOT HAVE_CANN))
-   ocv_module_disable(cannarithm)
+   ocv_module_disable(cannops)
  endif()
 
 set(the_description "Ascend-accelerated Operations on Matrices")
 
-ocv_add_module(cannarithm opencv_core WRAP python)
+ocv_add_module(cannops opencv_core WRAP python)
 ocv_module_include_directories(${CANN_INCLUDE_DIRS})
 ocv_glob_module_sources()
 ocv_install_used_external_targets(${CANN_LIBRARIES})
@@ -12,5 +12,5 @@ ocv_create_module(${CANN_LIBRARIES})
 
 ocv_include_directories(${CMAKE_SOURCE_DIR}/modules/ts/include)
 
-ocv_add_accuracy_tests(DEPENDS_ON opencv_cannarithm)
-ocv_add_perf_tests(DEPENDS_ON opencv_cannarithm)
+ocv_add_accuracy_tests(DEPENDS_ON opencv_cannops)
+ocv_add_perf_tests(DEPENDS_ON opencv_cannops)
diff --git a/modules/cannops/include/opencv2/cann.hpp b/modules/cannops/include/opencv2/cann.hpp
new file mode 100644
index 00000000000..3290e768d6e
--- /dev/null
+++ b/modules/cannops/include/opencv2/cann.hpp
@@ -0,0 +1,324 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_CANN_HPP
+#define OPENCV_CANNOPS_CANN_HPP
+
+#include "opencv2/core.hpp"
+
+/**
+  @defgroup cann Ascend-accelerated Computer Vision
+  @{
+    @defgroup canncore Core part
+    @{
+      @defgroup cann_struct Data Structures
+      @defgroup cann_init Initializeation and Information
+    @}
+  @}
+ */
+
+namespace cv
+{
+namespace cann
+{
+class AscendStream;
+
+//! @addtogroup cann_struct
+//! @{
+
+//===================================================================================
+// NpuMat
+//===================================================================================
+
+/** @brief Base storage class for NPU memory with reference counting.
+ * NpuMat class has a similar interface with Mat and NpuMat, and work on [Ascend
+ * NPU](https://www.hiascend.com/) backend.
+ * @sa Mat cuda::GpuMat
+ */
+class AscendStream;
+class CV_EXPORTS_W NpuMat
+{
+public:
+    class CV_EXPORTS_W Allocator
+    {
+    public:
+        virtual ~Allocator() {}
+        // basic allocator
+        virtual std::shared_ptr<uchar> allocate(size_t size) = 0;
+        // allocator must fill data, step and refcount fields
+        virtual bool allocate(NpuMat* mat, int rows, int cols, size_t elemSize) = 0;
+    };
+
+    /**
+     * @brief Create default allocator for NpuMat. This allocator alloc memory from device for
+     * specific size.
+     */
+    CV_WRAP static NpuMat::Allocator* defaultAllocator();
+
+    /**
+     * @brief Set allocator for NpuMat.
+     * @param allocator
+     */
+    CV_WRAP static void setDefaultAllocator(NpuMat::Allocator* allocator);
+
+    //! default constructor
+    CV_WRAP explicit NpuMat(NpuMat::Allocator* allocator_ = NpuMat::defaultAllocator());
+
+    //! constructs NpuMat of the specified size and type
+    CV_WRAP NpuMat(int rows, int cols, int type,
+                   NpuMat::Allocator* allocator = NpuMat::defaultAllocator());
+    //! constructs NpuMat of the specified size and type
+    CV_WRAP NpuMat(Size size, int type, NpuMat::Allocator* allocator = NpuMat::defaultAllocator());
+
+    //! constructs NpuMat and fills it with the specified value s
+    CV_WRAP NpuMat(int rows, int cols, int type, Scalar& s,
+                   NpuMat::Allocator* allocator = NpuMat::defaultAllocator());
+    //! constructs NpuMat and fills it with the specified value s
+    CV_WRAP NpuMat(Size size, int type, Scalar& s,
+                   NpuMat::Allocator* allocator = NpuMat::defaultAllocator());
+
+    //! copy constructor
+    CV_WRAP NpuMat(const NpuMat& m);
+
+    //! constructs NpuMat by crop a certain area from another
+    CV_WRAP NpuMat(InputArray _m, const Rect& roi);
+    CV_WRAP NpuMat(InputArray _m, const Rect& roi, AscendStream& stream);
+
+    //! builds NpuMat from host memory (Blocking call)
+    CV_WRAP explicit NpuMat(InputArray arr, AscendStream& stream,
+                            NpuMat::Allocator* allocator = NpuMat::defaultAllocator());
+
+    //! assignment operators
+    NpuMat& operator=(const NpuMat& m);
+
+    //! sets some of the NpuMat elements to s (Blocking call)
+    CV_WRAP NpuMat& setTo(const Scalar& s);
+    //! sets some of the NpuMat elements to s (Non-Blocking call)
+    CV_WRAP NpuMat& setTo(const Scalar& s, AscendStream& stream);
+
+    //! sets all of the NpuMat elements to float (Blocking call)
+    CV_WRAP NpuMat& setTo(float sc);
+
+    //! sets all of the NpuMat elements to float (Non-Blocking call)
+    CV_WRAP NpuMat& setTo(float sc, AscendStream& stream);
+
+    //! swaps with other smart pointer
+    CV_WRAP void swap(NpuMat& mat);
+
+    //! allocates new NpuMat data unless the NpuMat already has specified size and type
+    CV_WRAP void create(int rows, int cols, int type);
+
+    //! upload host memory data to NpuMat (Blocking call)
+    CV_WRAP void upload(InputArray arr);
+    //! upload host memory data to NpuMat (Non-Blocking call)
+    CV_WRAP void upload(InputArray arr, AscendStream& stream);
+
+    //! download data from NpuMat to host (Blocking call)
+    CV_WRAP void download(OutputArray dst) const;
+    //! download data from NpuMat to host (Non-Blocking call)
+    CV_WRAP void download(OutputArray dst, AscendStream& stream) const;
+
+    //! converts NpuMat to another datatype (Blocking call)
+    CV_WRAP void convertTo(CV_OUT NpuMat& dst, int rtype) const;
+
+    //! converts NpuMat to another datatype (Non-Blocking call)
+    CV_WRAP void convertTo(CV_OUT NpuMat& dst, int rtype, AscendStream& stream) const;
+
+    //! returns true iff the NpuMat data is continuous
+    //! (i.e. when there are no gaps between successive rows)
+    CV_WRAP bool isContinuous() const;
+
+    //! returns element size in bytes
+    CV_WRAP size_t elemSize() const;
+
+    //! returns the size of element channel in bytes
+    CV_WRAP size_t elemSize1() const;
+
+    //! returns element type
+    CV_WRAP int type() const;
+
+    //! returns element type
+    CV_WRAP int depth() const;
+
+    //! returns number of channels
+    CV_WRAP int channels() const;
+
+    //! returns step/elemSize1()
+    CV_WRAP size_t step1() const;
+
+    //! returns NpuMat size : width == number of columns, height == number of rows
+    CV_WRAP Size size() const;
+
+    //! returns true if NpuMat data is NULL
+    CV_WRAP bool empty() const;
+
+    //! internal use method: updates the continuity flag
+    CV_WRAP void updateContinuityFlag();
+
+    /*! includes several bit-fields:
+     - the magic signature
+     - continuity flag
+     - depth
+     - number of channels
+     */
+    int flags;
+
+    //! the number of rows and columns
+    int rows, cols;
+
+    //! a distance between successive rows in bytes; includes the gap if any
+    CV_PROP size_t step;
+
+    //! pointer to the data
+    std::shared_ptr<uchar> data;
+
+    //! helper fields used in locateROI and adjustROI
+    uchar* datastart;
+    const uchar* dataend;
+
+    //! allocator
+    Allocator* allocator;
+};
+
+class AscendStream;
+class AscendStreamAccessor;
+class AscendEvent;
+class AscendEventAccessor;
+class DefaultDeviceInitializer;
+
+//===================================================================================
+// AscendStream
+//===================================================================================
+
+/** @brief In AscendCL Stream(AscendStream) is a task queue. Stream is used to manage the
+ * parallelism of tasks. The tasks inside a Stream are executed sequentially, that is, the Stream
+ * executes sequentially according to the sent tasks; the tasks in different Streams are executed in
+ * parallel.
+ *
+ * All Non-blocking functions should pass parameter stream, These function returns immediately after
+ * the task is submitted. Caller should wait stream until completion.
+ *
+ * Blocking functions implicityly use the default stream, and synchronize stream before function
+ * return.
+ * @sa cuda::Stream
+ */
+
+// TODO: Stream is defined in namespace cuda, and pybind code does not use a namespace of stream,
+// change stream name to AscendStream to avoid confilct.
+class CV_EXPORTS_W AscendStream
+{
+public:
+    CV_WRAP AscendStream();
+
+    //! blocks the current CPU thread until all operations in the stream are complete.
+    CV_WRAP void waitForCompletion();
+
+    //! blocks the current CPU thread until event trigger.
+    CV_WRAP void waitAscendEvent(const cv::cann::AscendEvent& event);
+
+    /**
+     * @brief return default AscendStream object for default Acl stream.
+     */
+    CV_WRAP static AscendStream& Null();
+
+    // acl symbols CANNOT used in any hpp files. Use a inner class to avoid acl symbols defined in
+    // hpp.
+    class Impl;
+
+    void addTensorHolder(const std::shared_ptr<uchar>& holder);
+
+private:
+    Ptr<Impl> impl_;
+    AscendStream(const Ptr<Impl>& impl);
+
+    friend class AscendStreamAccessor;
+    friend class DefaultDeviceInitializer;
+};
+
+/**
+ * @brief AscendEvent to synchronize between different streams.
+ */
+class CV_EXPORTS_W AscendEvent
+{
+public:
+    CV_WRAP AscendEvent();
+
+    //! records an event
+    CV_WRAP void record(AscendStream& stream);
+
+    //! waits for an event to complete
+    CV_WRAP void waitForComplete() const;
+
+    class Impl;
+
+private:
+    Ptr<Impl> impl_;
+    AscendEvent(const Ptr<Impl>& impl);
+
+    friend class AscendEventAccessor;
+};
+
+/** @brief Bindings overload to create a Stream object from the address stored in an existing CANN
+ * Runtime API stream pointer (aclrtStream).
+ * @param AscendStreamAddress Memory address stored in a CANN Runtime API stream pointer
+ * (aclrtStream). The created Stream object does not perform any allocation or deallocation and
+ * simply wraps existing raw CANN Runtime API stream pointer.
+ * @note Overload for generation of bindings only, not exported or intended for use internally fro
+ * C++.
+ */
+CV_EXPORTS_W AscendStream wrapStream(size_t AscendStreamAddress);
+
+//! @} cann_struct
+
+//===================================================================================
+// Initialization & Info
+//===================================================================================
+
+//! @addtogroup cann_init
+//! @{
+
+//! Get Ascend matrix object from Input array, upload matrix memory if need. (Non-Blocking call)
+NpuMat getInputMat(InputArray src, AscendStream& stream);
+
+//! Get Ascend matrix object from Output array, upload matrix memory if need.
+NpuMat getOutputMat(OutputArray dst, int rows, int cols, int type, AscendStream& stream);
+
+//! Sync output matrix to Output array, download matrix memory if need.
+void syncOutput(const NpuMat& dst, OutputArray _dst, AscendStream& stream);
+
+/**
+ * @brief Choose Ascend npu device.
+ */
+CV_EXPORTS_W void setDevice(int device);
+
+/**
+ * @brief Clear all context created in current Ascend device.
+ */
+CV_EXPORTS_W void resetDevice();
+
+/**
+ * @brief Get current Ascend device.
+ */
+CV_EXPORTS_W int32_t getDevice();
+
+/**
+ * @brief init AscendCL.
+ */
+CV_EXPORTS_W void initAcl();
+
+/**
+ * @brief finalize AscendCL.
+ * @note finalizeAcl only can be called once for a process. Call this function after all AscendCL
+ * options finished.
+ */
+CV_EXPORTS_W void finalizeAcl();
+
+//! @} cann_init
+
+} // namespace cann
+} // namespace cv
+
+#include "opencv2/cann.inl.hpp"
+
+#endif // OPENCV_CANNOPS_CANN_HPP
diff --git a/modules/cannops/include/opencv2/cann.inl.hpp b/modules/cannops/include/opencv2/cann.inl.hpp
new file mode 100644
index 00000000000..8529a51655b
--- /dev/null
+++ b/modules/cannops/include/opencv2/cann.inl.hpp
@@ -0,0 +1,95 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_CANN_INL_HPP
+#define OPENCV_CANNOPS_CANN_INL_HPP
+
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+inline NpuMat::NpuMat(NpuMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+}
+
+inline NpuMat::NpuMat(int rows_, int cols_, int type_, NpuMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    if (rows_ > 0 && cols_ > 0)
+        create(rows_, cols_, type_);
+}
+
+inline NpuMat::NpuMat(Size size_, int type_, NpuMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    if (size_.height > 0 && size_.width > 0)
+        create(size_.height, size_.width, type_);
+}
+
+inline NpuMat::NpuMat(InputArray arr, AscendStream& stream, NpuMat::Allocator* allocator_)
+    : flags(0), rows(0), cols(0), step(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    upload(arr, stream);
+}
+
+inline NpuMat::NpuMat(const NpuMat& m)
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data),
+      datastart(m.datastart), dataend(m.dataend), allocator(m.allocator)
+{}
+
+inline NpuMat& NpuMat::operator=(const NpuMat& m)
+{
+    if (this != &m)
+    {
+        NpuMat temp(m);
+        swap(temp);
+    }
+
+    return *this;
+}
+
+inline void NpuMat::swap(NpuMat& b)
+{
+    std::swap(flags, b.flags);
+    std::swap(rows, b.rows);
+    std::swap(cols, b.cols);
+    std::swap(step, b.step);
+    std::swap(data, b.data);
+    std::swap(datastart, b.datastart);
+    std::swap(dataend, b.dataend);
+    std::swap(allocator, b.allocator);
+}
+
+inline bool NpuMat::isContinuous() const { return (flags & Mat::CONTINUOUS_FLAG) != 0; }
+
+inline size_t NpuMat::elemSize() const { return CV_ELEM_SIZE(flags); }
+
+inline size_t NpuMat::elemSize1() const { return CV_ELEM_SIZE1(flags); }
+
+inline int NpuMat::type() const { return CV_MAT_TYPE(flags); }
+
+inline int NpuMat::depth() const { return CV_MAT_DEPTH(flags); }
+
+inline int NpuMat::channels() const { return CV_MAT_CN(flags); }
+
+inline size_t NpuMat::step1() const { return step / elemSize1(); }
+
+inline Size NpuMat::size() const { return Size(cols, rows); }
+
+inline bool NpuMat::empty() const { return data == 0; }
+
+inline AscendStream::AscendStream(const Ptr<AscendStream::Impl>& impl) : impl_(impl) {}
+
+inline AscendEvent::AscendEvent(const Ptr<AscendEvent::Impl>& impl) : impl_(impl) {}
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNOPS_CANN_INL_HPP
diff --git a/modules/cannops/include/opencv2/cann_call.hpp b/modules/cannops/include/opencv2/cann_call.hpp
new file mode 100644
index 00000000000..e13e4c5c72a
--- /dev/null
+++ b/modules/cannops/include/opencv2/cann_call.hpp
@@ -0,0 +1,134 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_CANN_CALL_HPP
+#define OPENCV_CANNOPS_CANN_CALL_HPP
+
+#include <vector>
+#include <set>
+#include <string>
+#include <acl/acl_base.h>
+#include "opencv2/cann.hpp"
+
+class aclopAttr;
+
+namespace cv
+{
+namespace cann
+{
+struct AclAttribute
+{
+    virtual ~AclAttribute() = default;
+    virtual void addAttr(aclopAttr* opAttr) = 0;
+};
+
+#define DEFINE_ATTR_DECLEAR(FUNC, TYPE)                                                      \
+    class Acl##FUNC##Attribute : public AclAttribute                                         \
+    {                                                                                        \
+        const char* name;                                                                    \
+        TYPE value;                                                                          \
+                                                                                             \
+    public:                                                                                  \
+        Acl##FUNC##Attribute(const char* _name, TYPE _value) : name(_name), value(_value){}; \
+        void addAttr(aclopAttr* opAttr) CV_OVERRIDE;                                         \
+    }
+
+#define DEFINE_ATTR_LIST_DECLEAR(FUNC, TYPE)                               \
+    class AclList##FUNC##Attribute : public AclAttribute                   \
+    {                                                                      \
+        const char* name;                                                  \
+        TYPE value;                                                        \
+        int num;                                                           \
+                                                                           \
+    public:                                                                \
+        AclList##FUNC##Attribute(const char* _name, int _num, TYPE _value) \
+            : name(_name), value(_value), num(_num){};                     \
+        void addAttr(aclopAttr* opAttr) CV_OVERRIDE;                       \
+    }
+
+DEFINE_ATTR_DECLEAR(Float, float);
+DEFINE_ATTR_DECLEAR(String, const char*);
+DEFINE_ATTR_DECLEAR(Int, int);
+DEFINE_ATTR_DECLEAR(Bool, bool);
+DEFINE_ATTR_LIST_DECLEAR(Int, int64_t*);
+
+#undef DEFINE_ATTR_DECLEAR
+#undef DEFINE_ATTR_LIST_DECLEAR
+
+class AscendStream::Impl
+{
+public:
+    aclrtStream stream;
+    bool ownStream;
+    std::set<std::shared_ptr<uchar>> tensorHolders;
+    Impl();
+    explicit Impl(aclrtStream stream);
+    void AddTensorHolder(const std::shared_ptr<uchar>& tensorData);
+};
+class AscendEvent::Impl
+{
+public:
+    aclrtEvent event;
+    bool ownEvent;
+
+    Impl();
+    explicit Impl(aclrtEvent event);
+    ~Impl();
+};
+struct AscendTensor
+{
+    std::string name;
+    std::shared_ptr<uchar> data;
+    size_t dataSize;
+    std::vector<int64_t> dims;
+    aclDataType dtype;
+    aclFormat format;
+    AscendTensor(){};
+    AscendTensor(std::shared_ptr<uchar> _data, size_t _dataSize, int64_t* _dims, size_t _dimSize,
+                 aclDataType _dtype, std::string _name = "", aclFormat _format = ACL_FORMAT_ND);
+    AscendTensor(std::shared_ptr<uchar> _data, size_t _dataSize, std::vector<int64_t>& _dims,
+                 aclDataType _dtype, std::string _name = "", aclFormat _format = ACL_FORMAT_ND)
+        : name(_name), data(_data), dataSize(_dataSize), dims(_dims), dtype(_dtype),
+          format(_format){};
+    AscendTensor(const NpuMat& npuMat, std::string _name = "", aclFormat format = ACL_FORMAT_ND);
+};
+void aclrtMallocWarpper(void** data, size_t size);
+void aclrtFreeWarpper(void* data);
+
+void aclrtMemcpyWarpper(std::shared_ptr<uchar>& dst, size_t offset, const void* src, size_t size,
+                        AscendStream& stream);
+void aclrtMemcpyWarpper(void* dst, const std::shared_ptr<uchar>& src, size_t offset, size_t size,
+                        AscendStream& stream);
+void aclrtMemcpyWarpper(std::shared_ptr<uchar>& dst, size_t dstOffset,
+                        const std::shared_ptr<uchar>& src, size_t srcOffset, size_t size,
+                        AscendStream& stream);
+void aclrtMemcpy2dWarpper(std::shared_ptr<uchar>& dst, size_t offset, size_t dpitch,
+                          const void* src, size_t spitch, size_t width, size_t length,
+                          AscendStream& stream);
+void aclrtMemcpy2dWarpper(void* dst, size_t dpitch, const std::shared_ptr<uchar>& src,
+                          size_t offset, size_t spitch, size_t width, size_t length,
+                          AscendStream& stream);
+void aclrtMemsetWarpper(std::shared_ptr<uchar>& ptr, int32_t value, size_t count,
+                        AscendStream& stream);
+
+static std::vector<AclAttribute*> emptyattr;
+void callAscendOperator(const char* op, std::vector<AscendTensor>& srcs,
+                        std::vector<AscendTensor>& dsts, AscendStream& stream,
+                        std::vector<AclAttribute*>& attrs = emptyattr);
+void callAscendOperator(const NpuMat& src, NpuMat& dst, const char* op, AscendStream& stream,
+                        std::vector<AclAttribute*>& attrs = emptyattr);
+void callAscendOperator(const NpuMat& src1, const NpuMat& src2, NpuMat& dst, const char* op,
+                        AscendStream& stream, std::vector<AclAttribute*>& attrs = emptyattr);
+void callAscendOperator(const NpuMat* srcs, size_t srcCount, NpuMat& dst, const char* op,
+                        AscendStream& stream, std::vector<AclAttribute*>& attrs = emptyattr);
+void callAscendOperator(const NpuMat& src, const Scalar& sc, bool inv, NpuMat& dst, const char* op,
+                        AscendStream& stream, std::vector<AclAttribute*>& attrs = emptyattr);
+void callAscendOperator(const NpuMat& src, NpuMat* dsts, const size_t dstCount, const char* op,
+                        AscendStream& stream, std::vector<AclAttribute*>& attrs = emptyattr);
+std::shared_ptr<uchar> mallocAndUpload(void* data, size_t size, AscendStream& stream,
+                                       NpuMat::Allocator* allocator = NpuMat::defaultAllocator());
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNOPS_CANN_CALL_HPP
diff --git a/modules/cannops/include/opencv2/cann_interface.hpp b/modules/cannops/include/opencv2/cann_interface.hpp
new file mode 100644
index 00000000000..5d3d75096f1
--- /dev/null
+++ b/modules/cannops/include/opencv2/cann_interface.hpp
@@ -0,0 +1,333 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_CANN_INTERFACE_HPP
+#define OPENCV_CANNOPS_CANN_INTERFACE_HPP
+
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+
+/**
+  @addtogroup cann
+  @{
+    @defgroup cannops Operations for Ascend Backend.
+    @{
+        @defgroup cannops_elem Per-element Operations
+        @defgroup cannops_core Core Operations on Matrices
+        @defgroup cannimgproc Image Processing
+    @}
+  @}
+ */
+
+//! @addtogroup cannops_elem
+//! @{
+
+/** @brief Computes a matrix-matrix or matrix-scalar sum.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param dtype Optional depth of the output array.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::add cuda::add
+ */
+CV_EXPORTS_W void add(InputArray src1, InputArray src2, OutputArray dst,
+                      InputArray mask = noArray(), int dtype = -1,
+                      AscendStream& stream = AscendStream::Null());
+// This code should not be compiled nor analyzed by doxygen. This interface only for python binding
+// code generation. add(InputArray, InputArray ...) can accept Scalar as its parametr.(Scalar -> Mat
+// -> InputArray)
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void add(InputArray src1, Scalar src2, OutputArray dst, InputArray mask = noArray(),
+                      int dtype = -1, AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void add(Scalar src1, InputArray src2, OutputArray dst, InputArray mask = noArray(),
+                      int dtype = -1, AscendStream& stream = AscendStream::Null());
+#endif
+
+/** @brief Computes a matrix-matrix or matrix-scalar difference.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param dtype Optional depth of the output array.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::subtract cuda::subtract
+ */
+CV_EXPORTS_W void subtract(InputArray src1, InputArray src2, OutputArray dst,
+                           InputArray mask = noArray(), int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void subtract(InputArray src1, Scalar src2, OutputArray dst,
+                           InputArray mask = noArray(), int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void subtract(Scalar src1, InputArray src2, OutputArray dst,
+                           InputArray mask = noArray(), int dtype = -1,
+                           AscendStream& stream = AscendStream::Null());
+#endif
+
+/** @brief Computes a matrix-matrix or matrix-scalar per-element product.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param scale Optional scale factor.
+ * @param dtype Optional depth of the output array.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::multiply cuda::multiply
+ */
+CV_EXPORTS_W void multiply(InputArray src1, InputArray src2, OutputArray dst, float scale = 1,
+                           int dtype = -1, AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void multiply(InputArray src1, Scalar src2, OutputArray dst, float scale = 1,
+                           int dtype = -1, AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void multiply(Scalar src1, InputArray src2, OutputArray dst, float scale = 1,
+                           int dtype = -1, AscendStream& stream = AscendStream::Null());
+#endif
+
+/** @brief Computes a matrix-matrix or matrix-scalar division.
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar. Matrix should have the same size and type as src1 .
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param scale Optional scale factor.
+ * @param dtype Optional depth of the output array.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::divide cuda::divide
+ */
+CV_EXPORTS_W void divide(InputArray src1, InputArray src2, OutputArray dst, float scale = 1,
+                         int dtype = -1, AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void divide(InputArray src1, Scalar src2, OutputArray dst, float scale = 1, int dtype = -1,
+                         AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void divide(Scalar src1, InputArray src2, OutputArray dst, float scale = 1, int dtype = -1,
+                         AscendStream& stream = AscendStream::Null());
+#endif
+
+/** @brief Performs a per-element bitwise conjunction of two matrices (or of matrix and scalar).
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar.
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::bitwise_and cuda::bitwise_and
+ */
+CV_EXPORTS_W void bitwise_and(InputArray src1, InputArray src2, OutputArray dst,
+                              InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void bitwise_and(InputArray src1, Scalar src2, OutputArray dst,
+                              InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void bitwise_and(Scalar src1, InputArray src2, OutputArray dst,
+                              InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+#endif
+
+/** @brief Performs a per-element bitwise disjunction of two matrices (or of matrix and scalar).
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar.
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::bitwise_or cuda::bitwise_or
+ */
+CV_EXPORTS_W void bitwise_or(InputArray src1, InputArray src2, OutputArray dst,
+                             InputArray mask = noArray(),
+                             AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void bitwise_or(InputArray src1, Scalar src2, OutputArray dst,
+                             InputArray mask = noArray(),
+                             AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void bitwise_or(Scalar src1, InputArray src2, OutputArray dst,
+                             InputArray mask = noArray(),
+                             AscendStream& stream = AscendStream::Null());
+#endif
+
+/** @brief Performs a per-element bitwise exclusive or operation of two matrices (or of matrix and
+ * scalar).
+ * @param src1 First source matrix or scalar.
+ * @param src2 Second source matrix or scalar.
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::bitwise_xor cuda::bitwise_xor
+ */
+CV_EXPORTS_W void bitwise_xor(InputArray src1, InputArray src2, OutputArray dst,
+                              InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+#ifdef NEVER_DEFINED
+CV_EXPORTS_W void bitwise_xor(InputArray src1, Scalar src2, OutputArray dst,
+                              InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+CV_EXPORTS_W void bitwise_xor(Scalar src1, InputArray src2, OutputArray dst,
+                              InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+#endif
+
+/** @brief Performs a per-element bitwise inversion.
+ * @param src First source matrix.
+ * @param dst Destination matrix that has the same size and number of channels as the input
+ * array(s). The depth is defined by dtype or src1 depth.
+ * @param mask Optional operation mask, 8-bit single channel array, that specifies elements of the
+ * destination array to be changed. The mask can be used only with single channel images.
+ * @param stream AscendStream for the asynchronous version.
+ * @sa cv::bitwise_not cuda::bitwise_not
+ */
+CV_EXPORTS_W void bitwise_not(InputArray src, OutputArray dst, InputArray mask = noArray(),
+                              AscendStream& stream = AscendStream::Null());
+
+/** @brief Computes the weighted sum of two arrays.
+
+@param src1 First source array.
+@param alpha Weight for the first array elements.
+@param src2 Second source array of the same size and channel number as src1 .
+@param beta Weight for the second array elements.
+@param dst Destination array that has the same size and number of channels as the input arrays.
+@param gamma Scalar added to each sum.
+@param dtype Optional depth of the destination array. When both input arrays have the same depth,
+dtype can be set to -1, which will be equivalent to src1.depth().
+@param stream Stream for the asynchronous version.
+
+The function addWeighted calculates the weighted sum of two arrays as follows:
+
+\f[\texttt{dst} (I)= \texttt{saturate} ( \texttt{src1} (I)* \texttt{alpha} +  \texttt{src2} (I)*
+\texttt{beta} +  \texttt{gamma} )\f]
+
+where I is a multi-dimensional index of array elements. In case of multi-channel arrays, each
+channel is processed independently.
+
+@sa cv::addWeighted cv::cuda::addWeighted
+ */
+CV_EXPORTS_W void addWeighted(InputArray src1, double alpha, InputArray src2, double beta,
+                              double gamma, OutputArray dst, int dtype = -1,
+                              AscendStream& stream = AscendStream::Null());
+
+/** @brief Applies a fixed-level threshold to each array element.
+
+@param src Source array (single-channel).
+@param dst Destination array with the same size and type as src .
+@param thresh Threshold value.
+@param maxval Maximum value to use with THRESH_BINARY and THRESH_BINARY_INV threshold types.
+@param type Threshold type. For details, see threshold . The THRESH_MASK, THRESH_OTSU and
+THRESH_TRIANGLE threshold types are not supported.
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::threshold cv::cuda::threshold
+*/
+CV_EXPORTS_W double threshold(InputArray src, OutputArray dst, double thresh, double maxval,
+                              int type, AscendStream& stream = AscendStream::Null());
+
+//! @} cannops_elem
+
+//! @addtogroup cannops_core
+//! @{
+
+/** @brief Makes a multi-channel matrix out of several single-channel matrices.
+
+@param src Array/vector of source matrices.
+@param n Number of source matrices.
+@param dst Destination matrix.
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::merge cv::cuda::merge
+ */
+CV_EXPORTS void merge(const NpuMat* src, size_t n, OutputArray dst,
+                      AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void merge(const std::vector<NpuMat>& src, OutputArray dst,
+                        AscendStream& stream = AscendStream::Null());
+
+/** @brief Copies each plane of a multi-channel matrix into an array.
+
+@param src Source matrix.
+@param dst Destination array/vector of single-channel matrices.
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::split cv::cuda::split
+ */
+CV_EXPORTS void split(InputArray src, NpuMat* dst, AscendStream& stream = AscendStream::Null());
+/** @overload */
+CV_EXPORTS_W void split(InputArray src, CV_OUT std::vector<NpuMat>& dst,
+                        AscendStream& stream = AscendStream::Null());
+
+/** @brief Transposes a matrix.
+
+@param src Source matrix.
+@param dst Destination matrix.
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::transpose cv::cuda::transpose
+ */
+CV_EXPORTS_W void transpose(InputArray src, OutputArray dst,
+                            AscendStream& stream = AscendStream::Null());
+
+/** @brief Flips a 2D matrix around vertical, horizontal, or both axes.
+
+@param src Source matrix.
+@param dst Destination matrix.
+@param flipCode Flip mode for the source:
+-   0 Flips around x-axis.
+-   \> 0 Flips around y-axis.
+-   \< 0 Flips around both axes.
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::flip cv::cuda::flip
+ */
+CV_EXPORTS_W void flip(InputArray src, OutputArray dst, int flipCode,
+                       AscendStream& stream = AscendStream::Null());
+
+/** @brief Rotates a 2D array in multiples of 90 degrees.
+The function cv::rotate rotates the array in one of three different ways:
+*   Rotate by 90 degrees clockwise (rotateCode = ROTATE_90_CLOCKWISE).
+*   Rotate by 180 degrees clockwise (rotateCode = ROTATE_180).
+*   Rotate by 270 degrees clockwise (rotateCode = ROTATE_90_COUNTERCLOCKWISE).
+@param src input array.
+@param dst output array of the same type as src.  The size is the same with ROTATE_180,
+and the rows and cols are switched for ROTATE_90_CLOCKWISE and ROTATE_90_COUNTERCLOCKWISE.
+@param rotateCode an enum to specify how to rotate the array; see the enum #RotateFlags
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::rotate
+*/
+CV_EXPORTS_W void rotate(InputArray src, OutputArray dst, int rotateCode,
+                         AscendStream& stream = AscendStream::Null());
+//! @} cannops_core
+
+//! @addtogroup cannimgproc
+//! @{
+
+/** @brief Converts an image from one color space to another.
+
+@param src Source image with CV_8U , CV_16U , or CV_32F depth and 1, 3, or 4 channels.
+@param dst Destination image.
+@param code Color space conversion code. For details, see cvtColor .
+@param dstCn Number of channels in the destination image. If the parameter is 0, the number of the
+channels is derived automatically from src and the code .
+@param stream AscendStream for the asynchronous version.
+
+@sa cv::cvtColor cv::cuda::cvtColor
+ */
+CV_EXPORTS_W void cvtColor(InputArray src, OutputArray dst, int code, int dstCn = 0,
+                           AscendStream& stream = AscendStream::Null());
+
+//! @} cannimgproc
+
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNOPS_CANN_INTERFACE_HPP
diff --git a/modules/cannops/include/opencv2/cann_private.hpp b/modules/cannops/include/opencv2/cann_private.hpp
new file mode 100644
index 00000000000..9609b0d883d
--- /dev/null
+++ b/modules/cannops/include/opencv2/cann_private.hpp
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_CANN_PRIVATE_HPP
+#define OPENCV_CANNOPS_CANN_PRIVATE_HPP
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+void adds(const NpuMat& arr, float scalar, NpuMat& dst, AscendStream& stream);
+void muls(const NpuMat& arr, float scalar, NpuMat& dst, AscendStream& stream);
+void transData(const NpuMat& src, NpuMat& dst, const char* from, const char* to,
+               AscendStream& stream);
+void transpose(const NpuMat& src, int64_t* perm, NpuMat& dst, AscendStream& stream);
+void flip(const NpuMat& src, std::vector<int32_t>& asixs, NpuMat& dst, AscendStream& stream);
+void merge(const NpuMat* src, size_t n, NpuMat& dst, AscendStream& stream);
+void split(const NpuMat& src, NpuMat* dst, AscendStream& stream);
+
+double threshold(NpuMat& src, NpuMat& dst, double thresh, double maxval, int type,
+                 AscendStream& stream);
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNOPS_CANN_PRIVATE_HPP
\ No newline at end of file
diff --git a/modules/cannops/include/opencv2/stream_accessor.hpp b/modules/cannops/include/opencv2/stream_accessor.hpp
new file mode 100644
index 00000000000..ff64d7dcbc0
--- /dev/null
+++ b/modules/cannops/include/opencv2/stream_accessor.hpp
@@ -0,0 +1,39 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_STREAM_ACCESSOR_HPP
+#define OPENCV_CANNOPS_STREAM_ACCESSOR_HPP
+
+#include <acl/acl_base.h>
+#include "opencv2/cann.hpp"
+
+namespace cv
+{
+namespace cann
+{
+//! @addtogroup cann_struct
+//! @{
+
+/** @brief Class that enables getting aclrtAscendStream from cann::AscendStream
+ */
+struct AscendStreamAccessor
+{
+    CV_EXPORTS static aclrtStream getStream(const AscendStream& stream);
+    CV_EXPORTS static AscendStream wrapStream(aclrtStream stream);
+};
+
+/** @brief Class that enables getting aclrtAscendEvent from cann::AscendEvent
+ */
+struct AscendEventAccessor
+{
+    CV_EXPORTS static aclrtEvent getEvent(const AscendEvent& event);
+    CV_EXPORTS static AscendEvent wrapEvent(aclrtEvent event);
+};
+
+//! @} cann_struct
+
+} // namespace cann
+} // namespace cv
+
+#endif // OPENCV_CANNOPS_STREAM_ACCESSOR_HPP
diff --git a/modules/cannops/misc/python/pyopencv_cann.hpp b/modules/cannops/misc/python/pyopencv_cann.hpp
new file mode 100644
index 00000000000..ed9a5cd08fc
--- /dev/null
+++ b/modules/cannops/misc/python/pyopencv_cann.hpp
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CANNOPS_PYOPENCV_CANN_HPP
+#define OPENCV_CANNOPS_PYOPENCV_CANN_HPP
+
+#ifdef HAVE_OPENCV_CORE
+
+#include "opencv2/cann.hpp"
+
+typedef std::vector<cann::NpuMat> vector_NpuMat;
+typedef cann::NpuMat::Allocator NpuMat_Allocator;
+
+CV_PY_TO_CLASS(cann::NpuMat);
+CV_PY_TO_CLASS(cann::AscendStream);
+
+CV_PY_TO_CLASS_PTR(cann::NpuMat);
+CV_PY_TO_CLASS_PTR(cann::NpuMat::Allocator);
+
+CV_PY_FROM_CLASS(cann::NpuMat);
+CV_PY_FROM_CLASS(cann::AscendStream);
+
+CV_PY_FROM_CLASS_PTR(cann::NpuMat::Allocator);
+
+#endif // HAVE_OPENCV_CORE
+
+#endif // OPENCV_CANNOPS_PYOPENCV_CANN_HPP
diff --git a/modules/cannops/misc/python/test/test_cannops.py b/modules/cannops/misc/python/test/test_cannops.py
new file mode 100644
index 00000000000..fc69ba3e166
--- /dev/null
+++ b/modules/cannops/misc/python/test/test_cannops.py
@@ -0,0 +1,154 @@
+# This file is part of OpenCV project.
+# It is subject to the license terms in the LICENSE file found in the top-level directory
+# of this distribution and at http://opencv.org/license.html.
+
+from tests_common import NewOpenCVTests
+import cv2 as cv
+import numpy as np
+
+
+def genMask(mask, listx, listy):
+    for row in range(mask.shape[0]):
+        for col in range(mask.shape[1]):
+            if (row in listx and col in listx) or (row in listy and col in listy):
+                mask[row][col] = 1
+    mask = mask.astype(np.uint8)
+    return mask
+
+
+mask = np.zeros((5, 5))
+listx = [0, 1]
+listy = [1, 2]
+mask = genMask(mask, listx, listy)
+
+
+class cannop_test(NewOpenCVTests):
+    def test_ascend(self):
+        cv.cann.initAcl()
+        cv.cann.getDevice()
+        cv.cann.setDevice(0)
+        stream = cv.cann.AscendStream_Null()
+        cv.cann.wrapStream(id(stream))
+        cv.cann.resetDevice()
+
+    def test_arithmetic(self):
+        npMat1 = np.random.random((5, 5, 3)).astype(int)
+        npMat2 = np.random.random((5, 5, 3)).astype(int)
+        cv.cann.setDevice(0)
+
+        self.assertTrue(np.allclose(cv.cann.add(
+            npMat1, npMat2), cv.add(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.subtract(
+            npMat1, npMat2), cv.subtract(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.multiply(
+            npMat1, npMat2, scale=2), cv.multiply(npMat1, npMat2, scale=2)))
+        self.assertTrue(np.allclose(cv.cann.divide(
+            npMat1, npMat2, scale=2), cv.divide(npMat1, npMat2, scale=2)))
+
+        # mask
+        self.assertTrue(np.allclose(cv.cann.add(
+            npMat1, npMat2, mask=mask), cv.add(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.subtract(
+            npMat1, npMat2, mask=mask), cv.subtract(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.multiply(npMat1, npMat2, scale=2),
+                                    cv.multiply(npMat1, npMat2, scale=2)))
+        self.assertTrue(np.allclose(cv.cann.divide(npMat1, npMat2, scale=2),
+                                    cv.divide(npMat1, npMat2, scale=2)))
+        self.assertTrue(np.allclose(cv.cann.addWeighted(npMat1, 2, npMat2, 4, 3),
+                                    cv.addWeighted(npMat1, 2, npMat2, 4, 3)))
+
+        # stream
+        stream = cv.cann.AscendStream()
+        matDst = cv.cann.add(npMat1, npMat2, stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(matDst, cv.add(npMat1, npMat2)))
+        matDst = cv.cann.add(npMat1, npMat2, mask=mask, stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(matDst, cv.add(npMat1, npMat2, mask=mask)))
+        matDst = cv.cann.subtract(npMat1, npMat2, mask=mask, stream=stream)
+        stream.waitForCompletion()
+        self.assertTrue(np.allclose(
+            matDst, cv.subtract(npMat1, npMat2, mask=mask)))
+
+        cv.cann.resetDevice()
+
+    def test_logical(self):
+        npMat1 = np.random.random((5, 5, 3)).astype(np.uint16)
+        npMat2 = np.random.random((5, 5, 3)).astype(np.uint16)
+        cv.cann.setDevice(0)
+
+        self.assertTrue(np.allclose(cv.cann.bitwise_or(npMat1, npMat2),
+                                    cv.bitwise_or(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_or(
+            npMat1, npMat2), cv.bitwise_or(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_and(npMat1, npMat2),
+                                    cv.bitwise_and(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_and(
+            npMat1, npMat2), cv.bitwise_and(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_xor(npMat1, npMat2),
+                                    cv.bitwise_xor(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_xor(
+            npMat1, npMat2), cv.bitwise_xor(npMat1, npMat2)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_not(npMat1),
+                                    cv.bitwise_not(npMat1)))
+        self.assertTrue(np.allclose(
+            cv.cann.bitwise_not(npMat1), cv.bitwise_not(npMat1)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_and(npMat1, npMat2, mask=mask),
+                                    cv.bitwise_and(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_or(npMat1, npMat2, mask=mask),
+                                    cv.bitwise_or(npMat1, npMat2, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_not(npMat1, mask=mask),
+                                    cv.bitwise_not(npMat1, mask=mask)))
+        self.assertTrue(np.allclose(cv.cann.bitwise_xor(npMat1, npMat2, mask=mask),
+                                    cv.bitwise_xor(npMat1, npMat2, mask=mask)))
+        cv.cann.resetDevice()
+
+    def test_imgproc(self):
+        npMat = (np.random.random((128, 128, 3)) * 255).astype(np.uint8)
+        cv.cann.setDevice(0)
+
+        self.assertTrue(np.allclose(
+            cv.cann.merge(cv.cann.split(npMat)), npMat))
+
+        self.assertTrue(np.allclose(
+            cv.cann.transpose(npMat), cv.transpose(npMat)))
+
+        flipMode = [0, 1, -1]
+        for fMode in flipMode:
+            self.assertTrue(np.allclose(cv.cann.flip(
+                npMat, fMode), cv.flip(npMat, fMode)))
+
+        rotateMode = [0, 1, 2]
+        for rMode in rotateMode:
+            self.assertTrue(np.allclose(cv.cann.rotate(
+                npMat, rMode), cv.rotate(npMat, rMode)))
+
+        cvtModeC1 = [cv.COLOR_GRAY2BGR, cv.COLOR_GRAY2BGRA]
+        cvtModeC3 = [cv.COLOR_BGR2GRAY, cv.COLOR_BGRA2BGR, cv.COLOR_BGR2RGBA, cv.COLOR_RGBA2BGR,
+                     cv.COLOR_BGR2RGB, cv.COLOR_BGRA2RGBA, cv.COLOR_RGB2GRAY, cv.COLOR_BGRA2GRAY,
+                     cv.COLOR_RGBA2GRAY, cv.COLOR_BGR2BGRA, cv.COLOR_BGR2YUV, cv.COLOR_RGB2YUV,
+                     cv.COLOR_YUV2BGR, cv.COLOR_YUV2RGB, cv.COLOR_BGR2YCrCb, cv.COLOR_RGB2YCrCb,
+                     cv.COLOR_YCrCb2BGR, cv.COLOR_YCrCb2RGB, cv.COLOR_BGR2XYZ, cv.COLOR_RGB2XYZ,
+                     cv.COLOR_XYZ2BGR, cv.COLOR_XYZ2RGB,]
+        for cvtM in cvtModeC3:
+            self.assertTrue(np.allclose(cv.cann.cvtColor(
+                npMat, cvtM), cv.cvtColor(npMat, cvtM), 1))
+        npMatC1 = (np.random.random((128, 128, 1)) * 255).astype(np.uint8)
+        for cvtM in cvtModeC1:
+            self.assertTrue(np.allclose(cv.cann.cvtColor(
+                npMatC1, cvtM), cv.cvtColor(npMatC1, cvtM), 1))
+
+        threshType = [cv.THRESH_BINARY, cv.THRESH_BINARY_INV,
+                      cv.THRESH_TRUNC, cv.THRESH_TOZERO, cv.THRESH_TOZERO_INV]
+        for tType in threshType:
+            cvRet, cvThresh = cv.threshold(
+                npMat.astype(np.uint8), 127, 255, tType)
+            cannRet, cannThresh = cv.cann.threshold(
+                npMat.astype(np.float32), 127, 255, tType)
+            self.assertTrue(np.allclose(cvThresh, cannThresh))
+            self.assertTrue(np.allclose(cvRet, cannRet))
+        cv.cann.resetDevice()
+
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
diff --git a/modules/cannops/perf/perf_core.cpp b/modules/cannops/perf/perf_core.cpp
new file mode 100644
index 00000000000..0280da36a22
--- /dev/null
+++ b/modules/cannops/perf/perf_core.cpp
@@ -0,0 +1,150 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include "opencv2/cann_interface.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+#define TYPICAL_NPU_MAT_SIZES \
+    Values(::perf::sz1080p, ::perf::sz2K, ::perf::sz2160p, ::perf::sz4320p)
+#define DEF_PARAM_TEST(name, ...) \
+    typedef ::perf::TestBaseWithParam<testing::tuple<__VA_ARGS__>> name
+
+DEF_PARAM_TEST(NPU, Size);
+DEF_PARAM_TEST(CPU, Size);
+
+PERF_TEST_P(NPU, MERGE, TYPICAL_NPU_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC1);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    NpuMat npuMat[3];
+    npuMat[0].upload(mat);
+    npuMat[1].upload(mat);
+    npuMat[2].upload(mat);
+
+    TEST_CYCLE() { cv::cann::merge(&npuMat[0], 3, dst); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MERGE, TYPICAL_NPU_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC1);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    Mat mats[3] = {mat, mat, mat};
+    TEST_CYCLE() { cv::merge(&mats[0], 3, dst); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, SPLIT, TYPICAL_NPU_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    NpuMat npuMat[3];
+
+    TEST_CYCLE() { cv::cann::split(mat, &npuMat[0]); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, SPLIT, TYPICAL_NPU_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    declare.in(mat, WARMUP_RNG);
+    Mat mats[3] = {mat, mat, mat};
+    TEST_CYCLE() { cv::split(mat, &mats[0]); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, TRANSPOSE, TYPICAL_NPU_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::transpose(mat, dst); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, TRANSPOSE, TYPICAL_NPU_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::transpose(mat, dst); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, FLIP, TYPICAL_NPU_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::flip(mat, dst, -1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, FLIP, TYPICAL_NPU_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::flip(mat, dst, -1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, ROTATE, TYPICAL_NPU_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::rotate(mat, dst, 1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, ROTATE, TYPICAL_NPU_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::rotate(mat, dst, 1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, CROP, TYPICAL_NPU_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    Rect b(1, 2, 4, 4);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { NpuMat cropped_cann(mat, b); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, CROP, TYPICAL_NPU_MAT_SIZES)
+{
+    Mat mat(GET_PARAM(0), CV_8UC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    Rect b(1, 2, 4, 4);
+    TEST_CYCLE() { Mat cropped_cv(mat, b); }
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/perf/perf_cvtcolor.cpp b/modules/cannops/perf/perf_cvtcolor.cpp
new file mode 100644
index 00000000000..dc10d71aa66
--- /dev/null
+++ b/modules/cannops/perf/perf_cvtcolor.cpp
@@ -0,0 +1,69 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include "opencv2/cann_interface.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+#define CVT_COLORS_3                                                                         \
+    Values(COLOR_BGR2BGRA, COLOR_BGRA2BGR, COLOR_BGR2RGBA, COLOR_RGBA2BGR, COLOR_BGR2RGB,    \
+           COLOR_BGRA2RGBA, COLOR_BGR2GRAY, COLOR_BGRA2GRAY, COLOR_RGBA2GRAY, COLOR_BGR2XYZ, \
+           COLOR_RGB2XYZ, COLOR_XYZ2BGR, COLOR_XYZ2RGB, COLOR_BGR2YCrCb, COLOR_RGB2YCrCb,    \
+           COLOR_YCrCb2BGR, COLOR_YCrCb2RGB, COLOR_BGR2YUV, COLOR_RGB2YUV, COLOR_YUV2BGR,    \
+           COLOR_YUV2RGB)
+#define CVT_COLORS_1 Values(COLOR_GRAY2BGR, COLOR_GRAY2BGRA)
+#define TYPICAL_NPU_MAT_SIZES \
+    Values(::perf::sz1080p, ::perf::sz2K, ::perf::sz2160p, ::perf::sz4320p)
+#define DEF_PARAM_TEST(name, ...) \
+    typedef ::perf::TestBaseWithParam<testing::tuple<__VA_ARGS__>> name
+
+DEF_PARAM_TEST(NPU, Size, ColorConversionCodes);
+DEF_PARAM_TEST(CPU, Size, ColorConversionCodes);
+
+PERF_TEST_P(NPU, CVT_COLOR_3, testing::Combine(TYPICAL_NPU_MAT_SIZES, CVT_COLORS_3))
+{
+    Mat mat(GET_PARAM(0), CV_32FC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::cvtColor(mat, dst, GET_PARAM(1)); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, CVT_COLOR_3, testing::Combine(TYPICAL_NPU_MAT_SIZES, CVT_COLORS_3))
+{
+    Mat mat(GET_PARAM(0), CV_32FC3);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::cvtColor(mat, dst, GET_PARAM(1)); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, CVT_COLOR_1, testing::Combine(TYPICAL_NPU_MAT_SIZES, CVT_COLORS_1))
+{
+    Mat mat(GET_PARAM(0), CV_32FC1);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::cvtColor(mat, dst, GET_PARAM(1)); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, CVT_COLOR_1, testing::Combine(TYPICAL_NPU_MAT_SIZES, CVT_COLORS_1))
+{
+    Mat mat(GET_PARAM(0), CV_32FC1);
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::cvtColor(mat, dst, GET_PARAM(1)); }
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/perf/perf_element_operations.cpp b/modules/cannops/perf/perf_element_operations.cpp
new file mode 100644
index 00000000000..d7d5947d661
--- /dev/null
+++ b/modules/cannops/perf/perf_element_operations.cpp
@@ -0,0 +1,211 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include "opencv2/cann_interface.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+#define ARITHM_MAT_DEPTH Values(CV_32S, CV_32SC3)
+#define TYPICAL_NPU_MAT_SIZES \
+    Values(::perf::sz1080p, ::perf::sz2K, ::perf::sz2160p, ::perf::sz4320p)
+#define DEF_PARAM_TEST(name, ...) \
+    typedef ::perf::TestBaseWithParam<testing::tuple<__VA_ARGS__>> name
+
+DEF_PARAM_TEST(NPU, Size, int);
+DEF_PARAM_TEST(CPU, Size, int);
+
+PERF_TEST_P(NPU, MAT_ADD_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::add(mat1, mat2, dst, noArray(), -1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_ADD_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::add(mat1, mat2, dst, noArray(), -1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_SUB_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::subtract(mat1, mat2, dst, noArray(), -1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_SUB_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::subtract(mat1, mat2, dst, noArray(), -1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_MUL_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::multiply(mat1, mat2, dst, 1, -1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_MUL_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::multiply(mat1, mat2, dst, 1, -1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_DIV_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::divide(mat1, mat2, dst, 1, -1); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_DIV_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::divide(mat1, mat2, dst, 1, -1); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_BITWISE_AND_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::bitwise_and(mat1, mat2, dst, noArray()); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_BITWISE_AND_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::bitwise_and(mat1, mat2, dst, noArray()); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_BITWISE_OR_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::bitwise_or(mat1, mat2, dst, noArray()); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_BITWISE_OR_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::bitwise_or(mat1, mat2, dst, noArray()); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_BITWISE_XOR_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::bitwise_xor(mat1, mat2, dst, noArray()); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_BITWISE_XOR_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat1(GET_PARAM(0), GET_PARAM(1));
+    Mat mat2(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat1, WARMUP_RNG);
+    declare.in(mat2, WARMUP_RNG);
+    TEST_CYCLE() { cv::bitwise_xor(mat1, mat2, dst, noArray()); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(NPU, MAT_BITWISE_NOT_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    cv::cann::setDevice(DEVICE_ID);
+    TEST_CYCLE() { cv::cann::bitwise_not(mat, dst, noArray()); }
+    cv::cann::resetDevice();
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, MAT_BITWISE_NOT_MAT, testing::Combine(TYPICAL_NPU_MAT_SIZES, ARITHM_MAT_DEPTH))
+{
+    Mat mat(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE() { cv::bitwise_not(mat, dst, noArray()); }
+    SANITY_CHECK_NOTHING();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannarithm/perf/perf_main.cpp b/modules/cannops/perf/perf_main.cpp
similarity index 51%
rename from modules/cannarithm/perf/perf_main.cpp
rename to modules/cannops/perf/perf_main.cpp
index 13cde8f491e..33503ac4158 100644
--- a/modules/cannarithm/perf/perf_main.cpp
+++ b/modules/cannops/perf/perf_main.cpp
@@ -3,27 +3,14 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "perf_precomp.hpp"
-#include "opencv2/cann_arithm.hpp"
+#include "opencv2/cann_interface.hpp"
 using namespace perf;
 
 class CannEnvironment : public ::testing::Environment
 {
 public:
     virtual ~CannEnvironment() = default;
-    virtual void SetUp() CV_OVERRIDE {
-        cv::cann::initAcl();
-
-        // for device warmup
-        Scalar s1(1,2,3), s2(4,5,6);
-        Mat src1(10, 10, CV_32SC3, s1), src2(10, 10, CV_32SC3, s2);
-        cv::cann::setDevice(0);
-
-        cv::cann::AclMat npu_src1, npu_src2, dst;
-        npu_src1.upload(src1);
-        npu_src2.upload(src2);
-        cv::cann::add(npu_src1, npu_src2, dst);
-        cv::cann::resetDevice();
-        }
+    virtual void SetUp() CV_OVERRIDE { cv::cann::initAcl(); }
     virtual void TearDown() CV_OVERRIDE { cv::cann::finalizeAcl(); }
 };
 
@@ -33,4 +20,4 @@ static void initTests()
     ::testing::AddGlobalTestEnvironment(cannEnv);
 }
 
-CV_PERF_TEST_MAIN("cannarithm", initTests())
+CV_PERF_TEST_MAIN("cannops", initTests())
diff --git a/modules/cannarithm/perf/perf_precomp.hpp b/modules/cannops/perf/perf_precomp.hpp
similarity index 89%
rename from modules/cannarithm/perf/perf_precomp.hpp
rename to modules/cannops/perf/perf_precomp.hpp
index d0ff9533235..59e2fa03d7b 100644
--- a/modules/cannarithm/perf/perf_precomp.hpp
+++ b/modules/cannops/perf/perf_precomp.hpp
@@ -9,12 +9,11 @@
 #include "opencv2/ts/ts_perf.hpp"
 #include "opencv2/cann.hpp"
 
-namespace opencv_test
-{
+#define DEVICE_ID 0
+
 using namespace perf;
 using namespace testing;
 using namespace cv;
 using namespace cv::cann;
-} // namespace opencv_test
 
 #endif
diff --git a/modules/cannarithm/samples/sample.cpp b/modules/cannops/samples/sample.cpp
similarity index 55%
rename from modules/cannarithm/samples/sample.cpp
rename to modules/cannops/samples/sample.cpp
index 772ca96f54f..ddf9d45adac 100644
--- a/modules/cannarithm/samples/sample.cpp
+++ b/modules/cannops/samples/sample.cpp
@@ -2,12 +2,13 @@
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 
-//g++ -o sample sample.cpp -I opencv/include/opencv4/ -L opencv/build/install/lib/ -l opencv_cannarithm -l opencv_core -l opencv_imgcodecs
+// g++ -o sample sample.cpp -I opencv/include/opencv4/ -L opencv/build/install/lib/ -l
+// opencv_cannops -l opencv_core -l opencv_imgcodecs
 
 #include <iostream>
 #include <opencv2/imgcodecs.hpp>
 #include <opencv2/cann.hpp>
-#include <opencv2/cann_arithm.hpp>
+#include <opencv2/cann_interface.hpp>
 
 int main()
 {
@@ -16,14 +17,14 @@ int main()
     cv::cann::initAcl();
     cv::cann::setDevice(0);
 
-    cv::cann::AclMat aclMat = cv::cann::AclMat();
-    aclMat.upload(img);
+    cv::cann::NpuMat npuMat;
+    npuMat.upload(img);
 
-    cv::cann::AclMat aclMatSum;
-    cv::cann::add(aclMat, aclMat, aclMatSum);
+    cv::cann::NpuMat npuMatSum;
+    cv::cann::add(npuMat, npuMat, npuMatSum);
     cv::Mat imgResult;
-    aclMatSum.download(imgResult);
-    std::cout<<imgResult<<std::endl;
+    npuMatSum.download(imgResult);
+    std::cout << imgResult << std::endl;
 
     cv::cann::resetDevice();
     cv::cann::finalizeAcl();
diff --git a/modules/cannarithm/samples/sample.py b/modules/cannops/samples/sample.py
similarity index 73%
rename from modules/cannarithm/samples/sample.py
rename to modules/cannops/samples/sample.py
index b769e83ad44..2f94b41c1f9 100644
--- a/modules/cannarithm/samples/sample.py
+++ b/modules/cannops/samples/sample.py
@@ -10,11 +10,11 @@
 cv2.cann.initAcl()
 cv2.cann.setDevice(0)
 
-aclMat = cv2.cann.AclMat()
-aclMat.upload(img)
+npuMat = cv2.cann.NpuMat()
+npuMat.upload(img)
 
-aclMatSum = cv2.cann.add(aclMat, aclMat)
-imgResult = aclMatSum.download()
+npuMatSum = cv2.cann.add(npuMat, npuMat)
+imgResult = npuMatSum.download()
 print(imgResult)
 
 cv2.cann.finalizeAcl()
diff --git a/modules/cannops/src/cann_call.cpp b/modules/cannops/src/cann_call.cpp
new file mode 100644
index 00000000000..adf393aa57d
--- /dev/null
+++ b/modules/cannops/src/cann_call.cpp
@@ -0,0 +1,578 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <acl/acl.h>
+#include <acl/acl_op_compiler.h>
+#include "precomp.hpp"
+#include "opencv2/core/private.hpp"
+namespace cv
+{
+namespace cann
+{
+/*******************************Acl Error Checker*****************************/
+static inline void checkAclError(aclError err, const char* file, const int line, const char* func)
+{
+    if (ACL_SUCCESS != err)
+    {
+        const char* errMsg = aclGetRecentErrMsg();
+        cv::error(cv::Error::AscendApiCallError, errMsg == nullptr ? "" : errMsg, func, file, line);
+    }
+}
+
+static inline void checkAclPtr(void* ptr, const char* file, const int line, const char* func)
+{
+    if (nullptr == ptr)
+    {
+        const char* errMsg = aclGetRecentErrMsg();
+        cv::error(cv::Error::AscendApiCallError, errMsg == nullptr ? "" : errMsg, func, file, line);
+    }
+}
+
+#define CV_ACL_SAFE_CALL(expr) checkAclError((expr), __FILE__, __LINE__, CV_Func)
+#define CV_ACL_SAFE_CALL_PTR(expr)                     \
+    ({                                                 \
+        auto ptr = (expr);                             \
+        checkAclPtr(ptr, __FILE__, __LINE__, CV_Func); \
+        ptr;                                           \
+    })
+
+/*****************************Acl Operator Attribute**************************/
+#define DEFINE_ATTR_BODY(FUNC)                                     \
+    void Acl##FUNC##Attribute::addAttr(aclopAttr* opAttr)          \
+    {                                                              \
+        CV_ACL_SAFE_CALL(aclopSetAttr##FUNC(opAttr, name, value)); \
+    }
+
+#define DEFINE_ATTR_LIST_BODY(FUNC)                                         \
+    void AclList##FUNC##Attribute::addAttr(aclopAttr* opAttr)               \
+    {                                                                       \
+        CV_ACL_SAFE_CALL(aclopSetAttrList##FUNC(opAttr, name, num, value)); \
+    }
+
+DEFINE_ATTR_BODY(Float);
+DEFINE_ATTR_BODY(String);
+DEFINE_ATTR_BODY(Int);
+DEFINE_ATTR_BODY(Bool);
+DEFINE_ATTR_LIST_BODY(Int);
+
+#undef DEFINE_ATTR_BODY
+#undef DEFINE_ATTR_LIST_BODY
+
+/******************************Acl Runtime Warpper****************************/
+void aclrtMallocWarpper(void** data, size_t size)
+{
+    CV_ACL_SAFE_CALL(aclrtMalloc(data, size, ACL_MEM_MALLOC_HUGE_FIRST));
+}
+
+void aclrtFreeWarpper(void* data) { CV_ACL_SAFE_CALL(aclrtFree(data)); }
+// TODO should define dstMax?
+void aclrtMemcpyWarpper(std::shared_ptr<uchar>& dst, size_t offset, const void* src, size_t size,
+                        AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(
+            aclrtMemcpy(dst.get() + offset, size, src, size, ACL_MEMCPY_HOST_TO_DEVICE));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemcpyAsync(dst.get() + offset, size, src, size,
+                                          ACL_MEMCPY_HOST_TO_DEVICE, rawStream));
+        if (offset == 0)
+            stream.addTensorHolder(dst);
+    }
+}
+
+void aclrtMemcpyWarpper(void* dst, const std::shared_ptr<uchar>& src, size_t offset, size_t size,
+                        AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(
+            aclrtMemcpy(dst, size, src.get() + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemcpyAsync(dst, size, src.get() + offset, size,
+                                          ACL_MEMCPY_DEVICE_TO_HOST, rawStream));
+        if (offset == 0)
+            stream.addTensorHolder(src);
+    }
+}
+
+void aclrtMemcpyWarpper(std::shared_ptr<uchar>& dst, size_t dstOffset,
+                        const std::shared_ptr<uchar>& src, size_t srcOffset, size_t size,
+                        AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtMemcpy(dst.get() + dstOffset, size, src.get() + srcOffset, size,
+                                     ACL_MEMCPY_DEVICE_TO_DEVICE));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemcpyAsync(dst.get() + dstOffset, size, src.get() + srcOffset, size,
+                                          ACL_MEMCPY_DEVICE_TO_DEVICE, rawStream));
+        if (srcOffset == 0)
+            stream.addTensorHolder(src);
+        if (dstOffset == 0)
+            stream.addTensorHolder(dst);
+    }
+}
+
+void aclrtMemcpy2dWarpper(std::shared_ptr<uchar>& dst, size_t offset, size_t dpitch,
+                          const void* src, size_t spitch, size_t width, size_t length,
+                          AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtMemcpy2d(dst.get() + offset, dpitch, src, spitch, width, length,
+                                       ACL_MEMCPY_HOST_TO_DEVICE));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemcpy2dAsync(dst.get() + offset, dpitch, src, spitch, width, length,
+                                            ACL_MEMCPY_HOST_TO_DEVICE, rawStream));
+        stream.addTensorHolder(dst);
+    }
+}
+
+void aclrtMemcpy2dWarpper(void* dst, size_t dpitch, const std::shared_ptr<uchar>& src,
+                          size_t offset, size_t spitch, size_t width, size_t length,
+                          AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtMemcpy2d(dst, dpitch, src.get() + offset, spitch, width, length,
+                                       ACL_MEMCPY_DEVICE_TO_HOST));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemcpy2dAsync(dst, dpitch, src.get() + offset, spitch, width, length,
+                                            ACL_MEMCPY_DEVICE_TO_HOST, rawStream));
+        stream.addTensorHolder(src);
+    }
+}
+
+void aclrtMemsetWarpper(std::shared_ptr<uchar>& ptr, int32_t value, size_t count,
+                        AscendStream& stream)
+{
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtMemset(ptr.get(), count, value, count));
+    else
+    {
+        CV_ACL_SAFE_CALL(aclrtMemsetAsync(ptr.get(), count, value, count, rawStream));
+        stream.addTensorHolder(ptr);
+    }
+}
+
+/**************************Acl attribute preparation**************************/
+struct CannPreparation
+{
+    CannPreparation() { opAttr_ = CV_ACL_SAFE_CALL_PTR(aclopCreateAttr()); }
+
+    virtual ~CannPreparation()
+    {
+        for (auto desc : inputDesc_)
+        {
+            aclDestroyTensorDesc(desc);
+        }
+        for (auto desc : outputDesc_)
+        {
+            aclDestroyTensorDesc(desc);
+        }
+        for (auto buf : inputBuffers_)
+        {
+            aclDestroyDataBuffer(buf);
+        }
+        for (auto buf : outputBuffers_)
+        {
+            aclDestroyDataBuffer(buf);
+        }
+        aclopDestroyAttr(opAttr_);
+    }
+
+    std::vector<aclDataBuffer*> inputBuffers_;
+    std::vector<aclDataBuffer*> outputBuffers_;
+    std::vector<aclTensorDesc*> inputDesc_;
+    std::vector<aclTensorDesc*> outputDesc_;
+    aclopAttr* opAttr_;
+};
+
+#define CANN_PREPARE_INPUTDESC(var, name, ...)                               \
+    do                                                                       \
+    {                                                                        \
+        auto _rPtr = CV_ACL_SAFE_CALL_PTR(aclCreateTensorDesc(__VA_ARGS__)); \
+        if (_rPtr != nullptr)                                                \
+        {                                                                    \
+            if (name != nullptr and strlen(name) != 0)                       \
+                aclSetTensorDescName(_rPtr, name);                           \
+            var.inputDesc_.push_back(_rPtr);                                 \
+        }                                                                    \
+    } while (0)
+
+#define CANN_PREPARE_OUTPUTDESC(var, ...)                                    \
+    do                                                                       \
+    {                                                                        \
+        auto _rPtr = CV_ACL_SAFE_CALL_PTR(aclCreateTensorDesc(__VA_ARGS__)); \
+        if (_rPtr != nullptr)                                                \
+            var.outputDesc_.push_back(_rPtr);                                \
+    } while (0)
+
+#define CANN_PREPARE_INPUTBUFFER(var, ...)                                   \
+    do                                                                       \
+    {                                                                        \
+        auto _rPtr = CV_ACL_SAFE_CALL_PTR(aclCreateDataBuffer(__VA_ARGS__)); \
+        if (_rPtr != nullptr)                                                \
+            var.inputBuffers_.push_back(_rPtr);                              \
+    } while (0)
+
+#define CANN_PREPARE_OUTPUTBUFFER(var, ...)                                  \
+    do                                                                       \
+    {                                                                        \
+        auto _rPtr = CV_ACL_SAFE_CALL_PTR(aclCreateDataBuffer(__VA_ARGS__)); \
+        if (_rPtr != nullptr)                                                \
+            var.outputBuffers_.push_back(_rPtr);                             \
+    } while (0)
+
+/********************************Ascend Tensor********************************/
+static inline aclDataType getACLType(int opencvdepth)
+{
+    switch (opencvdepth)
+    {
+        case CV_8S:
+            return ACL_INT8;
+        case CV_16S:
+            return ACL_INT16;
+        case CV_8U:
+            return ACL_UINT8;
+        case CV_16U:
+            return ACL_UINT16;
+        case CV_32S:
+            return ACL_INT32;
+        case CV_32F:
+            return ACL_FLOAT;
+        case CV_64F:
+            return ACL_DOUBLE;
+        case CV_16F:
+            return ACL_FLOAT16;
+        default:
+            return ACL_DT_UNDEFINED;
+    }
+}
+
+AscendTensor::AscendTensor(std::shared_ptr<uchar> _data, size_t _dataSize, int64_t* _dims,
+                           size_t _dimSize, aclDataType _dtype, std::string _name,
+                           aclFormat _format)
+    : name(_name), data(_data), dataSize(_dataSize), dtype(_dtype), format(_format)
+{
+    dims.assign(_dims, _dims + _dimSize);
+}
+
+AscendTensor::AscendTensor(const NpuMat& npuMat, std::string _name, aclFormat _format)
+    : name(_name), format(_format)
+{
+    data = npuMat.data;
+    // Ascend can't process with gaps in matrix.
+    CV_Assert(npuMat.isContinuous());
+    dataSize = npuMat.rows * npuMat.cols * npuMat.elemSize();
+
+    switch (_format)
+    {
+        case ACL_FORMAT_NHWC:
+        case ACL_FORMAT_ND:
+            dims.resize(4);
+            // Batch, default = 1.
+            dims[0] = 1;
+            // Default OpenCV image format = NHWC.
+            dims[1] = npuMat.rows;
+            dims[2] = npuMat.cols;
+            dims[3] = npuMat.channels();
+            break;
+        case ACL_FORMAT_NCHW:
+            dims.resize(4);
+            dims[0] = 1;
+            dims[1] = npuMat.channels();
+            dims[2] = npuMat.rows;
+            dims[3] = npuMat.cols;
+            break;
+        default:
+            CV_Error(Error::StsBadArg, "Unknown/unsupported matrix format");
+    }
+
+    dtype = getACLType(npuMat.depth());
+}
+
+/**********************************Device*************************************/
+void setDevice(int device_id)
+{
+    aclrtContext context;
+    CV_ACL_SAFE_CALL(aclrtSetDevice(device_id));
+    CV_ACL_SAFE_CALL(aclrtCreateContext(&context, device_id));
+}
+
+void resetDevice() { CV_ACL_SAFE_CALL(aclrtResetDevice(getDevice())); }
+
+int32_t getDevice()
+{
+    int32_t deviceId;
+    CV_ACL_SAFE_CALL(aclrtGetDevice(&deviceId));
+    return deviceId;
+}
+
+void initAcl() { CV_ACL_SAFE_CALL(aclInit(nullptr)); }
+
+void finalizeAcl() { CV_ACL_SAFE_CALL(aclFinalize()); }
+
+class DefaultDeviceInitializer
+{
+public:
+    DefaultDeviceInitializer();
+    ~DefaultDeviceInitializer();
+
+    AscendStream& getNullAscendStream(int deviceId);
+
+private:
+    std::vector<Ptr<AscendStream>> streams_;
+    Mutex streams_mtx_;
+};
+
+DefaultDeviceInitializer::DefaultDeviceInitializer() {}
+
+DefaultDeviceInitializer::~DefaultDeviceInitializer() { streams_.clear(); }
+
+AscendStream& DefaultDeviceInitializer::getNullAscendStream(int deviceId)
+{
+    AutoLock lock(streams_mtx_);
+
+    if (streams_.empty())
+    {
+        uint32_t deviceCount;
+        CV_ACL_SAFE_CALL(aclrtGetDeviceCount(&deviceCount));
+
+        if (deviceCount > 0)
+            streams_.resize(deviceCount);
+    }
+
+    CV_DbgAssert(deviceId >= 0 && deviceId < static_cast<int>(streams_.size()));
+
+    if (streams_[deviceId].empty())
+    {
+        aclrtStream stream = nullptr;
+        Ptr<AscendStream::Impl> impl = makePtr<AscendStream::Impl>(stream);
+        streams_[deviceId] = Ptr<AscendStream>(new AscendStream(impl));
+    }
+
+    return *streams_[deviceId];
+}
+
+DefaultDeviceInitializer initializer;
+
+/***********************************Event*************************************/
+AscendEvent::Impl::Impl() : event(nullptr), ownEvent(true)
+{
+    CV_ACL_SAFE_CALL(aclrtCreateEvent(&event));
+}
+
+AscendEvent::Impl::Impl(aclrtEvent e) : event(e), ownEvent(false) {}
+
+AscendEvent::Impl::~Impl()
+{
+    if (event && ownEvent)
+    {
+        CV_ACL_SAFE_CALL(aclrtDestroyEvent(event));
+    }
+}
+
+aclrtEvent AscendEventAccessor::getEvent(const AscendEvent& event) { return event.impl_->event; }
+
+AscendEvent AscendEventAccessor::wrapEvent(aclrtEvent event)
+{
+    return AscendEvent(makePtr<AscendEvent::Impl>(event));
+}
+
+AscendEvent::AscendEvent() { impl_ = makePtr<Impl>(); }
+
+void AscendEvent::record(AscendStream& stream)
+{
+    CV_ACL_SAFE_CALL(aclrtRecordEvent(impl_->event, AscendStreamAccessor::getStream(stream)));
+}
+
+void AscendEvent::waitForComplete() const { CV_ACL_SAFE_CALL(aclrtSynchronizeEvent(impl_->event)); }
+
+/************************************Stream***********************************/
+void AscendStream::Impl::AddTensorHolder(const std::shared_ptr<uchar>& tensorData)
+{
+    tensorHolders.insert(tensorData);
+}
+
+AscendStream::Impl::Impl() : stream(nullptr), ownStream(true)
+{
+    CV_ACL_SAFE_CALL(aclrtCreateStream(&stream));
+}
+
+AscendStream::Impl::Impl(aclrtStream s) : stream(s), ownStream(false) {}
+
+aclrtStream AscendStreamAccessor::getStream(const AscendStream& stream)
+{
+    return stream.impl_->stream;
+}
+
+AscendStream AscendStreamAccessor::wrapStream(aclrtStream stream)
+{
+    return AscendStream(makePtr<AscendStream::Impl>(stream));
+}
+
+AscendStream wrapStream(size_t AscendStreamAddress)
+{
+    return AscendStreamAccessor::wrapStream(reinterpret_cast<aclrtStream>(AscendStreamAddress));
+}
+
+AscendStream::AscendStream() { impl_ = makePtr<Impl>(); }
+
+void AscendStream::waitForCompletion()
+{
+    CV_ACL_SAFE_CALL(aclrtSynchronizeStream(impl_->stream));
+    impl_->tensorHolders.clear();
+}
+
+void AscendStream::waitAscendEvent(const AscendEvent& event)
+{
+    CV_ACL_SAFE_CALL(aclrtStreamWaitEvent(impl_->stream, AscendEventAccessor::getEvent(event)));
+}
+
+AscendStream& AscendStream::Null()
+{
+    const uint32_t deviceId = getDevice();
+    return initializer.getNullAscendStream(deviceId);
+}
+
+void AscendStream::addTensorHolder(const std::shared_ptr<uchar>& holder)
+{
+    impl_->AddTensorHolder(holder);
+}
+
+/********************************Operator caller******************************/
+std::shared_ptr<uchar> mallocAndUpload(void* data, size_t size, AscendStream& stream,
+                                       NpuMat::Allocator* allocator)
+{
+    std::shared_ptr<uchar> ptr = allocator->allocate(size);
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtMemcpy(ptr.get(), size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
+    else
+        CV_ACL_SAFE_CALL(
+            aclrtMemcpyAsync(ptr.get(), size, data, size, ACL_MEMCPY_HOST_TO_DEVICE, rawStream));
+    return ptr;
+}
+
+void callAscendOperator(const char* op, std::vector<AscendTensor>& srcs,
+                        std::vector<AscendTensor>& dsts, AscendStream& stream,
+                        std::vector<AclAttribute*>& attrs)
+{
+    CannPreparation prepare;
+    for (AclAttribute* attr : attrs)
+    {
+        attr->addAttr(prepare.opAttr_);
+    }
+
+    for (const AscendTensor& src : srcs)
+    {
+        CANN_PREPARE_INPUTDESC(prepare, src.name.c_str(), src.dtype, src.dims.size(),
+                               &src.dims.at(0), src.format);
+        CANN_PREPARE_INPUTBUFFER(prepare, src.data.get(), src.dataSize);
+    }
+
+    for (const AscendTensor& dst : dsts)
+    {
+        CANN_PREPARE_OUTPUTDESC(prepare, dst.dtype, dst.dims.size(), &dst.dims.at(0), dst.format);
+        CANN_PREPARE_OUTPUTBUFFER(prepare, dst.data.get(), dst.dataSize);
+    }
+
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+
+    CV_ACL_SAFE_CALL(aclopCompileAndExecute(
+        op, prepare.inputDesc_.size(), prepare.inputDesc_.data(), prepare.inputBuffers_.data(),
+        prepare.outputDesc_.size(), prepare.outputDesc_.data(), prepare.outputBuffers_.data(),
+        prepare.opAttr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, rawStream));
+    if (rawStream == nullptr)
+        CV_ACL_SAFE_CALL(aclrtSynchronizeStream(rawStream));
+    else
+    {
+        for (const AscendTensor& src : srcs)
+        {
+            stream.addTensorHolder(src.data);
+        }
+        for (const AscendTensor& dst : dsts)
+        {
+            stream.addTensorHolder(dst.data);
+        }
+    }
+}
+
+void callAscendOperator(const NpuMat& src, NpuMat& dst, const char* op, AscendStream& stream,
+                        std::vector<AclAttribute*>& attrs)
+{
+    std::vector<AscendTensor> srcTensors, dstTensors;
+    srcTensors.emplace_back(src);
+    dstTensors.emplace_back(dst);
+    callAscendOperator(op, srcTensors, dstTensors, stream, attrs);
+}
+
+void callAscendOperator(const NpuMat& src1, const NpuMat& src2, NpuMat& dst, const char* op,
+                        AscendStream& stream, std::vector<AclAttribute*>& attrs)
+{
+    std::vector<AscendTensor> srcTensors, dstTensors;
+    srcTensors.emplace_back(src1);
+    srcTensors.emplace_back(src2);
+    dstTensors.emplace_back(dst);
+    callAscendOperator(op, srcTensors, dstTensors, stream, attrs);
+}
+
+void callAscendOperator(const NpuMat* srcs, const size_t srcCount, NpuMat& dst, const char* op,
+                        AscendStream& stream, std::vector<AclAttribute*>& attrs)
+{
+    std::vector<AscendTensor> srcTensors, dstTensors;
+    for (size_t i = 0; i < srcCount; i++)
+    {
+        srcTensors.emplace_back(srcs[i]);
+    }
+    dstTensors.emplace_back(dst);
+    callAscendOperator(op, srcTensors, dstTensors, stream, attrs);
+}
+
+void callAscendOperator(const NpuMat& src, const Scalar& sc, bool inv, NpuMat& dst, const char* op,
+                        AscendStream& stream, std::vector<AclAttribute*>& attrs)
+{
+    uchar rawData[32];
+    cv::scalarToRawData(sc, rawData, src.type(), 0);
+    std::shared_ptr<uchar> scPtr = mallocAndUpload(rawData, src.elemSize(), stream);
+
+    int64_t dims[] = {1, 1, 1, src.channels()};
+    AscendTensor scalarTensor(scPtr, src.elemSize(), dims, sizeof(dims) / sizeof(dims[0]),
+                              getACLType(src.depth()));
+
+    std::vector<AscendTensor> srcTensors, dstTensors;
+
+    srcTensors.emplace_back(src);
+    srcTensors.push_back(scalarTensor);
+
+    if (inv)
+        std::swap(srcTensors[0], srcTensors[1]);
+
+    dstTensors.emplace_back(dst);
+    callAscendOperator(op, srcTensors, dstTensors, stream, attrs);
+}
+
+void callAscendOperator(const NpuMat& src, NpuMat* dsts, const size_t dstCount, const char* op,
+                        AscendStream& stream, std::vector<AclAttribute*>& attrs)
+{
+    std::vector<AscendTensor> srcTensors, dstTensors;
+    srcTensors.emplace_back(src);
+    for (size_t i = 0; i < dstCount; i++)
+    {
+        dstTensors.emplace_back(dsts[i]);
+    }
+    callAscendOperator(op, srcTensors, dstTensors, stream, attrs);
+}
+
+} // namespace cann
+} // namespace cv
diff --git a/modules/cannops/src/color.cpp b/modules/cannops/src/color.cpp
new file mode 100644
index 00000000000..0c6f9df74a5
--- /dev/null
+++ b/modules/cannops/src/color.cpp
@@ -0,0 +1,722 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+namespace cv
+{
+namespace cann
+{
+static void matAlphaSet(NpuMat& mat, int dtype, AscendStream& stream)
+{
+    if (dtype < 0)
+        dtype = mat.depth();
+
+    if (mat.depth() == CV_8U || mat.depth() == CV_16U)
+    {
+        size_t size = mat.rows * mat.step;
+        aclrtMemsetWarpper(mat.data, 255, size, stream);
+    }
+    else
+    {
+        if (dtype == CV_32F)
+            mat.setTo(1.0f, stream);
+        else
+        {
+            mat.setTo((dtype == CV_8U ? (1 << 8) : (1 << 16)) - 1, stream);
+        }
+    }
+}
+
+inline void checkImg(const NpuMat& mat)
+{
+    int depth = mat.depth();
+    CV_Assert(!mat.empty());
+    CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32F);
+}
+
+inline void cvtBGRtoBGR(InputArray& _src, OutputArray& _dst, int dcn, bool swapBlue,
+                        AscendStream& stream)
+{
+    NpuMat src = getInputMat(_src, stream);
+    checkImg(src);
+    CV_Assert(src.channels() == 3 || src.channels() == 4);
+
+    NpuMat matChannels[4];
+    split(src, matChannels, stream);
+
+    if (swapBlue)
+    {
+        std::swap(matChannels[0], matChannels[2]);
+    }
+
+    if (dcn == 4 && src.channels() != 4)
+    {
+        NpuMat& alpha = matChannels[3];
+        alpha.create(src.rows, src.cols, CV_MAKE_TYPE(src.depth(), 1));
+        matAlphaSet(alpha, -1, stream);
+    }
+
+    merge(matChannels, dcn, _dst, stream);
+}
+
+// TODO duplicated code
+static const float B2YF = 0.114f;
+static const float G2YF = 0.587f;
+static const float R2YF = 0.299f;
+
+inline void cvtBGRtoGray(InputArray& _src, OutputArray& _dst, int, bool swapBlue,
+                         AscendStream& stream)
+{
+    NpuMat src = getInputMat(_src, stream);
+    checkImg(src);
+    CV_Assert(src.channels() == 3 || src.channels() == 4);
+
+    float coeffs[] = {B2YF, G2YF, R2YF};
+
+    NpuMat formatMat;
+    if (src.depth() != CV_32F)
+    {
+        src.convertTo(formatMat, CV_32F);
+    }
+    else
+    {
+        formatMat = src;
+    }
+
+    // For RGB
+    if (swapBlue)
+    {
+        std::swap(coeffs[0], coeffs[2]);
+    }
+
+    Scalar sc = {coeffs[0], coeffs[1], coeffs[2], 0};
+    NpuMat grayRet;
+    multiply(formatMat, sc, grayRet, 1, -1, stream);
+
+    NpuMat matChannels[4];
+    split(grayRet, matChannels, stream);
+
+    NpuMat dst = getOutputMat(_dst, src.rows, src.cols, CV_MAKE_TYPE(src.depth(), 1), stream);
+
+    AclIntAttribute matSize("N", 3);
+    std::vector<AclAttribute*> attrs{&matSize};
+
+    if (src.depth() != CV_32F)
+    {
+        formatMat.create(grayRet.rows, grayRet.cols, CV_MAKE_TYPE(grayRet.depth(), 1));
+        callAscendOperator(matChannels, 3, formatMat, "AddN", stream, attrs);
+
+        // do not use convertTo here, dst.data will overwrited.
+        callAscendOperator(formatMat, dst, "Cast", stream);
+    }
+    else
+        callAscendOperator(matChannels, 3, dst, "AddN", stream, attrs);
+    syncOutput(dst, _dst, stream);
+}
+
+inline void cvtGraytoBGR(InputArray& _src, OutputArray& _dst, int dcn, bool, AscendStream& stream)
+{
+    NpuMat src = getInputMat(_src, stream);
+    checkImg(src);
+    CV_Assert(src.channels() == 1);
+
+    NpuMat matChannels[4];
+    for (int i = 0; i < 3; i++)
+    {
+        matChannels[i] = src;
+    }
+
+    if (dcn == 4)
+    {
+        NpuMat& alpha = matChannels[3];
+        alpha.create(src.rows, src.cols, CV_MAKE_TYPE(src.depth(), 1));
+        matAlphaSet(alpha, -1, stream);
+    }
+
+    merge(matChannels, dcn, _dst, stream);
+}
+
+static const float RGB2XYZ_D65[] = {0.412453, 0.357580, 0.180423, 0.212671, 0.715160,
+                                    0.072169, 0.019334, 0.119193, 0.950227};
+
+static const float XYZ2RGB_D65[] = {3.240479, -1.53715, -0.498535, -0.969256, 1.875991,
+                                    0.041556, 0.055648, -0.204043, 1.057311};
+
+inline void matMulRGB(InputArray& _src, OutputArray& _dst, float* matrix, AscendStream& stream)
+{
+    NpuMat src = getInputMat(_src, stream);
+    checkImg(src);
+    CV_Assert(src.channels() == 3);
+
+    NpuMat formatMat;
+    if (src.depth() != CV_32F)
+    {
+        src.convertTo(formatMat, CV_32F);
+    }
+    else
+    {
+        formatMat = src;
+    }
+
+    // TODO async!!!
+    Mat transMat(1, 3, CV_32FC3, matrix);
+    NpuMat transNpuMat;
+    transNpuMat.upload(transMat, stream);
+
+    AclBoolAttribute transposeX1("adj_x1", false);
+    AclBoolAttribute transposeX2("adj_x2", true);
+    std::vector<AclAttribute*> matMulAttr{&transposeX1, &transposeX2};
+
+    NpuMat dst = getOutputMat(_dst, src.rows, src.cols, src.type(), stream);
+
+    if (src.depth() != CV_32F)
+    {
+        NpuMat formatRet(formatMat.size(), formatMat.type()),
+            thresholdRet(formatMat.size(), formatMat.type());
+        callAscendOperator(formatMat, transNpuMat, formatRet, "BatchMatMulV2", stream, matMulAttr);
+        uint16_t thresh = (src.depth() == CV_8U ? (1 << 8) : (1 << 16)) - 1;
+        threshold(formatRet, thresholdRet, thresh, 0, 2 /*THRESH_TRUNC*/, stream);
+        threshold(thresholdRet, formatRet, 0, 0, 3 /*THRESH_TOZERO*/, stream);
+        callAscendOperator(formatRet, dst, "Cast", stream);
+    }
+    else
+        callAscendOperator(formatMat, transNpuMat, dst, "BatchMatMulV2", stream, matMulAttr);
+
+    syncOutput(dst, _dst, stream);
+}
+
+// TODO should deal with overflow. set 255 instead of cut off.
+inline void cvtBGRtoXYZ(InputArray& src, OutputArray& dst, int, bool swapBlue, AscendStream& stream)
+{
+    float coeffs[9];
+    memcpy(coeffs, RGB2XYZ_D65, 9 * sizeof(float));
+    if (!swapBlue)
+    {
+        std::swap(coeffs[0], coeffs[2]);
+        std::swap(coeffs[3], coeffs[5]);
+        std::swap(coeffs[6], coeffs[8]);
+    }
+    matMulRGB(src, dst, coeffs, stream);
+}
+
+inline void cvtXYZtoBGR(InputArray& src, OutputArray& dst, int dcn, bool swapBlue,
+                        AscendStream& stream)
+{
+    float coeffs[9];
+    memcpy(coeffs, XYZ2RGB_D65, 9 * sizeof(float));
+    if (!swapBlue)
+    {
+        std::swap(coeffs[0], coeffs[6]);
+        std::swap(coeffs[1], coeffs[7]);
+        std::swap(coeffs[2], coeffs[8]);
+    }
+
+    if (dcn == 4)
+    {
+        NpuMat RGB[4], tempMat1;
+        matMulRGB(src, tempMat1, coeffs, stream);
+
+        split(tempMat1, RGB, stream);
+        RGB[3].create(RGB[0].rows, RGB[1].cols, RGB[0].type());
+        matAlphaSet(RGB[3], -1, stream);
+        merge(RGB, 4, dst, stream);
+    }
+    else
+        matMulRGB(src, dst, coeffs, stream);
+}
+
+// TODO duplicated code
+static const float YCRF = 0.713f;
+static const float YCBF = 0.564f;
+static const float R2VF = 0.877f;
+static const float B2UF = 0.492f;
+inline void cvtBGRtoYCrCb(InputArray& _src, OutputArray& _dst, float* coeffs, bool swapBlue,
+                          bool yuvOrder, AscendStream& stream)
+{
+    NpuMat src = getInputMat(_src, stream);
+    checkImg(src);
+    CV_Assert(src.channels() == 3);
+
+    int buleIdx = swapBlue ? 2 : 0;
+    int depth = src.depth();
+    float delta = (depth == CV_8U) ? 128 : ((depth == CV_16U) ? 32768 : 0.5);
+
+    NpuMat formatMat;
+    if (src.depth() != CV_32F)
+    {
+        src.convertTo(formatMat, CV_32F);
+    }
+    else
+    {
+        formatMat = src;
+    }
+
+    NpuMat YCrCb[3], RGB[3];
+    split(formatMat, RGB, stream);
+    cvtBGRtoGray(formatMat, YCrCb[0], 1, swapBlue, stream);
+    YCrCb[1].create(YCrCb[0].rows, YCrCb[0].cols, YCrCb[0].type());
+    YCrCb[2].create(YCrCb[0].rows, YCrCb[0].cols, YCrCb[0].type());
+
+    NpuMat tempMat1(formatMat.size(), CV_MAKE_TYPE(formatMat.depth(), 1)),
+        tempMat2(formatMat.size(), CV_MAKE_TYPE(formatMat.depth(), 1));
+
+    callAscendOperator(RGB[buleIdx ^ 2], YCrCb[0], tempMat1, "Sub", stream);
+    muls(tempMat1, coeffs[0], tempMat2, stream);
+    adds(tempMat2, delta, YCrCb[1], stream);
+
+    callAscendOperator(RGB[buleIdx], YCrCb[0], tempMat1, "Sub", stream);
+    muls(tempMat1, coeffs[1], tempMat2, stream);
+    adds(tempMat2, delta, YCrCb[2], stream);
+
+    if (yuvOrder)
+    {
+        std::swap(YCrCb[1], YCrCb[2]);
+    }
+
+    if (src.depth() != CV_32F)
+    {
+        NpuMat formatRet(formatMat.size(), formatMat.type()),
+            thresholdRet(formatMat.size(), formatMat.type());
+        merge(YCrCb, 3, formatRet, stream);
+        uint16_t thresh = (src.depth() == CV_8U ? (1 << 8) : (1 << 16)) - 1;
+        threshold(formatRet, thresholdRet, thresh, 0, 2 /*THRESH_TRUNC*/, stream);
+        threshold(thresholdRet, formatRet, 0, 0, 3 /*THRESH_TOZERO*/, stream);
+        NpuMat dst = getOutputMat(_dst, src.rows, src.cols, src.type(), stream);
+        callAscendOperator(formatRet, dst, "Cast", stream);
+        syncOutput(dst, _dst, stream);
+    }
+    else
+        merge(YCrCb, 3, _dst, stream);
+}
+
+static const float CR2RF = 1.403f;
+static const float CR2GF = -0.714f;
+static const float CB2GF = -0.344f;
+static const float CB2BF = 1.773f;
+
+static const float V2RF = 1.140f;
+static const float V2GF = -0.581f;
+static const float U2GF = -0.395f;
+static const float U2BF = 2.032f;
+
+inline void cvtYCrCbtoBGR(InputArray& _src, OutputArray& _dst, int dcn, float* coeffs,
+                          bool swapBlue, bool yuvOrder, AscendStream& stream)
+{
+    NpuMat src = getInputMat(_src, stream);
+    checkImg(src);
+    CV_Assert(src.channels() == 3);
+
+    int buleIdx = swapBlue ? 2 : 0;
+    int depth = src.depth();
+    float delta = (depth == CV_8U) ? 128 : ((depth == CV_16U) ? 32768 : 0.5);
+
+    NpuMat formatMat;
+    if (src.depth() != CV_32F)
+    {
+        src.convertTo(formatMat, CV_32F);
+    }
+    else
+    {
+        formatMat = src;
+    }
+
+    NpuMat YCrCb[3], RGB[4];
+    split(formatMat, YCrCb, stream);
+    if (yuvOrder)
+    {
+        std::swap(YCrCb[1], YCrCb[2]);
+    }
+    RGB[0].create(formatMat.rows, formatMat.cols, CV_MAKE_TYPE(formatMat.depth(), 1));
+    RGB[1].create(formatMat.rows, formatMat.cols, CV_MAKE_TYPE(formatMat.depth(), 1));
+    RGB[2].create(formatMat.rows, formatMat.cols, CV_MAKE_TYPE(formatMat.depth(), 1));
+    NpuMat tempMat1(formatMat.size(), CV_MAKE_TYPE(formatMat.depth(), 1)),
+        tempMat2(formatMat.size(), CV_MAKE_TYPE(formatMat.depth(), 1)),
+        CbSubDelta(formatMat.size(), CV_MAKE_TYPE(formatMat.depth(), 1)),
+        CrSubDelta(formatMat.size(), CV_MAKE_TYPE(formatMat.depth(), 1));
+
+    adds(YCrCb[1], (0.0f - delta), CrSubDelta, stream);
+    adds(YCrCb[2], (0.0f - delta), CbSubDelta, stream);
+
+    muls(CrSubDelta, coeffs[0], tempMat1, stream);
+    callAscendOperator(YCrCb[0], tempMat1, RGB[buleIdx ^ 2], "Add", stream);
+
+    muls(CrSubDelta, coeffs[1], tempMat1, stream);
+    callAscendOperator(YCrCb[0], tempMat1, tempMat2, "Add", stream);
+    muls(CbSubDelta, coeffs[2], tempMat1, stream);
+    callAscendOperator(tempMat2, tempMat1, RGB[1], "Add", stream);
+
+    muls(CbSubDelta, coeffs[3], tempMat1, stream);
+    callAscendOperator(YCrCb[0], tempMat1, RGB[buleIdx], "Add", stream);
+
+    if (dcn == 4)
+    {
+        RGB[3].create(RGB[0].rows, RGB[0].cols, RGB[0].type());
+        matAlphaSet(RGB[3], src.depth(), stream);
+    }
+
+    if (src.depth() != CV_32F)
+    {
+        NpuMat formatRet(formatMat.size(), CV_MAKE_TYPE(formatMat.depth(), dcn)),
+            thresholdRet(formatMat.size(), CV_MAKE_TYPE(formatMat.depth(), dcn));
+        merge(RGB, dcn, formatRet, stream);
+        uint16_t thresh = (src.depth() == CV_8U ? (1 << 8) : (1 << 16)) - 1;
+        threshold(formatRet, thresholdRet, thresh, 0, 2 /*THRESH_TRUNC*/, stream);
+        threshold(thresholdRet, formatRet, 0, 0, 3 /*THRESH_TOZERO*/, stream);
+        NpuMat dst = getOutputMat(_dst, src.rows, src.cols, CV_MAKE_TYPE(src.depth(), dcn), stream);
+        callAscendOperator(formatRet, dst, "Cast", stream);
+        syncOutput(dst, _dst, stream);
+    }
+    else
+        merge(RGB, dcn, _dst, stream);
+}
+
+inline void BGR2BGRA(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 4, false, stream);
+}
+
+inline void BGRA2BGR(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 3, false, stream);
+}
+
+inline void BGR2RGBA(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 4, true, stream);
+}
+
+inline void RGBA2BGR(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 3, true, stream);
+}
+
+inline void BGR2RGB(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 3, true, stream);
+}
+
+inline void BGRA2RGBA(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    cvtBGRtoBGR(src, dst, 4, true, stream);
+}
+
+inline void BGR2GRAY(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    cvtBGRtoGray(src, dst, 1, false, stream);
+}
+
+inline void RGB2GRAY(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    cvtBGRtoGray(src, dst, 1, true, stream);
+}
+
+inline void GRAY2BGR(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    cvtGraytoBGR(src, dst, 3, false, stream);
+}
+
+inline void GRAY2BGRA(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    cvtGraytoBGR(src, dst, 4, false, stream);
+}
+
+inline void BGRA2GRAY(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    cvtBGRtoGray(src, dst, 1, false, stream);
+}
+
+inline void RGBA2GRAY(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    cvtBGRtoGray(src, dst, 1, true, stream);
+}
+
+inline void BGR2XYZ(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    cvtBGRtoXYZ(src, dst, 3, false, stream);
+}
+
+inline void RGB2XYZ(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    cvtBGRtoXYZ(src, dst, 3, true, stream);
+}
+
+inline void XYZ2BGR(InputArray src, OutputArray& dst, int dcn, AscendStream& stream)
+{
+    if (dcn <= 0)
+        dcn = 3;
+    cvtXYZtoBGR(src, dst, dcn, false, stream);
+}
+
+inline void XYZ2RGB(InputArray src, OutputArray& dst, int dcn, AscendStream& stream)
+{
+    if (dcn <= 0)
+        dcn = 3;
+    cvtXYZtoBGR(src, dst, dcn, true, stream);
+}
+
+inline void BGR2YCrCb(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    float coeffs[2];
+    coeffs[0] = YCRF;
+    coeffs[1] = YCBF;
+    cvtBGRtoYCrCb(src, dst, coeffs, false, false, stream);
+}
+
+inline void RGB2YCrCb(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    float coeffs[2];
+    coeffs[0] = YCRF;
+    coeffs[1] = YCBF;
+    cvtBGRtoYCrCb(src, dst, coeffs, true, false, stream);
+}
+
+inline void YCrCb2BGR(InputArray src, OutputArray& dst, int dcn, AscendStream& stream)
+{
+    float coeffs[4];
+    coeffs[0] = CR2RF;
+    coeffs[1] = CR2GF;
+    coeffs[2] = CB2GF;
+    coeffs[3] = CB2BF;
+    if (dcn <= 0)
+        dcn = 3;
+    cvtYCrCbtoBGR(src, dst, dcn, coeffs, false, false, stream);
+}
+
+inline void YCrCb2RGB(InputArray src, OutputArray& dst, int dcn, AscendStream& stream)
+{
+    float coeffs[4];
+    coeffs[0] = CR2RF;
+    coeffs[1] = CR2GF;
+    coeffs[2] = CB2GF;
+    coeffs[3] = CB2BF;
+    if (dcn <= 0)
+        dcn = 3;
+    cvtYCrCbtoBGR(src, dst, dcn, coeffs, true, false, stream);
+}
+
+inline void BGR2YUV(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    float coeffs[2];
+    coeffs[0] = R2VF;
+    coeffs[1] = B2UF;
+    cvtBGRtoYCrCb(src, dst, coeffs, false, true, stream);
+}
+
+inline void RGB2YUV(InputArray src, OutputArray& dst, int, AscendStream& stream)
+{
+    float coeffs[2];
+    coeffs[0] = R2VF;
+    coeffs[1] = B2UF;
+    cvtBGRtoYCrCb(src, dst, coeffs, true, true, stream);
+}
+
+inline void YUV2BGR(InputArray src, OutputArray& dst, int dcn, AscendStream& stream)
+{
+    float coeffs[4];
+    coeffs[0] = V2RF;
+    coeffs[1] = V2GF;
+    coeffs[2] = U2GF;
+    coeffs[3] = U2BF;
+    if (dcn <= 0)
+        dcn = 3;
+    cvtYCrCbtoBGR(src, dst, dcn, coeffs, false, true, stream);
+}
+
+inline void YUV2RGB(InputArray src, OutputArray& dst, int dcn, AscendStream& stream)
+{
+    float coeffs[4];
+    coeffs[0] = V2RF;
+    coeffs[1] = V2GF;
+    coeffs[2] = U2GF;
+    coeffs[3] = U2BF;
+    if (dcn <= 0)
+        dcn = 3;
+    cvtYCrCbtoBGR(src, dst, dcn, coeffs, true, true, stream);
+}
+
+void cvtColor(InputArray src, OutputArray dst, int code, int dcn, AscendStream& stream)
+{
+    typedef void (*func_t)(InputArray& src, OutputArray& dst, int dcn, AscendStream& stream);
+    static const func_t funcs[] = {
+        BGR2BGRA,  // CV_BGR2BGRA    =0
+        BGRA2BGR,  // CV_BGRA2BGR    =1
+        BGR2RGBA,  // CV_BGR2RGBA    =2
+        RGBA2BGR,  // CV_RGBA2BGR    =3
+        BGR2RGB,   // CV_BGR2RGB     =4
+        BGRA2RGBA, // CV_BGRA2RGBA   =5
+
+        BGR2GRAY,  // CV_BGR2GRAY    =6
+        RGB2GRAY,  // CV_RGB2GRAY    =7
+        GRAY2BGR,  // CV_GRAY2BGR    =8
+        GRAY2BGRA, // CV_GRAY2BGRA   =9
+        BGRA2GRAY, // CV_BGRA2GRAY   =10
+        RGBA2GRAY, // CV_RGBA2GRAY   =11
+
+        0, // CV_BGR2BGR565  =12
+        0, // CV_RGB2BGR565  =13
+        0, // CV_BGR5652BGR  =14
+        0, // CV_BGR5652RGB  =15
+        0, // CV_BGRA2BGR565 =16
+        0, // CV_RGBA2BGR565 =17
+        0, // CV_BGR5652BGRA =18
+        0, // CV_BGR5652RGBA =19
+
+        0, // CV_GRAY2BGR565 =20
+        0, // CV_BGR5652GRAY =21
+
+        0, // CV_BGR2BGR555  =22
+        0, // CV_RGB2BGR555  =23
+        0, // CV_BGR5552BGR  =24
+        0, // CV_BGR5552RGB  =25
+        0, // CV_BGRA2BGR555 =26
+        0, // CV_RGBA2BGR555 =27
+        0, // CV_BGR5552BGRA =28
+        0, // CV_BGR5552RGBA =29
+
+        0, // CV_GRAY2BGR555 =30
+        0, // CV_BGR5552GRAY =31
+
+        BGR2XYZ, // CV_BGR2XYZ     =32
+        RGB2XYZ, // CV_RGB2XYZ     =33
+        XYZ2BGR, // CV_XYZ2BGR     =34
+        XYZ2RGB, // CV_XYZ2RGB     =35
+
+        BGR2YCrCb, // CV_BGR2YCrCb   =36
+        RGB2YCrCb, // CV_RGB2YCrCb   =37
+        YCrCb2BGR, // CV_YCrCb2BGR   =38
+        YCrCb2RGB, // CV_YCrCb2RGB   =39
+
+        0, // CV_BGR2HSV     =40
+        0, // CV_RGB2HSV     =41
+
+        0, //                =42
+        0, //                =43
+
+        0, // CV_BGR2Lab     =44
+        0, // CV_RGB2Lab     =45
+
+        0, // CV_BayerBG2BGR =46
+        0, // CV_BayeRGB2BGR =47
+        0, // CV_BayerRG2BGR =48
+        0, // CV_BayerGR2BGR =49
+
+        0, // CV_BGR2Luv     =50
+        0, // CV_RGB2Luv     =51
+
+        0, // CV_BGR2HLS     =52
+        0, // CV_RGB2HLS     =53
+
+        0, // CV_HSV2BGR     =54
+        0, // CV_HSV2RGB     =55
+
+        0, // CV_Lab2BGR     =56
+        0, // CV_Lab2RGB     =57
+        0, // CV_Luv2BGR     =58
+        0, // CV_Luv2RGB     =59
+
+        0, // CV_HLS2BGR     =60
+        0, // CV_HLS2RGB     =61
+
+        0, // CV_BayerBG2BGR_VNG =62
+        0, // CV_BayeRGB2BGR_VNG =63
+        0, // CV_BayerRG2BGR_VNG =64
+        0, // CV_BayerGR2BGR_VNG =65
+
+        0, // CV_BGR2HSV_FULL = 66
+        0, // CV_RGB2HSV_FULL = 67
+        0, // CV_BGR2HLS_FULL = 68
+        0, // CV_RGB2HLS_FULL = 69
+
+        0, // CV_HSV2BGR_FULL = 70
+        0, // CV_HSV2RGB_FULL = 71
+        0, // CV_HLS2BGR_FULL = 72
+        0, // CV_HLS2RGB_FULL = 73
+
+        0, // CV_LBGR2Lab     = 74
+        0, // CV_LRGB2Lab     = 75
+        0, // CV_LBGR2Luv     = 76
+        0, // CV_LRGB2Luv     = 77
+
+        0, // CV_Lab2LBGR     = 78
+        0, // CV_Lab2LRGB     = 79
+        0, // CV_Luv2LBGR     = 80
+        0, // CV_Luv2LRGB     = 81
+
+        BGR2YUV, // CV_BGR2YUV      = 82
+        RGB2YUV, // CV_RGB2YUV      = 83
+        YUV2BGR, // CV_YUV2BGR      = 84
+        YUV2RGB, // CV_YUV2RGB      = 85
+
+        0, // CV_BayerBG2GRAY = 86
+        0, // CV_BayeRGB2GRAY = 87
+        0, // CV_BayerRG2GRAY = 88
+        0, // CV_BayerGR2GRAY = 89
+
+        // YUV 4:2:0 formats family
+        0, // CV_YUV2RGB_NV12 = 90,
+        0, // CV_YUV2BGR_NV12 = 91,
+        0, // CV_YUV2RGB_NV21 = 92,
+        0, // CV_YUV2BGR_NV21 = 93,
+
+        0, // CV_YUV2RGBA_NV12 = 94,
+        0, // CV_YUV2BGRA_NV12 = 95,
+        0, // CV_YUV2RGBA_NV21 = 96,
+        0, // CV_YUV2BGRA_NV21 = 97,
+
+        0, // CV_YUV2RGB_YV12 = 98,
+        0, // CV_YUV2BGR_YV12 = 99,
+        0, // CV_YUV2RGB_IYUV = 100,
+        0, // CV_YUV2BGR_IYUV = 101,
+
+        0, // CV_YUV2RGBA_YV12 = 102,
+        0, // CV_YUV2BGRA_YV12 = 103,
+        0, // CV_YUV2RGBA_IYUV = 104,
+        0, // CV_YUV2BGRA_IYUV = 105,
+
+        0, // CV_YUV2GRAY_420 = 106,
+
+        // YUV 4:2:2 formats family
+        0, // CV_YUV2RGB_UYVY = 107,
+        0, // CV_YUV2BGR_UYVY = 108,
+        0, // //CV_YUV2RGB_VYUY = 109,
+        0, // //CV_YUV2BGR_VYUY = 110,
+
+        0, // CV_YUV2RGBA_UYVY = 111,
+        0, // CV_YUV2BGRA_UYVY = 112,
+        0, // //CV_YUV2RGBA_VYUY = 113,
+        0, // //CV_YUV2BGRA_VYUY = 114,
+
+        0, // CV_YUV2RGB_YUY2 = 115,
+        0, // CV_YUV2BGR_YUY2 = 116,
+        0, // CV_YUV2RGB_YVYU = 117,
+        0, // CV_YUV2BGR_YVYU = 118,
+
+        0, // CV_YUV2RGBA_YUY2 = 119,
+        0, // CV_YUV2BGRA_YUY2 = 120,
+        0, // CV_YUV2RGBA_YVYU = 121,
+        0, // CV_YUV2BGRA_YVYU = 122,
+
+        0, // CV_YUV2GRAY_UYVY = 123,
+        0, // CV_YUV2GRAY_YUY2 = 124,
+
+        // alpha premultiplication
+        0, // CV_RGBA2mRGBA = 125,
+        0, // CV_mRGBA2RGBA = 126,
+
+        0, // CV_COLORCVT_MAX  = 127
+    };
+
+    CV_Assert(code < 128);
+
+    func_t func = funcs[code];
+
+    if (func == 0)
+        CV_Error(Error::StsBadFlag, "Unknown/unsupported color conversion code");
+
+    func(src, dst, dcn, stream);
+}
+
+} // namespace cann
+} // namespace cv
\ No newline at end of file
diff --git a/modules/cannops/src/core.cpp b/modules/cannops/src/core.cpp
new file mode 100644
index 00000000000..398394e6714
--- /dev/null
+++ b/modules/cannops/src/core.cpp
@@ -0,0 +1,205 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+
+namespace cv
+{
+namespace cann
+{
+static inline aclFormat getAclFormat(const char* type)
+{
+    if (strcmp(type, "NCHW") == 0)
+    {
+        return ACL_FORMAT_NCHW;
+    }
+    else if (strcmp(type, "NHWC") == 0)
+    {
+        return ACL_FORMAT_NHWC;
+    }
+    else
+    {
+        CV_Error(Error::StsBadArg, "Unknown/unsupported matrix format");
+    }
+}
+
+void transData(const NpuMat& src, NpuMat& dst, const char* from, const char* to,
+               AscendStream& stream)
+{
+    AclStringAttribute fromAttr("src_format", from);
+    AclStringAttribute toAttr("dst_format", to);
+    std::vector<AclAttribute*> attrs{&fromAttr, &toAttr};
+
+    std::vector<AscendTensor> srcTensors, dstTensors;
+    srcTensors.emplace_back(src, "", getAclFormat(from));
+    dstTensors.emplace_back(dst, "", getAclFormat(to));
+    callAscendOperator("TransData", srcTensors, dstTensors, stream, attrs);
+}
+
+void merge(const NpuMat* src, size_t n, NpuMat& dst, AscendStream& stream)
+{
+    if (src == nullptr || n < 2)
+        return;
+
+    int depth = src->depth();
+    int rows = src->rows;
+    int cols = src->cols;
+
+    // all matrix must have same size and type
+    for (size_t i = 1; i < n; i++)
+    {
+        CV_Assert(src[i].depth() == depth && src[i].channels() == 1);
+        CV_Assert(src[i].rows == rows && src[i].cols == cols);
+    }
+
+    AclIntAttribute concatDim("concat_dim", 3);
+    std::vector<AclAttribute*> attrs{&concatDim};
+
+    std::vector<AscendTensor> srcTensors, dstTensors;
+
+    for (size_t i = 0; i < n; i++)
+    {
+        srcTensors.emplace_back(src[i], "x" + std::to_string(i));
+    }
+    dstTensors.emplace_back(dst);
+
+    callAscendOperator("ConcatD", srcTensors, dstTensors, stream, attrs);
+}
+
+void merge(const NpuMat* src, size_t n, OutputArray _dst, AscendStream& stream)
+{
+    NpuMat dst = getOutputMat(_dst, src->rows, src->cols, CV_MAKE_TYPE(src->depth(), n), stream);
+    merge(src, n, dst, stream);
+    syncOutput(dst, _dst, stream);
+}
+
+void merge(const std::vector<NpuMat>& src, OutputArray dst, AscendStream& stream)
+{
+    merge(&src[0], src.size(), dst, stream);
+}
+
+void split(const NpuMat& src, NpuMat* dst, AscendStream& stream)
+{
+    if (src.empty() || dst == nullptr)
+        return;
+
+    int cn = src.channels();
+    AclIntAttribute splitDim("split_dim", 3);
+    AclIntAttribute numSplit("num_split", cn);
+
+    for (int i = 0; i < cn; i++)
+        dst[i].create(src.rows, src.cols, CV_MAKE_TYPE(src.depth(), 1));
+
+    std::vector<AclAttribute*> attrs{&splitDim, &numSplit};
+
+    callAscendOperator(src, dst, cn, "SplitD", stream, attrs);
+}
+
+void split(InputArray _src, NpuMat* dst, AscendStream& stream)
+{
+    NpuMat src = getInputMat(_src, stream);
+    split(src, dst, stream);
+}
+
+void split(InputArray _src, std::vector<NpuMat>& dst, AscendStream& stream)
+{
+    NpuMat src = getInputMat(_src, stream);
+    dst.resize(src.channels());
+    split(_src, &dst[0], stream);
+}
+
+void transpose(const NpuMat& src, int64_t* perm, NpuMat& dst, AscendStream& stream)
+{
+    AclListIntAttribute permAttr("perm", 4, perm);
+    std::vector<AclAttribute*> attrs{&permAttr};
+
+    std::vector<AscendTensor> srcTensors, dstTensors;
+    srcTensors.emplace_back(src);
+    dstTensors.emplace_back(dst);
+    callAscendOperator("TransposeD", srcTensors, dstTensors, stream, attrs);
+}
+
+void transpose(InputArray _src, OutputArray _dst, AscendStream& stream)
+{
+    NpuMat src = getInputMat(_src, stream);
+
+    NpuMat dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
+
+    int64_t perm[] = {0, 2, 1, 3};
+    transpose(src, perm, dst, stream);
+    syncOutput(dst, _dst, stream);
+}
+
+void flip(const NpuMat& src, std::vector<int32_t>& asixs, NpuMat& dst, AscendStream& stream)
+{
+    size_t dataSize = asixs.size() * sizeof(int32_t);
+    std::shared_ptr<uchar> axisPtr = mallocAndUpload(&asixs.at(0), dataSize, stream);
+
+    int64_t dims[] = {(int64_t)asixs.size()};
+    AscendTensor asixTensor(axisPtr, dataSize, dims, 1, ACL_INT32);
+
+    std::vector<AscendTensor> srcTensors, dstTensors;
+    srcTensors.emplace_back(src);
+    srcTensors.push_back(std::move(asixTensor));
+    dstTensors.emplace_back(dst);
+    callAscendOperator("ReverseV2", srcTensors, dstTensors, stream, emptyattr);
+}
+
+void flip(InputArray _src, OutputArray _dst, int flipCode, AscendStream& stream)
+{
+    NpuMat src = getInputMat(_src, stream);
+    NpuMat dst = getOutputMat(_dst, src.rows, src.cols, src.type(), stream);
+
+    std::vector<int32_t> asix;
+    if (flipCode == 0)
+    {
+        asix.push_back(1);
+    }
+    else if (flipCode > 0)
+    {
+        asix.push_back(2);
+    }
+    else
+    {
+        asix.push_back(1);
+        asix.push_back(2);
+    }
+    flip(src, asix, dst, stream);
+    syncOutput(dst, _dst, stream);
+}
+
+void rotate(InputArray _src, OutputArray _dst, int rotateMode, AscendStream& stream)
+{
+    CV_Assert(_src.dims() <= 2);
+    NpuMat src = getInputMat(_src, stream), dst, tempMat;
+    switch (rotateMode)
+    {
+        case ROTATE_90_CLOCKWISE:
+        {
+            dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
+            transpose(src, tempMat, stream);
+            flip(tempMat, dst, 1, stream);
+            break;
+        }
+        case ROTATE_180:
+        {
+            dst = getOutputMat(_dst, src.rows, src.cols, src.type(), stream);
+            flip(src, dst, -1, stream);
+            break;
+        }
+        case ROTATE_90_COUNTERCLOCKWISE:
+        {
+            dst = getOutputMat(_dst, src.cols, src.rows, src.type(), stream);
+            transpose(_src, tempMat, stream);
+            flip(tempMat, dst, 0, stream);
+            break;
+        }
+        default:
+            break;
+    }
+    syncOutput(dst, _dst, stream);
+}
+
+} // namespace cann
+} // namespace cv
diff --git a/modules/cannops/src/element_operations.cpp b/modules/cannops/src/element_operations.cpp
new file mode 100644
index 00000000000..907c995ce45
--- /dev/null
+++ b/modules/cannops/src/element_operations.cpp
@@ -0,0 +1,240 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+namespace cv
+{
+namespace cann
+{
+static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask,
+                      float scale, int dtype, const char* op, AscendStream& stream)
+{
+    const bool isScalar1 = (_src1.kind() == _InputArray::MATX);
+    const bool isScalar2 = (_src2.kind() == _InputArray::MATX);
+
+    if (isScalar1 && isScalar2)
+        CV_Error(Error::StsBadArg, "At list one matrix parameter shoule be passwd.");
+
+    NpuMat src1, src2;
+    Mat scalar;
+
+    if (!isScalar1)
+        src1 = getInputMat(_src1, stream);
+    if (!isScalar2)
+        src2 = getInputMat(_src2, stream);
+
+    if (isScalar1)
+        scalar = _src1.getMat();
+    else if (isScalar2)
+        scalar = _src2.getMat();
+
+    const int sdepth = src1.empty() ? src2.depth() : src1.depth();
+    const int cn = src1.empty() ? src2.channels() : src1.channels();
+    const Size size = src1.empty() ? src2.size() : src1.size();
+
+    if (dtype < 0)
+        dtype = sdepth;
+
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    CV_Assert(sdepth <= CV_16F && ddepth <= CV_16F);
+    CV_Assert(!scalar.empty() || src2.empty() ||
+              (src2.depth() == src1.depth() && src2.size() == src1.size()));
+
+    Scalar val;
+
+    if (!scalar.empty())
+    {
+        CV_Assert(scalar.total() <= 4);
+        scalar.convertTo(Mat_<double>(scalar.rows, scalar.cols, &val[0]), CV_64F);
+    }
+
+    NpuMat dst = getOutputMat(_dst, size.height, size.width, CV_MAKE_TYPE(ddepth, cn), stream);
+
+    if (isScalar1)
+        callAscendOperator(src2, val, true, dst, op, stream);
+    else if (isScalar2)
+        callAscendOperator(src1, val, false, dst, op, stream);
+    else
+    {
+        if (src2.empty())
+            callAscendOperator(src1, dst, op, stream);
+        else
+            callAscendOperator(src1, src2, dst, op, stream);
+    }
+
+    NpuMat mask = getInputMat(_mask, stream);
+    if (!mask.empty())
+    {
+        int mtype = mask.type();
+        CV_Assert((mtype == CV_8UC1 || mtype == CV_8SC1) && mask.size() == size);
+        NpuMat onesMask, castedMask;
+        onesMask.create(mask.rows, mask.cols, mask.type());
+        callAscendOperator(mask, mask, onesMask, "Div", stream);
+        onesMask.convertTo(castedMask, dst.depth(), stream);
+        callAscendOperator(dst, castedMask, dst, "Mul", stream);
+    }
+
+    if (scale != 1)
+    {
+        muls(dst, scale, dst, stream);
+    }
+
+    syncOutput(dst, _dst, stream);
+}
+
+void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype,
+         AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
+}
+
+void subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
+}
+
+void multiply(InputArray src1, InputArray src2, OutputArray dst, float scale, int dtype,
+              AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, noArray(), scale, dtype, "Mul", stream);
+}
+
+void divide(InputArray src1, InputArray src2, OutputArray dst, float scale, int dtype,
+            AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, noArray(), scale, dtype, "Div", stream);
+}
+
+void bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
+}
+
+void bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask,
+                AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
+}
+
+void bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask,
+                 AscendStream& stream)
+{
+    arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
+}
+
+void bitwise_not(InputArray src, OutputArray dst, InputArray mask, AscendStream& stream)
+{
+    arithm_op(src, noArray(), dst, mask, 1, -1, "Invert", stream);
+}
+
+void addWeighted(InputArray _src1, double alpha, InputArray _src2, double beta, double gamma,
+                 OutputArray _dst, int dtype, AscendStream& stream)
+{
+    NpuMat src1, src2;
+    src1 = getInputMat(_src1, stream);
+    src2 = getInputMat(_src2, stream);
+
+    if (dtype < 0)
+        dtype = src1.depth();
+
+    CV_Assert(src2.depth() == src1.depth() && src2.size() == src1.size() &&
+              src1.channels() == src2.channels());
+
+    int type = CV_MAKE_TYPE(dtype, src1.channels());
+    NpuMat dst = getOutputMat(_dst, src1.rows, src1.cols, type, stream);
+
+    // TODO Consider overflow, should extend type or not?
+    NpuMat src1Weighted(src1.size(), type), src2Weighted(src1.size(), type),
+        srcWeightedSumRet(src1.size(), type);
+    muls(src1, alpha, src1Weighted, stream);
+    muls(src2, beta, src2Weighted, stream);
+    callAscendOperator(src1Weighted, src2Weighted, srcWeightedSumRet, "Add", stream);
+    adds(srcWeightedSumRet, gamma, dst, stream);
+
+    syncOutput(dst, _dst, stream);
+}
+
+double threshold(NpuMat& src, NpuMat& dst, double thresh, double maxval, int type,
+                 AscendStream& stream)
+{
+    // ThresholdTypes is defined in opencv2/imgproc, This type is the only Symbol we need.
+    // Add imgproc to dependence is too heavy, use magic number instead.
+    CV_Assert(type <= 4 /*THRESH_TOZERO_INV*/);
+
+    NpuMat threshMat(src.size(), src.type());
+
+    AclFloatAttribute attr("threshold", (float)thresh);
+    std::vector<AclAttribute*> attrs{&attr};
+    callAscendOperator(src, threshMat, "Threshold", stream, attrs);
+
+    // THRESH_*_INV, THRESH_TRUNC need a inverse threshMat.
+    // THRESH_BINARY_INV = 1, THRESH_TRUNC = 2, THRESH_TOZERO_INV = 4,
+    if (type == 1 || type == 2 || type == 4)
+    {
+        NpuMat threshInvMat(src.size(), src.type());
+        NpuMat ones(src.size(), src.type());
+        Scalar s(1, 1, 1, 1);
+        ones.setTo(s, stream);
+        callAscendOperator(ones, threshMat, threshInvMat, "Sub", stream);
+
+        if (type == 1)
+        {
+            muls(threshInvMat, maxval, dst, stream);
+        }
+        else if (type == 2)
+        {
+            NpuMat ToZeroInvMat(src.size(), src.type());
+            NpuMat TruncMat(src.size(), src.type());
+            callAscendOperator(threshInvMat, src, ToZeroInvMat, "Mul", stream);
+            muls(threshMat, thresh, TruncMat, stream);
+            callAscendOperator(ToZeroInvMat, TruncMat, dst, "Add", stream);
+        }
+        else
+        {
+            callAscendOperator(threshInvMat, src, dst, "Mul", stream);
+        }
+    }
+    else
+    {
+        if (type == 0) /* THRESH_BINARY = 0 */
+        {
+            muls(threshMat, maxval, dst, stream);
+        }
+        else if (type == 3) /* THRESH_TOZERO = 3 */
+        {
+            callAscendOperator(threshMat, src, dst, "Mul", stream);
+        }
+        else
+        {
+            CV_Error(Error::AscendApiCallError, "Unknown/unsupported threshold type");
+        }
+    }
+    return thresh;
+}
+
+double threshold(InputArray _src, OutputArray _dst, double thresh, double maxval, int type,
+                 AscendStream& stream)
+{
+    NpuMat src = getInputMat(_src, stream);
+    NpuMat dst = getOutputMat(_dst, src.rows, src.cols, src.type(), stream);
+    double ret = threshold(src, dst, thresh, maxval, type, stream);
+    syncOutput(dst, _dst, stream);
+    return ret;
+}
+
+#define OpScalar(name, op)                                                        \
+    void name(const NpuMat& arr, float scalar, NpuMat& dst, AscendStream& stream) \
+    {                                                                             \
+        AclFloatAttribute attr("value", scalar);                                  \
+        std::vector<AclAttribute*> attrs{&attr};                                  \
+        callAscendOperator(arr, dst, #op, stream, attrs);                         \
+    }
+
+OpScalar(muls, Muls);
+OpScalar(adds, Adds);
+
+} // namespace cann
+} // namespace cv
diff --git a/modules/cannops/src/npumat.cpp b/modules/cannops/src/npumat.cpp
new file mode 100644
index 00000000000..0332b891ec9
--- /dev/null
+++ b/modules/cannops/src/npumat.cpp
@@ -0,0 +1,276 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "precomp.hpp"
+#include <iostream>
+
+namespace
+{
+class DefaultAllocator : public cv::cann::NpuMat::Allocator
+{
+public:
+    std::shared_ptr<uchar> allocate(size_t size) CV_OVERRIDE;
+    bool allocate(cv::cann::NpuMat* mat, int rows, int cols, size_t elemSize) CV_OVERRIDE;
+};
+
+std::shared_ptr<uchar> DefaultAllocator::allocate(size_t size)
+{
+    uchar* data;
+    cv::cann::aclrtMallocWarpper((void**)(&data), size);
+    return std::shared_ptr<uchar>(data, [](void* ptr) { cv::cann::aclrtFreeWarpper(ptr); });
+}
+
+bool DefaultAllocator::allocate(cv::cann::NpuMat* mat, int rows, int cols, size_t elemSize)
+{
+    mat->data = allocate(elemSize * cols * rows);
+    mat->step = cols * elemSize;
+
+    return true;
+}
+
+DefaultAllocator cannDefaultAllocator;
+cv::cann::NpuMat::Allocator* g_defaultAllocator = &cannDefaultAllocator;
+} // namespace
+
+namespace cv
+{
+namespace cann
+{
+NpuMat::Allocator* NpuMat::defaultAllocator() { return g_defaultAllocator; }
+
+void NpuMat::setDefaultAllocator(NpuMat::Allocator* allocator)
+{
+    CV_Assert(allocator != 0);
+    g_defaultAllocator = allocator;
+}
+
+// TODO: this function is copied from matrix.cpp, which is a local symbol there and can be
+// refreneced.
+static int updateContinuityFlag(int flags, int dims, const int* size, const size_t* step)
+{
+    int i, j;
+    for (i = 0; i < dims; i++)
+    {
+        if (size[i] > 1)
+            break;
+    }
+
+    uint64 t = (uint64)size[std::min(i, dims - 1)] * CV_MAT_CN(flags);
+    for (j = dims - 1; j > i; j--)
+    {
+        t *= size[j];
+        if (step[j] * size[j] < step[j - 1])
+            break;
+    }
+
+    if (j <= i && t == (uint64)(int)t)
+        return flags | Mat::CONTINUOUS_FLAG;
+    return flags & ~Mat::CONTINUOUS_FLAG;
+}
+
+void NpuMat::updateContinuityFlag()
+{
+    int sz[] = {rows, cols};
+    size_t steps[] = {step, elemSize()};
+    flags = cv::cann::updateContinuityFlag(flags, 2, sz, steps);
+}
+
+void NpuMat::create(int _rows, int _cols, int _type)
+{
+    CV_DbgAssert(_rows >= 0 && _cols >= 0);
+
+    _type &= Mat::TYPE_MASK;
+
+    if (rows == _rows && cols == _cols && type() == _type && data)
+        return;
+
+    if (_rows > 0 && _cols > 0)
+    {
+        flags = Mat::MAGIC_VAL + _type;
+        rows = _rows;
+        cols = _cols;
+
+        const size_t esz = elemSize();
+
+        bool allocSuccess = allocator->allocate(this, rows, cols, esz);
+
+        if (!allocSuccess)
+        {
+            // custom allocator fails, try default allocator
+            allocator = defaultAllocator();
+            allocSuccess = allocator->allocate(this, rows, cols, esz);
+            CV_Assert(allocSuccess);
+        }
+
+        if (esz * cols == step)
+            flags |= Mat::CONTINUOUS_FLAG;
+
+        datastart = data.get();
+        dataend = data.get() + step * (rows - 1) + cols * esz;
+    }
+}
+
+void NpuMat::upload(InputArray arr) { upload(arr, AscendStream::Null()); }
+
+void NpuMat::upload(InputArray arr, AscendStream& stream)
+{
+    Mat mat = arr.getMat();
+    CV_DbgAssert(!mat.empty());
+    create(mat.rows, mat.cols, mat.type());
+    aclrtMemcpy2dWarpper(data, 0, step, mat.data, mat.step[0], cols * elemSize(), rows, stream);
+}
+
+void NpuMat::download(OutputArray dst) const { download(dst, AscendStream::Null()); }
+
+void NpuMat::download(OutputArray _dst, AscendStream& stream) const
+{
+    CV_DbgAssert(!empty());
+
+    _dst.create(size(), type());
+    Mat dst = _dst.getMat();
+    aclrtMemcpy2dWarpper(dst.data, dst.step[0], data, 0, step, cols * elemSize(), rows, stream);
+}
+
+NpuMat::NpuMat(int rows_, int cols_, int type_, Scalar& s_, NpuMat::Allocator* allocator_)
+    : flags(0), rows(rows_), cols(cols_), step(0), datastart(0), dataend(0), allocator(allocator_)
+{
+    create(rows_, cols_, type_);
+    setTo(s_);
+}
+
+NpuMat::NpuMat(Size size_, int type_, Scalar& s_, NpuMat::Allocator* allocator_)
+    : flags(0), rows(size_.height), cols(size_.width), step(0), datastart(0), dataend(0),
+      allocator(allocator_)
+{
+    create(size_.height, size_.width, type_);
+    setTo(s_);
+}
+
+NpuMat::NpuMat(InputArray _m, const Rect& roi) : NpuMat(_m, roi, AscendStream::Null()) {}
+
+NpuMat::NpuMat(InputArray _m, const Rect& roi, AscendStream& stream)
+    : rows(roi.height), cols(roi.width), allocator(defaultAllocator())
+{
+    NpuMat m = getInputMat(_m, stream);
+    step = m.step;
+    data = m.data;
+    flags = m.flags;
+    CV_Assert(0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y &&
+              0 <= roi.height && roi.y + roi.height <= m.rows);
+    size_t esz = CV_ELEM_SIZE(flags);
+    size_t sizeMem = esz * roi.width * roi.height * m.channels();
+    size_t offset = roi.y * m.step + roi.x * esz;
+
+    void* dst = malloc(sizeMem);
+    size_t dpitch = roi.width * esz;
+    std::shared_ptr<uchar> dstDevice = allocator->allocate(sizeMem);
+    aclrtMemcpy2dWarpper(dst, dpitch, data, offset, step, dpitch, roi.height, stream);
+    aclrtMemcpy2dWarpper(dstDevice, 0, dpitch, dst, dpitch, dpitch, roi.height, stream);
+    data = dstDevice;
+    step = dpitch;
+    free(dst);
+    updateContinuityFlag();
+}
+
+NpuMat& NpuMat::setTo(const Scalar& sc) { return setTo(sc, AscendStream::Null()); }
+
+NpuMat& NpuMat::setTo(const Scalar& sc, AscendStream& stream)
+{
+    size_t totalBytes = (size_t)rows * cols * elemSize();
+    if (totalBytes == 0)
+        return *this;
+
+    aclrtMemsetWarpper(data, 0, totalBytes, stream);
+
+    NpuMat dst(rows, cols, type());
+    // TODO use AssignAdd to avoid memcpy, or use broadcase.
+    callAscendOperator(*this, sc, false, dst, "Add", stream);
+    swap(dst);
+
+    return *this;
+}
+
+NpuMat& NpuMat::setTo(float sc) { return setTo(sc, AscendStream::Null()); }
+
+NpuMat& NpuMat::setTo(float sc, AscendStream& stream)
+{
+    size_t totalBytes = (size_t)rows * cols * elemSize();
+    if (totalBytes == 0)
+        return *this;
+
+    aclrtMemsetWarpper(data, 0, totalBytes, stream);
+
+    NpuMat dst(rows, cols, type());
+    adds(*this, sc, dst, stream);
+    swap(dst);
+
+    return *this;
+}
+
+void NpuMat::convertTo(NpuMat& dst, int rtype) const
+{
+    convertTo(dst, rtype, AscendStream::Null());
+}
+
+void NpuMat::convertTo(NpuMat& dst, int _rtype, AscendStream& _stream) const
+{
+    int cn = channels();
+    dst.create(rows, cols, CV_MAKE_TYPE(_rtype, cn));
+    callAscendOperator(*this, dst, "Cast", _stream);
+}
+
+static NpuMat getNpuMat(InputArray arr)
+{
+    _InputArray::KindFlag k = arr.kind();
+    if (k == _InputArray::NPU_MAT)
+    {
+        const cann::NpuMat* n_mat = (const cann::NpuMat*)arr.getObj();
+        return *n_mat;
+    }
+
+    if (k == _InputArray::NONE)
+        return cann::NpuMat();
+
+    CV_Error(cv::Error::StsNotImplemented, "getNpuMat is available only for cann::NpuMat");
+}
+
+NpuMat getInputMat(InputArray _src, AscendStream& stream)
+{
+    NpuMat src;
+    if (_src.kind() == _InputArray::NPU_MAT)
+    {
+        src = getNpuMat(_src);
+    }
+    else if (!_src.empty())
+    {
+        src.upload(_src, stream);
+    }
+    return src;
+}
+
+NpuMat getOutputMat(OutputArray _dst, int rows, int cols, int type, AscendStream& stream)
+{
+    CV_UNUSED(stream);
+    NpuMat dst;
+    if (_dst.kind() == _InputArray::NPU_MAT)
+    {
+        ((cann::NpuMat*)(_dst.getObj()))->create(rows, cols, type);
+        dst = getNpuMat(_dst);
+    }
+    else
+    {
+        dst.create(rows, cols, type);
+    }
+    return dst;
+}
+
+void syncOutput(const NpuMat& dst, OutputArray _dst, AscendStream& stream)
+{
+    if (_dst.kind() != _InputArray::NPU_MAT)
+    {
+        dst.download(_dst, stream);
+    }
+}
+} // namespace cann
+} // namespace cv
diff --git a/modules/cannarithm/src/precomp.hpp b/modules/cannops/src/precomp.hpp
similarity index 67%
rename from modules/cannarithm/src/precomp.hpp
rename to modules/cannops/src/precomp.hpp
index 1541ec80a69..8411cc40407 100644
--- a/modules/cannarithm/src/precomp.hpp
+++ b/modules/cannops/src/precomp.hpp
@@ -5,12 +5,10 @@
 #ifndef __OPENCV_PRECOMP_H__
 #define __OPENCV_PRECOMP_H__
 
-#include <acl/acl.h>
-#include <acl/acl_op_compiler.h>
 #include "opencv2/cann.hpp"
-#include "opencv2/cann_prepare.hpp"
-#include "opencv2/acl_stream_accessor.hpp"
+#include "opencv2/stream_accessor.hpp"
 #include "opencv2/cann_call.hpp"
-#include "opencv2/cann_arithm.hpp"
+#include "opencv2/cann_interface.hpp"
+#include "opencv2/cann_private.hpp"
 
 #endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cannops/test/test_core.cpp b/modules/cannops/test/test_core.cpp
new file mode 100644
index 00000000000..fca24133ca5
--- /dev/null
+++ b/modules/cannops/test/test_core.cpp
@@ -0,0 +1,135 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+#include <vector>
+
+namespace opencv_test
+{
+namespace
+{
+TEST(IMGPROC, MERGE)
+{
+    Mat m1 = (Mat_<uchar>(2, 2) << 1, 4, 7, 10);
+    Mat m2 = (Mat_<uchar>(2, 2) << 2, 5, 8, 11);
+    Mat m3 = (Mat_<uchar>(2, 2) << 3, 6, 9, 12);
+    Mat channels[3] = {m1, m2, m3};
+    Mat m;
+    cv::merge(channels, 3, m);
+
+    cv::cann::setDevice(0);
+
+    NpuMat a1, a2, a3;
+    a1.upload(m1);
+    a2.upload(m2);
+    a3.upload(m3);
+    NpuMat aclChannels[3] = {a1, a2, a3};
+    std::vector<NpuMat> aclChannelsVector;
+    aclChannelsVector.push_back(a1);
+    aclChannelsVector.push_back(a2);
+    aclChannelsVector.push_back(a3);
+
+    Mat checker1, checker2;
+    cv::cann::merge(aclChannels, 3, checker1);
+    cv::cann::merge(aclChannelsVector, checker2);
+
+    EXPECT_MAT_NEAR(m, checker1, 0.0);
+    EXPECT_MAT_NEAR(m, checker2, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(IMGPROC, SPLIT)
+{
+    char d[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+    Mat m(2, 2, CV_8UC3, d);
+    Mat channels[3];
+    cv::split(m, channels);
+
+    cv::cann::setDevice(0);
+
+    NpuMat aclChannels[3];
+    std::vector<NpuMat> aclChannelsVector;
+
+    cv::cann::split(m, aclChannels);
+    cv::cann::split(m, aclChannelsVector);
+
+    Mat checker1[3], checker2[3];
+    aclChannels[0].download(checker1[0]);
+    aclChannels[1].download(checker1[1]);
+    aclChannels[2].download(checker1[2]);
+
+    aclChannelsVector[0].download(checker2[0]);
+    aclChannelsVector[1].download(checker2[1]);
+    aclChannelsVector[2].download(checker2[2]);
+
+    EXPECT_MAT_NEAR(channels[0], checker1[0], 0.0);
+    EXPECT_MAT_NEAR(channels[1], checker1[1], 0.0);
+    EXPECT_MAT_NEAR(channels[2], checker1[2], 0.0);
+
+    EXPECT_MAT_NEAR(channels[0], checker2[0], 0.0);
+    EXPECT_MAT_NEAR(channels[1], checker2[1], 0.0);
+    EXPECT_MAT_NEAR(channels[2], checker2[2], 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(IMGPROC, TRANSPOSE)
+{
+    Mat cpuMat = randomMat(10, 10, CV_32SC3), cpuRetMat, checker;
+    cv::transpose(cpuMat, cpuRetMat);
+    cv::cann::transpose(cpuMat, checker);
+
+    EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+}
+
+TEST(IMGPROC, FLIP)
+{
+    Mat cpuMat = randomMat(10, 10, CV_32SC3), cpuRetMat, checker;
+
+    cv::flip(cpuMat, cpuRetMat, 0);
+    cv::cann::flip(cpuMat, checker, 0);
+    EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+
+    cv::flip(cpuMat, cpuRetMat, 1);
+    cv::cann::flip(cpuMat, checker, 1);
+    EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+
+    cv::flip(cpuMat, cpuRetMat, -1);
+    cv::cann::flip(cpuMat, checker, -1);
+    EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+}
+
+TEST(IMGPROC, ROTATE)
+{
+    Mat cpuRetMat, checker, cpuMat = randomMat(3, 5, CV_16S, 0.0, 255.0);
+
+    int rotateMode = 0;
+    cv::rotate(cpuMat, cpuRetMat, rotateMode);
+    cv::cann::rotate(cpuMat, checker, rotateMode);
+    EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+
+    rotateMode = 1;
+    cv::rotate(cpuMat, cpuRetMat, rotateMode);
+    cv::cann::rotate(cpuMat, checker, rotateMode);
+    EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+
+    rotateMode = 2;
+    cv::rotate(cpuMat, cpuRetMat, rotateMode);
+    cv::cann::rotate(cpuMat, checker, rotateMode);
+    EXPECT_MAT_NEAR(cpuRetMat, checker, 0.0);
+}
+
+TEST(CORE, CROP)
+{
+    Mat cpuOpRet, checker, cpuMat = randomMat(6, 6, CV_32SC3, 0.0, 255.0);
+    Rect b(1, 2, 4, 4);
+    Mat cropped_cv(cpuMat, b);
+    NpuMat cropped_cann(cpuMat, b);
+    cropped_cann.download(checker);
+    EXPECT_MAT_NEAR(cropped_cv, checker, 1e-10);
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/test/test_cvtcolor.cpp b/modules/cannops/test/test_cvtcolor.cpp
new file mode 100644
index 00000000000..70dc11f297f
--- /dev/null
+++ b/modules/cannops/test/test_cvtcolor.cpp
@@ -0,0 +1,72 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+void cvtColorTest(int code, int cn, int dcn = 3, float diff = 0.0f)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat cpuRet, npuRet;
+
+    Mat img8U = randomMat(512, 512, CV_MAKETYPE(CV_8U, cn), 0.0f, 255.0f);
+    Mat img16U = randomMat(512, 512, CV_MAKETYPE(CV_16U, cn), 0.0f, 65535.0f);
+    Mat img32F = randomMat(512, 512, CV_MAKETYPE(CV_32F, cn), 0.0f, 65535.0f);
+
+    cv::cvtColor(img8U, cpuRet, code, dcn);
+    cv::cann::cvtColor(img8U, npuRet, code, dcn);
+    EXPECT_MAT_NEAR(cpuRet, npuRet, diff);
+
+    cv::cvtColor(img16U, cpuRet, code, dcn);
+    cv::cann::cvtColor(img16U, npuRet, code, dcn);
+    EXPECT_MAT_NEAR(cpuRet, npuRet, diff);
+
+    cv::cvtColor(img32F, cpuRet, code, dcn);
+    cv::cann::cvtColor(img32F, npuRet, code, dcn);
+    EXPECT_MAT_NEAR(cpuRet, npuRet, diff);
+    cv::cann::resetDevice();
+}
+
+TEST(CVT_COLOR, BGR2BGRA) { cvtColorTest(COLOR_BGR2BGRA, 3, 4); }
+TEST(CVT_COLOR, BGRA2BGR) { cvtColorTest(COLOR_BGRA2BGR, 4); }
+TEST(CVT_COLOR, BGR2RGBA) { cvtColorTest(COLOR_BGR2RGBA, 3, 4); }
+TEST(CVT_COLOR, RGBA2BGR) { cvtColorTest(COLOR_RGBA2BGR, 4); }
+TEST(CVT_COLOR, BGR2RGB) { cvtColorTest(COLOR_BGR2RGB, 3); }
+TEST(CVT_COLOR, BGRA2RGBA) { cvtColorTest(COLOR_BGRA2RGBA, 4, 4); }
+
+// Due to parameter accuracy issues, the calculation results have certain accuracy differences.
+TEST(CVT_COLOR, BGR2GRAY) { cvtColorTest(COLOR_BGR2GRAY, 3, 1, 10.0f); }
+TEST(CVT_COLOR, RGB2GRAY) { cvtColorTest(COLOR_BGR2GRAY, 3, 1, 10.0f); }
+TEST(CVT_COLOR, GRAY2BGR) { cvtColorTest(COLOR_GRAY2BGR, 1); }
+TEST(CVT_COLOR, GRAY2BGRA) { cvtColorTest(COLOR_GRAY2BGRA, 1, 4); }
+TEST(CVT_COLOR, BGRA2GRAY) { cvtColorTest(COLOR_BGRA2GRAY, 4, 1, 10.0f); }
+TEST(CVT_COLOR, RGBA2GRAY) { cvtColorTest(COLOR_RGBA2GRAY, 4, 1, 10.0f); }
+
+TEST(CVT_COLOR, BGR2XYZ) { cvtColorTest(COLOR_BGR2XYZ, 3, 3, 50.0f); }
+TEST(CVT_COLOR, RGB2XYZ) { cvtColorTest(COLOR_RGB2XYZ, 3, 3, 50.0f); }
+TEST(CVT_COLOR, XYZ2BGR) { cvtColorTest(COLOR_XYZ2BGR, 3, 3, 150.0f); }
+TEST(CVT_COLOR, XYZ2RGB) { cvtColorTest(COLOR_XYZ2RGB, 3, 3, 150.0f); }
+TEST(CVT_COLOR, XYZ2BGR_DC4) { cvtColorTest(COLOR_XYZ2BGR, 3, 4, 150.0f); }
+TEST(CVT_COLOR, XYZ2RGB_DC4) { cvtColorTest(COLOR_XYZ2RGB, 3, 4, 150.0f); }
+
+TEST(CVT_COLOR, BGR2YCrCb) { cvtColorTest(COLOR_BGR2YCrCb, 3, 3, 10.0f); }
+TEST(CVT_COLOR, RGB2YCrCb) { cvtColorTest(COLOR_RGB2YCrCb, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YCrCb2BGR) { cvtColorTest(COLOR_YCrCb2BGR, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YCrCb2RGB) { cvtColorTest(COLOR_YCrCb2RGB, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YCrCb2BGR_DC4) { cvtColorTest(COLOR_YCrCb2BGR, 3, 4, 10.0f); }
+TEST(CVT_COLOR, YCrCb2RGB_DC4) { cvtColorTest(COLOR_YCrCb2RGB, 3, 4, 10.0f); }
+
+TEST(CVT_COLOR, BGR2YUV) { cvtColorTest(COLOR_BGR2YUV, 3, 3, 10.0f); }
+TEST(CVT_COLOR, RGB2YUV) { cvtColorTest(COLOR_RGB2YUV, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YUV2BGR) { cvtColorTest(COLOR_YUV2BGR, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YUV2RGB) { cvtColorTest(COLOR_YUV2RGB, 3, 3, 10.0f); }
+TEST(CVT_COLOR, YUV2BGR_DC4) { cvtColorTest(COLOR_YUV2BGR, 3, 4, 10.0f); }
+TEST(CVT_COLOR, YUV2RGB_DC4) { cvtColorTest(COLOR_YUV2RGB, 3, 4, 10.0f); }
+
+} // namespace
+} // namespace opencv_test
\ No newline at end of file
diff --git a/modules/cannops/test/test_element_operations.cpp b/modules/cannops/test/test_element_operations.cpp
new file mode 100644
index 00000000000..4364a1ebe23
--- /dev/null
+++ b/modules/cannops/test/test_element_operations.cpp
@@ -0,0 +1,259 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+#include <iostream>
+
+namespace opencv_test
+{
+namespace
+{
+template <typename FCV, typename FCANN, typename... PARAMS>
+void testMatOpMat(FCV cvFunc, FCANN cannFunc, PARAMS... param)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat mat1 = randomMat(10, 10, CV_32SC3);
+    Mat mat2 = randomMat(10, 10, CV_32SC3);
+    Mat cpuDst, check;
+
+    cvFunc(mat1, mat2, cpuDst, param...);
+    cannFunc(mat1, mat2, check, param..., AscendStream::Null());
+    EXPECT_MAT_NEAR(cpuDst, check, 0.0);
+
+    AscendStream stream;
+    cannFunc(mat1, mat2, check, param..., stream);
+    stream.waitForCompletion();
+    EXPECT_MAT_NEAR(cpuDst, check, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(ELEMENTWISE_OP, MAT_ADD_MAT) { testMatOpMat(cv::add, cv::cann::add, noArray(), -1); }
+
+TEST(ELEMENTWISE_OP, MAT_SUB_MAT) { testMatOpMat(cv::subtract, cv::cann::subtract, noArray(), -1); }
+
+TEST(ELEMENTWISE_OP, MAT_MUL_MAT) { testMatOpMat(cv::multiply, cv::cann::multiply, 1, -1); }
+
+/*
+ * TODO cv::divide will round each element by cvRound while Ascend DIV op will floor each element.
+ * In order to pass the testcase, using interger for all matrix and scalar, fixme after Ascend
+ * support round element.
+ */
+/*
+TEST(ELEMENTWISE_OP, MAT_DIV_MAT)
+{
+
+    testMatOpMat([](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+                 { cv::divide(src1, src2, dst, scale, dtype); },
+                 cv::cann::divide, 1, -1);
+}
+*/
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_AND_MAT)
+{
+    testMatOpMat(cv::bitwise_and, cv::cann::bitwise_and, noArray());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_OR_MAT)
+{
+    testMatOpMat(cv::bitwise_or, cv::cann::bitwise_or, noArray());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_XOR_MAT)
+{
+    testMatOpMat(cv::bitwise_xor, cv::cann::bitwise_xor, noArray());
+}
+
+TEST(ELEMENTWISE_OP, MAT_ADD_MAT_WITH_MASK_AND_DTYPE)
+{
+    testMatOpMat(cv::add, cv::cann::add, genMask(), CV_32SC3);
+}
+
+TEST(ELEMENTWISE_OP, MAT_SUB_MAT_WITH_MASK_AND_DTYPE)
+{
+    testMatOpMat(cv::subtract, cv::cann::subtract, genMask(), CV_32SC3);
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_AND_MAT_WITH_MASK)
+{
+    testMatOpMat(cv::bitwise_and, cv::cann::bitwise_and, genMask());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_OR_MAT_WITH_MASK)
+{
+    testMatOpMat(cv::bitwise_or, cv::cann::bitwise_or, genMask());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_XOR_MAT_WITH_MASK)
+{
+    testMatOpMat(cv::bitwise_xor, cv::cann::bitwise_xor, genMask());
+}
+
+/* Ascend Mul will case scale to interger first if matrix dtype is interger.
+ * Result is not match, fixme after Ascend Op updated.
+ */
+float randomScale = randomInterger();
+TEST(ELEMENTWISE_OP, MAT_MUL_MAT_WITH_SCALE)
+{
+    testMatOpMat(cv::multiply, cv::cann::multiply, randomScale, -1);
+}
+
+/*
+TEST(ELEMENTWISE_OP, MAT_DIV_MAT_WITH_SCALE)
+{
+    testMatOpMat([](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale, int dtype)
+                 { cv::divide(src1, src2, dst, scale, dtype); },
+                 cv::cann::divide, randomScale, -1);
+}
+*/
+
+template <typename FCV, typename FCANN, typename... PARAMS>
+void testMatOpScalar(FCV cvFunc, FCANN cannFunc, PARAMS... param)
+{
+    Scalar scalar = randomScalar();
+    Mat mat(10, 10, CV_32SC3, randomScalar());
+    Mat cpuDst1, cpuDst2, checker1, checker2;
+
+    cvFunc(Mat(10, 10, CV_32SC3, scalar), mat, cpuDst1, param...);
+    cvFunc(mat, Mat(10, 10, CV_32SC3, scalar), cpuDst2, param...);
+    cv::cann::setDevice(DEVICE_ID);
+
+    cannFunc(scalar, mat, checker1, param..., AscendStream::Null());
+    cannFunc(mat, scalar, checker2, param..., AscendStream::Null());
+    EXPECT_MAT_NEAR(cpuDst1, checker1, 0.0);
+    EXPECT_MAT_NEAR(cpuDst2, checker2, 0.0);
+
+    AscendStream stream;
+    cannFunc(scalar, mat, checker1, param..., stream);
+    cannFunc(mat, scalar, checker2, param..., stream);
+    stream.waitForCompletion();
+    EXPECT_MAT_NEAR(cpuDst1, checker1, 0.0);
+    EXPECT_MAT_NEAR(cpuDst2, checker2, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(ELEMENTWISE_OP, MAT_ADD_SCALAR) { testMatOpScalar(cv::add, cv::cann::add, noArray(), -1); }
+
+TEST(ELEMENTWISE_OP, MAT_SUB_SCALAR)
+{
+    testMatOpScalar(cv::subtract, cv::cann::subtract, noArray(), -1);
+}
+
+TEST(ELEMENTWISE_OP, MAT_MUL_SCALAR) { testMatOpScalar(cv::multiply, cv::cann::multiply, 1, -1); }
+
+/*
+TEST(ELEMENTWISE_OP, MAT_DIV_SCALAR)
+{
+    testMatOpScalar([](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale,
+                       int dtype) { cv::divide(src1, src2, dst, scale, dtype); },
+                    cv::cann::divide, 1, -1);
+}
+*/
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_AND_SCALAR)
+{
+    testMatOpScalar(cv::bitwise_and, cv::cann::bitwise_and, noArray());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_OR_SCALAR)
+{
+    testMatOpScalar(cv::bitwise_or, cv::cann::bitwise_or, noArray());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_XOR_SCALAR)
+{
+    testMatOpScalar(cv::bitwise_xor, cv::cann::bitwise_xor, noArray());
+}
+
+TEST(ELEMENTWISE_OP, MAT_ADD_SCALAR_WITH_MASK_AND_DETYPE)
+{
+    testMatOpScalar(cv::add, cv::cann::add, genMask(), CV_32SC3);
+}
+
+TEST(ELEMENTWISE_OP, MAT_SUB_SCALAR_WITH_MASK_AND_DETYPE)
+{
+    testMatOpScalar(cv::subtract, cv::cann::subtract, genMask(), CV_32SC3);
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_AND_SCALAR_WITH_MASK)
+{
+    testMatOpScalar(cv::bitwise_and, cv::cann::bitwise_and, genMask());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_OR_SCALAR_WITH_MASK)
+{
+    testMatOpScalar(cv::bitwise_or, cv::cann::bitwise_or, genMask());
+}
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_XOR_SCALAR_WITH_MASK)
+{
+    testMatOpScalar(cv::bitwise_xor, cv::cann::bitwise_xor, genMask());
+}
+
+TEST(ELEMENTWISE_OP, MAT_MUL_SCALAR_WITH_SCALE)
+{
+    testMatOpScalar(cv::multiply, cv::cann::multiply, randomScale, -1);
+}
+
+/*
+TEST(ELEMENTWISE_OP, MAT_DIV_SCALAR_WITH_SCALE)
+{
+    testMatOpScalar([](const cv::Mat& src1, const cv::Mat& src2, cv::Mat& dst, double scale,
+                       int dtype) { cv::divide(src1, src2, dst, scale, dtype); },
+                    cv::cann::divide, randomScale, -1);
+}
+*/
+
+TEST(ELEMENTWISE_OP, MAT_BITWISE_NOT_1)
+{
+    Mat cpuOpRet, checker, cpuMat = randomMat(10, 10, CV_32SC3);
+
+    cv::cann::setDevice(DEVICE_ID);
+
+    cv::bitwise_not(cpuMat, cpuOpRet);
+    cv::cann::bitwise_not(cpuMat, checker);
+    EXPECT_MAT_NEAR(cpuOpRet, checker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+// TODO random test matrix
+TEST(ELEMENTWISE_OP, MAT_ADD_WEIGHTED_1)
+{
+    Mat cpuOpRet, checker, cpuMat1 = Mat::ones(5, 5, CV_32S), cpuMat2 = Mat::ones(5, 5, CV_32S);
+
+    cv::cann::setDevice(DEVICE_ID);
+
+    cv::addWeighted(cpuMat1, 2, cpuMat2, 3, 5, cpuOpRet);
+    cv::cann::addWeighted(cpuMat1, 2, cpuMat2, 3, 5, checker);
+    EXPECT_MAT_NEAR(cpuOpRet, checker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(ELEMENTWISE_OP, MAT_THRESHOLD_1)
+{
+    Mat cpuOpRet, checker, cpuMat = randomMat(10, 10, CV_16SC3, 0.0, 255.0);
+
+    NpuMat npuMat, npuMat16F, aclOpRet, aclOpRet16S;
+    cv::cann::setDevice(DEVICE_ID);
+    npuMat.upload(cpuMat);
+    npuMat.convertTo(npuMat16F, CV_16F);
+
+    for (int i = 0; i <= 4; i++)
+    {
+        cv::threshold(cpuMat, cpuOpRet, 128, 250, i);
+        cv::cann::threshold(npuMat16F, aclOpRet, 128, 250, i);
+        aclOpRet.convertTo(aclOpRet16S, CV_16S);
+        aclOpRet16S.download(checker);
+
+        EXPECT_MAT_NEAR(cpuOpRet, checker, 1e-10);
+    }
+
+    cv::cann::resetDevice();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannarithm/test/test_main.cpp b/modules/cannops/test/test_main.cpp
similarity index 93%
rename from modules/cannarithm/test/test_main.cpp
rename to modules/cannops/test/test_main.cpp
index 14bd66005ec..202c6af27ee 100644
--- a/modules/cannarithm/test/test_main.cpp
+++ b/modules/cannops/test/test_main.cpp
@@ -18,4 +18,4 @@ static void initTests()
     ::testing::AddGlobalTestEnvironment(cannEnv);
 }
 
-CV_TEST_MAIN("cannarithm", initTests());
+CV_TEST_MAIN("cannops", initTests());
diff --git a/modules/cannops/test/test_npumat.cpp b/modules/cannops/test/test_npumat.cpp
new file mode 100644
index 00000000000..7e40afda184
--- /dev/null
+++ b/modules/cannops/test/test_npumat.cpp
@@ -0,0 +1,146 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+class DummyAllocator : public NpuMat::Allocator
+{
+public:
+    std::shared_ptr<uchar> allocate(size_t size) CV_OVERRIDE
+    {
+        CV_UNUSED(size);
+        return std::shared_ptr<uchar>();
+    }
+    bool allocate(cv::cann::NpuMat* mat, int rows, int cols, size_t elemSize) CV_OVERRIDE
+    {
+        CV_UNUSED(rows);
+        CV_UNUSED(cols);
+        CV_UNUSED(elemSize);
+        mat->data = std::shared_ptr<uchar>((uchar*)0x12345, [](void* ptr) { CV_UNUSED(ptr); });
+        return true;
+    }
+};
+
+TEST(NpuMat, Construct)
+{
+    cv::cann::setDevice(0);
+    // 1 Default constructor.
+    NpuMat defaultNpuMat;
+    NpuMat::Allocator* defaultAllocator = NpuMat::defaultAllocator();
+    ASSERT_EQ(defaultNpuMat.allocator, defaultAllocator);
+
+    // 2 get & set allocator.
+    DummyAllocator dummyAllocator;
+    NpuMat::setDefaultAllocator(&dummyAllocator);
+    ASSERT_EQ(defaultNpuMat.defaultAllocator(), &dummyAllocator);
+    NpuMat::setDefaultAllocator(defaultAllocator);
+
+    // 3 constructs NpuMat of the specified size and type
+    NpuMat specifiedSizeNpuMat1(5, 6, CV_8UC3);
+    NpuMat specifiedSizeNpuMat2(Size(300, 200), CV_64F);
+
+    ASSERT_EQ(specifiedSizeNpuMat1.rows, 5);
+    ASSERT_EQ(specifiedSizeNpuMat1.cols, 6);
+    ASSERT_EQ(specifiedSizeNpuMat1.depth(), CV_8U);
+    ASSERT_EQ(specifiedSizeNpuMat1.channels(), 3);
+
+    ASSERT_EQ(specifiedSizeNpuMat2.cols, 300);
+    ASSERT_EQ(specifiedSizeNpuMat2.rows, 200);
+    ASSERT_EQ(specifiedSizeNpuMat2.depth(), CV_64F);
+    ASSERT_EQ(specifiedSizeNpuMat2.channels(), 1);
+
+    // 4 constructs NpuMat and fills it with the specified value s
+    srand((unsigned int)(time(NULL)));
+    Scalar sc(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+
+    Mat scalarToMat(7, 8, CV_8UC3, sc);
+    NpuMat scalarToNpuMat1(7, 8, CV_8UC3, sc);
+    Mat scalarToMatChecker;
+    scalarToNpuMat1.download(scalarToMatChecker);
+
+    EXPECT_MAT_NEAR(scalarToMat, scalarToMatChecker, 0.0);
+
+    NpuMat scalarToNpuMat2(Size(123, 345), CV_32S);
+
+    ASSERT_EQ(scalarToNpuMat1.rows, 7);
+    ASSERT_EQ(scalarToNpuMat1.cols, 8);
+    ASSERT_EQ(scalarToNpuMat1.depth(), CV_8U);
+    ASSERT_EQ(scalarToNpuMat1.channels(), 3);
+
+    ASSERT_EQ(scalarToNpuMat2.cols, 123);
+    ASSERT_EQ(scalarToNpuMat2.rows, 345);
+    ASSERT_EQ(scalarToNpuMat2.depth(), CV_32S);
+    ASSERT_EQ(scalarToNpuMat2.channels(), 1);
+
+    // 6 builds NpuMat from host memory
+    Scalar sc2(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+    Mat randomMat(7, 8, CV_8UC3, sc2);
+    InputArray arr = randomMat;
+
+    NpuMat fromInputArray(arr, AscendStream::Null());
+    Mat randomMatChecker;
+    fromInputArray.download(randomMatChecker);
+    EXPECT_MAT_NEAR(randomMat, randomMatChecker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(NpuMat, Assignment)
+{
+    DummyAllocator dummyAllocator;
+    NpuMat mat1;
+    NpuMat mat2(3, 4, CV_8SC1, &dummyAllocator);
+    mat1 = mat2;
+
+    ASSERT_EQ(mat1.rows, 3);
+    ASSERT_EQ(mat1.cols, 4);
+    ASSERT_EQ(mat1.depth(), CV_8S);
+    ASSERT_EQ(mat1.channels(), 1);
+    ASSERT_EQ(mat1.data.get(), (uchar*)0x12345);
+}
+
+TEST(NpuMat, SetTo)
+{
+    cv::cann::setDevice(0);
+
+    srand((unsigned int)(time(NULL)));
+    Scalar sc(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+
+    NpuMat npuMat(2, 2, CV_8UC4);
+    npuMat.setTo(sc);
+    Mat mat(2, 2, CV_8UC4, sc);
+    Mat checker;
+    npuMat.download(checker);
+
+    EXPECT_MAT_NEAR(mat, checker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+TEST(NpuMat, ConvertTo)
+{
+    cv::cann::setDevice(0);
+
+    srand((unsigned int)(time(NULL)));
+    Scalar sc(rand() % 256, rand() % 256, rand() % 256, rand() % 256);
+
+    NpuMat npuMat(2, 2, CV_8UC4, sc);
+    NpuMat convertedNpuMat;
+    npuMat.convertTo(convertedNpuMat, CV_16S);
+    Mat mat(2, 2, CV_16SC4, sc);
+    Mat checker;
+    convertedNpuMat.download(checker);
+
+    EXPECT_MAT_NEAR(mat, checker, 0.0);
+
+    cv::cann::resetDevice();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannarithm/test/test_precomp.hpp b/modules/cannops/test/test_precomp.hpp
similarity index 64%
rename from modules/cannarithm/test/test_precomp.hpp
rename to modules/cannops/test/test_precomp.hpp
index e95abb86e1c..439ad70af0d 100644
--- a/modules/cannarithm/test/test_precomp.hpp
+++ b/modules/cannops/test/test_precomp.hpp
@@ -7,10 +7,21 @@
 
 #include "opencv2/ts.hpp"
 #include "opencv2/cann.hpp"
+#include "opencv2/ts/cuda_test.hpp"
+#include "opencv2/cann_interface.hpp"
 
+using namespace cv;
 using namespace cv::cann;
 #undef EXPECT_MAT_NEAR
 #define EXPECT_MAT_NEAR(m1, m2, eps) EXPECT_PRED_FORMAT3(cvtest::assertMatNear, m1, m2, eps)
 #define ASSERT_MAT_NEAR(m1, m2, eps) ASSERT_PRED_FORMAT3(cvtest::assertMatNear, m1, m2, eps)
 
-#endif
+#define DEVICE_ID 0
+
+Mat randomMat(int w, int h, int dtype, float min = 1.0f, float max = 10.0f);
+Scalar randomScalar();
+float randomNum();
+int randomInterger();
+Mat genMask();
+
+#endif //__OPENCV_TEST_PRECOMP_HPP__
diff --git a/modules/cannops/test/test_utils.cpp b/modules/cannops/test/test_utils.cpp
new file mode 100644
index 00000000000..2141ebf906a
--- /dev/null
+++ b/modules/cannops/test/test_utils.cpp
@@ -0,0 +1,41 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+// Random Generator
+Mat randomMat(int w, int h, int dtype, float min, float max)
+{
+    Mat rnMat(w, h, dtype);
+    RNG rng(getTickCount());
+    rng.fill(rnMat, RNG::UNIFORM, min, max);
+    return rnMat;
+}
+Scalar randomScalar()
+{
+    RNG rng(getTickCount());
+    Scalar sc;
+    rng.fill(sc, RNG::UNIFORM, 1.0, 5.0);
+    return sc;
+}
+float randomNum()
+{
+    RNG rng(getTickCount());
+    float rdnNum = float(rng.uniform(1.0, 5.0));
+    return rdnNum;
+}
+
+int randomInterger()
+{
+    RNG rng(getTickCount());
+    float rdnNum = float(rng.uniform(1, 5));
+    return rdnNum;
+}
+
+Mat genMask()
+{
+    Mat mask = Mat::zeros(Size(10, 10), CV_8UC1);
+    rectangle(mask, cv::Rect(5, 5, 3, 3), Scalar(255), -1);
+    return mask;
+}