[NVIDIA] Add new activation, comparison and reduce operations (openvi…

…notoolkit#705) * [NVIDIA] Adds abs,cos,cosh,exp,gelu,sin,sinh,sqrt,log activations * [NVIDIA] Adds greater_equal,less_equal,equal,not_equal operations * Missing licenses * Adds hswish * Small update * Adds ReduceMax,ReduceMean,ReduceMin,ReduceProd operations * Support reduction opeations with keed_dims == false * Update StridedSlice to support i32 input * Adds Mish activation * Update cuda_opset.md * Adds Elu activation * Disable ShuffleChannelsFusion * Update cuda_opset.md * Fix ReduceTransformation for case when Reduce doesn't reduce dims * ELU: double alpha => float * Update ReduceTransformation
sunxiaoxia2022 · Sep 1, 2023 · 322c19f · 322c19f
1 parent 394a8cf
commit 322c19f
Show file tree

Hide file tree

Showing 93 changed files with 3,184 additions and 436 deletions.
diff --git a/modules/nvidia_plugin/docs/cuda_opset.md b/modules/nvidia_plugin/docs/cuda_opset.md
diff --git a/modules/nvidia_plugin/src/cuda/dnn.hpp b/modules/nvidia_plugin/src/cuda/dnn.hpp
@@ -241,6 +241,39 @@ class DnnReduceAddDescriptor : public DnnReduceTensorDescriptor {
     }
 };
 
+class DnnReduceMulDescriptor : public DnnReduceTensorDescriptor {
+public:
+    explicit DnnReduceMulDescriptor(cudnnDataType_t compType) {
+        set(CUDNN_REDUCE_TENSOR_MUL,
+            compType,
+            CUDNN_PROPAGATE_NAN,
+            CUDNN_REDUCE_TENSOR_NO_INDICES,
+            CUDNN_32BIT_INDICES);
+    }
+};
+
+class DnnReduceMinDescriptor : public DnnReduceTensorDescriptor {
+public:
+    explicit DnnReduceMinDescriptor(cudnnDataType_t compType) {
+        set(CUDNN_REDUCE_TENSOR_MIN,
+            compType,
+            CUDNN_PROPAGATE_NAN,
+            CUDNN_REDUCE_TENSOR_NO_INDICES,
+            CUDNN_32BIT_INDICES);
+    }
+};
+
+class DnnReduceMaxDescriptor : public DnnReduceTensorDescriptor {
+public:
+    explicit DnnReduceMaxDescriptor(cudnnDataType_t compType) {
+        set(CUDNN_REDUCE_TENSOR_MAX,
+            compType,
+            CUDNN_PROPAGATE_NAN,
+            CUDNN_REDUCE_TENSOR_NO_INDICES,
+            CUDNN_32BIT_INDICES);
+    }
+};
+
 class DnnReduceAvgDescriptor : public DnnReduceTensorDescriptor {
 public:
     explicit DnnReduceAvgDescriptor(cudnnDataType_t compType) {

diff --git a/modules/nvidia_plugin/src/cuda/math.cuh b/modules/nvidia_plugin/src/cuda/math.cuh
@@ -78,7 +78,7 @@ inline __device__ T max(T x, T y) {
 
 template <typename T>
 inline __device__ T exp(T x) {
-    return ::exp(x);
+    return static_cast<T>(::exp(static_cast<float>(x)));
 }
 
 template <typename T>
@@ -88,12 +88,47 @@ inline __device__ T pow(T x, T y) {
 
 template <typename T>
 inline __device__ T sqrt(T a) {
-    return ::sqrt(a);
+    return static_cast<T>(::sqrtf(static_cast<float>(a)));
 }
 
 template <typename T>
 inline __device__ T abs(T a) {
-    return ::abs(a);
+    return static_cast<T>(::fabsf(static_cast<float>(a)));
+}
+
+template <typename T>
+inline __device__ T tanh(T a) {
+    return static_cast<T>(::tanhf(static_cast<float>(a)));
+}
+
+template <typename T>
+inline __device__ T erff(T a) {
+    return ::erff(a);
+}
+
+template <typename T>
+inline __device__ T sin(T a) {
+    return static_cast<T>(::sinf(static_cast<float>(a)));
+}
+
+template <typename T>
+inline __device__ T sinh(T a) {
+    return static_cast<T>(::sinhf(static_cast<float>(a)));
+}
+
+template <typename T>
+inline __device__ T cos(T a) {
+    return static_cast<T>(::cosf(static_cast<float>(a)));
+}
+
+template <typename T>
+inline __device__ T cosh(T a) {
+    return static_cast<T>(::coshf(static_cast<float>(a)));
+}
+
+template <typename T>
+inline __device__ T log(T a) {
+    return static_cast<T>(::logf(static_cast<float>(a)));
 }
 
 #ifdef __CUDACC__
@@ -123,6 +158,21 @@ inline __device__ __half abs<__half>(__half x) {
     return ::__habs(x);
 }
 
+template <>
+inline __device__ __half sin<__half>(__half x) {
+    return ::hsin(x);
+}
+
+template <>
+inline __device__ __half cos<__half>(__half x) {
+    return ::hcos(x);
+}
+
+template <>
+inline __device__ __half log<__half>(__half x) {
+    return ::hlog(x);
+}
+
 #else  // defined (CUDA_HAS_HALF_MATH)
 
 inline __device__ __half floor(__half x) { return floor(static_cast<float>(x)); }
@@ -154,6 +204,21 @@ inline __device__ __half abs<__half>(__half x) {
     return ::abs(static_cast<float>(x));
 }
 
+template <>
+inline __device__ __half sin<__half>(__half x) {
+    return ::sin(static_cast<float>(x));
+}
+
+template <>
+inline __device__ __half cos<__half>(__half x) {
+    return ::cos(static_cast<float>(x));
+}
+
+template <>
+inline __device__ __half log<__half>(__half x) {
+    return ::log(static_cast<float>(x));
+}
+
 #endif  // defined (CUDA_HAS_HALF_MATH)
 
 /* ================================================= */
@@ -186,6 +251,21 @@ inline __device__ __nv_bfloat16 abs<__nv_bfloat16>(__nv_bfloat16 x) {
     return ::__habs(x);
 }
 
+template <>
+inline __device__ __nv_bfloat16 sin<__nv_bfloat16>(__nv_bfloat16 x) {
+    return ::hsin(x);
+}
+
+template <>
+inline __device__ __nv_bfloat16 cos<__nv_bfloat16>(__nv_bfloat16 x) {
+    return ::hcos(x);
+}
+
+template <>
+inline __device__ __nv_bfloat16 log<__nv_bfloat16>(__nv_bfloat16 x) {
+    return ::hlog(x);
+}
+
 #else  // defined (CUDA_HAS_BF16_MATH)
 
 inline __device__ __nv_bfloat16 floor(__nv_bfloat16 x) { return floor(static_cast<float>(x)); }
@@ -217,6 +297,21 @@ inline __device__ __nv_bfloat16 abs<__nv_bfloat16>(__nv_bfloat16 x) {
     return abs<float>(static_cast<float>(x));
 }
 
+template <>
+inline __device__ __nv_bfloat16 sin<__nv_bfloat16>(__nv_bfloat16 x) {
+    return ::sin(static_cast<float>(x));
+}
+
+template <>
+inline __device__ __nv_bfloat16 cos<__nv_bfloat16>(__nv_bfloat16 x) {
+    return ::cos(static_cast<float>(x));
+}
+
+template <>
+inline __device__ __nv_bfloat16 log<__nv_bfloat16>(__nv_bfloat16 x) {
+    return ::log(static_cast<float>(x));
+}
+
 #endif  // defined (CUDA_HAS_BF16_MATH)
 #endif  // defined (CUDA_HAS_BF16_TYPE)
 /* ================================================= */

diff --git a/modules/nvidia_plugin/src/cuda_op_buffers_extractor.hpp b/modules/nvidia_plugin/src/cuda_op_buffers_extractor.hpp
@@ -150,7 +150,7 @@ class OperationBuffersExtractor {
      */
     template <typename TNode>
     static std::size_t GetTensorByteSize(const TNode& node) {
-        return node.get_element_type().size() * shape_size(node.get_shape());
+        return node.get_element_type().size() * std::max(std::size_t(1), shape_size(node.get_shape()));
     }
 
     /**

diff --git a/modules/nvidia_plugin/src/kernels/abs.cu b/modules/nvidia_plugin/src/kernels/abs.cu
@@ -0,0 +1,29 @@
+// Copyright (C) 2021-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "abs.hpp"
+
+namespace ov {
+namespace nvidia_gpu {
+namespace kernel {
+
+namespace cumath = CUDA::math;
+
+template <typename T>
+struct AbsOpImpl {
+    __device__ static inline T op(T x) {
+        return cumath::abs(x);
+    }
+};
+
+Abs::Abs(Type_t element_type, size_t max_threads_per_block, size_t num_elements)
+    : impl_{element_type, max_threads_per_block, num_elements} {}
+
+void Abs::operator()(cudaStream_t stream, const void* in0, void* out) const {
+    impl_(stream, in0, out);
+}
+
+}  // namespace kernel
+}  // namespace nvidia_gpu
+}  // namespace ov
diff --git a/modules/nvidia_plugin/src/kernels/abs.hpp b/modules/nvidia_plugin/src/kernels/abs.hpp
@@ -0,0 +1,33 @@
+// Copyright (C) 2021-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "details/cuda_type_traits.hpp"
+#include "details/elementwise_unary.cuh"
+
+namespace ov {
+namespace nvidia_gpu {
+namespace kernel {
+
+template <typename T>
+struct AbsOpImpl;
+/**
+ * Elementwise Abs operation
+ */
+class Abs {
+public:
+    Abs(Type_t element_type, size_t max_threads_per_block, size_t num_elements);
+
+    void operator()(cudaStream_t stream, const void* in0, void* out) const;
+
+private:
+    ElementwiseUnary<AllElementTypesSwitch, AbsOpImpl> impl_;
+};
+
+}  // namespace kernel
+}  // namespace nvidia_gpu
+}  // namespace ov
+
+
diff --git a/modules/nvidia_plugin/src/kernels/comparison.cu b/modules/nvidia_plugin/src/kernels/comparison.cu
@@ -53,11 +53,31 @@ template <typename T>
 struct OpImpl<T, Comparison::Op_t::GREATER> {
     __device__ static inline bool op(T left, T right) { return left > right; }
 };
+
+template <typename T>
+struct OpImpl<T, Comparison::Op_t::GREATER_EQUAL> {
+    __device__ static inline bool op(T left, T right) { return left >= right; }
+};
 template <typename T>
 struct OpImpl<T, Comparison::Op_t::LESS> {
     __device__ static inline bool op(T left, T right) { return left < right; }
 };
 
+template <typename T>
+struct OpImpl<T, Comparison::Op_t::LESS_EQUAL> {
+    __device__ static inline bool op(T left, T right) { return left <= right; }
+};
+
+template <typename T>
+struct OpImpl<T, Comparison::Op_t::EQUAL> {
+    __device__ static inline bool op(T left, T right) { return left == right; }
+};
+
+template <typename T>
+struct OpImpl<T, Comparison::Op_t::NOT_EQUAL> {
+    __device__ static inline bool op(T left, T right) { return left != right; }
+};
+
 Comparison::Comparison(Op_t op_type, Type_t element_type, size_t max_size, size_t num_blocks, size_t threads_per_block)
     : op_type_{op_type},
       element_type_{element_type},
@@ -136,10 +156,26 @@ void Comparison::Call(Comparison::Op_t type,
             Call<T, Comparison::Op_t::GREATER>(
                 stream, left_src, right_src, left_brcst_offsets, right_brcst_offsets, output_sizes, dst);
             break;
+        case Comparison::Op_t::GREATER_EQUAL:
+            Call<T, Comparison::Op_t::GREATER_EQUAL>(
+                stream, left_src, right_src, left_brcst_offsets, right_brcst_offsets, output_sizes, dst);
+            break;
         case Comparison::Op_t::LESS:
             Call<T, Comparison::Op_t::LESS>(
                 stream, left_src, right_src, left_brcst_offsets, right_brcst_offsets, output_sizes, dst);
             break;
+        case Comparison::Op_t::LESS_EQUAL:
+            Call<T, Comparison::Op_t::LESS_EQUAL>(
+                stream, left_src, right_src, left_brcst_offsets, right_brcst_offsets, output_sizes, dst);
+            break;
+        case Comparison::Op_t::EQUAL:
+            Call<T, Comparison::Op_t::EQUAL>(
+                stream, left_src, right_src, left_brcst_offsets, right_brcst_offsets, output_sizes, dst);
+            break;
+        case Comparison::Op_t::NOT_EQUAL:
+            Call<T, Comparison::Op_t::NOT_EQUAL>(
+                stream, left_src, right_src, left_brcst_offsets, right_brcst_offsets, output_sizes, dst);
+            break;
         default:
             throw_ov_exception(fmt::format("Input operation = {} is not supported by Comparison operation !!",
                                          static_cast<Type_t>(type)));

diff --git a/modules/nvidia_plugin/src/kernels/comparison.hpp b/modules/nvidia_plugin/src/kernels/comparison.hpp
@@ -12,7 +12,7 @@ namespace kernel {
 
 class Comparison {
 public:
-    enum class Op_t { GREATER, LESS };
+    enum class Op_t { GREATER, LESS, EQUAL, GREATER_EQUAL, LESS_EQUAL, NOT_EQUAL };
 
     Comparison(Op_t op, Type_t element_type, size_t max_size, size_t num_blocks, size_t threads_per_block);
     Comparison(Comparison&&) = default;

diff --git a/modules/nvidia_plugin/src/kernels/cos.cu b/modules/nvidia_plugin/src/kernels/cos.cu
@@ -0,0 +1,29 @@
+// Copyright (C) 2021-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "cos.hpp"
+
+namespace ov {
+namespace nvidia_gpu {
+namespace kernel {
+
+namespace cumath = CUDA::math;
+
+template <typename T>
+struct CosOpImpl {
+    __device__ static inline T op(T x) {
+        return cumath::cos(x);
+    }
+};
+
+Cos::Cos(Type_t element_type, size_t max_threads_per_block, size_t num_elements)
+    : impl_{element_type, max_threads_per_block, num_elements} {}
+
+void Cos::operator()(cudaStream_t stream, const void* in0, void* out) const {
+    impl_(stream, in0, out);
+}
+
+}  // namespace kernel
+}  // namespace nvidia_gpu
+}  // namespace ov
diff --git a/modules/nvidia_plugin/src/kernels/cos.hpp b/modules/nvidia_plugin/src/kernels/cos.hpp
@@ -0,0 +1,33 @@
+// Copyright (C) 2021-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "details/cuda_type_traits.hpp"
+#include "details/elementwise_unary.cuh"
+
+namespace ov {
+namespace nvidia_gpu {
+namespace kernel {
+
+template <typename T>
+struct CosOpImpl;
+/**
+ * Elementwise Cos operation
+ */
+class Cos {
+public:
+    Cos(Type_t element_type, size_t max_threads_per_block, size_t num_elements);
+
+    void operator()(cudaStream_t stream, const void* in0, void* out) const;
+
+private:
+    ElementwiseUnary<AllElementTypesSwitch, CosOpImpl> impl_;
+};
+
+}  // namespace kernel
+}  // namespace nvidia_gpu
+}  // namespace ov
+
+