Skip to content

Commit

Permalink
[NVIDIA] Add new activation, comparison and reduce operations (openvi…
Browse files Browse the repository at this point in the history
…notoolkit#705)

* [NVIDIA] Adds abs,cos,cosh,exp,gelu,sin,sinh,sqrt,log activations

* [NVIDIA] Adds greater_equal,less_equal,equal,not_equal operations

* Missing licenses

* Adds hswish

* Small update

* Adds ReduceMax,ReduceMean,ReduceMin,ReduceProd operations

* Support reduction opeations with keed_dims == false

* Update StridedSlice to support i32 input

* Adds Mish activation

* Update cuda_opset.md

* Adds Elu activation

* Disable ShuffleChannelsFusion

* Update cuda_opset.md

* Fix ReduceTransformation for case when Reduce doesn't reduce dims

* ELU: double alpha => float

* Update ReduceTransformation
  • Loading branch information
nkogteva authored Sep 1, 2023
1 parent 394a8cf commit 322c19f
Show file tree
Hide file tree
Showing 93 changed files with 3,184 additions and 436 deletions.
60 changes: 32 additions & 28 deletions modules/nvidia_plugin/docs/cuda_opset.md

Large diffs are not rendered by default.

33 changes: 33 additions & 0 deletions modules/nvidia_plugin/src/cuda/dnn.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,39 @@ class DnnReduceAddDescriptor : public DnnReduceTensorDescriptor {
}
};

class DnnReduceMulDescriptor : public DnnReduceTensorDescriptor {
public:
explicit DnnReduceMulDescriptor(cudnnDataType_t compType) {
set(CUDNN_REDUCE_TENSOR_MUL,
compType,
CUDNN_PROPAGATE_NAN,
CUDNN_REDUCE_TENSOR_NO_INDICES,
CUDNN_32BIT_INDICES);
}
};

class DnnReduceMinDescriptor : public DnnReduceTensorDescriptor {
public:
explicit DnnReduceMinDescriptor(cudnnDataType_t compType) {
set(CUDNN_REDUCE_TENSOR_MIN,
compType,
CUDNN_PROPAGATE_NAN,
CUDNN_REDUCE_TENSOR_NO_INDICES,
CUDNN_32BIT_INDICES);
}
};

class DnnReduceMaxDescriptor : public DnnReduceTensorDescriptor {
public:
explicit DnnReduceMaxDescriptor(cudnnDataType_t compType) {
set(CUDNN_REDUCE_TENSOR_MAX,
compType,
CUDNN_PROPAGATE_NAN,
CUDNN_REDUCE_TENSOR_NO_INDICES,
CUDNN_32BIT_INDICES);
}
};

class DnnReduceAvgDescriptor : public DnnReduceTensorDescriptor {
public:
explicit DnnReduceAvgDescriptor(cudnnDataType_t compType) {
Expand Down
101 changes: 98 additions & 3 deletions modules/nvidia_plugin/src/cuda/math.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ inline __device__ T max(T x, T y) {

template <typename T>
inline __device__ T exp(T x) {
return ::exp(x);
return static_cast<T>(::exp(static_cast<float>(x)));
}

template <typename T>
Expand All @@ -88,12 +88,47 @@ inline __device__ T pow(T x, T y) {

template <typename T>
inline __device__ T sqrt(T a) {
return ::sqrt(a);
return static_cast<T>(::sqrtf(static_cast<float>(a)));
}

template <typename T>
inline __device__ T abs(T a) {
return ::abs(a);
return static_cast<T>(::fabsf(static_cast<float>(a)));
}

template <typename T>
inline __device__ T tanh(T a) {
return static_cast<T>(::tanhf(static_cast<float>(a)));
}

template <typename T>
inline __device__ T erff(T a) {
return ::erff(a);
}

template <typename T>
inline __device__ T sin(T a) {
return static_cast<T>(::sinf(static_cast<float>(a)));
}

template <typename T>
inline __device__ T sinh(T a) {
return static_cast<T>(::sinhf(static_cast<float>(a)));
}

template <typename T>
inline __device__ T cos(T a) {
return static_cast<T>(::cosf(static_cast<float>(a)));
}

template <typename T>
inline __device__ T cosh(T a) {
return static_cast<T>(::coshf(static_cast<float>(a)));
}

template <typename T>
inline __device__ T log(T a) {
return static_cast<T>(::logf(static_cast<float>(a)));
}

#ifdef __CUDACC__
Expand Down Expand Up @@ -123,6 +158,21 @@ inline __device__ __half abs<__half>(__half x) {
return ::__habs(x);
}

template <>
inline __device__ __half sin<__half>(__half x) {
return ::hsin(x);
}

template <>
inline __device__ __half cos<__half>(__half x) {
return ::hcos(x);
}

template <>
inline __device__ __half log<__half>(__half x) {
return ::hlog(x);
}

#else // defined (CUDA_HAS_HALF_MATH)

inline __device__ __half floor(__half x) { return floor(static_cast<float>(x)); }
Expand Down Expand Up @@ -154,6 +204,21 @@ inline __device__ __half abs<__half>(__half x) {
return ::abs(static_cast<float>(x));
}

template <>
inline __device__ __half sin<__half>(__half x) {
return ::sin(static_cast<float>(x));
}

template <>
inline __device__ __half cos<__half>(__half x) {
return ::cos(static_cast<float>(x));
}

template <>
inline __device__ __half log<__half>(__half x) {
return ::log(static_cast<float>(x));
}

#endif // defined (CUDA_HAS_HALF_MATH)

/* ================================================= */
Expand Down Expand Up @@ -186,6 +251,21 @@ inline __device__ __nv_bfloat16 abs<__nv_bfloat16>(__nv_bfloat16 x) {
return ::__habs(x);
}

template <>
inline __device__ __nv_bfloat16 sin<__nv_bfloat16>(__nv_bfloat16 x) {
return ::hsin(x);
}

template <>
inline __device__ __nv_bfloat16 cos<__nv_bfloat16>(__nv_bfloat16 x) {
return ::hcos(x);
}

template <>
inline __device__ __nv_bfloat16 log<__nv_bfloat16>(__nv_bfloat16 x) {
return ::hlog(x);
}

#else // defined (CUDA_HAS_BF16_MATH)

inline __device__ __nv_bfloat16 floor(__nv_bfloat16 x) { return floor(static_cast<float>(x)); }
Expand Down Expand Up @@ -217,6 +297,21 @@ inline __device__ __nv_bfloat16 abs<__nv_bfloat16>(__nv_bfloat16 x) {
return abs<float>(static_cast<float>(x));
}

template <>
inline __device__ __nv_bfloat16 sin<__nv_bfloat16>(__nv_bfloat16 x) {
return ::sin(static_cast<float>(x));
}

template <>
inline __device__ __nv_bfloat16 cos<__nv_bfloat16>(__nv_bfloat16 x) {
return ::cos(static_cast<float>(x));
}

template <>
inline __device__ __nv_bfloat16 log<__nv_bfloat16>(__nv_bfloat16 x) {
return ::log(static_cast<float>(x));
}

#endif // defined (CUDA_HAS_BF16_MATH)
#endif // defined (CUDA_HAS_BF16_TYPE)
/* ================================================= */
Expand Down
2 changes: 1 addition & 1 deletion modules/nvidia_plugin/src/cuda_op_buffers_extractor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ class OperationBuffersExtractor {
*/
template <typename TNode>
static std::size_t GetTensorByteSize(const TNode& node) {
return node.get_element_type().size() * shape_size(node.get_shape());
return node.get_element_type().size() * std::max(std::size_t(1), shape_size(node.get_shape()));
}

/**
Expand Down
29 changes: 29 additions & 0 deletions modules/nvidia_plugin/src/kernels/abs.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// Copyright (C) 2021-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "abs.hpp"

namespace ov {
namespace nvidia_gpu {
namespace kernel {

namespace cumath = CUDA::math;

template <typename T>
struct AbsOpImpl {
__device__ static inline T op(T x) {
return cumath::abs(x);
}
};

Abs::Abs(Type_t element_type, size_t max_threads_per_block, size_t num_elements)
: impl_{element_type, max_threads_per_block, num_elements} {}

void Abs::operator()(cudaStream_t stream, const void* in0, void* out) const {
impl_(stream, in0, out);
}

} // namespace kernel
} // namespace nvidia_gpu
} // namespace ov
33 changes: 33 additions & 0 deletions modules/nvidia_plugin/src/kernels/abs.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright (C) 2021-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "details/cuda_type_traits.hpp"
#include "details/elementwise_unary.cuh"

namespace ov {
namespace nvidia_gpu {
namespace kernel {

template <typename T>
struct AbsOpImpl;
/**
* Elementwise Abs operation
*/
class Abs {
public:
Abs(Type_t element_type, size_t max_threads_per_block, size_t num_elements);

void operator()(cudaStream_t stream, const void* in0, void* out) const;

private:
ElementwiseUnary<AllElementTypesSwitch, AbsOpImpl> impl_;
};

} // namespace kernel
} // namespace nvidia_gpu
} // namespace ov


36 changes: 36 additions & 0 deletions modules/nvidia_plugin/src/kernels/comparison.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,31 @@ template <typename T>
struct OpImpl<T, Comparison::Op_t::GREATER> {
__device__ static inline bool op(T left, T right) { return left > right; }
};

template <typename T>
struct OpImpl<T, Comparison::Op_t::GREATER_EQUAL> {
__device__ static inline bool op(T left, T right) { return left >= right; }
};
template <typename T>
struct OpImpl<T, Comparison::Op_t::LESS> {
__device__ static inline bool op(T left, T right) { return left < right; }
};

template <typename T>
struct OpImpl<T, Comparison::Op_t::LESS_EQUAL> {
__device__ static inline bool op(T left, T right) { return left <= right; }
};

template <typename T>
struct OpImpl<T, Comparison::Op_t::EQUAL> {
__device__ static inline bool op(T left, T right) { return left == right; }
};

template <typename T>
struct OpImpl<T, Comparison::Op_t::NOT_EQUAL> {
__device__ static inline bool op(T left, T right) { return left != right; }
};

Comparison::Comparison(Op_t op_type, Type_t element_type, size_t max_size, size_t num_blocks, size_t threads_per_block)
: op_type_{op_type},
element_type_{element_type},
Expand Down Expand Up @@ -136,10 +156,26 @@ void Comparison::Call(Comparison::Op_t type,
Call<T, Comparison::Op_t::GREATER>(
stream, left_src, right_src, left_brcst_offsets, right_brcst_offsets, output_sizes, dst);
break;
case Comparison::Op_t::GREATER_EQUAL:
Call<T, Comparison::Op_t::GREATER_EQUAL>(
stream, left_src, right_src, left_brcst_offsets, right_brcst_offsets, output_sizes, dst);
break;
case Comparison::Op_t::LESS:
Call<T, Comparison::Op_t::LESS>(
stream, left_src, right_src, left_brcst_offsets, right_brcst_offsets, output_sizes, dst);
break;
case Comparison::Op_t::LESS_EQUAL:
Call<T, Comparison::Op_t::LESS_EQUAL>(
stream, left_src, right_src, left_brcst_offsets, right_brcst_offsets, output_sizes, dst);
break;
case Comparison::Op_t::EQUAL:
Call<T, Comparison::Op_t::EQUAL>(
stream, left_src, right_src, left_brcst_offsets, right_brcst_offsets, output_sizes, dst);
break;
case Comparison::Op_t::NOT_EQUAL:
Call<T, Comparison::Op_t::NOT_EQUAL>(
stream, left_src, right_src, left_brcst_offsets, right_brcst_offsets, output_sizes, dst);
break;
default:
throw_ov_exception(fmt::format("Input operation = {} is not supported by Comparison operation !!",
static_cast<Type_t>(type)));
Expand Down
2 changes: 1 addition & 1 deletion modules/nvidia_plugin/src/kernels/comparison.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ namespace kernel {

class Comparison {
public:
enum class Op_t { GREATER, LESS };
enum class Op_t { GREATER, LESS, EQUAL, GREATER_EQUAL, LESS_EQUAL, NOT_EQUAL };

Comparison(Op_t op, Type_t element_type, size_t max_size, size_t num_blocks, size_t threads_per_block);
Comparison(Comparison&&) = default;
Expand Down
29 changes: 29 additions & 0 deletions modules/nvidia_plugin/src/kernels/cos.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// Copyright (C) 2021-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "cos.hpp"

namespace ov {
namespace nvidia_gpu {
namespace kernel {

namespace cumath = CUDA::math;

template <typename T>
struct CosOpImpl {
__device__ static inline T op(T x) {
return cumath::cos(x);
}
};

Cos::Cos(Type_t element_type, size_t max_threads_per_block, size_t num_elements)
: impl_{element_type, max_threads_per_block, num_elements} {}

void Cos::operator()(cudaStream_t stream, const void* in0, void* out) const {
impl_(stream, in0, out);
}

} // namespace kernel
} // namespace nvidia_gpu
} // namespace ov
33 changes: 33 additions & 0 deletions modules/nvidia_plugin/src/kernels/cos.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright (C) 2021-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "details/cuda_type_traits.hpp"
#include "details/elementwise_unary.cuh"

namespace ov {
namespace nvidia_gpu {
namespace kernel {

template <typename T>
struct CosOpImpl;
/**
* Elementwise Cos operation
*/
class Cos {
public:
Cos(Type_t element_type, size_t max_threads_per_block, size_t num_elements);

void operator()(cudaStream_t stream, const void* in0, void* out) const;

private:
ElementwiseUnary<AllElementTypesSwitch, CosOpImpl> impl_;
};

} // namespace kernel
} // namespace nvidia_gpu
} // namespace ov


Loading

0 comments on commit 322c19f

Please sign in to comment.