diff --git a/sycl/include/CL/sycl/builtins_esimd.hpp b/sycl/include/CL/sycl/builtins_esimd.hpp index 5b64428c685b3..8142129fa83a1 100644 --- a/sycl/include/CL/sycl/builtins_esimd.hpp +++ b/sycl/include/CL/sycl/builtins_esimd.hpp @@ -28,7 +28,7 @@ cos(__ESIMD_NS::simd x) __NOEXC { #ifdef __SYCL_DEVICE_ONLY__ return __ESIMD_NS::detail::ocl_cos(x.data()); #else - return __esimd_cos(x.data()); + return __esimd_cos(x.data()); #endif // __SYCL_DEVICE_ONLY__ } @@ -39,7 +39,7 @@ sin(__ESIMD_NS::simd x) __NOEXC { #ifdef __SYCL_DEVICE_ONLY__ return __ESIMD_NS::detail::ocl_sin(x.data()); #else - return __esimd_sin(x.data()); + return __esimd_sin(x.data()); #endif // __SYCL_DEVICE_ONLY__ } @@ -50,7 +50,7 @@ exp(__ESIMD_NS::simd x) __NOEXC { #ifdef __SYCL_DEVICE_ONLY__ return __ESIMD_NS::detail::ocl_exp(x.data()); #else - return __esimd_exp(x.data()); + return __esimd_exp(x.data()); #endif // __SYCL_DEVICE_ONLY__ } @@ -61,7 +61,7 @@ log(__ESIMD_NS::simd x) __NOEXC { #ifdef __SYCL_DEVICE_ONLY__ return __ESIMD_NS::detail::ocl_log(x.data()); #else - return __esimd_log(x.data()); + return __esimd_log(x.data()); #endif // __SYCL_DEVICE_ONLY__ } diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/elem_type_traits.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/elem_type_traits.hpp index c2e7aed5fad2e..c2214a1dce372 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/elem_type_traits.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/elem_type_traits.hpp @@ -133,6 +133,10 @@ template struct element_type_traits { // Whether a value or clang vector value the raw element type can be used // directly as operand to std C++ operations. static inline constexpr bool use_native_cpp_ops = true; + // W/A for MSVC compiler problems which thinks + // std::is_floating_point_v<_Float16> is false; so require new element types + // implementations to state "is floating point" trait explicitly + static inline constexpr bool is_floating_point = false; }; // Element type traits specialization for C++ standard element type. @@ -141,8 +145,19 @@ struct element_type_traits>> { using RawT = T; using EnclosingCppT = T; static inline constexpr bool use_native_cpp_ops = true; + static inline constexpr bool is_floating_point = std::is_floating_point_v; }; +#ifdef __SYCL_DEVICE_ONLY__ +template <> struct element_type_traits<_Float16, void> { + using RawT = _Float16; + using EnclosingCppT = _Float16; + __SYCL_DEPRECATED("use sycl::half as element type") + static inline constexpr bool use_native_cpp_ops = true; + static inline constexpr bool is_floating_point = true; +}; +#endif + // --- Type conversions // Low-level conversion functions to and from a wrapper element type. @@ -563,7 +578,7 @@ class WrapperElementTypeProxy { // the wrapper floating-point types such as sycl::half. template static inline constexpr bool is_generic_floating_point_v = - std::is_floating_point_v::EnclosingCppT>; + element_type_traits::is_floating_point; // @{ // Get computation type of a binary operator given its operand types: @@ -664,6 +679,8 @@ struct element_type_traits>> { // operations on half type. static inline constexpr bool use_native_cpp_ops = false; #endif // __SYCL_DEVICE_ONLY__ + + static inline constexpr bool is_floating_point = true; }; using half_raw = __raw_t; diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/host_util.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/host_util.hpp index d1562a2405d68..b4590ac25a8f1 100755 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/host_util.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/host_util.hpp @@ -12,8 +12,11 @@ #ifndef __SYCL_DEVICE_ONLY__ +#include #include +#include + #define SIMDCF_ELEMENT_SKIP(i) __SYCL_INLINE_NAMESPACE(cl) { @@ -46,7 +49,12 @@ static long long abs(long long a) { } } -template struct satur { +template struct satur; + +template +struct satur>> { + static_assert(!__SEIEED::is_wrapper_elem_type_v); + template static RT saturate(const T val, const int flags) { if ((flags & sat_is_on) == 0) { return (RT)val; @@ -72,35 +80,29 @@ template struct satur { } }; -template <> struct satur { - template static float saturate(const T val, const int flags) { - if ((flags & sat_is_on) == 0) { - return (float)val; - } - - if (val < 0.) { - return 0; - } else if (val > 1.) { - return 1.; - } else { - return (float)val; - } - } -}; - -template <> struct satur { - template static double saturate(const T val, const int flags) { - if ((flags & sat_is_on) == 0) { - return (double)val; +// Host implemenation of saturation for FP types, including non-standarad +// wrapper types such as sycl::half. Template parameters are defined in terms +// of user-level types (sycl::half), function parameter and return types - +// in terms of raw bit representation type(_Float16 for half on device). +template +struct satur>> { + template + static __SEIEED::__raw_t saturate(const __SEIEED::__raw_t raw_src, + const int flags) { + Tsrc src = __SEIEED::bitcast_to_wrapper_type(raw_src); + + // perform comparison on user type! + if ((flags & sat_is_on) == 0 || (src >= 0 && src <= 1)) { + // convert_scalar accepts/returns user types - need to bitcast + Tdst dst = __SEIEED::convert_scalar(src); + return __SEIEED::bitcast_to_raw_type(dst); } - - if (val < 0.) { - return 0; - } else if (val > 1.) { - return 1.; - } else { - return (double)val; + if (src < 0) { + return __SEIEED::bitcast_to_raw_type(Tdst{0}); } + assert(src > 1); + return __SEIEED::bitcast_to_raw_type(Tdst{1}); } }; @@ -116,6 +118,10 @@ template <> struct SetSatur { static unsigned int set() { return sat_is_on; } }; +// TODO replace restype_ex with detail::computation_type_t and represent half +// as sycl::half rather than 'using half = sycl::detail::half_impl::half;' +// above + // used for intermediate type in dp4a emulation template struct restype_ex { private: @@ -430,36 +436,6 @@ template <> struct fptype { static const bool value = true; }; template struct dftype { static const bool value = false; }; template <> struct dftype { static const bool value = true; }; -template struct esimdtype; -template <> struct esimdtype { static const bool value = true; }; - -template <> struct esimdtype { static const bool value = true; }; - -template <> struct esimdtype { static const bool value = true; }; - -template <> struct esimdtype { static const bool value = true; }; - -template <> struct esimdtype { - static const bool value = true; -}; -template <> struct esimdtype { static const bool value = true; }; - -template <> struct esimdtype { static const bool value = true; }; - -template <> struct esimdtype { static const bool value = true; }; - -template <> struct esimdtype { static const bool value = true; }; - -template <> struct esimdtype { static const bool value = true; }; - -template <> struct esimdtype { static const bool value = true; }; - -template <> struct esimdtype { static const bool value = true; }; - -template <> struct esimdtype { - static const bool value = true; -}; - template struct bytetype; template <> struct bytetype { static const bool value = true; }; template <> struct bytetype { static const bool value = true; }; diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp index 5b7a080cdbe90..4cfe11d268532 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp @@ -13,95 +13,97 @@ #include #include +#include #include #include #include #include +#define __ESIMD_raw_vec_t(T, SZ) \ + __SEIEED::vector_type_t<__SEIEED::__raw_t, SZ> +#define __ESIMD_cpp_vec_t(T, SZ) \ + __SEIEED::vector_type_t<__SEIEED::__cpp_t, SZ> + // saturation intrinsics template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sat(__SEIEED::vector_type_t src); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_sat(__ESIMD_raw_vec_t(T1, SZ) src); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_fptoui_sat(__SEIEED::vector_type_t src); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_fptoui_sat(__ESIMD_raw_vec_t(T1, SZ) src); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_fptosi_sat(__SEIEED::vector_type_t src); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_fptosi_sat(__ESIMD_raw_vec_t(T1, SZ) src); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_uutrunc_sat(__SEIEED::vector_type_t src); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_uutrunc_sat(__ESIMD_raw_vec_t(T1, SZ) src); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ustrunc_sat(__SEIEED::vector_type_t src); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_ustrunc_sat(__ESIMD_raw_vec_t(T1, SZ) src); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sutrunc_sat(__SEIEED::vector_type_t src); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_sutrunc_sat(__ESIMD_raw_vec_t(T1, SZ) src); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sstrunc_sat(__SEIEED::vector_type_t src); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_sstrunc_sat(__ESIMD_raw_vec_t(T1, SZ) src); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_abs(__SEIEED::vector_type_t src0); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_abs(__ESIMD_raw_vec_t(T, SZ) src0); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ssshl(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_ssshl(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sushl(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_sushl(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_usshl(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_usshl(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_uushl(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_uushl(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ssshl_sat(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_ssshl_sat(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sushl_sat(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_sushl_sat(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_usshl_sat(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_usshl_sat(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_uushl_sat(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_uushl_sat(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_rol(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_rol(__ESIMD_raw_vec_t(T1, SZ) src0, __ESIMD_raw_vec_t(T1, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ror(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_ror(__ESIMD_raw_vec_t(T1, SZ) src0, __ESIMD_raw_vec_t(T1, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_umulh(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_umulh(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_smulh(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_smulh(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1); template __ESIMD_INTRIN __SEIEED::vector_type_t @@ -109,99 +111,84 @@ __esimd_frc(__SEIEED::vector_type_t src0); /// 3 kinds of max template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_fmax(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_fmax(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_umax(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_umax(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_smax(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_smax(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_lzd(__SEIEED::vector_type_t src0); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_lzd(__ESIMD_raw_vec_t(T, SZ) src0); /// 3 kinds of min template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_fmin(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_fmin(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_umin(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_umin(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_smin(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_smin(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_bfrev(__SEIEED::vector_type_t src0); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_bfrev(__ESIMD_raw_vec_t(T1, SZ) src0); template __ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_cbit(__SEIEED::vector_type_t src0); + __esimd_cbit(__ESIMD_raw_vec_t(T, SZ) src0); template -__ESIMD_INTRIN __SEIEED::vector_type_t __esimd_bfi( - __SEIEED::vector_type_t src0, __SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2, __SEIEED::vector_type_t src3); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_bfi(__ESIMD_raw_vec_t(T0, SZ) src0, __ESIMD_raw_vec_t(T0, SZ) src1, + __ESIMD_raw_vec_t(T0, SZ) src2, __ESIMD_raw_vec_t(T0, SZ) src3); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sbfe(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_sbfe(__ESIMD_raw_vec_t(T0, SZ) src0, __ESIMD_raw_vec_t(T0, SZ) src1, + __ESIMD_raw_vec_t(T0, SZ) src2); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_fbl(__SEIEED::vector_type_t src0); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_fbl(__ESIMD_raw_vec_t(T0, SZ) src0); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sfbh(__SEIEED::vector_type_t src0); +__ESIMD_INTRIN __ESIMD_raw_vec_t(int, SZ) + __esimd_sfbh(__ESIMD_raw_vec_t(T0, SZ) src0); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ufbh(__SEIEED::vector_type_t src0); - -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_inv(__SEIEED::vector_type_t src0); -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_log(__SEIEED::vector_type_t src0); -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_exp(__SEIEED::vector_type_t src0); -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sqrt(__SEIEED::vector_type_t src0); -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ieee_sqrt(__SEIEED::vector_type_t src0); -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_rsqrt(__SEIEED::vector_type_t src0); -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sin(__SEIEED::vector_type_t src0); -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_cos(__SEIEED::vector_type_t src0); -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_pow(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ieee_div(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); +__ESIMD_INTRIN __ESIMD_raw_vec_t(uint32_t, SZ) + __esimd_ufbh(__ESIMD_raw_vec_t(T0, SZ) src0); + +#define __ESIMD_UNARY_EXT_MATH_INTRIN(name) \ + template \ + __ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) \ + __esimd_##name(__ESIMD_raw_vec_t(T, SZ) src) + +__ESIMD_UNARY_EXT_MATH_INTRIN(inv); +__ESIMD_UNARY_EXT_MATH_INTRIN(log); +__ESIMD_UNARY_EXT_MATH_INTRIN(exp); +__ESIMD_UNARY_EXT_MATH_INTRIN(sqrt); +__ESIMD_UNARY_EXT_MATH_INTRIN(ieee_sqrt); +__ESIMD_UNARY_EXT_MATH_INTRIN(rsqrt); +__ESIMD_UNARY_EXT_MATH_INTRIN(sin); +__ESIMD_UNARY_EXT_MATH_INTRIN(cos); + +#undef __ESIMD_UNARY_EXT_MATH_INTRIN + +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_pow(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1); + +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_ieee_div(__ESIMD_raw_vec_t(T, SZ) src0, + __ESIMD_raw_vec_t(T, SZ) src1); template __ESIMD_INTRIN __SEIEED::vector_type_t @@ -216,14 +203,6 @@ template __ESIMD_INTRIN __SEIEED::vector_type_t __esimd_rndz(__SEIEED::vector_type_t src0); -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ieee_sqrt(__SEIEED::vector_type_t src0); -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ieee_div(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1); - template __ESIMD_INTRIN uint32_t __esimd_pack_mask(__SEIEED::vector_type_t src0); @@ -233,71 +212,68 @@ __ESIMD_INTRIN __SEIEED::vector_type_t __esimd_unpack_mask(uint32_t src0); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_uudp4a(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T1, N) + __esimd_uudp4a(__ESIMD_raw_vec_t(T2, N) src0, __ESIMD_raw_vec_t(T3, N) src1, + __ESIMD_raw_vec_t(T4, N) src2); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_usdp4a(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T1, N) + __esimd_usdp4a(__ESIMD_raw_vec_t(T2, N) src0, __ESIMD_raw_vec_t(T3, N) src1, + __ESIMD_raw_vec_t(T4, N) src2); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sudp4a(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T1, N) + __esimd_sudp4a(__ESIMD_raw_vec_t(T2, N) src0, __ESIMD_raw_vec_t(T3, N) src1, + __ESIMD_raw_vec_t(T4, N) src2); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ssdp4a(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T1, N) + __esimd_ssdp4a(__ESIMD_raw_vec_t(T2, N) src0, __ESIMD_raw_vec_t(T3, N) src1, + __ESIMD_raw_vec_t(T4, N) src2); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_uudp4a_sat(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T1, N) + __esimd_uudp4a_sat(__ESIMD_raw_vec_t(T2, N) src0, + __ESIMD_raw_vec_t(T3, N) src1, + __ESIMD_raw_vec_t(T4, N) src2); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_usdp4a_sat(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T1, N) + __esimd_usdp4a_sat(__ESIMD_raw_vec_t(T2, N) src0, + __ESIMD_raw_vec_t(T3, N) src1, + __ESIMD_raw_vec_t(T4, N) src2); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sudp4a_sat(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); +__ESIMD_INTRIN __ESIMD_raw_vec_t(T1, N) + __esimd_sudp4a_sat(__ESIMD_raw_vec_t(T2, N) src0, + __ESIMD_raw_vec_t(T3, N) src1, + __ESIMD_raw_vec_t(T4, N) src2); template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ssdp4a_sat(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2); - -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_dp4(__SEIEED::vector_type_t v1, - __SEIEED::vector_type_t v2) +__ESIMD_INTRIN __ESIMD_raw_vec_t(T1, N) + __esimd_ssdp4a_sat(__ESIMD_raw_vec_t(T2, N) src0, + __ESIMD_raw_vec_t(T3, N) src1, + __ESIMD_raw_vec_t(T4, N) src2); + +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) + __esimd_dp4(__ESIMD_raw_vec_t(T, N) v1, __ESIMD_raw_vec_t(T, N) v2) #ifdef __SYCL_DEVICE_ONLY__ - ; + ; #else { - __SEIEED::vector_type_t retv; + if constexpr (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; + __ESIMD_raw_vec_t(T, N) retv; for (auto i = 0; i != N; i += 4) { - Ty dp = (v1[i] * v2[i]) + (v1[i + 1] * v2[i + 1]) + - (v1[i + 2] * v2[i + 2]) + (v1[i + 3] * v2[i + 3]); + T dp = (v1[i] * v2[i]) + (v1[i + 1] * v2[i + 1]) + (v1[i + 2] * v2[i + 2]) + + (v1[i + 3] * v2[i + 3]); retv[i] = dp; retv[i + 1] = dp; retv[i + 2] = dp; retv[i + 3] = dp; } - return retv; + return retv.data(); } #endif // __SYCL_DEVICE_ONLY__ @@ -320,9 +296,9 @@ __ESIMD_INTRIN int __esimd_lane_id(); #define ESIMD_MATH_INTRINSIC_IMPL(type, func) \ template \ - __ESIMD_INTRIN __SEIEED::vector_type_t ocl_##func( \ - __SEIEED::vector_type_t src0) { \ - __SEIEED::vector_type_t retv; \ + __ESIMD_INTRIN __ESIMD_raw_vec_t(type, SZ) \ + ocl_##func(__ESIMD_raw_vec_t(type, SZ) src0) { \ + __ESIMD_raw_vec_t(type, SZ) retv; \ __ESIMD_SIMT_BEGIN(SZ, lane) \ retv[lane] = sycl::func(src0[lane]); \ __ESIMD_SIMT_END \ @@ -336,6 +312,7 @@ namespace intel { namespace experimental { namespace esimd { namespace detail { +// TODO support half vectors in std sycl math functions. ESIMD_MATH_INTRINSIC_IMPL(float, sin) ESIMD_MATH_INTRINSIC_IMPL(float, cos) ESIMD_MATH_INTRINSIC_IMPL(float, exp) @@ -354,6 +331,31 @@ ESIMD_MATH_INTRINSIC_IMPL(float, log) #else // __SYCL_DEVICE_ONLY__ +// Typical implementation of a generic intrinsic supporting non-standard +// types (half, bfloat*,...) should be like this: +// - user type information is encoded in template parameters, but function +// parameters and return type are raw types +// - before use, parameters are converted to EnclosingCppT +// - return value is calculated using the converted parameters, +// but before return it is converted back to the user type and is bitcast +// (that's what .data() basically does) to the raw type +// +// template +// __ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) __esimd_intrin( +// __ESIMD_raw_vec_t(T, SZ) raw_src0, __ESIMD_raw_vec_t(T, SZ) raw_src1) { +// +// simd ret; +// simd src0{raw_src0}; +// simd src1{raw_src1}; +// ret = function_of(src0, src1); +// return ret.data(); +// +// TODO Not following this approach in some of the intrinsics, and performing +// calculations on the raw type will lead to runtime compuation error. A guard +// if (__SEIEED::is_wrapper_elem_type_v) __ESIMD_UNSUPPORTED_ON_HOST; +// is temporarily used for now, until wrapper types are supported by these +// intrinsics. + template inline T extract(const uint32_t &width, const uint32_t &offset, uint32_t src, const uint32_t &sign_extend) { @@ -369,89 +371,34 @@ inline T extract(const uint32_t &width, const uint32_t &offset, uint32_t src, return ret; } -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sat(__SEIEED::vector_type_t src) { - __SEIEED::vector_type_t retv; - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = __SEIEEED::satur::saturate(src[i], 1); - } - return retv; -}; - -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_fptoui_sat(__SEIEED::vector_type_t src) { - __SEIEED::vector_type_t retv; - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = __SEIEEED::satur::saturate(src[i], 1); - } - return retv; -}; - -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_fptosi_sat(__SEIEED::vector_type_t src) { - __SEIEED::vector_type_t retv; - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = __SEIEEED::satur::saturate(src[i], 1); - } - return retv; -}; - -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_uutrunc_sat(__SEIEED::vector_type_t src) { - __SEIEED::vector_type_t retv; - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = __SEIEEED::satur::saturate(src[i], 1); - } - return retv; -}; - -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ustrunc_sat(__SEIEED::vector_type_t src) { - __SEIEED::vector_type_t retv; - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = __SEIEEED::satur::saturate(src[i], 1); - } - return retv; -}; - -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sutrunc_sat(__SEIEED::vector_type_t src) { - __SEIEED::vector_type_t retv; - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = __SEIEEED::satur::saturate(src[i], 1); +#define __ESIMD_DEFAULT_HOST_SATURATE_INTRIN(name) \ + template \ + __ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) \ + __esimd_##name(__ESIMD_raw_vec_t(T1, SZ) src) { \ + __ESIMD_raw_vec_t(T0, SZ) retv; \ + for (int i = 0; i < SZ; i++) { \ + SIMDCF_ELEMENT_SKIP(i); \ + retv[i] = __SEIEEED::satur::template saturate(src[i], 1); \ + } \ + return retv; \ } - return retv; -}; -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sstrunc_sat(__SEIEED::vector_type_t src) { - __SEIEED::vector_type_t retv; - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = __SEIEEED::satur::saturate(src[i], 1); - } - return retv; -}; +__ESIMD_DEFAULT_HOST_SATURATE_INTRIN(sat) +__ESIMD_DEFAULT_HOST_SATURATE_INTRIN(fptoui_sat) +__ESIMD_DEFAULT_HOST_SATURATE_INTRIN(fptosi_sat) +__ESIMD_DEFAULT_HOST_SATURATE_INTRIN(uutrunc_sat) +__ESIMD_DEFAULT_HOST_SATURATE_INTRIN(ustrunc_sat) +__ESIMD_DEFAULT_HOST_SATURATE_INTRIN(sutrunc_sat) +__ESIMD_DEFAULT_HOST_SATURATE_INTRIN(sstrunc_sat) template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_abs(__SEIEED::vector_type_t src0) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_abs(__ESIMD_raw_vec_t(T, SZ) src0) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; typename __SEIEEED::abstype::type ret; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -463,15 +410,17 @@ __esimd_abs(__SEIEED::vector_type_t src0) { retv[i] = ret; } return retv; -}; +} template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ssshl(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_ssshl(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; typename __SEIEEED::maxtype::type ret; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T0, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -479,14 +428,17 @@ __esimd_ssshl(__SEIEED::vector_type_t src0, retv[i] = ret; } return retv; -}; +} + template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sushl(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_sushl(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; typename __SEIEEED::maxtype::type ret; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T0, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -494,14 +446,17 @@ __esimd_sushl(__SEIEED::vector_type_t src0, retv[i] = ret; } return retv; -}; +} + template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_usshl(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_usshl(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; typename __SEIEEED::maxtype::type ret; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T0, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -509,14 +464,17 @@ __esimd_usshl(__SEIEED::vector_type_t src0, retv[i] = ret; } return retv; -}; +} + template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_uushl(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_uushl(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; typename __SEIEEED::maxtype::type ret; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T0, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -524,84 +482,102 @@ __esimd_uushl(__SEIEED::vector_type_t src0, retv[i] = ret; } return retv; -}; +} + template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ssshl_sat(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_ssshl_sat(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; typename __SEIEEED::maxtype::type ret; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T0, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); ret = src0.get(i) << src1.get(i); - retv[i] = __SEIEEED::satur::saturate(ret, 1); + retv[i] = __SEIEEED::satur::template saturate(ret, 1); } return retv; -}; +} + template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sushl_sat(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_sushl_sat(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; typename __SEIEEED::maxtype::type ret; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T0, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); ret = src0.get(i) << src1.get(i); - retv[i] = __SEIEEED::satur::saturate(ret, 1); + retv[i] = __SEIEEED::satur::template saturate(ret, 1); } return retv; -}; +} + template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_usshl_sat(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_usshl_sat(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; typename __SEIEEED::maxtype::type ret; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T0, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); ret = src0.get(i) << src1.get(i); - retv[i] = __SEIEEED::satur::saturate(ret, 1); + retv[i] = __SEIEEED::satur::template saturate(ret, 1); } return retv; -}; +} + template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_uushl_sat(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_uushl_sat(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; typename __SEIEEED::maxtype::type ret; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T0, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); ret = src0.get(i) << src1.get(i); - retv[i] = __SEIEEED::satur::saturate(ret, 1); + retv[i] = __SEIEEED::satur::template saturate(ret, 1); } return retv; -}; +} template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_rol(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1){}; +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_rol(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1) { + __ESIMD_UNSUPPORTED_ON_HOST; +} template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ror(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1){}; +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_ror(__ESIMD_raw_vec_t(T1, SZ) src0, + __ESIMD_raw_vec_t(T1, SZ) src1) { + __ESIMD_UNSUPPORTED_ON_HOST; +} template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_umulh(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_umulh(__ESIMD_raw_vec_t(T, SZ) src0, + __ESIMD_raw_vec_t(T, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T, SZ) retv; for (i = 0; i < SZ; i++) { unsigned long long temp; @@ -613,11 +589,13 @@ __esimd_umulh(__SEIEED::vector_type_t src0, } template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_smulh(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_smulh(__ESIMD_raw_vec_t(T, SZ) src0, + __ESIMD_raw_vec_t(T, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T, SZ) retv; for (i = 0; i < SZ; i++) { long long temp; @@ -626,7 +604,7 @@ __esimd_smulh(__SEIEED::vector_type_t src0, retv[i] = temp >> 32; } return retv; -}; +} template __ESIMD_INTRIN __SEIEED::vector_type_t @@ -637,15 +615,16 @@ __esimd_frc(__SEIEED::vector_type_t src0) { retv[i] = src0[i] - floor(src0[i]); } return retv; -}; +} /// 3 kinds of max template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_fmax(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_fmax(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -657,13 +636,15 @@ __esimd_fmax(__SEIEED::vector_type_t src0, } return retv; -}; +} + template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_umax(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_umax(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -675,13 +656,15 @@ __esimd_umax(__SEIEED::vector_type_t src0, } return retv; -}; +} + template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_smax(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_smax(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -693,14 +676,16 @@ __esimd_smax(__SEIEED::vector_type_t src0, } return retv; -}; +} template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_lzd(__SEIEED::vector_type_t src0) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_lzd(__ESIMD_raw_vec_t(T, SZ) src0) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; T ret; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -714,15 +699,16 @@ __esimd_lzd(__SEIEED::vector_type_t src0) { } return retv; -}; +} /// 3 kinds of min template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_fmin(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_fmin(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -737,11 +723,12 @@ __esimd_fmin(__SEIEED::vector_type_t src0, }; template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_umin(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_umin(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -753,14 +740,15 @@ __esimd_umin(__SEIEED::vector_type_t src0, } return retv; -}; +} template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_smin(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_smin(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -772,13 +760,15 @@ __esimd_smin(__SEIEED::vector_type_t src0, } return retv; -}; +} template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_bfrev(__SEIEED::vector_type_t src0) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ) + __esimd_bfrev(__ESIMD_raw_vec_t(T1, SZ) src0) { int i, j; - __SEIEED::vector_type_t retv; + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; + __ESIMD_raw_vec_t(T0, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -797,14 +787,16 @@ __esimd_bfrev(__SEIEED::vector_type_t src0) { } return retv; -}; +} template __ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_cbit(__SEIEED::vector_type_t src0) { +__esimd_cbit(__ESIMD_raw_vec_t(T, SZ) src0) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; uint32_t ret; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(uint32_t, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -820,17 +812,17 @@ __esimd_cbit(__SEIEED::vector_type_t src0) { } return retv; -}; +} -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_bfi(__SEIEED::vector_type_t width, - __SEIEED::vector_type_t offset, - __SEIEED::vector_type_t val, - __SEIEED::vector_type_t src) { +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_bfi(__ESIMD_raw_vec_t(T, SZ) width, __ESIMD_raw_vec_t(T, SZ) offset, + __ESIMD_raw_vec_t(T, SZ) val, __ESIMD_raw_vec_t(T, SZ) src) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; - typename __SEIEEED::maxtype::type ret; - __SEIEED::vector_type_t retv; + typename __SEIEEED::maxtype::type ret; + __ESIMD_raw_vec_t(T, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -838,7 +830,7 @@ __esimd_bfi(__SEIEED::vector_type_t width, const uint32_t imask = ~mask; ret = (src[i] & imask) | ((val[i] << offset[i] & mask)); // Sign extend if signed type - if constexpr (std::is_signed::value) { + if constexpr (std::is_signed::value) { int m = 1U << (width[i] - 1); ret = (ret ^ m) - m; } @@ -846,16 +838,18 @@ __esimd_bfi(__SEIEED::vector_type_t width, } return retv; -}; +} -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sbfe(__SEIEED::vector_type_t width, - __SEIEED::vector_type_t offset, - __SEIEED::vector_type_t src) { +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_sbfe(__ESIMD_raw_vec_t(T, SZ) width, + __ESIMD_raw_vec_t(T, SZ) offset, + __ESIMD_raw_vec_t(T, SZ) src) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; - typename __SEIEEED::maxtype::type ret; - __SEIEED::vector_type_t retv; + typename __SEIEEED::maxtype::type ret; + __ESIMD_raw_vec_t(T, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -865,14 +859,16 @@ __esimd_sbfe(__SEIEED::vector_type_t width, } return retv; -}; +} -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_fbl(__SEIEED::vector_type_t src0) { +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_fbl(__ESIMD_raw_vec_t(T, SZ) src0) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i; - T0 ret; - __SEIEED::vector_type_t retv; + T ret; + __ESIMD_raw_vec_t(T, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -890,15 +886,16 @@ __esimd_fbl(__SEIEED::vector_type_t src0) { } return retv; -}; - -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sfbh(__SEIEED::vector_type_t src0) { +} +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(int, SZ) + __esimd_sfbh(__ESIMD_raw_vec_t(T, SZ) src0) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; int i, cval; int ret; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(int, SZ) retv; for (i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -922,13 +919,15 @@ __esimd_sfbh(__SEIEED::vector_type_t src0) { } return retv; -}; +} -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ufbh(__SEIEED::vector_type_t src0) { +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(uint32_t, SZ) + __esimd_ufbh(__ESIMD_raw_vec_t(T, SZ) src0) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; uint32_t ret; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(uint32_t, SZ) retv; for (int i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); @@ -946,127 +945,57 @@ __esimd_ufbh(__SEIEED::vector_type_t src0) { } return retv; -}; - -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_inv(__SEIEED::vector_type_t src0) { - __SEIEED::vector_type_t retv; - - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = 1.f / src0[i]; - } - return retv; -}; - -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_log(__SEIEED::vector_type_t src0) { - __SEIEED::vector_type_t retv; +} - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = logf(src0[i]) / logf(2.); +// Host intrinsics are implemented via converting elements to enclosing Cpp +// type (always 'float' except ieee_sqrt, which can be 'double'), applying +// standard C++ library math function and converting back to the element type. +// +#define __ESIMD_UNARY_EXT_MATH_HOST_INTRIN(name, formula) \ + template \ + __ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) \ + __esimd_##name(__ESIMD_raw_vec_t(T, SZ) src) { \ + using CppT = __SEIEED::__cpp_t; \ + using CppVecT = __ESIMD_cpp_vec_t(T, SZ); \ + CppVecT ret_cpp{0}; \ + CppVecT src_cpp = __SEIEED::convert_vector(src); \ + \ + for (int i = 0; i < SZ; i++) { \ + SIMDCF_ELEMENT_SKIP(i); \ + ret_cpp[i] = formula; \ + } \ + __ESIMD_raw_vec_t(T, SZ) ret = \ + __SEIEED::convert_vector(ret_cpp); \ + return ret; \ } - return retv; -}; -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_exp(__SEIEED::vector_type_t src0) { - __SEIEED::vector_type_t retv; - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = powf(2.f, src0[i]); - } - return retv; -}; -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sqrt(__SEIEED::vector_type_t src0) { - __SEIEED::vector_type_t retv; +__ESIMD_UNARY_EXT_MATH_HOST_INTRIN(inv, 1.f / src_cpp[i]) +__ESIMD_UNARY_EXT_MATH_HOST_INTRIN(log, logf(src_cpp[i]) / logf(2.f)) +__ESIMD_UNARY_EXT_MATH_HOST_INTRIN(exp, powf(2.f, src_cpp[i])) +__ESIMD_UNARY_EXT_MATH_HOST_INTRIN(sqrt, sqrt(src_cpp[i])) +__ESIMD_UNARY_EXT_MATH_HOST_INTRIN(ieee_sqrt, sqrt(src_cpp[i])) +__ESIMD_UNARY_EXT_MATH_HOST_INTRIN(rsqrt, 1.f / sqrt(src_cpp[i])) +__ESIMD_UNARY_EXT_MATH_HOST_INTRIN(sin, sin(src_cpp[i])) +__ESIMD_UNARY_EXT_MATH_HOST_INTRIN(cos, cos(src_cpp[i])) - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = sqrt(src0[i]); - } - return retv; -}; -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ieee_sqrt(__SEIEED::vector_type_t src0) { - __SEIEED::vector_type_t retv; +#undef __ESIMD_UNARY_EXT_MATH_HOST_INTRIN - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = sqrt(src0[i]); - } - return retv; -}; -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_rsqrt(__SEIEED::vector_type_t src0) { - __SEIEED::vector_type_t retv; +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_pow(__ESIMD_raw_vec_t(T, SZ) src0, __ESIMD_raw_vec_t(T, SZ) src1) { + using CppT = __SEIEED::__cpp_t; + using CppVecT = __ESIMD_cpp_vec_t(T, SZ); - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = 1.f / sqrt(src0[i]); - } - return retv; -}; -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_sin(__SEIEED::vector_type_t src) { - __SEIEED::vector_type_t retv; - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = sin(src[i]); - } - return retv; -}; -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_cos(__SEIEED::vector_type_t src) { - __SEIEED::vector_type_t retv; - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = cos(src[i]); - } - return retv; -}; -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_pow(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { - __SEIEED::vector_type_t retv; + CppVecT cpp_src0 = __SEIEED::convert_vector(src0); + CppVecT cpp_src1 = __SEIEED::convert_vector(src1); + CppVecT cpp_res; for (int i = 0; i < SZ; i++) { SIMDCF_ELEMENT_SKIP(i); - retv[i] = powf(fabs(src0[i]), src1[i]); + cpp_res[i] = std::pow(std::fabs(cpp_src0[i]), cpp_src1[i]); } - return retv; -}; - -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ieee_div(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { - __SEIEED::vector_type_t divinv; - __SEIEED::vector_type_t retv; - - for (int idx = 0; idx < SZ; idx += 1) { - SIMDCF_ELEMENT_SKIP(idx); - if (src1[idx] == 0.0f) { - /// Handle Divide-by-zero - retv[idx] = (src0[idx] < 0) ? (-INFINITY) : INFINITY; - } else { - retv[idx] = src0[idx] / src1[idx]; - } - } - - return retv; -}; + return __SEIEED::convert_vector(cpp_res); +} template __ESIMD_INTRIN __SEIEED::vector_type_t @@ -1078,7 +1007,7 @@ __esimd_rndd(__SEIEED::vector_type_t src0) { retv[i] = floor(src0[i]); } return retv; -}; +} template __ESIMD_INTRIN __SEIEED::vector_type_t @@ -1098,7 +1027,7 @@ __esimd_rndu(__SEIEED::vector_type_t src0) { } return retv; -}; +} template __ESIMD_INTRIN __SEIEED::vector_type_t @@ -1120,7 +1049,7 @@ __esimd_rnde(__SEIEED::vector_type_t src0) { } return retv; -}; +} template __ESIMD_INTRIN __SEIEED::vector_type_t @@ -1139,39 +1068,30 @@ __esimd_rndz(__SEIEED::vector_type_t src0) { } return retv; -}; +} -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ieee_sqrt(__SEIEED::vector_type_t src0) { - __SEIEED::vector_type_t retv; +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, SZ) + __esimd_ieee_div(__ESIMD_raw_vec_t(T, SZ) src0, + __ESIMD_raw_vec_t(T, SZ) src1) { + using CppT = __SEIEED::__cpp_t; + using CppVecT = __ESIMD_cpp_vec_t(T, SZ); - for (int i = 0; i < SZ; i++) { - SIMDCF_ELEMENT_SKIP(i); - retv[i] = sqrt(src0[i]); - } - return retv; -}; + CppVecT cpp_src0 = __SEIEED::convert_vector(src0); + CppVecT cpp_src1 = __SEIEED::convert_vector(src1); + CppVecT cpp_res; -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_ieee_div(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1) { - __SEIEED::vector_type_t divinv; - __SEIEED::vector_type_t retv; - - for (int idx = 0; idx < SZ; idx += 1) { - SIMDCF_ELEMENT_SKIP(idx); - if (src1[idx] == 0.0f) { + for (int i = 0; i < SZ; i += 1) { + SIMDCF_ELEMENT_SKIP(i); + if (cpp_src1[i] == 0) { /// Handle Divide-by-zero - retv[idx] = (src0[idx] < 0) ? (-INFINITY) : INFINITY; + cpp_res[i] = (cpp_src0[i] < 0) ? (-INFINITY) : INFINITY; } else { - retv[idx] = src0[idx] / src1[idx]; + cpp_res[i] = cpp_src0[i] / cpp_src1[i]; } } - - return retv; -}; + return __SEIEED::convert_vector(cpp_res); +} template __ESIMD_INTRIN uint32_t @@ -1186,7 +1106,7 @@ __esimd_pack_mask(__SEIEED::vector_type_t src0) { } return retv; -}; +} template __ESIMD_INTRIN __SEIEED::vector_type_t @@ -1200,16 +1120,19 @@ __esimd_unpack_mask(uint32_t src0) { } } return retv; -}; +} template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_dp4a(__SEIEED::vector_type_t src0, - __SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2) { +__ESIMD_INTRIN __ESIMD_raw_vec_t(T1, N) + __esimd_dp4a(__ESIMD_raw_vec_t(T2, N) src0, __ESIMD_raw_vec_t(T3, N) src1, + __ESIMD_raw_vec_t(T4, N) src2) { +#define __ESIMD_WR(T) __SEIEED::is_wrapper_elem_type_v + if (__ESIMD_WR(T1) || __ESIMD_WR(T2) || __ESIMD_WR(T3) || __ESIMD_WR(T4)) + __ESIMD_UNSUPPORTED_ON_HOST; +#undef __ESIMD_IS_WR using __SEIEEED::restype_ex; typename restype_ex::type>::type reta; - __SEIEED::vector_type_t retv; + __ESIMD_raw_vec_t(T1, N) retv; int src1_a, src1_b, src1_c, src1_d, src2_a, src2_b, src2_c, src2_d, ret; @@ -1233,17 +1156,19 @@ __esimd_dp4a(__SEIEED::vector_type_t src0, ret = src1_a * src2_a + src1_b * src2_b + src1_c * src2_c + src1_d * src2_d; reta = ret + src0[i]; - retv[i] = __SEIEEED::satur::saturate(reta, sat1); + retv[i] = __SEIEEED::satur::template saturate(reta, sat1); } return retv; -}; +} -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_reduced_max(__SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2) { - __SEIEED::vector_type_t retv; +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) + __esimd_reduced_max(__ESIMD_raw_vec_t(T, N) src1, + __ESIMD_raw_vec_t(T, N) src2) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; + __ESIMD_raw_vec_t(T, N) retv; for (int I = 0; I < N; I++) { if (src1[I] >= src2[I]) { retv[I] = src1[I]; @@ -1254,11 +1179,13 @@ __esimd_reduced_max(__SEIEED::vector_type_t src1, return retv; } -template -__ESIMD_INTRIN __SEIEED::vector_type_t -__esimd_reduced_min(__SEIEED::vector_type_t src1, - __SEIEED::vector_type_t src2) { - __SEIEED::vector_type_t retv; +template +__ESIMD_INTRIN __ESIMD_raw_vec_t(T, N) + __esimd_reduced_min(__ESIMD_raw_vec_t(T, N) src1, + __ESIMD_raw_vec_t(T, N) src2) { + if (__SEIEED::is_wrapper_elem_type_v) + __ESIMD_UNSUPPORTED_ON_HOST; + __ESIMD_raw_vec_t(T, N) retv; for (int I = 0; I < N; I++) { if (src1[I] <= src2[I]) { retv[I] = src1[I]; @@ -1270,3 +1197,5 @@ __esimd_reduced_min(__SEIEED::vector_type_t src1, } #endif // #ifdef __SYCL_DEVICE_ONLY__ + +#undef __ESIMD_raw_vec_t diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp index 35a3c8fa91657..8ef2c543805e9 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp @@ -102,7 +102,7 @@ class simd_obj_impl { template friend class simd; template friend class simd_mask_impl; - using element_type = simd_like_obj_element_type_t; + using element_type = get_vector_element_type; using Ty = element_type; public: diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp index 2bd5a4defb676..c544886944696 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/types.hpp @@ -151,6 +151,8 @@ struct is_simd_obj_impl_derivative> template struct element_type_traits; template using __raw_t = typename __SEIEED::element_type_traits::RawT; +template +using __cpp_t = typename __SEIEED::element_type_traits::EnclosingCppT; // Specialization for all other types. template class Derived> @@ -314,21 +316,33 @@ template using element_type_t = typename element_type::type; // Determine element type of simd_obj_impl's Derived type w/o having to have // complete instantiation of the Derived type (is required by element_type_t, // hence can't be used here). -template struct simd_like_obj_info; +template struct simd_like_obj_info { + using element_type = T; + static inline constexpr int vector_length = 0; +}; + template struct simd_like_obj_info> { - using type = T; - static inline constexpr int length = N; + using element_type = T; + static inline constexpr int vector_length = N; }; + template struct simd_like_obj_info> { - using type = simd_mask_elem_type; // equals T - static inline constexpr int length = N; + using element_type = simd_mask_elem_type; // equals T + static inline constexpr int vector_length = N; +}; + +template +struct simd_like_obj_info> { + using element_type = typename RegionT::element_type; + static inline constexpr int vector_length = RegionT::length; }; template -using simd_like_obj_element_type_t = typename simd_like_obj_info::type; +using get_vector_element_type = typename simd_like_obj_info::element_type; + template -static inline constexpr int simd_like_obj_length = - simd_like_obj_info::length; +static inline constexpr int get_vector_length = + simd_like_obj_info::vector_length; // @} diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp index f4946e1d9849e..35bcddfe51d14 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp @@ -37,9 +37,9 @@ namespace esimd { /// @return vector of elements converted to \p T0 with saturation. template __ESIMD_API simd saturate(simd src) { - if constexpr (std::is_floating_point::value) + if constexpr (detail::is_generic_floating_point_v) return __esimd_sat(src.data()); - else if constexpr (std::is_floating_point::value) { + else if constexpr (detail::is_generic_floating_point_v) { if constexpr (std::is_unsigned::value) return __esimd_fptoui_sat(src.data()); else @@ -1381,93 +1381,116 @@ ESIMD_NODEBUG // // inv, log2, exp2, sqrt, rsqrt, sin, cos // -// share the same requirements. -// -// template -// simd -// ESIMD_INLINE inv(simd src0, int flag = saturation_off) { -// simd Result = __esimd_inv(src0); -// if (flag != saturation_on) -// return Result; -// return __esimd_sat(Result); -// } -// -// template -// __ESIMD_API -// simd -// inv(matrix src0, int flag = saturation_off) { -// simd Src0 = src0; -// return esimd::inv(Src0, flag); -// } -// -// ESIMD_INLINE float inv(float src0, int flag = saturation_off) { -// simd Src0 = src0; -// simd Result = esimd::inv(Src0, flag); -// return Result[0]; -// } -// -// we also make the scalar version template-based by adding -// a "typename T". Since the type can only be float, we hack it -// by defining T=void without instantiating it to be float. - -#define ESIMD_INTRINSIC_DEF(type, name, iname) \ - template \ - __ESIMD_API simd name(simd src0, \ - int flag = saturation_off) { \ - simd Result = __esimd_##iname(src0.data()); \ + +#define ESIMD_UNARY_INTRINSIC_DEF(COND, name, iname) \ + /* Faster vector implementation w/o dynamic branch when saturation */ \ + /* parameter is known at compile-time. */ \ + template > \ + __ESIMD_API simd name(simd src) { \ + __SEIEED::vector_type_t<__SEIEED::__raw_t, N> res = \ + __esimd_##iname(src.data()); \ + if constexpr (F != saturation_on) \ + return res; \ + else \ + return esimd::saturate(res); \ + } \ + \ + /* Slower vector implementation with dynamic branch on saturation \ + * parameter.*/ \ + template > \ + __ESIMD_API simd name(simd src, int flag) { \ + simd res = name(src); \ if (flag != saturation_on) \ - return Result; \ - return esimd::saturate(Result); \ + return res; \ + return esimd::saturate(res); \ } \ - template \ - __ESIMD_API type name(type src0, int flag = saturation_off) { \ - simd Src0 = src0; \ - simd Result = name(Src0, flag); \ - return Result[0]; \ + \ + /* Faster scalar implementation */ \ + template > \ + __ESIMD_API T name(T src) { \ + simd src_vec = src; \ + simd res = name(src_vec); \ + return res[0]; \ + } \ + \ + /* Slower scalar implementation */ \ + template > \ + __ESIMD_API T name(T src, int flag) { \ + simd src_vec = src; \ + simd res = name(src_vec, flag); \ + return res[0]; \ } -ESIMD_INTRINSIC_DEF(float, inv, inv) -ESIMD_INTRINSIC_DEF(float, log2, log) -ESIMD_INTRINSIC_DEF(float, exp2, exp) -ESIMD_INTRINSIC_DEF(float, sqrt, sqrt) -ESIMD_INTRINSIC_DEF(float, ieee_sqrt, sqrt_ieee) -ESIMD_INTRINSIC_DEF(float, rsqrt, rsqrt) -ESIMD_INTRINSIC_DEF(float, sin, sin) -ESIMD_INTRINSIC_DEF(float, cos, cos) - -ESIMD_INTRINSIC_DEF(double, ieee_sqrt, sqrt_ieee) - -#undef ESIMD_INTRINSIC_DEF - -#define ESIMD_INTRINSIC_DEF(ftype, name, iname) \ - template \ - __ESIMD_API simd name(simd src0, U src1, \ - int flag = saturation_off) { \ - simd Src1 = src1; \ - simd Result = __esimd_##iname(src0.data(), Src1.data()); \ - if (flag != saturation_on) \ - return Result; \ +#define __ESIMD_EMATH_COND \ + detail::is_generic_floating_point_v && (sizeof(T) <= 4) + +ESIMD_UNARY_INTRINSIC_DEF(__ESIMD_EMATH_COND, inv, inv) +ESIMD_UNARY_INTRINSIC_DEF(__ESIMD_EMATH_COND, log2, log) +ESIMD_UNARY_INTRINSIC_DEF(__ESIMD_EMATH_COND, exp2, exp) +ESIMD_UNARY_INTRINSIC_DEF(__ESIMD_EMATH_COND, sqrt, sqrt) +// This also includes double (in addition to half and float): +ESIMD_UNARY_INTRINSIC_DEF(detail::is_generic_floating_point_v && + (sizeof(T) >= 4), + sqrt_ieee, ieee_sqrt) +ESIMD_UNARY_INTRINSIC_DEF(__ESIMD_EMATH_COND, rsqrt, rsqrt) +ESIMD_UNARY_INTRINSIC_DEF(__ESIMD_EMATH_COND, sin, sin) +ESIMD_UNARY_INTRINSIC_DEF(__ESIMD_EMATH_COND, cos, cos) + +#undef __ESIMD_EMATH_COND +#undef ESIMD_UNARY_INTRINSIC_DEF + +#define ESIMD_BINARY_INTRINSIC_DEF(COND, name, iname) \ + template > /* Faster vector implementation \ + with compile-time constant \ + saturation */ \ + __ESIMD_API simd name(simd src0, simd src1) { \ + using RawVecT = __SEIEED::vector_type_t<__SEIEED::__raw_t, N>; \ + RawVecT src1_raw_conv = detail::convert_vector(src1.data()); \ + RawVecT res_raw = __esimd_##iname(src0.data(), src1_raw_conv); \ + if constexpr (F != saturation_on) \ + return res_raw; \ + else \ + return esimd::saturate(simd(res_raw)); \ + } \ \ - return esimd::saturate(Result); \ + /* Slower vector implementation with dynamic branch on saturation \ + * parameter.*/ \ + template > \ + __ESIMD_API simd name(simd src0, simd src1, int flag) { \ + simd res = name(src0, src1); \ + if (flag != saturation_on) \ + return res; \ + return esimd::saturate(res); \ } \ - template \ - __ESIMD_API \ - std::enable_if_t::value, simd> \ - name(U src0, simd src1, int flag = saturation_off) { \ - simd Src0 = src0; \ - return name(Src0, src1, flag); \ + \ + /* Faster scalar implementation */ \ + template > \ + __ESIMD_API T name(T src0, U src1) { \ + simd src0_vec = src0; \ + simd src1_vec = src1; \ + simd res = name(src0_vec, src1_vec); \ + return res[0]; \ } \ - __ESIMD_API ftype name(ftype src0, ftype src1, int flag = saturation_off) { \ - simd Src0 = src0; \ - simd Src1 = src1; \ - simd Result = name(Src0, Src1, flag); \ - return Result[0]; \ + \ + /* Slower scalar implementation */ \ + template > \ + __ESIMD_API T name(T src0, U src1, int flag) { \ + simd src0_vec = src0; \ + simd src1_vec = src1; \ + simd res = name(src0_vec, src1_vec, flag); \ + return res[0]; \ } -ESIMD_INTRINSIC_DEF(float, pow, pow) - -ESIMD_INTRINSIC_DEF(float, div_ieee, ieee_div) -ESIMD_INTRINSIC_DEF(double, div_ieee, ieee_div) +ESIMD_BINARY_INTRINSIC_DEF(detail::is_generic_floating_point_v && + sizeof(T) <= 4, + pow, pow) +ESIMD_BINARY_INTRINSIC_DEF(detail::is_generic_floating_point_v && + sizeof(T) >= 4, + div_ieee, ieee_div) #undef ESIMD_INTRINSIC_DEF @@ -1592,39 +1615,69 @@ asin(T src0, int flag = saturation_off) { return Result[0]; } +namespace detail { +// std::numbers::ln2_v in c++20 +constexpr float ln2 = 0.69314718f; +// std::numbers::log2e_v in c++20 +constexpr float log2e = 1.442695f; +} // namespace detail + /// Computes the natural logarithm of the given argument. This is an /// emulated version based on the H/W supported log2. /// @param the source operand to compute base-e logarithm of. /// @return the base-e logarithm of \p src0. -template -ESIMD_NODEBUG ESIMD_INLINE simd log(simd src0, - int flag = saturation_off) { - constexpr float ln2 = 0.69314718f; // std::numbers::ln2_v in c++20 - simd Result = esimd::log2(src0) * ln2; +template +ESIMD_NODEBUG ESIMD_INLINE simd log(simd src0, int flag) { + using CppT = __SEIEED::__cpp_t; + simd Result = esimd::log2(src0) * detail::ln2; if (flag != saturation_on) return Result; - return esimd::saturate(Result); + return esimd::saturate(Result); } -ESIMD_NODEBUG ESIMD_INLINE float log(float src0, int flag = saturation_off) { - return esimd::log<1>(src0, flag)[0]; +template +ESIMD_NODEBUG ESIMD_INLINE simd log(simd src0) { + simd Result = esimd::log2(src0) * detail::ln2; + + if constexpr (Flag != saturation_on) + return Result; + else + return esimd::saturate(Result); +} + +template ESIMD_NODEBUG ESIMD_INLINE T log(T src0, int flag) { + return esimd::log(src0, flag)[0]; +} + +template +ESIMD_NODEBUG ESIMD_INLINE T log(T src0) { + return esimd::log(src0)[0]; } /// Computes e raised to the power of the given argument. This is an /// emulated version based on the H/W supported exp2. /// @param the source operand to compute base-e exponential of. /// @return e raised to the power of \p src0. -template -ESIMD_NODEBUG ESIMD_INLINE simd exp(simd src0, - int flag = saturation_off) { - constexpr float log2e = 1.442695f; // std::numbers::log2e_v in c++20 - return esimd::exp2(src0 * log2e, flag); +template +ESIMD_NODEBUG ESIMD_INLINE simd exp(simd src0, int flag) { + using CppT = __SEIEED::__cpp_t; + return esimd::exp2(src0 * detail::log2e, flag); +} + +template +ESIMD_NODEBUG ESIMD_INLINE simd exp(simd src0) { + return esimd::exp2(src0 * detail::log2e); +} + +template ESIMD_NODEBUG ESIMD_INLINE T exp(T src0, int flag) { + return esimd::exp(src0, flag)[0]; } -ESIMD_NODEBUG ESIMD_INLINE float exp(float src0, int flag = saturation_off) { - return esimd::exp<1>(src0, flag)[0]; +template +ESIMD_NODEBUG ESIMD_INLINE T exp(T src0) { + return esimd::exp(src0)[0]; } //////////////////////////////////////////////////////////////////////////////// diff --git a/sycl/test/esimd/intrins_trans.cpp b/sycl/test/esimd/intrins_trans.cpp index 4663bf44c4bc0..d93cb499ee0a1 100644 --- a/sycl/test/esimd/intrins_trans.cpp +++ b/sycl/test/esimd/intrins_trans.cpp @@ -75,7 +75,7 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd foo() { simd diva(2.f); simd divb(1.f); - diva = __esimd_ieee_div<1>(diva.data(), divb.data()); + diva = __esimd_ieee_div(diva.data(), divb.data()); // CHECK: %{{[0-9a-zA-Z_.]+}} = call <1 x float> @llvm.genx.ieee.div.v1f32(<1 x float> %{{[0-9a-zA-Z_.]+}}, <1 x float> %{{[0-9a-zA-Z_.]+}}) simd a(0.1f); @@ -276,13 +276,13 @@ SYCL_EXTERNAL void test_math_intrins() SYCL_ESIMD_FUNCTION { { vec x0 = get8f(); vec x1 = get8f(); - auto y = __esimd_ieee_div<8>(x0, x1); + auto y = __esimd_ieee_div(x0, x1); // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x float> @llvm.genx.ieee.div.v8f32(<8 x float> %{{[a-zA-Z0-9.]+}}, <8 x float> %{{[a-zA-Z0-9.]+}}) use(y); } { vec x = get8f(); - auto y = __esimd_ieee_sqrt<8>(x); + auto y = __esimd_ieee_sqrt(x); // CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x float> @llvm.genx.ieee.sqrt.v8f32(<8 x float> %{{[a-zA-Z0-9.]+}}) use(y); } diff --git a/sycl/test/esimd/sycl_half_math_ops.cpp b/sycl/test/esimd/sycl_half_math_ops.cpp new file mode 100644 index 0000000000000..0ed93040e8d4b --- /dev/null +++ b/sycl/test/esimd/sycl_half_math_ops.cpp @@ -0,0 +1,28 @@ +// RUN: %clangxx -fsycl -fsycl-device-only -S %s -o %t.ll +// RUN: sycl-post-link -split-esimd -lower-esimd -S %t.ll -o %t.table +// RUN: FileCheck %s -input-file=%t_esimd_0.ll + +// The test checks that there are no unexpected extra conversions or intrinsic +// calls added by the API headers or compiler when generating code +// for math operations on simd values. + +#include + +using namespace sycl::ext::intel::experimental::esimd; +using namespace sycl::ext::intel::experimental; +using namespace sycl; + +// clang-format off +SYCL_EXTERNAL auto test_ext_math_op(simd val) SYCL_ESIMD_FUNCTION { +// CHECK: define dso_local spir_func void @_Z16test_ext_math_op{{[^\(]*}}( +// CHECK: <8 x half>{{[^,]*}}* %[[RET_VEC_ADDR:[a-zA-Z0-9_\.]+]], +// CHECK: <8 x half>* %[[VAL_PTR:[a-zA-Z0-9_\.]+]]){{.*}} { + return esimd::cos(val); +// CHECK: %[[VAL_VEC_ADDR:[a-zA-Z0-9_\.]+]] = addrspacecast {{.*}} %[[VAL_PTR]] +// CHECK-NEXT: %[[VAL_VEC:[a-zA-Z0-9_\.]+]] = load <8 x half>{{.*}} %[[VAL_VEC_ADDR]] +// CHECK-NEXT: %[[RES:[a-zA-Z0-9_\.]+]] = call <8 x half> @llvm.genx.cos.v8f16(<8 x half> %[[VAL_VEC]]) +// CHECK-NEXT: store <8 x half>{{.*}}%[[RES]], {{.*}}%[[RET_VEC_ADDR]] +// CHECK-NEXT: ret void +// CHECK-LABEL: } +} +// clang-format on