diff --git a/simd/src/Kokkos_SIMD_AVX2.hpp b/simd/src/Kokkos_SIMD_AVX2.hpp index 0e4e8aaa8d7..6710c47437b 100644 --- a/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/simd/src/Kokkos_SIMD_AVX2.hpp @@ -1533,6 +1533,11 @@ class simd> { gen(std::integral_constant()), gen(std::integral_constant()))) { } + template + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + value_type const* ptr, FlagType flag) { + copy_from(ptr, flag); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( __m128i const& value_in) : m_value(value_in) {} @@ -1736,6 +1741,11 @@ class simd> { gen(std::integral_constant()), gen(std::integral_constant()), gen(std::integral_constant()))) {} + template + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + value_type const* ptr, FlagType flag) { + copy_from(ptr, flag); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( __m256i const& value_in) : m_value(value_in) {} diff --git a/simd/src/Kokkos_SIMD_AVX512.hpp b/simd/src/Kokkos_SIMD_AVX512.hpp index 82cd978c981..7d1acbc35ab 100644 --- a/simd/src/Kokkos_SIMD_AVX512.hpp +++ b/simd/src/Kokkos_SIMD_AVX512.hpp @@ -873,6 +873,11 @@ class simd> { gen(std::integral_constant()), gen(std::integral_constant()), gen(std::integral_constant()))) {} + template + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + value_type const* ptr, FlagType flag) { + copy_from(ptr, flag); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { return reinterpret_cast(&m_value)[i]; } @@ -1364,6 +1369,11 @@ class simd> { gen(std::integral_constant()), gen(std::integral_constant()), gen(std::integral_constant()))) {} + template + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + value_type const* ptr, FlagType flag) { + copy_from(ptr, flag); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { return reinterpret_cast(&m_value)[i]; } @@ -1789,6 +1799,11 @@ class simd> { gen(std::integral_constant()), gen(std::integral_constant()), gen(std::integral_constant()))) {} + template + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + value_type const* ptr, FlagType flag) { + copy_from(ptr, flag); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION reference operator[](std::size_t i) { return reinterpret_cast(&m_value)[i]; } @@ -3216,7 +3231,17 @@ class where_expression>, } }; -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t hmax( +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + std::int32_t + hmax(const_where_expression< + simd_mask>, + simd>> const& x) { + return _mm512_mask_reduce_max_epi32( + static_cast<__mmask8>(x.impl_get_mask()), + _mm512_castsi256_si512(static_cast<__m256i>(x.impl_get_value()))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t reduce_max( const_where_expression< simd_mask>, simd>> const& x) { @@ -3225,7 +3250,17 @@ class where_expression>, _mm512_castsi256_si512(static_cast<__m256i>(x.impl_get_value()))); } -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t hmin( +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + std::int32_t + hmin(const_where_expression< + simd_mask>, + simd>> const& x) { + return _mm512_mask_reduce_min_epi32( + static_cast<__mmask8>(x.impl_get_mask()), + _mm512_castsi256_si512(static_cast<__m256i>(x.impl_get_value()))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t reduce_min( const_where_expression< simd_mask>, simd>> const& x) { @@ -3234,7 +3269,33 @@ class where_expression>, _mm512_castsi256_si512(static_cast<__m256i>(x.impl_get_value()))); } -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint32_t hmax( +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t reduce_max( + const_where_expression< + simd_mask>, + simd>> const& x) { + return _mm512_mask_reduce_max_epi32(static_cast<__mmask16>(x.impl_get_mask()), + static_cast<__m512i>(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t reduce_min( + const_where_expression< + simd_mask>, + simd>> const& x) { + return _mm512_mask_reduce_min_epi32(static_cast<__mmask16>(x.impl_get_mask()), + static_cast<__m512i>(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + std::uint32_t + hmax(const_where_expression< + simd_mask>, + simd>> const& x) { + return _mm512_mask_reduce_max_epu32( + static_cast<__mmask8>(x.impl_get_mask()), + _mm512_castsi256_si512(static_cast<__m256i>(x.impl_get_value()))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint32_t reduce_max( const_where_expression< simd_mask>, simd>> const& x) { @@ -3243,7 +3304,17 @@ class where_expression>, _mm512_castsi256_si512(static_cast<__m256i>(x.impl_get_value()))); } -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint32_t hmin( +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + std::uint32_t + hmin(const_where_expression< + simd_mask>, + simd>> const& x) { + return _mm512_mask_reduce_min_epu32( + static_cast<__mmask8>(x.impl_get_mask()), + _mm512_castsi256_si512(static_cast<__m256i>(x.impl_get_value()))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint32_t reduce_min( const_where_expression< simd_mask>, simd>> const& x) { @@ -3252,7 +3323,32 @@ class where_expression>, _mm512_castsi256_si512(static_cast<__m256i>(x.impl_get_value()))); } -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int64_t hmax( +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint32_t reduce_max( + const_where_expression< + simd_mask>, + simd>> const& x) { + return _mm512_mask_reduce_max_epu32(static_cast<__mmask16>(x.impl_get_mask()), + static_cast<__m512i>(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint32_t reduce_min( + const_where_expression< + simd_mask>, + simd>> const& x) { + return _mm512_mask_reduce_min_epu32(static_cast<__mmask16>(x.impl_get_mask()), + static_cast<__m512i>(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + std::int64_t + hmax(const_where_expression< + simd_mask>, + simd>> const& x) { + return _mm512_mask_reduce_max_epi64(static_cast<__mmask8>(x.impl_get_mask()), + static_cast<__m512i>(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int64_t reduce_max( const_where_expression< simd_mask>, simd>> const& x) { @@ -3260,7 +3356,16 @@ class where_expression>, static_cast<__m512i>(x.impl_get_value())); } -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int64_t hmin( +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + std::int64_t + hmin(const_where_expression< + simd_mask>, + simd>> const& x) { + return _mm512_mask_reduce_min_epi64(static_cast<__mmask8>(x.impl_get_mask()), + static_cast<__m512i>(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int64_t reduce_min( const_where_expression< simd_mask>, simd>> const& x) { @@ -3268,7 +3373,16 @@ class where_expression>, static_cast<__m512i>(x.impl_get_value())); } -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint64_t hmax( +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + std::uint64_t + hmax(const_where_expression< + simd_mask>, + simd>> const& x) { + return _mm512_mask_reduce_max_epu64(static_cast<__mmask8>(x.impl_get_mask()), + static_cast<__m512i>(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint64_t reduce_max( const_where_expression< simd_mask>, simd>> const& x) { @@ -3276,7 +3390,16 @@ class where_expression>, static_cast<__m512i>(x.impl_get_value())); } -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint64_t hmin( +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + std::uint64_t + hmin(const_where_expression< + simd_mask>, + simd>> const& x) { + return _mm512_mask_reduce_min_epu64(static_cast<__mmask8>(x.impl_get_mask()), + static_cast<__m512i>(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint64_t reduce_min( const_where_expression< simd_mask>, simd>> const& x) { @@ -3284,7 +3407,15 @@ class where_expression>, static_cast<__m512i>(x.impl_get_value())); } -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION double hmax( +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION double +hmax(const_where_expression>, + simd>> const& + x) { + return _mm512_mask_reduce_max_pd(static_cast<__mmask8>(x.impl_get_mask()), + static_cast<__m512d>(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION double reduce_max( const_where_expression>, simd>> const& x) { @@ -3292,7 +3423,15 @@ class where_expression>, static_cast<__m512d>(x.impl_get_value())); } -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION double hmin( +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION double +hmin(const_where_expression>, + simd>> const& + x) { + return _mm512_mask_reduce_min_pd(static_cast<__mmask8>(x.impl_get_mask()), + static_cast<__m512d>(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION double reduce_min( const_where_expression>, simd>> const& x) { @@ -3300,7 +3439,16 @@ class where_expression>, static_cast<__m512d>(x.impl_get_value())); } -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION float hmax( +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION float +hmax(const_where_expression>, + simd>> const& + x) { + return _mm512_mask_reduce_max_ps( + static_cast<__mmask8>(x.impl_get_mask()), + _mm512_castps256_ps512(static_cast<__m256>(x.impl_get_value()))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION float reduce_max( const_where_expression>, simd>> const& x) { @@ -3309,7 +3457,16 @@ class where_expression>, _mm512_castps256_ps512(static_cast<__m256>(x.impl_get_value()))); } -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION float hmin( +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION float +hmin(const_where_expression>, + simd>> const& + x) { + return _mm512_mask_reduce_min_ps( + static_cast<__mmask8>(x.impl_get_mask()), + _mm512_castps256_ps512(static_cast<__m256>(x.impl_get_value()))); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION float reduce_min( const_where_expression>, simd>> const& x) { @@ -3318,6 +3475,22 @@ class where_expression>, _mm512_castps256_ps512(static_cast<__m256>(x.impl_get_value()))); } +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION float reduce_max( + const_where_expression>, + simd>> const& + x) { + return _mm512_mask_reduce_max_ps(static_cast<__mmask16>(x.impl_get_mask()), + static_cast<__m512>(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION float reduce_min( + const_where_expression>, + simd>> const& + x) { + return _mm512_mask_reduce_min_ps(static_cast<__mmask16>(x.impl_get_mask()), + static_cast<__m512>(x.impl_get_value())); +} + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t reduce( const_where_expression< simd_mask>, @@ -3328,6 +3501,15 @@ class where_expression>, _mm512_castsi256_si512(static_cast<__m256i>(x.impl_get_value()))); } +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t reduce( + const_where_expression< + simd_mask>, + simd>> const& x, + std::int32_t, std::plus<>) { + return _mm512_mask_reduce_add_epi32(static_cast<__mmask16>(x.impl_get_mask()), + static_cast<__m512i>(x.impl_get_value())); +} + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int64_t reduce( const_where_expression< simd_mask>, @@ -3356,6 +3538,15 @@ class where_expression>, _mm512_castps256_ps512(static_cast<__m256>(x.impl_get_value()))); } +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION float reduce( + const_where_expression>, + simd>> const& + x, + float, std::plus<>) { + return _mm512_mask_reduce_add_ps(static_cast<__mmask16>(x.impl_get_mask()), + static_cast<__m512>(x.impl_get_value())); +} + } // namespace Experimental } // namespace Kokkos diff --git a/simd/src/Kokkos_SIMD_Common.hpp b/simd/src/Kokkos_SIMD_Common.hpp index c7a7f0f45a6..15245501e54 100644 --- a/simd/src/Kokkos_SIMD_Common.hpp +++ b/simd/src/Kokkos_SIMD_Common.hpp @@ -389,7 +389,7 @@ template [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION constexpr T reduce_min( const simd& x) noexcept { auto v = where(true, x); - return hmin(v); + return reduce_min(v); } template @@ -397,14 +397,14 @@ template const simd& x, const typename simd::mask_type& mask) noexcept { auto v = where(mask, x); - return hmin(v); + return reduce_min(v); } template [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION constexpr T reduce_max( const simd& x) noexcept { auto v = where(true, x); - return hmax(v); + return reduce_max(v); } template @@ -412,7 +412,7 @@ template const simd& x, const typename simd::mask_type& mask) noexcept { auto v = where(mask, x); - return hmax(v); + return reduce_max(v); } } // namespace Experimental diff --git a/simd/src/Kokkos_SIMD_Common_Math.hpp b/simd/src/Kokkos_SIMD_Common_Math.hpp index 9e5991e6912..2e78e2700fb 100644 --- a/simd/src/Kokkos_SIMD_Common_Math.hpp +++ b/simd/src/Kokkos_SIMD_Common_Math.hpp @@ -33,7 +33,7 @@ template class const_where_expression; template -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T hmin(const_where_expression, simd> const& x) { auto const& v = x.impl_get_value(); auto const& m = x.impl_get_mask(); @@ -45,7 +45,7 @@ hmin(const_where_expression, simd> const& x) { } template -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T hmax(const_where_expression, simd> const& x) { auto const& v = x.impl_get_value(); auto const& m = x.impl_get_mask(); @@ -56,6 +56,30 @@ hmax(const_where_expression, simd> const& x) { return result; } +template +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T +reduce_min(const_where_expression, simd> const& x) { + auto const& v = x.impl_get_value(); + auto const& m = x.impl_get_mask(); + auto result = Kokkos::reduction_identity::min(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::min(result, v[i]); + } + return result; +} + +template +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T +reduce_max(const_where_expression, simd> const& x) { + auto const& v = x.impl_get_value(); + auto const& m = x.impl_get_mask(); + auto result = Kokkos::reduction_identity::max(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::max(result, v[i]); + } + return result; +} + template > [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION T reduce(const_where_expression, simd> const& x, diff --git a/simd/src/Kokkos_SIMD_NEON.hpp b/simd/src/Kokkos_SIMD_NEON.hpp index 182d1ec1511..f35bd04bb8d 100644 --- a/simd/src/Kokkos_SIMD_NEON.hpp +++ b/simd/src/Kokkos_SIMD_NEON.hpp @@ -1004,6 +1004,11 @@ class simd> { m_value = vsetq_lane_f32(gen(std::integral_constant()), m_value, 3); } + template + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + value_type const* ptr, FlagType flag) { + copy_from(ptr, flag); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( float32x4_t const& value_in) : m_value(value_in) {} @@ -1472,6 +1477,11 @@ class simd> { m_value = vsetq_lane_s32(gen(std::integral_constant()), m_value, 3); } + template + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( + value_type const* ptr, FlagType flag) { + copy_from(ptr, flag); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit simd( int32x4_t const& value_in) : m_value(value_in) {} @@ -2712,6 +2722,173 @@ class where_expression>, } }; +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t reduce_min( + const_where_expression< + simd_mask>, + simd>> const& x) { + return vminv_s32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t reduce_max( + const_where_expression< + simd_mask>, + simd>> const& x) { + return vmaxv_s32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t reduce( + const_where_expression< + simd_mask>, + simd>> const& x, + std::int32_t, std::plus<>) { + return vaddv_s32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t reduce_min( + const_where_expression< + simd_mask>, + simd>> const& x) { + return vminvq_s32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t reduce_max( + const_where_expression< + simd_mask>, + simd>> const& x) { + return vmaxvq_s32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int32_t reduce( + const_where_expression< + simd_mask>, + simd>> const& x, + std::int32_t, std::plus<>) { + return vaddvq_s32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint32_t reduce_min( + const_where_expression< + simd_mask>, + simd>> const& x) { + return vminv_u32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint32_t reduce_max( + const_where_expression< + simd_mask>, + simd>> const& x) { + return vmaxv_u32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint32_t reduce( + const_where_expression< + simd_mask>, + simd>> const& x, + std::uint32_t, std::plus<>) { + return vaddv_u32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint32_t reduce_min( + const_where_expression< + simd_mask>, + simd>> const& x) { + return vminvq_u32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint32_t reduce_max( + const_where_expression< + simd_mask>, + simd>> const& x) { + return vmaxvq_u32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint32_t reduce( + const_where_expression< + simd_mask>, + simd>> const& x, + std::uint32_t, std::plus<>) { + return vaddvq_u32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::int64_t reduce( + const_where_expression< + simd_mask>, + simd>> const& x, + std::int64_t, std::plus<>) { + return vaddvq_s64(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION std::uint64_t reduce( + const_where_expression< + simd_mask>, + simd>> const& x, + std::uint64_t, std::plus<>) { + return vaddvq_u64(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION double reduce_min( + const_where_expression>, + simd>> const& + x) { + return vminvq_f64(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION double reduce_max( + const_where_expression>, + simd>> const& + x) { + return vmaxvq_f64(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION double reduce( + const_where_expression>, + simd>> const& x, + double, std::plus<>) { + return vaddvq_f64(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION float reduce_min( + const_where_expression>, + simd>> const& + x) { + return vminv_f32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION float reduce_max( + const_where_expression>, + simd>> const& + x) { + return vmaxv_f32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION float reduce( + const_where_expression>, + simd>> const& x, + float, std::plus<>) { + return vaddv_f32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION float reduce_min( + const_where_expression>, + simd>> const& + x) { + return vminvq_f32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION float reduce_max( + const_where_expression>, + simd>> const& + x) { + return vmaxvq_f32(static_cast(x.impl_get_value())); +} + +[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION float reduce( + const_where_expression>, + simd>> const& x, + float, std::plus<>) { + return vaddvq_f32(static_cast(x.impl_get_value())); +} + } // namespace Experimental } // namespace Kokkos diff --git a/simd/src/Kokkos_SIMD_Scalar.hpp b/simd/src/Kokkos_SIMD_Scalar.hpp index a17cfeee5d3..62b2e14e994 100644 --- a/simd/src/Kokkos_SIMD_Scalar.hpp +++ b/simd/src/Kokkos_SIMD_Scalar.hpp @@ -427,7 +427,7 @@ reduce(const_where_expression, } template -[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION T +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION T hmax(const_where_expression, simd> const& x) { return static_cast(x.impl_get_mask()) @@ -437,6 +437,15 @@ hmax(const_where_expression, template [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION T +reduce_max(const_where_expression, + simd> const& x) { + return static_cast(x.impl_get_mask()) + ? static_cast(x.impl_get_value()) + : Kokkos::reduction_identity::max(); +} + +template +[[nodiscard]] KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION T hmin(const_where_expression, simd> const& x) { return static_cast(x.impl_get_mask()) @@ -444,6 +453,15 @@ hmin(const_where_expression, : Kokkos::reduction_identity::min(); } +template +[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION T +reduce_min(const_where_expression, + simd> const& x) { + return static_cast(x.impl_get_mask()) + ? static_cast(x.impl_get_value()) + : Kokkos::reduction_identity::min(); +} + } // namespace Experimental } // namespace Kokkos diff --git a/simd/unit_tests/include/SIMDTesting_Ops.hpp b/simd/unit_tests/include/SIMDTesting_Ops.hpp index 094a3219893..6103efd90c2 100644 --- a/simd/unit_tests/include/SIMDTesting_Ops.hpp +++ b/simd/unit_tests/include/SIMDTesting_Ops.hpp @@ -331,86 +331,6 @@ class log_op { } }; -class hmin { - public: - template - KOKKOS_INLINE_FUNCTION auto on_host(T const& a, MaskType mask = true) const { - auto w = Kokkos::Experimental::where(mask, a); - return Kokkos::Experimental::hmin(w); - } - template - KOKKOS_INLINE_FUNCTION auto on_host_serial(T const& a, - MaskType mask = true) const { - auto w = Kokkos::Experimental::where(mask, a); - auto const& v = w.impl_get_value(); - auto const& m = w.impl_get_mask(); - auto result = v[0]; - for (std::size_t i = 1; i < v.size(); ++i) { - if (m[i]) result = Kokkos::min(result, v[i]); - } - return result; - } - - template - KOKKOS_INLINE_FUNCTION auto on_device(T const& a, - MaskType mask = true) const { - auto w = Kokkos::Experimental::where(mask, a); - return Kokkos::Experimental::hmin(w); - } - template - KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a, - MaskType mask = true) const { - auto w = Kokkos::Experimental::where(mask, a); - auto const& v = w.impl_get_value(); - auto const& m = w.impl_get_mask(); - auto result = v[0]; - for (std::size_t i = 1; i < v.size(); ++i) { - if (m[i]) result = Kokkos::min(result, v[i]); - } - return result; - } -}; - -class hmax { - public: - template - KOKKOS_INLINE_FUNCTION auto on_host(T const& a, MaskType mask = true) const { - auto w = Kokkos::Experimental::where(mask, a); - return Kokkos::Experimental::hmax(w); - } - template - KOKKOS_INLINE_FUNCTION auto on_host_serial(T const& a, - MaskType mask = true) const { - auto w = Kokkos::Experimental::where(mask, a); - auto const& v = w.impl_get_value(); - auto const& m = w.impl_get_mask(); - auto result = v[0]; - for (std::size_t i = 1; i < v.size(); ++i) { - if (m[i]) result = Kokkos::max(result, v[i]); - } - return result; - } - - template - KOKKOS_INLINE_FUNCTION auto on_device(T const& a, - MaskType mask = true) const { - auto w = Kokkos::Experimental::where(mask, a); - return Kokkos::Experimental::hmax(w); - } - template - KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a, - MaskType mask = true) const { - auto w = Kokkos::Experimental::where(mask, a); - auto const& v = w.impl_get_value(); - auto const& m = w.impl_get_mask(); - auto result = v[0]; - for (std::size_t i = 1; i < v.size(); ++i) { - if (m[i]) result = Kokkos::max(result, v[i]); - } - return result; - } -}; - template > class reduce_where_expr { public: @@ -458,7 +378,14 @@ class reduce_min { } template KOKKOS_INLINE_FUNCTION auto on_host_serial(T const& a, MaskType mask) const { - return hmin().on_host_serial(a, mask); + auto w = Kokkos::Experimental::where(mask, a); + auto const& v = w.impl_get_value(); + auto const& m = w.impl_get_mask(); + auto result = v[0]; + for (std::size_t i = 1; i < v.size(); ++i) { + if (m[i]) result = Kokkos::min(result, v[i]); + } + return result; } template @@ -468,7 +395,14 @@ class reduce_min { template KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a, MaskType mask) const { - return hmin().on_device_serial(a, mask); + auto w = Kokkos::Experimental::where(mask, a); + auto const& v = w.impl_get_value(); + auto const& m = w.impl_get_mask(); + auto result = v[0]; + for (std::size_t i = 1; i < v.size(); ++i) { + if (m[i]) result = Kokkos::min(result, v[i]); + } + return result; } }; @@ -480,7 +414,14 @@ class reduce_max { } template KOKKOS_INLINE_FUNCTION auto on_host_serial(T const& a, MaskType mask) const { - return hmax().on_host_serial(a, mask); + auto w = Kokkos::Experimental::where(mask, a); + auto const& v = w.impl_get_value(); + auto const& m = w.impl_get_mask(); + auto result = v[0]; + for (std::size_t i = 1; i < v.size(); ++i) { + if (m[i]) result = Kokkos::max(result, v[i]); + } + return result; } template @@ -490,7 +431,14 @@ class reduce_max { template KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a, MaskType mask) const { - return hmax().on_device_serial(a, mask); + auto w = Kokkos::Experimental::where(mask, a); + auto const& v = w.impl_get_value(); + auto const& m = w.impl_get_mask(); + auto result = v[0]; + for (std::size_t i = 1; i < v.size(); ++i) { + if (m[i]) result = Kokkos::max(result, v[i]); + } + return result; } }; diff --git a/simd/unit_tests/include/TestSIMD_Reductions.hpp b/simd/unit_tests/include/TestSIMD_Reductions.hpp index f9c10c0e235..baf1e7c88cc 100644 --- a/simd/unit_tests/include/TestSIMD_Reductions.hpp +++ b/simd/unit_tests/include/TestSIMD_Reductions.hpp @@ -57,8 +57,6 @@ inline void host_check_reduction_all_loaders(ReductionOp reduce_op, template inline void host_check_all_reductions(const DataType (&args)[n]) { - host_check_reduction_all_loaders(hmin(), n, args); - host_check_reduction_all_loaders(hmax(), n, args); host_check_reduction_all_loaders(reduce_where_expr>(), n, args); host_check_reduction_all_loaders(reduce_where_expr>(), @@ -138,8 +136,6 @@ KOKKOS_INLINE_FUNCTION void device_check_reduction_all_loaders( template KOKKOS_INLINE_FUNCTION void device_check_all_reductions( const DataType (&args)[n]) { - device_check_reduction_all_loaders(hmin(), n, args); - device_check_reduction_all_loaders(hmax(), n, args); device_check_reduction_all_loaders(reduce_where_expr>(), n, args); device_check_reduction_all_loaders(