From 47d4df9a452371ca815e960feb1e236e0b41e216 Mon Sep 17 00:00:00 2001 From: Ryo Date: Thu, 29 Feb 2024 10:05:15 +0000 Subject: [PATCH] Fix qdmlal instructions qdmlal instructions were implemented without saturation. This has been fixed by utilising existing SIMDe saturating mult and add instructions. Unit tests have been updated to test for all possible saturation cases. - Fix qdmlal, qdmlal_n, qdmlal_lane, qdmlal_high, qdmlal_high_n and qdmlal_high_lane - Update unit tests for qdmlal, qdmlal_n, qdmlal_lane, qdmlal_high, qdmlal_high_n, qdmala_high_lane Change-Id: I8d0d8cfba3f8d5203f2028efbe74b00c51485c61 --- simde/arm/neon/qdmlal.h | 19 ++-- simde/arm/neon/qdmlal_high.h | 22 +--- simde/arm/neon/qdmlal_high_lane.h | 67 +++---------- simde/arm/neon/qdmlal_high_n.h | 23 +---- test/arm/neon/qdmlal.c | 36 +++++++ test/arm/neon/qdmlal_high.c | 14 +++ test/arm/neon/qdmlal_high_lane.c | 160 ++++++++++++++++++------------ test/arm/neon/qdmlal_high_n.c | 13 +++ test/arm/neon/qdmlal_lane.c | 94 ++++++++++++++++++ test/arm/neon/qdmlal_n.c | 12 +++ 10 files changed, 290 insertions(+), 170 deletions(-) diff --git a/simde/arm/neon/qdmlal.h b/simde/arm/neon/qdmlal.h index fe96b0fc8..121d7eed8 100644 --- a/simde/arm/neon/qdmlal.h +++ b/simde/arm/neon/qdmlal.h @@ -27,12 +27,9 @@ #if !defined(SIMDE_ARM_NEON_QDMLAL_H) #define SIMDE_ARM_NEON_QDMLAL_H -#include "add.h" -#include "mul.h" -#include "mul_n.h" -#include "movl.h" -#include "qadd.h" #include "types.h" +#include "qadd.h" +#include "qdmull.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -44,7 +41,7 @@ simde_vqdmlalh_s16(int32_t a, int16_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqdmlalh_s16(a, b, c); #else - return HEDLEY_STATIC_CAST(int32_t, b) * HEDLEY_STATIC_CAST(int32_t, c) * 2 + a; + return simde_vqadds_s32(a, simde_vqdmullh_s16(b, c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -58,7 +55,7 @@ simde_vqdmlals_s32(int64_t a, int32_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqdmlals_s32(a, b, c); #else - return HEDLEY_STATIC_CAST(int64_t, b) * HEDLEY_STATIC_CAST(int64_t, c) * 2 + a; + return simde_vqaddd_s64(a, simde_vqdmulls_s32(b, c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -72,8 +69,7 @@ simde_vqdmlal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vqdmlal_s16(a, b, c); #else - simde_int32x4_t temp = simde_vmulq_s32(simde_vmovl_s16(b), simde_vmovl_s16(c)); - return simde_vqaddq_s32(simde_vqaddq_s32(temp, temp), a); + return simde_vqaddq_s32(simde_vqdmull_s16(b, c), a); #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -87,10 +83,7 @@ simde_vqdmlal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vqdmlal_s32(a, b, c); #else - simde_int64x2_t r = simde_x_vmulq_s64( - simde_vmovl_s32(b), - simde_vmovl_s32(c)); - return simde_vqaddq_s64(a, simde_vqaddq_s64(r, r)); + return simde_vqaddq_s64(simde_vqdmull_s32(b, c), a); #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/qdmlal_high.h b/simde/arm/neon/qdmlal_high.h index 016deb011..44edc9d06 100644 --- a/simde/arm/neon/qdmlal_high.h +++ b/simde/arm/neon/qdmlal_high.h @@ -27,10 +27,9 @@ #if !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_H) #define SIMDE_ARM_NEON_QDMLAL_HIGH_H -#include "movl_high.h" -#include "mla.h" -#include "mul_n.h" #include "types.h" +#include "qadd.h" +#include "qdmull_high.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -42,10 +41,7 @@ simde_vqdmlal_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqdmlal_high_s16(a, b, c); #else - return simde_vaddq_s32( - simde_vmulq_n_s32( - simde_vmulq_s32( - simde_vmovl_high_s16(b), simde_vmovl_high_s16(c)), 2), a); + return simde_vqaddq_s32(simde_vqdmull_high_s16(b, c), a); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -59,17 +55,7 @@ simde_vqdmlal_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqdmlal_high_s32(a, b, c); #else - simde_int64x2_private r_ = simde_int64x2_to_private( - simde_x_vmulq_s64( - simde_vmovl_high_s32(b), - simde_vmovl_high_s32(c))); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2); - } - - return simde_vaddq_s64(a, simde_int64x2_from_private(r_)); + return simde_vqaddq_s64(simde_vqdmull_high_s32(b, c), a); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/simde/arm/neon/qdmlal_high_lane.h b/simde/arm/neon/qdmlal_high_lane.h index b2d6a8b42..279cf5880 100644 --- a/simde/arm/neon/qdmlal_high_lane.h +++ b/simde/arm/neon/qdmlal_high_lane.h @@ -27,92 +27,49 @@ #if !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_LANE_H) #define SIMDE_ARM_NEON_QDMLAL_HIGH_LANE_H -#include "movl_high.h" -#include "add.h" -#include "mul.h" -#include "mul_n.h" -#include "dup_n.h" -#include "mla.h" +#include "dup_lane.h" +#include "get_high.h" #include "types.h" +#include "qdmlal.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ -SIMDE_FUNCTION_ATTRIBUTES -simde_int32x4_t -simde_vqdmlal_high_lane_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { - return simde_vaddq_s32( - simde_vmulq_n_s32( - simde_vmulq_s32( - simde_vmovl_high_s16(b), - simde_vmovl_high_s16(simde_vdupq_n_s16(simde_int16x4_to_private(v).values[lane]))), 2), a); -} #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #define simde_vqdmlal_high_lane_s16(a, b, v, lane) vqdmlal_high_lane_s16(a, b, v, lane) +#else + #define simde_vqdmlal_high_lane_s16(a, b, v, lane) simde_vqdmlal_s16((a), simde_vget_high_s16((b)), simde_vdup_lane_s16((v), (lane))) #endif #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) #undef vqdmlal_high_lane_s16 - #define vqdmlal_high_lane_s16(a, b, v, lane) simde_vqdmlal_high_lane_s16((a), (b), (v), (lane)) + #define vqdmlal_high_lane_s16(a, b, c, lane) simde_vqdmlal_high_lane_s16((a), (b), (c), (lane)) #endif -SIMDE_FUNCTION_ATTRIBUTES -simde_int32x4_t -simde_vqdmlal_high_laneq_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { - return simde_vaddq_s32( - simde_vmulq_n_s32( - simde_vmulq_s32( - simde_vmovl_high_s16(b), - simde_vmovl_high_s16(simde_vdupq_n_s16(simde_int16x8_to_private(v).values[lane]))), 2), a); -} #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #define simde_vqdmlal_high_laneq_s16(a, b, v, lane) vqdmlal_high_laneq_s16(a, b, v, lane) +#else + #define simde_vqdmlal_high_laneq_s16(a, b, v, lane) simde_vqdmlal_s16((a), simde_vget_high_s16((b)), simde_vdup_laneq_s16((v), (lane))) #endif #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) #undef vqdmlal_high_laneq_s16 #define vqdmlal_high_laneq_s16(a, b, v, lane) simde_vqdmlal_high_laneq_s16((a), (b), (v), (lane)) #endif -SIMDE_FUNCTION_ATTRIBUTES -simde_int64x2_t -simde_vqdmlal_high_lane_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { - simde_int64x2_private r_ = simde_int64x2_to_private( - simde_x_vmulq_s64( - simde_vmovl_high_s32(b), - simde_vmovl_high_s32(simde_vdupq_n_s32(simde_int32x2_to_private(v).values[lane])))); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2); - } - - return simde_vaddq_s64(a, simde_int64x2_from_private(r_)); -} #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #define simde_vqdmlal_high_lane_s32(a, b, v, lane) vqdmlal_high_lane_s32(a, b, v, lane) +#else + #define simde_vqdmlal_high_lane_s32(a, b, v, lane) simde_vqdmlal_s32((a), simde_vget_high_s32((b)), simde_vdup_lane_s32((v), (lane))) #endif #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) #undef vqdmlal_high_lane_s32 #define vqdmlal_high_lane_s32(a, b, v, lane) simde_vqdmlal_high_lane_s32((a), (b), (v), (lane)) #endif -SIMDE_FUNCTION_ATTRIBUTES -simde_int64x2_t -simde_vqdmlal_high_laneq_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { - simde_int64x2_private r_ = simde_int64x2_to_private( - simde_x_vmulq_s64( - simde_vmovl_high_s32(b), - simde_vmovl_high_s32(simde_vdupq_n_s32(simde_int32x4_to_private(v).values[lane])))); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2); - } - - return simde_vaddq_s64(a, simde_int64x2_from_private(r_)); -} #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #define simde_vqdmlal_high_laneq_s32(a, b, v, lane) vqdmlal_high_laneq_s32(a, b, v, lane) +#else + #define simde_vqdmlal_high_laneq_s32(a, b, v, lane) simde_vqdmlal_s32((a), simde_vget_high_s32((b)), simde_vdup_laneq_s32((v), (lane))) #endif #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) #undef vqdmlal_high_laneq_s32 diff --git a/simde/arm/neon/qdmlal_high_n.h b/simde/arm/neon/qdmlal_high_n.h index 205cafbcc..4e9c32203 100644 --- a/simde/arm/neon/qdmlal_high_n.h +++ b/simde/arm/neon/qdmlal_high_n.h @@ -27,12 +27,9 @@ #if !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_N_H) #define SIMDE_ARM_NEON_QDMLAL_HIGH_N_H -#include "movl_high.h" #include "dup_n.h" -#include "add.h" -#include "mul.h" -#include "mul_n.h" #include "types.h" +#include "qdmlal_high.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -44,11 +41,7 @@ simde_vqdmlal_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqdmlal_high_n_s16(a, b, c); #else - return simde_vaddq_s32( - simde_vmulq_n_s32( - simde_vmulq_s32( - simde_vmovl_high_s16(b), - simde_vmovl_high_s16(simde_vdupq_n_s16(c))), 2), a); + return simde_vqdmlal_high_s16(a, b, simde_vdupq_n_s16(c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -62,17 +55,7 @@ simde_vqdmlal_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqdmlal_high_n_s32(a, b, c); #else - simde_int64x2_private r_ = simde_int64x2_to_private( - simde_x_vmulq_s64( - simde_vmovl_high_s32(b), - simde_vmovl_high_s32(simde_vdupq_n_s32(c)))); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2); - } - - return simde_vaddq_s64(a, simde_int64x2_from_private(r_)); + return simde_vqdmlal_high_s32(a, b, simde_vdupq_n_s32(c)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/test/arm/neon/qdmlal.c b/test/arm/neon/qdmlal.c index f1c629abe..945e346b7 100644 --- a/test/arm/neon/qdmlal.c +++ b/test/arm/neon/qdmlal.c @@ -43,6 +43,18 @@ test_simde_vqdmlalh_s16 (SIMDE_MUNIT_TEST_ARGS) { { INT16_C( 6764) }, { -INT16_C( 707) }, { -INT32_C( 6880798) } }, + { { INT32_C( INT32_MAX) }, + { INT16_C( 1) }, + { INT16_C( 1) }, + { INT32_C( INT32_MAX) } }, + { { INT32_C( INT32_MIN) }, + { INT16_C( 1) }, + { -INT16_C( 1) }, + { INT32_C( INT32_MIN) } }, + { { INT32_C( 0) }, + { INT16_C( INT16_MIN) }, + { INT16_C( INT16_MIN) }, + { INT32_C( INT32_MAX) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -94,6 +106,18 @@ test_simde_vqdmlals_s32 (SIMDE_MUNIT_TEST_ARGS) { { INT32_C( 2995714) }, { -INT32_C( 3814223) }, { -INT64_C( 22853477950349) } }, + { { INT64_MAX }, + { INT32_C( 1) }, + { INT32_C( 1) }, + { INT64_MAX } }, + { { INT64_MIN }, + { INT32_C( 1) }, + { -INT32_C( 1) }, + { INT64_MIN } }, + { { INT64_C( 0) }, + { INT32_C( INT32_MIN) }, + { INT32_C( INT32_MIN) }, + { INT64_MAX } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -149,6 +173,10 @@ test_simde_vqdmlal_s16 (SIMDE_MUNIT_TEST_ARGS) { { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }, { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }, { INT32_C(2147483631), INT32_C(2147483632), INT32_C(2147483633), INT32_C(2147483634) } }, + { { INT32_C( INT32_MAX), INT32_C( INT32_MIN), INT32_C( 0), -INT32_C( 68184) }, + { INT16_C( 1), -INT16_C( 1), INT16_C( INT16_MIN), INT16_C( 9252) }, + { INT16_C( 1), INT16_C( 1), INT16_C( INT16_MIN), INT16_C( 5749) }, + { INT32_C( INT32_MAX), INT32_C( INT32_MIN), INT32_C( INT32_MAX), INT32_C( 106311312) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -207,6 +235,14 @@ test_simde_vqdmlal_s32 (SIMDE_MUNIT_TEST_ARGS) { { INT32_MIN, INT32_MIN }, { INT32_MIN, INT32_MIN }, {INT64_C(9223372036854775791), INT64_C(9223372036854775792) } }, + { { INT64_MAX, INT64_MIN }, + { INT32_C( 1), -INT32_C( 1) }, + { INT32_C( 1), INT32_C( 1) }, + { INT64_MAX, INT64_MIN } }, + { { INT64_C( 0), -INT64_C( 68184) }, + { INT32_C( INT32_MIN), INT32_C( 9252) }, + { INT32_C( INT32_MIN), INT32_C( 5749) }, + { INT64_MAX, INT64_C( 106311312) } } }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { diff --git a/test/arm/neon/qdmlal_high.c b/test/arm/neon/qdmlal_high.c index c928e05c6..75c3b9b16 100644 --- a/test/arm/neon/qdmlal_high.c +++ b/test/arm/neon/qdmlal_high.c @@ -59,6 +59,12 @@ test_simde_vqdmlal_high_s16 (SIMDE_MUNIT_TEST_ARGS) { { -INT16_C( 9269), -INT16_C( 5310), INT16_C( 5746), INT16_C( 4013), INT16_C( 5760), INT16_C( 4110), INT16_C( 8914), -INT16_C( 764) }, { -INT32_C( 74794532), INT32_C( 36362128), INT32_C( 94724016), INT32_C( 13825770) } }, + { { INT32_C( INT32_MAX), INT32_C( INT32_MIN), INT32_C( 0), INT32_C( 5368290) }, + { -INT16_C( 9903), -INT16_C( 7336), INT16_C( 1785), INT16_C( 5751), + INT16_C( 1), INT16_C( 1), INT16_C( INT16_MIN), -INT16_C( 5535) }, + { -INT16_C( 9269), -INT16_C( 5310), INT16_C( 5746), INT16_C( 4013), + INT16_C( 1), -INT16_C( 1), INT16_C( INT16_MIN), -INT16_C( 764) }, + { INT32_C( INT32_MAX), INT32_C( INT32_MIN), INT32_C( INT32_MAX), INT32_C( 13825770) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -113,6 +119,14 @@ test_simde_vqdmlal_high_s32 (SIMDE_MUNIT_TEST_ARGS) { { -INT32_C( 759050), -INT32_C( 437291), INT32_C( 207575), -INT32_C( 177006) }, { -INT32_C( 262650), INT32_C( 912777), INT32_C( 556302), -INT32_C( 41245) }, { INT64_C( 231133969127), INT64_C( 14599655586) } }, + { { INT64_MAX, INT64_MIN }, + { -INT32_C( 759050), -INT32_C( 437291), INT32_C( 1), INT32_C( 1) }, + { -INT32_C( 262650), INT32_C( 912777), INT32_C( 1), -INT32_C( 1) }, + { INT64_MAX, INT64_MIN } }, + { { INT64_C( 0), -INT64_C( 1569354) }, + { -INT32_C( 759050), -INT32_C( 437291), INT32_C( INT32_MIN), -INT32_C( 177006) }, + { -INT32_C( 262650), INT32_C( 912777), INT32_C( INT32_MIN), -INT32_C( 41245) }, + { INT64_MAX, INT64_C( 14599655586) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { diff --git a/test/arm/neon/qdmlal_high_lane.c b/test/arm/neon/qdmlal_high_lane.c index 76d93bb86..d19dfb9cc 100644 --- a/test/arm/neon/qdmlal_high_lane.c +++ b/test/arm/neon/qdmlal_high_lane.c @@ -60,6 +60,12 @@ test_simde_vqdmlal_high_lane_s16 (SIMDE_MUNIT_TEST_ARGS) { { INT16_C( 8706), INT16_C( 241), -INT16_C( 8993), -INT16_C( 4041) }, INT8_C( 2), { -INT32_C(136949241), -INT32_C( 46416569), -INT32_C( 54308817), -INT32_C( 69372172) } }, + { { INT32_C( INT32_MAX), INT32_C( INT32_MIN), -INT32_C( 0), INT32_C( 0) }, + { -INT16_C( 1), INT16_C( 1), INT16_C( INT16_MIN), INT16_C( 0), + -INT16_C( 1), INT16_C( 1), INT16_C( INT16_MIN), INT16_C( 0) }, + { INT16_C( INT16_MIN), INT16_C( 0), INT16_C( 0), INT16_C( 0) }, + INT8_C( 0), + { INT32_C( INT32_MAX), INT32_C( INT32_MIN), INT32_C( INT32_MAX), INT32_C( 0) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -140,6 +146,13 @@ test_simde_vqdmlal_high_laneq_s16 (SIMDE_MUNIT_TEST_ARGS) { -INT16_C( 6431), INT16_C( 9768), -INT16_C( 6471), INT16_C( 2563) }, INT8_C( 0), { INT32_C( 1788558), INT32_C( 63364), -INT32_C( 132192), -INT32_C( 2102687) } }, + { { INT32_C( INT32_MAX), INT32_C( INT32_MIN), -INT32_C( 0), INT32_C( 0) }, + { -INT16_C( 1), INT16_C( 1), INT16_C( INT16_MIN), INT16_C( 0), + -INT16_C( 1), INT16_C( 1), INT16_C( INT16_MIN), INT16_C( 0) }, + { INT16_C( INT16_MIN), INT16_C( 0), INT16_C( 0), INT16_C( 0), + INT16_C( INT16_MIN), INT16_C( 0), INT16_C( 0), INT16_C( 0) }, + INT8_C( 0), + { INT32_C( INT32_MAX), INT32_C( INT32_MIN), INT32_C( INT32_MAX), INT32_C( 0) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -155,70 +168,6 @@ test_simde_vqdmlal_high_laneq_s16 (SIMDE_MUNIT_TEST_ARGS) { return 0; } -static int -test_simde_vqdmlal_high_laneq_s32 (SIMDE_MUNIT_TEST_ARGS) { - static const struct { - int64_t a[2]; - int32_t b[4]; - int32_t v[4]; - int8_t lane; - int64_t r[2]; - } test_vec[] = { - { { INT64_C( 13898671), -INT64_C( 28702086) }, - { -INT32_C( 263510), -INT32_C( 347579), INT32_C( 942651), -INT32_C( 19214) }, - { -INT32_C( 985912), INT32_C( 863503), -INT32_C( 864486), -INT32_C( 67563) }, - INT8_C( 3), - { -INT64_C( 127362760355), INT64_C( 2567608878) } }, - { { -INT64_C( 49550599), INT64_C( 12924804) }, - { INT32_C( 569339), INT32_C( 916303), INT32_C( 322651), -INT32_C( 403309) }, - { INT32_C( 68174), -INT32_C( 513605), INT32_C( 1763), -INT32_C( 583573) }, - INT8_C( 1), - { -INT64_C( 331479884309), INT64_C( 414295962694) } }, - { { INT64_C( 33224957), -INT64_C( 60263322) }, - { -INT32_C( 733914), INT32_C( 345428), -INT32_C( 188026), -INT32_C( 361972) }, - { -INT32_C( 436531), INT32_C( 676538), INT32_C( 34787), -INT32_C( 755451) }, - INT8_C( 2), - { -INT64_C( 13048495967), -INT64_C( 25244103250) } }, - { { -INT64_C( 52374386), -INT64_C( 63935054) }, - { INT32_C( 407419), -INT32_C( 463334), -INT32_C( 37967), INT32_C( 535562) }, - { -INT32_C( 286385), -INT32_C( 865597), INT32_C( 573606), INT32_C( 589682) }, - INT8_C( 0), - { INT64_C( 21693984204), -INT64_C( 306817781794) } }, - { { -INT64_C( 6598636), INT64_C( 73031330) }, - { INT32_C( 493558), INT32_C( 443848), -INT32_C( 419461), INT32_C( 376534) }, - { -INT32_C( 860362), -INT32_C( 176951), INT32_C( 253114), INT32_C( 41359) }, - INT8_C( 1), - { INT64_C( 148441488186), -INT64_C( 133183104338) } }, - { { -INT64_C( 71693657), INT64_C( 31801833) }, - { -INT32_C( 640025), INT32_C( 582287), INT32_C( 257565), INT32_C( 667728) }, - { INT32_C( 984425), INT32_C( 972052), INT32_C( 466460), INT32_C( 325387) }, - INT8_C( 0), - { INT64_C( 507035156593), INT64_C(1314688074633) } }, - { { INT64_C( 80798879), -INT64_C( 5132023) }, - { INT32_C( 40841), INT32_C( 230578), -INT32_C( 63996), INT32_C( 386471) }, - { -INT32_C( 872708), -INT32_C( 69206), INT32_C( 475254), -INT32_C( 518991) }, - INT8_C( 3), - { INT64_C( 66507494951), -INT64_C( 401155073545) } }, - { { -INT64_C( 7962780), INT64_C( 26698152) }, - { -INT32_C( 245699), -INT32_C( 779577), INT32_C( 614398), INT32_C( 990465) }, - { -INT32_C( 682597), INT32_C( 354148), INT32_C( 119693), -INT32_C( 196681) }, - INT8_C( 0), - { -INT64_C( 838780425992), -INT64_C(1352150177058) } }, - }; - - for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { - simde_int64x2_t a = simde_vld1q_s64(test_vec[i].a); - simde_int32x4_t b = simde_vld1q_s32(test_vec[i].b); - simde_int32x4_t v = simde_vld1q_s32(test_vec[i].v); - simde_int64x2_t r; - SIMDE_CONSTIFY_4_(simde_vqdmlal_high_laneq_s32, r, (HEDLEY_UNREACHABLE(), r), test_vec[i].lane, a, b, v); - - simde_test_arm_neon_assert_equal_i64x2(r, simde_vld1q_s64(test_vec[i].r)); - } - - return 0; -} - static int test_simde_vqdmlal_high_lane_s32 (SIMDE_MUNIT_TEST_ARGS) { static const struct { @@ -268,6 +217,16 @@ test_simde_vqdmlal_high_lane_s32 (SIMDE_MUNIT_TEST_ARGS) { { -INT32_C( 562898), -INT32_C( 405129) }, INT8_C( 0), { INT64_C( 189650942394), INT64_C( 206619199558) } }, + { { INT64_MAX, INT64_MIN }, + { INT32_C( 1), -INT32_C( 1), INT32_C( 1), -INT32_C( 1) }, + { INT32_C( 1), -INT32_C( 49136) }, + INT8_C( 0), + { INT64_MAX, INT64_MIN } }, + { { INT64_C( 0), INT64_C( 0) }, + { INT32_C( INT32_MIN), -INT32_C( 0), INT32_C( INT32_MIN), -INT32_C( 0) }, + { INT32_C( INT32_MIN), -INT32_C( 0) }, + INT8_C( 0), + { INT64_MAX, INT64_C( 0) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -283,6 +242,79 @@ test_simde_vqdmlal_high_lane_s32 (SIMDE_MUNIT_TEST_ARGS) { return 0; } +static int +test_simde_vqdmlal_high_laneq_s32 (SIMDE_MUNIT_TEST_ARGS) { + static const struct { + int64_t a[2]; + int32_t b[4]; + int32_t v[4]; + int8_t lane; + int64_t r[2]; + } test_vec[] = { + { { INT64_C( 13898671), -INT64_C( 28702086) }, + { -INT32_C( 263510), -INT32_C( 347579), INT32_C( 942651), -INT32_C( 19214) }, + { -INT32_C( 985912), INT32_C( 863503), -INT32_C( 864486), -INT32_C( 67563) }, + INT8_C( 3), + { -INT64_C( 127362760355), INT64_C( 2567608878) } }, + { { -INT64_C( 49550599), INT64_C( 12924804) }, + { INT32_C( 569339), INT32_C( 916303), INT32_C( 322651), -INT32_C( 403309) }, + { INT32_C( 68174), -INT32_C( 513605), INT32_C( 1763), -INT32_C( 583573) }, + INT8_C( 1), + { -INT64_C( 331479884309), INT64_C( 414295962694) } }, + { { INT64_C( 33224957), -INT64_C( 60263322) }, + { -INT32_C( 733914), INT32_C( 345428), -INT32_C( 188026), -INT32_C( 361972) }, + { -INT32_C( 436531), INT32_C( 676538), INT32_C( 34787), -INT32_C( 755451) }, + INT8_C( 2), + { -INT64_C( 13048495967), -INT64_C( 25244103250) } }, + { { -INT64_C( 52374386), -INT64_C( 63935054) }, + { INT32_C( 407419), -INT32_C( 463334), -INT32_C( 37967), INT32_C( 535562) }, + { -INT32_C( 286385), -INT32_C( 865597), INT32_C( 573606), INT32_C( 589682) }, + INT8_C( 0), + { INT64_C( 21693984204), -INT64_C( 306817781794) } }, + { { -INT64_C( 6598636), INT64_C( 73031330) }, + { INT32_C( 493558), INT32_C( 443848), -INT32_C( 419461), INT32_C( 376534) }, + { -INT32_C( 860362), -INT32_C( 176951), INT32_C( 253114), INT32_C( 41359) }, + INT8_C( 1), + { INT64_C( 148441488186), -INT64_C( 133183104338) } }, + { { -INT64_C( 71693657), INT64_C( 31801833) }, + { -INT32_C( 640025), INT32_C( 582287), INT32_C( 257565), INT32_C( 667728) }, + { INT32_C( 984425), INT32_C( 972052), INT32_C( 466460), INT32_C( 325387) }, + INT8_C( 0), + { INT64_C( 507035156593), INT64_C(1314688074633) } }, + { { INT64_C( 80798879), -INT64_C( 5132023) }, + { INT32_C( 40841), INT32_C( 230578), -INT32_C( 63996), INT32_C( 386471) }, + { -INT32_C( 872708), -INT32_C( 69206), INT32_C( 475254), -INT32_C( 518991) }, + INT8_C( 3), + { INT64_C( 66507494951), -INT64_C( 401155073545) } }, + { { -INT64_C( 7962780), INT64_C( 26698152) }, + { -INT32_C( 245699), -INT32_C( 779577), INT32_C( 614398), INT32_C( 990465) }, + { -INT32_C( 682597), INT32_C( 354148), INT32_C( 119693), -INT32_C( 196681) }, + INT8_C( 0), + { -INT64_C( 838780425992), -INT64_C(1352150177058) } }, + { { INT64_MAX, INT64_MIN }, + { INT32_C( 1), -INT32_C( 1), INT32_C( 1), -INT32_C( 1) }, + { INT32_C( 1), -INT32_C( 49136), INT32_C( 1), -INT32_C( 1) }, + INT8_C( 0), + { INT64_MAX, INT64_MIN } }, + { { INT64_C( 0), INT64_C( 0) }, + { INT32_C( INT32_MIN), -INT32_C( 0), INT32_C( INT32_MIN), -INT32_C( 0) }, + { INT32_C( INT32_MIN), -INT32_C( 0), INT32_C( INT32_MIN), -INT32_C( 0) }, + INT8_C( 0), + { INT64_MAX, INT64_C( 0) } }, + }; + + for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { + simde_int64x2_t a = simde_vld1q_s64(test_vec[i].a); + simde_int32x4_t b = simde_vld1q_s32(test_vec[i].b); + simde_int32x4_t v = simde_vld1q_s32(test_vec[i].v); + simde_int64x2_t r; + SIMDE_CONSTIFY_4_(simde_vqdmlal_high_laneq_s32, r, (HEDLEY_UNREACHABLE(), r), test_vec[i].lane, a, b, v); + + simde_test_arm_neon_assert_equal_i64x2(r, simde_vld1q_s64(test_vec[i].r)); + } + + return 0; +} SIMDE_TEST_FUNC_LIST_BEGIN SIMDE_TEST_FUNC_LIST_ENTRY(vqdmlal_high_lane_s16) diff --git a/test/arm/neon/qdmlal_high_n.c b/test/arm/neon/qdmlal_high_n.c index 67cdaf411..b1d3d5e55 100644 --- a/test/arm/neon/qdmlal_high_n.c +++ b/test/arm/neon/qdmlal_high_n.c @@ -51,6 +51,11 @@ test_simde_vqdmlal_high_n_s16 (SIMDE_MUNIT_TEST_ARGS) { INT16_C( 485), INT16_C( 5900), -INT16_C( 5869), -INT16_C( 1878) }, INT16_C( 7731), { INT32_C( 7413265), INT32_C( 90658449), -INT32_C( 90521272), -INT32_C( 29754396) } }, + { { INT32_C( INT32_MAX), INT32_C( INT32_MIN), INT32_C( 0), -INT32_C( 0) }, + { -INT16_C( 6426), -INT16_C( 6271), -INT16_C( 104), -INT16_C( 6015), + -INT16_C( 1), INT16_C( 1), INT16_C( INT16_MIN), -INT16_C( 0) }, + INT16_C( INT16_MIN), + { INT32_C( INT32_MAX), INT32_C( INT32_MIN), INT32_C( INT32_MAX), INT32_C( 0) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -105,6 +110,14 @@ test_simde_vqdmlal_high_n_s32 (SIMDE_MUNIT_TEST_ARGS) { { INT32_C( 535706), INT32_C( 97715), -INT32_C( 835258), -INT32_C( 994676) }, -INT32_C( 559353), { INT64_C( 934440220790), INT64_C(1112700840082) } }, + { { INT64_MAX, INT64_MIN }, + { INT32_C( 535706), INT32_C( 97715), INT32_C( 1), -INT32_C( 1) }, + INT32_C( 1), + { INT64_MAX, INT64_MIN } }, + { { INT64_C( 0), INT64_C( 0) }, + { INT32_C( 535706), INT32_C( 97715), INT32_C( INT32_MIN), INT32_C( 0) }, + INT32_C( INT32_MIN), + { INT64_MAX, INT64_C( 0) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { diff --git a/test/arm/neon/qdmlal_lane.c b/test/arm/neon/qdmlal_lane.c index 5f12ff146..224cf098c 100644 --- a/test/arm/neon/qdmlal_lane.c +++ b/test/arm/neon/qdmlal_lane.c @@ -54,6 +54,21 @@ test_simde_vqdmlalh_lane_s16 (SIMDE_MUNIT_TEST_ARGS) { { -INT16_C( 4899), INT16_C( 1026), -INT16_C( 4249), -INT16_C( 6523)}, INT8_C( 0), { -INT32_C( 364754531) } }, + { { INT32_C( INT32_MAX) }, + { INT16_C( 1) }, + { INT16_C( 1), INT16_C( 1026), -INT16_C( 4249), -INT16_C( 6523)}, + INT8_C( 0), + { INT32_C( INT32_MAX) } }, + { { INT32_C( INT32_MIN) }, + { INT16_C( 1) }, + { -INT16_C( 1), INT16_C( 1026), -INT16_C( 4249), -INT16_C( 6523)}, + INT8_C( 0), + { INT32_C( INT32_MIN) } }, + { { INT32_C( 0) }, + { INT16_C( INT16_MIN) }, + { INT16_C( INT16_MIN), INT16_C( 1026), -INT16_C( 4249), -INT16_C( 6523)}, + INT8_C( 0), + { INT32_C( INT32_MAX) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -122,6 +137,21 @@ test_simde_vqdmlals_lane_s32 (SIMDE_MUNIT_TEST_ARGS) { { INT32_C( 310553), -INT32_C( 927821)}, INT8_C( 0), { -INT64_C( 428242251620) } }, + { { INT64_MAX }, + { INT32_C( 1) }, + { INT32_C( 1), INT32_C( 1026)}, + INT8_C( 0), + { INT64_MAX } }, + { { INT64_MIN }, + { INT32_C( 1) }, + { -INT32_C( 1), INT32_C( 1026)}, + INT8_C( 0), + { INT64_MIN } }, + { { 0 }, + { INT32_C( INT32_MIN) }, + { INT32_C( INT32_MIN), INT32_C( 1026)}, + INT8_C( 0), + { INT64_MAX } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -196,6 +226,24 @@ test_simde_vqdmlalh_laneq_s16 (SIMDE_MUNIT_TEST_ARGS) { INT16_C( 2898), -INT16_C( 6022), INT16_C( 9230), -INT16_C( 3066)}, INT8_C( 0), { -INT32_C( 791647416) } }, + { { INT32_C( INT32_MAX) }, + { INT16_C( 1) }, + { INT16_C( 1), INT16_C( 1026), -INT16_C( 4249), -INT16_C( 6523), + INT16_C( 1), INT16_C( 1026), -INT16_C( 4249), -INT16_C( 6523)}, + INT8_C( 0), + { INT32_C( INT32_MAX) } }, + { { INT32_C( INT32_MIN) }, + { INT16_C( 1) }, + { -INT16_C( 1), INT16_C( 1026), -INT16_C( 4249), -INT16_C( 6523), + -INT16_C( 1), INT16_C( 1026), -INT16_C( 4249), -INT16_C( 6523)}, + INT8_C( 0), + { INT32_C( INT32_MIN) } }, + { { INT32_C( 0) }, + { INT16_C( INT16_MIN) }, + { INT16_C( INT16_MIN), INT16_C( 1026), -INT16_C( 4249), -INT16_C( 6523), + INT16_C( INT16_MIN), INT16_C( 1026), -INT16_C( 4249), -INT16_C( 6523)}, + INT8_C( 0), + { INT32_C( INT32_MAX) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -268,6 +316,21 @@ test_simde_vqdmlals_laneq_s32 (SIMDE_MUNIT_TEST_ARGS) { { -INT32_C( 217936), -INT32_C( 918247), -INT32_C( 237518), -INT32_C( 18738)}, INT8_C( 0), { INT64_C( 274066566040) } }, + { { INT64_MAX }, + { INT32_C( 1) }, + { INT32_C( 1), INT32_C( 1026), -INT32_C( 4249), -INT32_C( 6523)}, + INT8_C( 0), + { INT64_MAX } }, + { { INT64_MIN }, + { INT32_C( 1) }, + { -INT32_C( 1), INT32_C( 1026), -INT32_C( 4249), -INT32_C( 6523)}, + INT8_C( 0), + { INT64_MIN } }, + { { INT64_C( 0) }, + { INT32_C( INT32_MIN) }, + { INT32_C( INT32_MIN), INT32_C( 1026), -INT32_C( 4249), -INT32_C( 6523)}, + INT8_C( 0), + { INT64_MAX } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -336,6 +399,11 @@ test_simde_vqdmlal_lane_s16 (SIMDE_MUNIT_TEST_ARGS) { { -INT16_C( 7119), -INT16_C( 6426), INT16_C( 5367), -INT16_C( 6430)}, INT8_C( 3), { -INT32_C( 695830361), INT32_C( 482871752), -INT32_C( 400665364), -INT32_C( 795825651) } }, + { { INT32_C( INT32_MAX), INT32_C( INT32_MIN), -INT32_C( 0), INT32_C( 0) }, + { -INT16_C( 1), INT16_C( 1), INT16_C( INT16_MIN), INT16_C( 0) }, + { INT16_C( INT16_MIN), INT16_C( 0), INT16_C( 0), INT16_C( 0) }, + INT8_C( 0), + { INT32_C( INT32_MAX), INT32_C( INT32_MIN), INT32_C( INT32_MAX), INT32_C( 0) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -406,6 +474,16 @@ test_simde_vqdmlal_lane_s32 (SIMDE_MUNIT_TEST_ARGS) { { INT32_C( 961008), -INT32_C( 49136)}, INT8_C( 1), { INT64_C( 561897490048), INT64_C( 713086479032) } }, + { { INT64_MAX, INT64_MIN }, + { INT32_C( 1), -INT32_C( 1) }, + { INT32_C( 1), -INT32_C( 49136) }, + INT8_C( 0), + { INT64_MAX, INT64_MIN } }, + { { INT64_C( 0), INT64_C( 0) }, + { INT32_C( INT32_MIN), -INT32_C( 0) }, + { INT32_C( INT32_MIN), -INT32_C( 0)}, + INT8_C( 0), + { INT64_MAX, INT64_C( 0) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -482,6 +560,12 @@ test_simde_vqdmlal_laneq_s16 (SIMDE_MUNIT_TEST_ARGS) { -INT16_C( 7423), INT16_C( 3231), -INT16_C( 6303), INT16_C( 9659)}, INT8_C( 6), { INT32_C( 910982024), -INT32_C(1011414068), -INT32_C( 502332473), INT32_C( 790086285) } }, + { { INT32_C( INT32_MAX), INT32_C( INT32_MIN), INT32_C( 0), INT32_C( 0) }, + { -INT16_C( 1), INT16_C( 1), INT16_C( INT16_MIN), -INT16_C( 0) }, + { INT16_C( INT16_MIN), INT16_C( 0), INT16_C( 0), INT16_C( 0), + -INT16_C( 7423), INT16_C( 3231), -INT16_C( 6303), INT16_C( 0)}, + INT8_C( 0), + { INT32_C( INT32_MAX), INT32_C( INT32_MIN), INT32_C( INT32_MAX), INT32_C( 0) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -556,6 +640,16 @@ test_simde_vqdmlal_laneq_s32 (SIMDE_MUNIT_TEST_ARGS) { { -INT32_C( 57677), -INT32_C( 975727), INT32_C( 913570), INT32_C( 949988)}, INT8_C( 0), { INT64_C( 99902036748), -INT64_C( 189027155595) } }, + { { INT64_MAX, INT64_MIN }, + { INT32_C( 1), -INT32_C( 1) }, + { INT32_C( 1), -INT32_C( 975727), INT32_C( 913570), INT32_C( 949988)}, + INT8_C( 0), + { INT64_MAX, INT64_MIN } }, + { { 0, 0 }, + { INT32_MIN, 0 }, + { INT32_C( INT32_MIN), -INT32_C( 975727), INT32_C( 913570), INT32_C( 949988)}, + INT8_C( 0), + { INT64_MAX, 0 } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { diff --git a/test/arm/neon/qdmlal_n.c b/test/arm/neon/qdmlal_n.c index 24cada709..6af14065f 100644 --- a/test/arm/neon/qdmlal_n.c +++ b/test/arm/neon/qdmlal_n.c @@ -43,6 +43,10 @@ test_simde_vqdmlal_n_s16 (SIMDE_MUNIT_TEST_ARGS) { { -INT16_C( 7412), -INT16_C( 248), -INT16_C( 2162), INT16_C( 8922) }, INT16_C( 3247), { INT32_C( 930198035), INT32_C( 477577394), -INT32_C( 953519945), -INT32_C( 268739455) } }, + { { INT32_C( INT32_MAX), INT32_C( INT32_MIN), INT32_C( 0), INT32_C( 0) }, + { -INT16_C( 1), INT16_C( 1), INT16_C( INT16_MIN), INT16_C( 0) }, + INT16_C( INT16_MIN), + { INT32_C( INT32_MAX), INT32_C( INT32_MIN), INT32_C( INT32_MAX), INT32_C( 0) } }, }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) { @@ -97,6 +101,14 @@ test_simde_vqdmlal_n_s32 (SIMDE_MUNIT_TEST_ARGS) { { -INT32_C( 403731), -INT32_C( 674292) }, INT32_C( 504404), { -INT64_C( 393406359481), -INT64_C( 533824867094) } }, + { { INT64_MAX, INT64_MIN }, + { INT32_C( 1), -INT32_C( 1) }, + INT32_C( 1), + { INT64_MAX, INT64_MIN } }, + { { INT64_C( 0), INT64_C( 0) }, + { INT32_C( INT32_MIN), INT32_C( 0) }, + INT32_C( INT32_MIN), + { INT64_MAX, INT64_C( 0) } } }; for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {