diff --git a/source/Lib/CommonLib/arm/InterPredARM.h b/source/Lib/CommonLib/arm/InterPredARM.h index 543f9d2a..2da9599b 100644 --- a/source/Lib/CommonLib/arm/InterPredARM.h +++ b/source/Lib/CommonLib/arm/InterPredARM.h @@ -274,10 +274,10 @@ void InterPredInterpolation::_initInterPredictionARM() xFpBiDirOptFlow = BiOptFlowCoreARMSIMD; } -#else +#else template -void TCoeffOps::_initInterPredictionARM() +void InterPredInterpolation::_initInterPredictionARM() {} #endif diff --git a/source/Lib/CommonLib/arm/RdCostARM.h b/source/Lib/CommonLib/arm/RdCostARM.h index c45ea350..8ae54249 100644 --- a/source/Lib/CommonLib/arm/RdCostARM.h +++ b/source/Lib/CommonLib/arm/RdCostARM.h @@ -69,15 +69,11 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { - -static int32x4_t neon_madd_16 (int16x8_t a, int16x8_t b) { - - int32x4_t sum = vdupq_n_s32(0); - int32x4_t c = vmull_s16(vget_low_s16(a), vget_low_s16(b)); - int32x4_t d = vmull_high_s16((a), (b)); - sum = vpaddq_s32(c,d); - - return sum; +static inline int32x4_t neon_madd_16( int16x8_t a, int16x8_t b ) +{ + int32x4_t c = vmull_s16( vget_low_s16( a ), vget_low_s16( b ) ); + int32x4_t d = vmull_s16( vget_high_s16( a ), vget_high_s16( b ) ); + return pairwise_add_s32x4( c, d ); } #if defined( TARGET_SIMD_ARM ) @@ -1029,11 +1025,13 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam ) { int16x8_t vsrc1 = vcombine_s16( vld1_s16( ( const int16_t* )pSrc1 ), vld1_s16( ( const int16_t* )( &pSrc1[iStrideSrc1] ) ) ); int16x8_t vsrc2 = vcombine_s16( vld1_s16( ( const int16_t* )pSrc2 ), vld1_s16( ( const int16_t* )( &pSrc2[iStrideSrc2] ) ) ); - int32x4_t vsum = vmovl_s16(vget_low_s16( vpaddq_s16( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 )) ); + int32x4_t vsum = + vmovl_s16( vget_low_s16( pairwise_add_s16x8( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 ) ) ); vsrc1 = vcombine_s16( vld1_s16( ( const int16_t* )( &pSrc1[2 * iStrideSrc1] ) ), vld1_s16( ( const int16_t* )( &pSrc1[3 * iStrideSrc1] ) ) ); vsrc2 = vcombine_s16( vld1_s16( ( const int16_t* )( &pSrc2[2 * iStrideSrc2] ) ), vld1_s16( ( const int16_t* )( &pSrc2[3 * iStrideSrc2] ) ) ); - vsum = vaddq_s32( vsum, vmovl_s16(vget_low_s16( vpaddq_s16( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 ) ) )); - uiSum = vaddvq_s32(vsum); + vsum = vaddq_s32( + vsum, vmovl_s16( vget_low_s16( pairwise_add_s16x8( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 ) ) ) ); + uiSum = horizontal_add_s32x4( vsum ); } else { @@ -1047,7 +1045,7 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam ) pSrc1 += iStrideSrc1; pSrc2 += iStrideSrc2; } - uiSum = vaddvq_s32(vsum32); + uiSum = horizontal_add_s32x4( vsum32 ); } } else @@ -1081,8 +1079,8 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam ) } int32x4_t vsumtemp = vpaddlq_s16( vsum16); - - if( earlyExitAllowed ) vsum32 = vpaddq_s32( vsum32, vsumtemp ); + + if( earlyExitAllowed ) vsum32 = pairwise_add_s32x4( vsum32, vsumtemp ); else vsum32 = vaddq_s32 ( vsum32, vsumtemp ); pSrc1 += iStrideSrc1; @@ -1101,16 +1099,28 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam ) checkExit--; } } - uiSum = vaddvq_s32(vsum32); + uiSum = horizontal_add_s32x4( vsum32 ); } uiSum <<= iSubShift; return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); } -template -Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam) -{ +static inline int16x8_t reverse_vector_s16( int16x8_t x ) +{ +#if REAL_TARGET_AARCH64 + static const uint8_t shuffle_table[ 16 ] = { 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 }; + uint8x16_t shuffle_indices = vld1q_u8( shuffle_table ); + return vreinterpretq_s16_u8( vqtbl1q_u8( vreinterpretq_u8_s16( x ), shuffle_indices ) ); +#else + int16x8_t rev_halves = vrev64q_s16( x ); + return vcombine_s16( vget_high_s16( rev_halves ), vget_low_s16( rev_halves ) ); +#endif +} + +template +Distortion RdCost::xGetSADwMask_ARMSIMD( const DistParam& rcDtParam ) +{ if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight) return RdCost::xGetSADwMask(rcDtParam); @@ -1128,8 +1138,6 @@ Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam) Distortion sum = 0; int32x4_t vsum32 = vdupq_n_s32( 0 ); - static const uint8_t shuffle_table[16] = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1}; - uint8x16_t shuffle_vector = vld1q_u8(shuffle_table); for (int y = 0; y < rows; y += subStep) { @@ -1140,10 +1148,8 @@ Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam) int16x8_t vmask; if (rcDtParam.stepX == -1) { - vmask = vld1q_s16( ( const int16_t* ) ((&weightMask[x]) - (x << 1) - (8 - 1))); - uint8x16_t input_vector = vreinterpretq_u8_s16(vmask); - uint8x16_t shuffled_vector = vqtbl1q_u8(input_vector, shuffle_vector); - vmask = vreinterpretq_s16_u8(shuffled_vector); + vmask = vld1q_s16( ( const int16_t* )( ( &weightMask[ x ] ) - ( x << 1 ) - ( 8 - 1 ) ) ); + vmask = reverse_vector_s16( vmask ); } else { @@ -1155,12 +1161,11 @@ Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam) src2 += strideSrc2; weightMask += strideMask; } - sum = vaddvq_s32(vsum32); + sum = horizontal_add_s32x4( vsum32 ); sum <<= subShift; return sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); } - template void RdCost::_initRdCostARM() { diff --git a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp index d23ab150..87b672b6 100644 --- a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp +++ b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp @@ -56,17 +56,17 @@ POSSIBILITY OF SUCH DAMAGE. //! \ingroup CommonLib //! \{ -#if SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_AVX2 -# define USE_AVX2 -#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_SSE42 -# define USE_SSE42 -#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_SSE41 -# define USE_SSE41 +#if defined( TARGET_SIMD_X86 ) +#if SIMD_EVERYWHERE_EXTENSION_LEVEL_ID == X86_SIMD_AVX2 +#define USE_AVX2 +#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID == X86_SIMD_SSE42 +#define USE_SSE42 +#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID == X86_SIMD_SSE41 +#define USE_SSE41 #endif -#ifdef TARGET_SIMD_X86 # include "../x86/InterpolationFilterX86.h" -#endif +#endif // defined( TARGET_SIMD_X86 ) #if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_OPT_MCIF @@ -528,49 +528,48 @@ static void simdInterpolateHorM8_Neon( const int16_t* src, int srcStride, int16_ vsrc0 = vld1q_s16( ( const int16_t * )&src[col] ); vsrc1 = vld1q_s16( ( const int16_t * )&src[col + 4] ); - vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[0])); - vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[1])); - vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 6), vdupq_n_s16(coeff[2])); - vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 7), vdupq_n_s16(coeff[3])); + vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 0 ) ), vdup_n_s16( coeff[ 0 ] ) ); + vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 1 ) ), vdup_n_s16( coeff[ 1 ] ) ); + vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 2 ) ), vdup_n_s16( coeff[ 2 ] ) ); + vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 3 ) ), vdup_n_s16( coeff[ 3 ] ) ); - vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[0])); - vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[1])); - vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 6), vdupq_n_s16(coeff[2])); - vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 7), vdupq_n_s16(coeff[3])); + vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 0 ) ), vdup_n_s16( coeff[ 0 ] ) ); + vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 1 ) ), vdup_n_s16( coeff[ 1 ] ) ); + vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 2 ) ), vdup_n_s16( coeff[ 2 ] ) ); + vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 3 ) ), vdup_n_s16( coeff[ 3 ] ) ); + if( N == 8 ) + { + vsrc0 = vld1q_s16( ( const int16_t* )&src[ col + 8 ] ); + vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 0 ) ), vdup_n_s16( coeff[ 4 ] ) ); + vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 1 ) ), vdup_n_s16( coeff[ 5 ] ) ); + vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 2 ) ), vdup_n_s16( coeff[ 6 ] ) ); + vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 3 ) ), vdup_n_s16( coeff[ 7 ] ) ); + + vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 0 ) ), vdup_n_s16( coeff[ 4 ] ) ); + vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 1 ) ), vdup_n_s16( coeff[ 5 ] ) ); + vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 2 ) ), vdup_n_s16( coeff[ 6 ] ) ); + vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 3 ) ), vdup_n_s16( coeff[ 7 ] ) ); + } + if( N == 6 ) + { + vsrc0 = vld1q_s16( ( const int16_t* )&src[ col + 8 ] ); + vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 0 ) ), vdup_n_s16( coeff[ 4 ] ) ); + vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 1 ) ), vdup_n_s16( coeff[ 5 ] ) ); - if( N == 8 ) - { - vsrc0 = vld1q_s16( ( const int16_t * )&src[col + 8] ); - vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[4])); - vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[5])); - vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 6), vdupq_n_s16(coeff[6])); - vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 7), vdupq_n_s16(coeff[7])); - - vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[4])); - vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[5])); - vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 6), vdupq_n_s16(coeff[6])); - vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 7), vdupq_n_s16(coeff[7])); - } - if( N == 6 ) - { - vsrc0 = vld1q_s16( ( const int16_t * )&src[col + 8] ); - vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[4])); - vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[5])); + vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 0 ) ), vdup_n_s16( coeff[ 4 ] ) ); + vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 1 ) ), vdup_n_s16( coeff[ 5 ] ) ); + } - vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[4])); - vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[5])); - } - - vsuma = vshlq_s32( vsuma, vdupq_n_s32(-1*shift) ); - vsumb = vshlq_s32( vsumb, vdupq_n_s32(-1*shift) ); - vsum = vcombine_s16(vqmovn_s32(vsuma), vqmovn_s32(vsumb)); - - if( shiftBack ) - { - vsum = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum ) ); - } - vst1q_s16((int16_t*) &dst[col], vsum); + vsuma = vshlq_s32( vsuma, vdupq_n_s32( -1 * shift ) ); + vsumb = vshlq_s32( vsumb, vdupq_n_s32( -1 * shift ) ); + vsum = vcombine_s16( vqmovn_s32( vsuma ), vqmovn_s32( vsumb ) ); + + if( shiftBack ) + { + vsum = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum ) ); + } + vst1q_s16( ( int16_t* )&dst[ col ], vsum ); } src += srcStride; dst += dstStride; @@ -606,8 +605,8 @@ static void simdInterpolateVerM8_Neon( const int16_t *src, int srcStride, int16_ { vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[ 0]), vdup_n_s16(coeff[0])); vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[ 1]), vdup_n_s16(coeff[1])); - vsumb = vmlal_high_s16(vsumb, vsrc[0], vdupq_n_s16(coeff[0])); - vsumb = vmlal_high_s16(vsumb, vsrc[1], vdupq_n_s16(coeff[1])); + vsumb = vmlal_s16( vsumb, vget_high_s16( vsrc[ 0 ] ), vdup_n_s16( coeff[ 0 ] ) ); + vsumb = vmlal_s16( vsumb, vget_high_s16( vsrc[ 1 ] ), vdup_n_s16( coeff[ 1 ] ) ); vsrc[0] = vsrc[1]; } @@ -617,8 +616,8 @@ static void simdInterpolateVerM8_Neon( const int16_t *src, int srcStride, int16_ { vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[i + 0]), vdup_n_s16(coeff[i + 0])); vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[i + 1]), vdup_n_s16(coeff[i + 1])); - vsumb = vmlal_high_s16(vsumb, vsrc[i + 0], vdupq_n_s16(coeff[i + 0])); - vsumb = vmlal_high_s16(vsumb, vsrc[i + 1], vdupq_n_s16(coeff[i + 1])); + vsumb = vmlal_s16( vsumb, vget_high_s16( vsrc[ i + 0 ] ), vdup_n_s16( coeff[ i + 0 ] ) ); + vsumb = vmlal_s16( vsumb, vget_high_s16( vsrc[ i + 1 ] ), vdup_n_s16( coeff[ i + 1 ] ) ); vsrc[i ] = vsrc[i + 1]; vsrc[i + 1] = vsrc[i + 2]; } @@ -639,6 +638,7 @@ static void simdInterpolateVerM8_Neon( const int16_t *src, int srcStride, int16_ } } +#if defined( TARGET_SIMD_X86 ) template static void simdFilterARM( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff ) { @@ -817,7 +817,7 @@ static void simdFilterARM( const ClpRng& clpRng, Pel const *src, int srcStride, dst += dstStride; } } - +#endif // defined( TARGET_SIMD_X86 ) template<> void InterpolationFilter::_initInterpolationFilterARM() @@ -833,6 +833,7 @@ void InterpolationFilter::_initInterpolationFilterARM() m_filterN2_2D = simdInterpolateN2_2D_neon; +#if defined( TARGET_SIMD_X86 ) m_filterHor[0][0][0] = simdFilterARM<8, false, false, false>; m_filterHor[0][0][1] = simdFilterARM<8, false, false, true>; m_filterHor[0][1][0] = simdFilterARM<8, false, true, false>; @@ -862,8 +863,7 @@ void InterpolationFilter::_initInterpolationFilterARM() m_filterVer[3][0][1] = simdFilterARM<6, true, false, true>; m_filterVer[3][1][0] = simdFilterARM<6, true, true, false>; m_filterVer[3][1][1] = simdFilterARM<6, true, true, true>; - - +#endif // defined( TARGET_SIMD_X86 ) } } // namespace vvenc diff --git a/source/Lib/CommonLib/arm/neon/sum_neon.h b/source/Lib/CommonLib/arm/neon/sum_neon.h index d5923af9..49298911 100644 --- a/source/Lib/CommonLib/arm/neon/sum_neon.h +++ b/source/Lib/CommonLib/arm/neon/sum_neon.h @@ -109,6 +109,17 @@ static inline int16x8_t pairwise_add_s16x8( const int16x8_t a, const int16x8_t b #endif } +static inline int32x4_t pairwise_add_s32x4( const int32x4_t a, const int32x4_t b ) +{ +#if REAL_TARGET_AARCH64 + return vpaddq_s32( a, b ); +#else + int32x2_t lo = vpadd_s32( vget_low_s32( a ), vget_low_s32( b ) ); + int32x2_t hi = vpadd_s32( vget_high_s32( a ), vget_high_s32( b ) ); + return vcombine_s32( lo, hi ); +#endif +} + } // namespace vvenc #endif