Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AArch32 and AArch64 minus SIMDe fixes #475

Merged
merged 2 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions source/Lib/CommonLib/arm/InterPredARM.h
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,10 @@ void InterPredInterpolation::_initInterPredictionARM()
xFpBiDirOptFlow = BiOptFlowCoreARMSIMD<vext>;
}

#else
#else

template<ARM_VEXT vext>
void TCoeffOps::_initInterPredictionARM()
void InterPredInterpolation::_initInterPredictionARM()
{}
#endif

Expand Down
59 changes: 32 additions & 27 deletions source/Lib/CommonLib/arm/RdCostARM.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,11 @@ POSSIBILITY OF SUCH DAMAGE.
namespace vvenc
{


static int32x4_t neon_madd_16 (int16x8_t a, int16x8_t b) {

int32x4_t sum = vdupq_n_s32(0);
int32x4_t c = vmull_s16(vget_low_s16(a), vget_low_s16(b));
int32x4_t d = vmull_high_s16((a), (b));
sum = vpaddq_s32(c,d);

return sum;
static inline int32x4_t neon_madd_16( int16x8_t a, int16x8_t b )
{
int32x4_t c = vmull_s16( vget_low_s16( a ), vget_low_s16( b ) );
int32x4_t d = vmull_s16( vget_high_s16( a ), vget_high_s16( b ) );
return pairwise_add_s32x4( c, d );
}

#if defined( TARGET_SIMD_ARM )
Expand Down Expand Up @@ -1029,11 +1025,13 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam )
{
int16x8_t vsrc1 = vcombine_s16( vld1_s16( ( const int16_t* )pSrc1 ), vld1_s16( ( const int16_t* )( &pSrc1[iStrideSrc1] ) ) );
int16x8_t vsrc2 = vcombine_s16( vld1_s16( ( const int16_t* )pSrc2 ), vld1_s16( ( const int16_t* )( &pSrc2[iStrideSrc2] ) ) );
int32x4_t vsum = vmovl_s16(vget_low_s16( vpaddq_s16( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 )) );
int32x4_t vsum =
vmovl_s16( vget_low_s16( pairwise_add_s16x8( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 ) ) );
vsrc1 = vcombine_s16( vld1_s16( ( const int16_t* )( &pSrc1[2 * iStrideSrc1] ) ), vld1_s16( ( const int16_t* )( &pSrc1[3 * iStrideSrc1] ) ) );
vsrc2 = vcombine_s16( vld1_s16( ( const int16_t* )( &pSrc2[2 * iStrideSrc2] ) ), vld1_s16( ( const int16_t* )( &pSrc2[3 * iStrideSrc2] ) ) );
vsum = vaddq_s32( vsum, vmovl_s16(vget_low_s16( vpaddq_s16( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 ) ) ));
uiSum = vaddvq_s32(vsum);
vsum = vaddq_s32(
vsum, vmovl_s16( vget_low_s16( pairwise_add_s16x8( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 ) ) ) );
uiSum = horizontal_add_s32x4( vsum );
}
else
{
Expand All @@ -1047,7 +1045,7 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam )
pSrc1 += iStrideSrc1;
pSrc2 += iStrideSrc2;
}
uiSum = vaddvq_s32(vsum32);
uiSum = horizontal_add_s32x4( vsum32 );
}
}
else
Expand Down Expand Up @@ -1081,8 +1079,8 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam )
}

int32x4_t vsumtemp = vpaddlq_s16( vsum16);
if( earlyExitAllowed ) vsum32 = vpaddq_s32( vsum32, vsumtemp );

if( earlyExitAllowed ) vsum32 = pairwise_add_s32x4( vsum32, vsumtemp );
else vsum32 = vaddq_s32 ( vsum32, vsumtemp );

pSrc1 += iStrideSrc1;
Expand All @@ -1101,16 +1099,28 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam )
checkExit--;
}
}
uiSum = vaddvq_s32(vsum32);
uiSum = horizontal_add_s32x4( vsum32 );
}

uiSum <<= iSubShift;
return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
}

template<ARM_VEXT vext>
Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam)
{
static inline int16x8_t reverse_vector_s16( int16x8_t x )
{
#if REAL_TARGET_AARCH64
static const uint8_t shuffle_table[ 16 ] = { 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 };
uint8x16_t shuffle_indices = vld1q_u8( shuffle_table );
return vreinterpretq_s16_u8( vqtbl1q_u8( vreinterpretq_u8_s16( x ), shuffle_indices ) );
#else
int16x8_t rev_halves = vrev64q_s16( x );
return vcombine_s16( vget_high_s16( rev_halves ), vget_low_s16( rev_halves ) );
#endif
}

template<ARM_VEXT vext>
Distortion RdCost::xGetSADwMask_ARMSIMD( const DistParam& rcDtParam )
{
if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight)
return RdCost::xGetSADwMask(rcDtParam);

Expand All @@ -1128,8 +1138,6 @@ Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam)
Distortion sum = 0;

int32x4_t vsum32 = vdupq_n_s32( 0 );
static const uint8_t shuffle_table[16] = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
uint8x16_t shuffle_vector = vld1q_u8(shuffle_table);

for (int y = 0; y < rows; y += subStep)
{
Expand All @@ -1140,10 +1148,8 @@ Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam)
int16x8_t vmask;
if (rcDtParam.stepX == -1)
{
vmask = vld1q_s16( ( const int16_t* ) ((&weightMask[x]) - (x << 1) - (8 - 1)));
uint8x16_t input_vector = vreinterpretq_u8_s16(vmask);
uint8x16_t shuffled_vector = vqtbl1q_u8(input_vector, shuffle_vector);
vmask = vreinterpretq_s16_u8(shuffled_vector);
vmask = vld1q_s16( ( const int16_t* )( ( &weightMask[ x ] ) - ( x << 1 ) - ( 8 - 1 ) ) );
vmask = reverse_vector_s16( vmask );
}
else
{
Expand All @@ -1155,12 +1161,11 @@ Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam)
src2 += strideSrc2;
weightMask += strideMask;
}
sum = vaddvq_s32(vsum32);
sum = horizontal_add_s32x4( vsum32 );
sum <<= subShift;
return sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
}


template<ARM_VEXT vext>
void RdCost::_initRdCostARM()
{
Expand Down
108 changes: 54 additions & 54 deletions source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,17 @@ POSSIBILITY OF SUCH DAMAGE.
//! \ingroup CommonLib
//! \{

#if SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_AVX2
# define USE_AVX2
#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_SSE42
# define USE_SSE42
#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_SSE41
# define USE_SSE41
#if defined( TARGET_SIMD_X86 )
#if SIMD_EVERYWHERE_EXTENSION_LEVEL_ID == X86_SIMD_AVX2
#define USE_AVX2
#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID == X86_SIMD_SSE42
#define USE_SSE42
#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID == X86_SIMD_SSE41
#define USE_SSE41
#endif

#ifdef TARGET_SIMD_X86
# include "../x86/InterpolationFilterX86.h"
#endif
#endif // defined( TARGET_SIMD_X86 )

#if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_OPT_MCIF

Expand Down Expand Up @@ -528,49 +528,48 @@ static void simdInterpolateHorM8_Neon( const int16_t* src, int srcStride, int16_
vsrc0 = vld1q_s16( ( const int16_t * )&src[col] );
vsrc1 = vld1q_s16( ( const int16_t * )&src[col + 4] );

vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[0]));
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[1]));
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 6), vdupq_n_s16(coeff[2]));
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 7), vdupq_n_s16(coeff[3]));
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 0 ) ), vdup_n_s16( coeff[ 0 ] ) );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 1 ) ), vdup_n_s16( coeff[ 1 ] ) );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 2 ) ), vdup_n_s16( coeff[ 2 ] ) );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 3 ) ), vdup_n_s16( coeff[ 3 ] ) );

vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[0]));
vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[1]));
vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 6), vdupq_n_s16(coeff[2]));
vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 7), vdupq_n_s16(coeff[3]));
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 0 ) ), vdup_n_s16( coeff[ 0 ] ) );
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 1 ) ), vdup_n_s16( coeff[ 1 ] ) );
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 2 ) ), vdup_n_s16( coeff[ 2 ] ) );
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 3 ) ), vdup_n_s16( coeff[ 3 ] ) );

if( N == 8 )
{
vsrc0 = vld1q_s16( ( const int16_t* )&src[ col + 8 ] );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 0 ) ), vdup_n_s16( coeff[ 4 ] ) );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 1 ) ), vdup_n_s16( coeff[ 5 ] ) );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 2 ) ), vdup_n_s16( coeff[ 6 ] ) );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 3 ) ), vdup_n_s16( coeff[ 7 ] ) );

vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 0 ) ), vdup_n_s16( coeff[ 4 ] ) );
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 1 ) ), vdup_n_s16( coeff[ 5 ] ) );
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 2 ) ), vdup_n_s16( coeff[ 6 ] ) );
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 3 ) ), vdup_n_s16( coeff[ 7 ] ) );
}
if( N == 6 )
{
vsrc0 = vld1q_s16( ( const int16_t* )&src[ col + 8 ] );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 0 ) ), vdup_n_s16( coeff[ 4 ] ) );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 1 ) ), vdup_n_s16( coeff[ 5 ] ) );

if( N == 8 )
{
vsrc0 = vld1q_s16( ( const int16_t * )&src[col + 8] );
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[4]));
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[5]));
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 6), vdupq_n_s16(coeff[6]));
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 7), vdupq_n_s16(coeff[7]));

vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[4]));
vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[5]));
vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 6), vdupq_n_s16(coeff[6]));
vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 7), vdupq_n_s16(coeff[7]));
}
if( N == 6 )
{
vsrc0 = vld1q_s16( ( const int16_t * )&src[col + 8] );
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[4]));
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[5]));
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 0 ) ), vdup_n_s16( coeff[ 4 ] ) );
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 1 ) ), vdup_n_s16( coeff[ 5 ] ) );
}

vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[4]));
vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[5]));
}

vsuma = vshlq_s32( vsuma, vdupq_n_s32(-1*shift) );
vsumb = vshlq_s32( vsumb, vdupq_n_s32(-1*shift) );
vsum = vcombine_s16(vqmovn_s32(vsuma), vqmovn_s32(vsumb));

if( shiftBack )
{
vsum = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum ) );
}
vst1q_s16((int16_t*) &dst[col], vsum);
vsuma = vshlq_s32( vsuma, vdupq_n_s32( -1 * shift ) );
vsumb = vshlq_s32( vsumb, vdupq_n_s32( -1 * shift ) );
vsum = vcombine_s16( vqmovn_s32( vsuma ), vqmovn_s32( vsumb ) );

if( shiftBack )
{
vsum = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum ) );
}
vst1q_s16( ( int16_t* )&dst[ col ], vsum );
}
src += srcStride;
dst += dstStride;
Expand Down Expand Up @@ -606,8 +605,8 @@ static void simdInterpolateVerM8_Neon( const int16_t *src, int srcStride, int16_
{
vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[ 0]), vdup_n_s16(coeff[0]));
vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[ 1]), vdup_n_s16(coeff[1]));
vsumb = vmlal_high_s16(vsumb, vsrc[0], vdupq_n_s16(coeff[0]));
vsumb = vmlal_high_s16(vsumb, vsrc[1], vdupq_n_s16(coeff[1]));
vsumb = vmlal_s16( vsumb, vget_high_s16( vsrc[ 0 ] ), vdup_n_s16( coeff[ 0 ] ) );
vsumb = vmlal_s16( vsumb, vget_high_s16( vsrc[ 1 ] ), vdup_n_s16( coeff[ 1 ] ) );

vsrc[0] = vsrc[1];
}
Expand All @@ -617,8 +616,8 @@ static void simdInterpolateVerM8_Neon( const int16_t *src, int srcStride, int16_
{
vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[i + 0]), vdup_n_s16(coeff[i + 0]));
vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[i + 1]), vdup_n_s16(coeff[i + 1]));
vsumb = vmlal_high_s16(vsumb, vsrc[i + 0], vdupq_n_s16(coeff[i + 0]));
vsumb = vmlal_high_s16(vsumb, vsrc[i + 1], vdupq_n_s16(coeff[i + 1]));
vsumb = vmlal_s16( vsumb, vget_high_s16( vsrc[ i + 0 ] ), vdup_n_s16( coeff[ i + 0 ] ) );
vsumb = vmlal_s16( vsumb, vget_high_s16( vsrc[ i + 1 ] ), vdup_n_s16( coeff[ i + 1 ] ) );
vsrc[i ] = vsrc[i + 1];
vsrc[i + 1] = vsrc[i + 2];
}
Expand All @@ -639,6 +638,7 @@ static void simdInterpolateVerM8_Neon( const int16_t *src, int srcStride, int16_
}
}

#if defined( TARGET_SIMD_X86 )
template<int N, bool isVertical, bool isFirst, bool isLast>
static void simdFilterARM( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff )
{
Expand Down Expand Up @@ -817,7 +817,7 @@ static void simdFilterARM( const ClpRng& clpRng, Pel const *src, int srcStride,
dst += dstStride;
}
}

#endif // defined( TARGET_SIMD_X86 )

template<>
void InterpolationFilter::_initInterpolationFilterARM<NEON>()
Expand All @@ -833,6 +833,7 @@ void InterpolationFilter::_initInterpolationFilterARM<NEON>()

m_filterN2_2D = simdInterpolateN2_2D_neon;

#if defined( TARGET_SIMD_X86 )
m_filterHor[0][0][0] = simdFilterARM<8, false, false, false>;
m_filterHor[0][0][1] = simdFilterARM<8, false, false, true>;
m_filterHor[0][1][0] = simdFilterARM<8, false, true, false>;
Expand Down Expand Up @@ -862,8 +863,7 @@ void InterpolationFilter::_initInterpolationFilterARM<NEON>()
m_filterVer[3][0][1] = simdFilterARM<6, true, false, true>;
m_filterVer[3][1][0] = simdFilterARM<6, true, true, false>;
m_filterVer[3][1][1] = simdFilterARM<6, true, true, true>;


#endif // defined( TARGET_SIMD_X86 )
}

} // namespace vvenc
Expand Down
11 changes: 11 additions & 0 deletions source/Lib/CommonLib/arm/neon/sum_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,17 @@ static inline int16x8_t pairwise_add_s16x8( const int16x8_t a, const int16x8_t b
#endif
}

static inline int32x4_t pairwise_add_s32x4( const int32x4_t a, const int32x4_t b )
{
#if REAL_TARGET_AARCH64
return vpaddq_s32( a, b );
#else
int32x2_t lo = vpadd_s32( vget_low_s32( a ), vget_low_s32( b ) );
int32x2_t hi = vpadd_s32( vget_high_s32( a ), vget_high_s32( b ) );
return vcombine_s32( lo, hi );
#endif
}

} // namespace vvenc

#endif