Skip to content

Commit

Permalink
AArch32 and AArch64 minus SIMDe fixes (#475)
Browse files Browse the repository at this point in the history
* Re-enable compiling without SIMDe enabled on AArch64

Guard uses of the new `simdFilterARM` kernel by checking that
`TARGET_SIMD_X86` is defined, since otherwise the SIMD-everywhere based
kernel is not available and will fail to compile.

* Various AArch32 fixes

Since 6a5cfa8 the AArch32 code fails to
build due to use of `*_high_*` and other intrinsics which are only
available on AArch64. Switch these to the portable versions already
defined in `sum_neon.h`.

Additionally fix a typo in InterPredARM.h.

Also adjust code style to match the prevailing style elsewhere in the
library.
  • Loading branch information
georges-arm authored Nov 26, 2024
1 parent 6a5cfa8 commit cddf62d
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 83 deletions.
4 changes: 2 additions & 2 deletions source/Lib/CommonLib/arm/InterPredARM.h
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,10 @@ void InterPredInterpolation::_initInterPredictionARM()
xFpBiDirOptFlow = BiOptFlowCoreARMSIMD<vext>;
}

#else
#else

template<ARM_VEXT vext>
void TCoeffOps::_initInterPredictionARM()
void InterPredInterpolation::_initInterPredictionARM()
{}
#endif

Expand Down
59 changes: 32 additions & 27 deletions source/Lib/CommonLib/arm/RdCostARM.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,11 @@ POSSIBILITY OF SUCH DAMAGE.
namespace vvenc
{


static int32x4_t neon_madd_16 (int16x8_t a, int16x8_t b) {

int32x4_t sum = vdupq_n_s32(0);
int32x4_t c = vmull_s16(vget_low_s16(a), vget_low_s16(b));
int32x4_t d = vmull_high_s16((a), (b));
sum = vpaddq_s32(c,d);

return sum;
static inline int32x4_t neon_madd_16( int16x8_t a, int16x8_t b )
{
int32x4_t c = vmull_s16( vget_low_s16( a ), vget_low_s16( b ) );
int32x4_t d = vmull_s16( vget_high_s16( a ), vget_high_s16( b ) );
return pairwise_add_s32x4( c, d );
}

#if defined( TARGET_SIMD_ARM )
Expand Down Expand Up @@ -1029,11 +1025,13 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam )
{
int16x8_t vsrc1 = vcombine_s16( vld1_s16( ( const int16_t* )pSrc1 ), vld1_s16( ( const int16_t* )( &pSrc1[iStrideSrc1] ) ) );
int16x8_t vsrc2 = vcombine_s16( vld1_s16( ( const int16_t* )pSrc2 ), vld1_s16( ( const int16_t* )( &pSrc2[iStrideSrc2] ) ) );
int32x4_t vsum = vmovl_s16(vget_low_s16( vpaddq_s16( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 )) );
int32x4_t vsum =
vmovl_s16( vget_low_s16( pairwise_add_s16x8( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 ) ) );
vsrc1 = vcombine_s16( vld1_s16( ( const int16_t* )( &pSrc1[2 * iStrideSrc1] ) ), vld1_s16( ( const int16_t* )( &pSrc1[3 * iStrideSrc1] ) ) );
vsrc2 = vcombine_s16( vld1_s16( ( const int16_t* )( &pSrc2[2 * iStrideSrc2] ) ), vld1_s16( ( const int16_t* )( &pSrc2[3 * iStrideSrc2] ) ) );
vsum = vaddq_s32( vsum, vmovl_s16(vget_low_s16( vpaddq_s16( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 ) ) ));
uiSum = vaddvq_s32(vsum);
vsum = vaddq_s32(
vsum, vmovl_s16( vget_low_s16( pairwise_add_s16x8( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 ) ) ) );
uiSum = horizontal_add_s32x4( vsum );
}
else
{
Expand All @@ -1047,7 +1045,7 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam )
pSrc1 += iStrideSrc1;
pSrc2 += iStrideSrc2;
}
uiSum = vaddvq_s32(vsum32);
uiSum = horizontal_add_s32x4( vsum32 );
}
}
else
Expand Down Expand Up @@ -1081,8 +1079,8 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam )
}

int32x4_t vsumtemp = vpaddlq_s16( vsum16);
if( earlyExitAllowed ) vsum32 = vpaddq_s32( vsum32, vsumtemp );

if( earlyExitAllowed ) vsum32 = pairwise_add_s32x4( vsum32, vsumtemp );
else vsum32 = vaddq_s32 ( vsum32, vsumtemp );

pSrc1 += iStrideSrc1;
Expand All @@ -1101,16 +1099,28 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam )
checkExit--;
}
}
uiSum = vaddvq_s32(vsum32);
uiSum = horizontal_add_s32x4( vsum32 );
}

uiSum <<= iSubShift;
return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
}

template<ARM_VEXT vext>
Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam)
{
static inline int16x8_t reverse_vector_s16( int16x8_t x )
{
#if REAL_TARGET_AARCH64
static const uint8_t shuffle_table[ 16 ] = { 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 };
uint8x16_t shuffle_indices = vld1q_u8( shuffle_table );
return vreinterpretq_s16_u8( vqtbl1q_u8( vreinterpretq_u8_s16( x ), shuffle_indices ) );
#else
int16x8_t rev_halves = vrev64q_s16( x );
return vcombine_s16( vget_high_s16( rev_halves ), vget_low_s16( rev_halves ) );
#endif
}

template<ARM_VEXT vext>
Distortion RdCost::xGetSADwMask_ARMSIMD( const DistParam& rcDtParam )
{
if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight)
return RdCost::xGetSADwMask(rcDtParam);

Expand All @@ -1128,8 +1138,6 @@ Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam)
Distortion sum = 0;

int32x4_t vsum32 = vdupq_n_s32( 0 );
static const uint8_t shuffle_table[16] = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
uint8x16_t shuffle_vector = vld1q_u8(shuffle_table);

for (int y = 0; y < rows; y += subStep)
{
Expand All @@ -1140,10 +1148,8 @@ Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam)
int16x8_t vmask;
if (rcDtParam.stepX == -1)
{
vmask = vld1q_s16( ( const int16_t* ) ((&weightMask[x]) - (x << 1) - (8 - 1)));
uint8x16_t input_vector = vreinterpretq_u8_s16(vmask);
uint8x16_t shuffled_vector = vqtbl1q_u8(input_vector, shuffle_vector);
vmask = vreinterpretq_s16_u8(shuffled_vector);
vmask = vld1q_s16( ( const int16_t* )( ( &weightMask[ x ] ) - ( x << 1 ) - ( 8 - 1 ) ) );
vmask = reverse_vector_s16( vmask );
}
else
{
Expand All @@ -1155,12 +1161,11 @@ Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam)
src2 += strideSrc2;
weightMask += strideMask;
}
sum = vaddvq_s32(vsum32);
sum = horizontal_add_s32x4( vsum32 );
sum <<= subShift;
return sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
}


template<ARM_VEXT vext>
void RdCost::_initRdCostARM()
{
Expand Down
108 changes: 54 additions & 54 deletions source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,17 @@ POSSIBILITY OF SUCH DAMAGE.
//! \ingroup CommonLib
//! \{

#if SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_AVX2
# define USE_AVX2
#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_SSE42
# define USE_SSE42
#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_SSE41
# define USE_SSE41
#if defined( TARGET_SIMD_X86 )
#if SIMD_EVERYWHERE_EXTENSION_LEVEL_ID == X86_SIMD_AVX2
#define USE_AVX2
#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID == X86_SIMD_SSE42
#define USE_SSE42
#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID == X86_SIMD_SSE41
#define USE_SSE41
#endif

#ifdef TARGET_SIMD_X86
# include "../x86/InterpolationFilterX86.h"
#endif
#endif // defined( TARGET_SIMD_X86 )

#if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_OPT_MCIF

Expand Down Expand Up @@ -528,49 +528,48 @@ static void simdInterpolateHorM8_Neon( const int16_t* src, int srcStride, int16_
vsrc0 = vld1q_s16( ( const int16_t * )&src[col] );
vsrc1 = vld1q_s16( ( const int16_t * )&src[col + 4] );

vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[0]));
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[1]));
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 6), vdupq_n_s16(coeff[2]));
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 7), vdupq_n_s16(coeff[3]));
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 0 ) ), vdup_n_s16( coeff[ 0 ] ) );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 1 ) ), vdup_n_s16( coeff[ 1 ] ) );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 2 ) ), vdup_n_s16( coeff[ 2 ] ) );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 3 ) ), vdup_n_s16( coeff[ 3 ] ) );

vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[0]));
vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[1]));
vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 6), vdupq_n_s16(coeff[2]));
vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 7), vdupq_n_s16(coeff[3]));
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 0 ) ), vdup_n_s16( coeff[ 0 ] ) );
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 1 ) ), vdup_n_s16( coeff[ 1 ] ) );
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 2 ) ), vdup_n_s16( coeff[ 2 ] ) );
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 3 ) ), vdup_n_s16( coeff[ 3 ] ) );

if( N == 8 )
{
vsrc0 = vld1q_s16( ( const int16_t* )&src[ col + 8 ] );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 0 ) ), vdup_n_s16( coeff[ 4 ] ) );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 1 ) ), vdup_n_s16( coeff[ 5 ] ) );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 2 ) ), vdup_n_s16( coeff[ 6 ] ) );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 3 ) ), vdup_n_s16( coeff[ 7 ] ) );

vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 0 ) ), vdup_n_s16( coeff[ 4 ] ) );
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 1 ) ), vdup_n_s16( coeff[ 5 ] ) );
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 2 ) ), vdup_n_s16( coeff[ 6 ] ) );
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 3 ) ), vdup_n_s16( coeff[ 7 ] ) );
}
if( N == 6 )
{
vsrc0 = vld1q_s16( ( const int16_t* )&src[ col + 8 ] );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 0 ) ), vdup_n_s16( coeff[ 4 ] ) );
vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 1 ) ), vdup_n_s16( coeff[ 5 ] ) );

if( N == 8 )
{
vsrc0 = vld1q_s16( ( const int16_t * )&src[col + 8] );
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[4]));
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[5]));
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 6), vdupq_n_s16(coeff[6]));
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 7), vdupq_n_s16(coeff[7]));

vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[4]));
vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[5]));
vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 6), vdupq_n_s16(coeff[6]));
vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 7), vdupq_n_s16(coeff[7]));
}
if( N == 6 )
{
vsrc0 = vld1q_s16( ( const int16_t * )&src[col + 8] );
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[4]));
vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[5]));
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 0 ) ), vdup_n_s16( coeff[ 4 ] ) );
vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 1 ) ), vdup_n_s16( coeff[ 5 ] ) );
}

vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[4]));
vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[5]));
}

vsuma = vshlq_s32( vsuma, vdupq_n_s32(-1*shift) );
vsumb = vshlq_s32( vsumb, vdupq_n_s32(-1*shift) );
vsum = vcombine_s16(vqmovn_s32(vsuma), vqmovn_s32(vsumb));

if( shiftBack )
{
vsum = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum ) );
}
vst1q_s16((int16_t*) &dst[col], vsum);
vsuma = vshlq_s32( vsuma, vdupq_n_s32( -1 * shift ) );
vsumb = vshlq_s32( vsumb, vdupq_n_s32( -1 * shift ) );
vsum = vcombine_s16( vqmovn_s32( vsuma ), vqmovn_s32( vsumb ) );

if( shiftBack )
{
vsum = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum ) );
}
vst1q_s16( ( int16_t* )&dst[ col ], vsum );
}
src += srcStride;
dst += dstStride;
Expand Down Expand Up @@ -606,8 +605,8 @@ static void simdInterpolateVerM8_Neon( const int16_t *src, int srcStride, int16_
{
vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[ 0]), vdup_n_s16(coeff[0]));
vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[ 1]), vdup_n_s16(coeff[1]));
vsumb = vmlal_high_s16(vsumb, vsrc[0], vdupq_n_s16(coeff[0]));
vsumb = vmlal_high_s16(vsumb, vsrc[1], vdupq_n_s16(coeff[1]));
vsumb = vmlal_s16( vsumb, vget_high_s16( vsrc[ 0 ] ), vdup_n_s16( coeff[ 0 ] ) );
vsumb = vmlal_s16( vsumb, vget_high_s16( vsrc[ 1 ] ), vdup_n_s16( coeff[ 1 ] ) );

vsrc[0] = vsrc[1];
}
Expand All @@ -617,8 +616,8 @@ static void simdInterpolateVerM8_Neon( const int16_t *src, int srcStride, int16_
{
vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[i + 0]), vdup_n_s16(coeff[i + 0]));
vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[i + 1]), vdup_n_s16(coeff[i + 1]));
vsumb = vmlal_high_s16(vsumb, vsrc[i + 0], vdupq_n_s16(coeff[i + 0]));
vsumb = vmlal_high_s16(vsumb, vsrc[i + 1], vdupq_n_s16(coeff[i + 1]));
vsumb = vmlal_s16( vsumb, vget_high_s16( vsrc[ i + 0 ] ), vdup_n_s16( coeff[ i + 0 ] ) );
vsumb = vmlal_s16( vsumb, vget_high_s16( vsrc[ i + 1 ] ), vdup_n_s16( coeff[ i + 1 ] ) );
vsrc[i ] = vsrc[i + 1];
vsrc[i + 1] = vsrc[i + 2];
}
Expand All @@ -639,6 +638,7 @@ static void simdInterpolateVerM8_Neon( const int16_t *src, int srcStride, int16_
}
}

#if defined( TARGET_SIMD_X86 )
template<int N, bool isVertical, bool isFirst, bool isLast>
static void simdFilterARM( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff )
{
Expand Down Expand Up @@ -817,7 +817,7 @@ static void simdFilterARM( const ClpRng& clpRng, Pel const *src, int srcStride,
dst += dstStride;
}
}

#endif // defined( TARGET_SIMD_X86 )

template<>
void InterpolationFilter::_initInterpolationFilterARM<NEON>()
Expand All @@ -833,6 +833,7 @@ void InterpolationFilter::_initInterpolationFilterARM<NEON>()

m_filterN2_2D = simdInterpolateN2_2D_neon;

#if defined( TARGET_SIMD_X86 )
m_filterHor[0][0][0] = simdFilterARM<8, false, false, false>;
m_filterHor[0][0][1] = simdFilterARM<8, false, false, true>;
m_filterHor[0][1][0] = simdFilterARM<8, false, true, false>;
Expand Down Expand Up @@ -862,8 +863,7 @@ void InterpolationFilter::_initInterpolationFilterARM<NEON>()
m_filterVer[3][0][1] = simdFilterARM<6, true, false, true>;
m_filterVer[3][1][0] = simdFilterARM<6, true, true, false>;
m_filterVer[3][1][1] = simdFilterARM<6, true, true, true>;


#endif // defined( TARGET_SIMD_X86 )
}

} // namespace vvenc
Expand Down
11 changes: 11 additions & 0 deletions source/Lib/CommonLib/arm/neon/sum_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,17 @@ static inline int16x8_t pairwise_add_s16x8( const int16x8_t a, const int16x8_t b
#endif
}

static inline int32x4_t pairwise_add_s32x4( const int32x4_t a, const int32x4_t b )
{
#if REAL_TARGET_AARCH64
return vpaddq_s32( a, b );
#else
int32x2_t lo = vpadd_s32( vget_low_s32( a ), vget_low_s32( b ) );
int32x2_t hi = vpadd_s32( vget_high_s32( a ), vget_high_s32( b ) );
return vcombine_s32( lo, hi );
#endif
}

} // namespace vvenc

#endif

0 comments on commit cddf62d

Please sign in to comment.