AArch32 and AArch64 minus SIMDe fixes (#475)

* Re-enable compiling without SIMDe enabled on AArch64 Guard uses of the new `simdFilterARM` kernel by checking that `TARGET_SIMD_X86` is defined, since otherwise the SIMD-everywhere based kernel is not available and will fail to compile. * Various AArch32 fixes Since 6a5cfa8 the AArch32 code fails to build due to use of `*_high_*` and other intrinsics which are only available on AArch64. Switch these to the portable versions already defined in `sum_neon.h`. Additionally fix a typo in InterPredARM.h. Also adjust code style to match the prevailing style elsewhere in the library.
fraunhoferhhi · Nov 26, 2024 · cddf62d · cddf62d
1 parent 6a5cfa8
commit cddf62d
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 83 deletions.
diff --git a/source/Lib/CommonLib/arm/InterPredARM.h b/source/Lib/CommonLib/arm/InterPredARM.h
@@ -274,10 +274,10 @@ void InterPredInterpolation::_initInterPredictionARM()
   xFpBiDirOptFlow     = BiOptFlowCoreARMSIMD<vext>;
 }
 
-#else 
+#else
 
 template<ARM_VEXT vext>
-void TCoeffOps::_initInterPredictionARM()
+void InterPredInterpolation::_initInterPredictionARM()
 {}
 #endif
 

diff --git a/source/Lib/CommonLib/arm/RdCostARM.h b/source/Lib/CommonLib/arm/RdCostARM.h
@@ -69,15 +69,11 @@ POSSIBILITY OF SUCH DAMAGE.
 namespace vvenc
 {
 
-
-static int32x4_t neon_madd_16 (int16x8_t a, int16x8_t b) {
-
-  int32x4_t sum = vdupq_n_s32(0);   
-  int32x4_t c = vmull_s16(vget_low_s16(a), vget_low_s16(b));
-  int32x4_t d = vmull_high_s16((a), (b)); 
-  sum = vpaddq_s32(c,d);
-
-  return sum;
+static inline int32x4_t neon_madd_16( int16x8_t a, int16x8_t b )
+{
+  int32x4_t c = vmull_s16( vget_low_s16( a ), vget_low_s16( b ) );
+  int32x4_t d = vmull_s16( vget_high_s16( a ), vget_high_s16( b ) );
+  return pairwise_add_s32x4( c, d );
 }
 
 #if defined( TARGET_SIMD_ARM )
@@ -1029,11 +1025,13 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam )
     {
       int16x8_t vsrc1 = vcombine_s16( vld1_s16( ( const int16_t* )pSrc1 ),  vld1_s16( ( const int16_t* )( &pSrc1[iStrideSrc1] ) ) );
       int16x8_t vsrc2 = vcombine_s16( vld1_s16( ( const int16_t* )pSrc2 ),  vld1_s16( ( const int16_t* )( &pSrc2[iStrideSrc2] ) ) );
-      int32x4_t vsum  = vmovl_s16(vget_low_s16( vpaddq_s16( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 )) );
+      int32x4_t vsum =
+        vmovl_s16( vget_low_s16( pairwise_add_s16x8( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 ) ) );
       vsrc1 = vcombine_s16( vld1_s16( ( const int16_t* )( &pSrc1[2 * iStrideSrc1] ) ),  vld1_s16( ( const int16_t* )( &pSrc1[3 * iStrideSrc1] ) ) );
       vsrc2 = vcombine_s16( vld1_s16( ( const int16_t* )( &pSrc2[2 * iStrideSrc2] ) ),  vld1_s16( ( const int16_t* )( &pSrc2[3 * iStrideSrc2] ) ) );
-      vsum  = vaddq_s32( vsum, vmovl_s16(vget_low_s16( vpaddq_s16( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 ) ) ));
-      uiSum = vaddvq_s32(vsum);
+      vsum = vaddq_s32(
+        vsum, vmovl_s16( vget_low_s16( pairwise_add_s16x8( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 ) ) ) );
+      uiSum = horizontal_add_s32x4( vsum );
     }
     else
     {
@@ -1047,7 +1045,7 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam )
         pSrc1 += iStrideSrc1;
         pSrc2 += iStrideSrc2;
       }
-      uiSum = vaddvq_s32(vsum32);
+      uiSum = horizontal_add_s32x4( vsum32 );
     }
   }
   else
@@ -1081,8 +1079,8 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam )
       }
 
       int32x4_t vsumtemp = vpaddlq_s16( vsum16);
-      
-      if( earlyExitAllowed ) vsum32 = vpaddq_s32( vsum32, vsumtemp );
+
+      if( earlyExitAllowed ) vsum32 = pairwise_add_s32x4( vsum32, vsumtemp );
       else                   vsum32 = vaddq_s32 ( vsum32, vsumtemp );
 
       pSrc1   += iStrideSrc1;
@@ -1101,16 +1099,28 @@ Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam )
         checkExit--;
       }
     }
-    uiSum = vaddvq_s32(vsum32);
+    uiSum = horizontal_add_s32x4( vsum32 );
   }
 
   uiSum <<= iSubShift;
   return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
 }
 
-template<ARM_VEXT vext> 
-Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam)
-{ 
+static inline int16x8_t reverse_vector_s16( int16x8_t x )
+{
+#if REAL_TARGET_AARCH64
+  static const uint8_t shuffle_table[ 16 ] = { 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 };
+  uint8x16_t shuffle_indices               = vld1q_u8( shuffle_table );
+  return vreinterpretq_s16_u8( vqtbl1q_u8( vreinterpretq_u8_s16( x ), shuffle_indices ) );
+#else
+  int16x8_t rev_halves = vrev64q_s16( x );
+  return vcombine_s16( vget_high_s16( rev_halves ), vget_low_s16( rev_halves ) );
+#endif
+}
+
+template<ARM_VEXT vext>
+Distortion RdCost::xGetSADwMask_ARMSIMD( const DistParam& rcDtParam )
+{
   if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight)
     return RdCost::xGetSADwMask(rcDtParam);
 
@@ -1128,8 +1138,6 @@ Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam)
   Distortion sum = 0;
 
   int32x4_t vsum32 = vdupq_n_s32( 0 );
-  static const uint8_t shuffle_table[16] = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
-  uint8x16_t shuffle_vector = vld1q_u8(shuffle_table);
 
   for (int y = 0; y < rows; y += subStep)
   {
@@ -1140,10 +1148,8 @@ Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam)
       int16x8_t vmask;
       if (rcDtParam.stepX == -1)
       {
-        vmask                      = vld1q_s16( ( const int16_t* ) ((&weightMask[x]) - (x << 1) - (8 - 1)));
-        uint8x16_t input_vector = vreinterpretq_u8_s16(vmask);
-        uint8x16_t shuffled_vector = vqtbl1q_u8(input_vector, shuffle_vector);
-        vmask = vreinterpretq_s16_u8(shuffled_vector);
+        vmask = vld1q_s16( ( const int16_t* )( ( &weightMask[ x ] ) - ( x << 1 ) - ( 8 - 1 ) ) );
+        vmask = reverse_vector_s16( vmask );
       }
       else
       {
@@ -1155,12 +1161,11 @@ Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam)
     src2 += strideSrc2;
     weightMask += strideMask;
   }
-  sum    = vaddvq_s32(vsum32);
+  sum = horizontal_add_s32x4( vsum32 );
   sum <<= subShift;
   return sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
 }
 
-
 template<ARM_VEXT vext>
 void RdCost::_initRdCostARM()
 {

diff --git a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp
@@ -56,17 +56,17 @@ POSSIBILITY OF SUCH DAMAGE.
 //! \ingroup CommonLib
 //! \{
 
-#if SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_AVX2
-# define USE_AVX2
-#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_SSE42
-# define USE_SSE42
-#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_SSE41
-# define USE_SSE41
+#if defined( TARGET_SIMD_X86 )
+#if SIMD_EVERYWHERE_EXTENSION_LEVEL_ID == X86_SIMD_AVX2
+#define USE_AVX2
+#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID == X86_SIMD_SSE42
+#define USE_SSE42
+#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID == X86_SIMD_SSE41
+#define USE_SSE41
 #endif
 
-#ifdef TARGET_SIMD_X86
 # include "../x86/InterpolationFilterX86.h"
-#endif
+#endif  // defined( TARGET_SIMD_X86 )
 
 #if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_OPT_MCIF
 
@@ -528,49 +528,48 @@ static void simdInterpolateHorM8_Neon( const int16_t* src, int srcStride, int16_
       vsrc0 = vld1q_s16( ( const int16_t * )&src[col] );
       vsrc1 = vld1q_s16( ( const int16_t * )&src[col + 4] );
 
-      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[0]));
-      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[1]));
-      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 6), vdupq_n_s16(coeff[2]));
-      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 7), vdupq_n_s16(coeff[3]));
+      vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 0 ) ), vdup_n_s16( coeff[ 0 ] ) );
+      vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 1 ) ), vdup_n_s16( coeff[ 1 ] ) );
+      vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 2 ) ), vdup_n_s16( coeff[ 2 ] ) );
+      vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc0, vsrc0, 3 ) ), vdup_n_s16( coeff[ 3 ] ) );
 
-      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[0]));
-      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[1]));
-      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 6), vdupq_n_s16(coeff[2]));
-      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 7), vdupq_n_s16(coeff[3]));
+      vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 0 ) ), vdup_n_s16( coeff[ 0 ] ) );
+      vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 1 ) ), vdup_n_s16( coeff[ 1 ] ) );
+      vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 2 ) ), vdup_n_s16( coeff[ 2 ] ) );
+      vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc1, vsrc1, 3 ) ), vdup_n_s16( coeff[ 3 ] ) );
 
+      if( N == 8 )
+      {
+        vsrc0 = vld1q_s16( ( const int16_t* )&src[ col + 8 ] );
+        vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 0 ) ), vdup_n_s16( coeff[ 4 ] ) );
+        vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 1 ) ), vdup_n_s16( coeff[ 5 ] ) );
+        vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 2 ) ), vdup_n_s16( coeff[ 6 ] ) );
+        vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 3 ) ), vdup_n_s16( coeff[ 7 ] ) );
+
+        vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 0 ) ), vdup_n_s16( coeff[ 4 ] ) );
+        vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 1 ) ), vdup_n_s16( coeff[ 5 ] ) );
+        vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 2 ) ), vdup_n_s16( coeff[ 6 ] ) );
+        vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 3 ) ), vdup_n_s16( coeff[ 7 ] ) );
+      }
+      if( N == 6 )
+      {
+        vsrc0 = vld1q_s16( ( const int16_t* )&src[ col + 8 ] );
+        vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 0 ) ), vdup_n_s16( coeff[ 4 ] ) );
+        vsuma = vmlal_s16( vsuma, vget_low_s16( vextq_s16( vsrc1, vsrc1, 1 ) ), vdup_n_s16( coeff[ 5 ] ) );
 
-    if( N == 8 )
-    {
-      vsrc0 = vld1q_s16( ( const int16_t * )&src[col + 8] );
-      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[4]));
-      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[5]));
-      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 6), vdupq_n_s16(coeff[6]));
-      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 7), vdupq_n_s16(coeff[7]));
-
-      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[4]));
-      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[5]));
-      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 6), vdupq_n_s16(coeff[6]));
-      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 7), vdupq_n_s16(coeff[7]));
-    }
-    if( N == 6 )
-    {
-      vsrc0 = vld1q_s16( ( const int16_t * )&src[col + 8] );
-      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[4]));
-      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[5]));
+        vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 0 ) ), vdup_n_s16( coeff[ 4 ] ) );
+        vsumb = vmlal_s16( vsumb, vget_low_s16( vextq_s16( vsrc0, vsrc0, 1 ) ), vdup_n_s16( coeff[ 5 ] ) );
+      }
 
-      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[4]));
-      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[5]));   
-    }
-
-    vsuma = vshlq_s32( vsuma, vdupq_n_s32(-1*shift) );
-    vsumb = vshlq_s32( vsumb, vdupq_n_s32(-1*shift) );
-    vsum = vcombine_s16(vqmovn_s32(vsuma), vqmovn_s32(vsumb));
-
-    if( shiftBack )
-    { 
-      vsum = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum ) );
-    }
-    vst1q_s16((int16_t*) &dst[col], vsum);
+      vsuma = vshlq_s32( vsuma, vdupq_n_s32( -1 * shift ) );
+      vsumb = vshlq_s32( vsumb, vdupq_n_s32( -1 * shift ) );
+      vsum  = vcombine_s16( vqmovn_s32( vsuma ), vqmovn_s32( vsumb ) );
+
+      if( shiftBack )
+      {
+        vsum = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum ) );
+      }
+      vst1q_s16( ( int16_t* )&dst[ col ], vsum );
     }
     src += srcStride;
     dst += dstStride;
@@ -606,8 +605,8 @@ static void simdInterpolateVerM8_Neon( const int16_t *src, int srcStride, int16_
       {  
         vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[ 0]), vdup_n_s16(coeff[0]));
         vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[ 1]), vdup_n_s16(coeff[1]));
-        vsumb = vmlal_high_s16(vsumb, vsrc[0], vdupq_n_s16(coeff[0]));
-        vsumb = vmlal_high_s16(vsumb, vsrc[1], vdupq_n_s16(coeff[1]));
+        vsumb = vmlal_s16( vsumb, vget_high_s16( vsrc[ 0 ] ), vdup_n_s16( coeff[ 0 ] ) );
+        vsumb = vmlal_s16( vsumb, vget_high_s16( vsrc[ 1 ] ), vdup_n_s16( coeff[ 1 ] ) );
 
         vsrc[0] = vsrc[1];
       }
@@ -617,8 +616,8 @@ static void simdInterpolateVerM8_Neon( const int16_t *src, int srcStride, int16_
         {
           vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[i + 0]), vdup_n_s16(coeff[i + 0]));
           vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[i + 1]), vdup_n_s16(coeff[i + 1]));
-          vsumb = vmlal_high_s16(vsumb, vsrc[i + 0], vdupq_n_s16(coeff[i + 0]));
-          vsumb = vmlal_high_s16(vsumb, vsrc[i + 1], vdupq_n_s16(coeff[i + 1]));
+          vsumb       = vmlal_s16( vsumb, vget_high_s16( vsrc[ i + 0 ] ), vdup_n_s16( coeff[ i + 0 ] ) );
+          vsumb       = vmlal_s16( vsumb, vget_high_s16( vsrc[ i + 1 ] ), vdup_n_s16( coeff[ i + 1 ] ) );
           vsrc[i    ] = vsrc[i + 1];
           vsrc[i + 1] = vsrc[i + 2];
         }
@@ -639,6 +638,7 @@ static void simdInterpolateVerM8_Neon( const int16_t *src, int srcStride, int16_
   }
 }
 
+#if defined( TARGET_SIMD_X86 )
 template<int N, bool isVertical, bool isFirst, bool isLast>
 static void simdFilterARM( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff )
 {
@@ -817,7 +817,7 @@ static void simdFilterARM( const ClpRng& clpRng, Pel const *src, int srcStride,
     dst += dstStride;
   }
 }
-
+#endif  // defined( TARGET_SIMD_X86 )
 
 template<>
 void InterpolationFilter::_initInterpolationFilterARM<NEON>()
@@ -833,6 +833,7 @@ void InterpolationFilter::_initInterpolationFilterARM<NEON>()
 
   m_filterN2_2D = simdInterpolateN2_2D_neon;
 
+#if defined( TARGET_SIMD_X86 )
   m_filterHor[0][0][0] = simdFilterARM<8, false, false, false>;
   m_filterHor[0][0][1] = simdFilterARM<8, false, false, true>;
   m_filterHor[0][1][0] = simdFilterARM<8, false, true, false>;
@@ -862,8 +863,7 @@ void InterpolationFilter::_initInterpolationFilterARM<NEON>()
   m_filterVer[3][0][1] = simdFilterARM<6, true, false, true>;
   m_filterVer[3][1][0] = simdFilterARM<6, true, true, false>;
   m_filterVer[3][1][1] = simdFilterARM<6, true, true, true>;
-
-
+#endif  // defined( TARGET_SIMD_X86 )
 }
 
 } // namespace vvenc

diff --git a/source/Lib/CommonLib/arm/neon/sum_neon.h b/source/Lib/CommonLib/arm/neon/sum_neon.h
@@ -109,6 +109,17 @@ static inline int16x8_t pairwise_add_s16x8( const int16x8_t a, const int16x8_t b
 #endif
 }
 
+static inline int32x4_t pairwise_add_s32x4( const int32x4_t a, const int32x4_t b )
+{
+#if REAL_TARGET_AARCH64
+  return vpaddq_s32( a, b );
+#else
+  int32x2_t lo = vpadd_s32( vget_low_s32( a ), vget_low_s32( b ) );
+  int32x2_t hi = vpadd_s32( vget_high_s32( a ), vget_high_s32( b ) );
+  return vcombine_s32( lo, hi );
+#endif
+}
+
 }  // namespace vvenc
 
 #endif