diff --git a/AUTHORS.md b/AUTHORS.md index 15830f961..0ace759b9 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -16,3 +16,5 @@ * Hossein Pejman, , École de technologie supérieure (ÉTS) * Vignesh V Menon, , Fraunhofer HHI * George Steed, @georges-arm, Arm +* Yiqun Liu, , Fraunhofer HHI +* Mehrdad Ghafari, , Fraunhofer HHI diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp index 0eaebcad2..8f8614105 100644 --- a/source/Lib/CommonLib/InterPrediction.cpp +++ b/source/Lib/CommonLib/InterPrediction.cpp @@ -681,6 +681,10 @@ void InterPredInterpolation::init() initInterPredictionX86(); #endif +#if ENABLE_SIMD_OPT_BDOF && defined( TARGET_SIMD_ARM ) + initInterPredictionARM(); +#endif + if (m_storedMv == nullptr) { const int MVBUFFER_SIZE = MAX_CU_SIZE / MIN_PU_SIZE; diff --git a/source/Lib/CommonLib/InterPrediction.h b/source/Lib/CommonLib/InterPrediction.h index 8b3686f0b..dabe99204 100644 --- a/source/Lib/CommonLib/InterPrediction.h +++ b/source/Lib/CommonLib/InterPrediction.h @@ -98,6 +98,12 @@ class InterPredInterpolation void _initInterPredictionX86(); #endif +#if ENABLE_SIMD_OPT_BDOF && defined( TARGET_SIMD_ARM ) + void initInterPredictionARM(); + template + void _initInterPredictionARM(); +#endif + protected: void xWeightedAverage ( const CodingUnit& cu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const bool bdofApplied, PelUnitBuf *yuvPredTmp = NULL ); void xPredAffineBlk ( const ComponentID compID, const CodingUnit& cu, const Picture* refPic, const Mv* _mv, PelUnitBuf& dstPic, const bool bi, const ClpRng& clpRng, const RefPicList refPicList = REF_PIC_LIST_X); diff --git a/source/Lib/CommonLib/RdCost.h b/source/Lib/CommonLib/RdCost.h index c4239181c..301b551be 100644 --- a/source/Lib/CommonLib/RdCost.h +++ b/source/Lib/CommonLib/RdCost.h @@ -275,6 +275,12 @@ class RdCost template static Distortion xGetHADs_ARMSIMD ( const DistParam& pcDtParam ); + + template + static Distortion xGetSADwMask_ARMSIMD(const DistParam &rcDtParam); + + template< int iWidth, ARM_VEXT vext > + static Distortion xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam ); #endif unsigned int getBitsMultiplePredsIBC(int x, int y, bool useIMV); diff --git a/source/Lib/CommonLib/arm/InitARM.cpp b/source/Lib/CommonLib/arm/InitARM.cpp index 362d9ff23..9d664c02d 100644 --- a/source/Lib/CommonLib/arm/InitARM.cpp +++ b/source/Lib/CommonLib/arm/InitARM.cpp @@ -121,6 +121,22 @@ void TCoeffOps::initTCoeffOpsARM() } #endif +#if ENABLE_SIMD_OPT_BDOF +void InterPredInterpolation::initInterPredictionARM() +{ + auto vext = read_arm_extension_flags(); + switch (vext){ + case NEON: + _initInterPredictionARM(); + break; + default: + break; + } +} +#endif + + + #endif // TARGET_SIMD_ARM } // namespace diff --git a/source/Lib/CommonLib/arm/InterPredARM.h b/source/Lib/CommonLib/arm/InterPredARM.h new file mode 100644 index 000000000..543f9d2a1 --- /dev/null +++ b/source/Lib/CommonLib/arm/InterPredARM.h @@ -0,0 +1,292 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------------------- */ +/** \file InterPredX86.h + \brief SIMD for InterPrediction +*/ + +//! \ingroup CommonLib +//! \{ + + +#include "CommonDefARM.h" +#include "Rom.h" +#include "InterPrediction.h" + + + +//! \ingroup CommonLib +//! \{ + +namespace vvenc { + +static inline int rightShiftMSB(int numer, int denom) +{ + int shiftIdx = bit_scan_reverse(denom); + return (numer >> shiftIdx); +} + +#ifdef TARGET_SIMD_ARM +#if __ARM_ARCH >= 8 + + +template< ARM_VEXT vext > +static inline void calcBIOSums_Neon(const Pel* srcY0Tmp, const Pel* srcY1Tmp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, const int widthG, const int bitDepth, int limit, int &tmpx, int &tmpy) +{ + const int srcStride = widthG + 2; + int16x8_t sumAbsGXTmp = vdupq_n_s16(0); + int16x8_t sumDIXTmp = vdupq_n_s16(0); + int16x8_t sumAbsGYTmp = vdupq_n_s16(0); + int16x8_t sumDIYTmp = vdupq_n_s16(0); + int16x8_t sumSignGyGxTmp = vdupq_n_s16(0); + int16x8_t x = {1, 1, 1, 1, 1, 1, 0, 0}; + + for (int y = 0; y < 3; y++) + { + int16x8_t shiftSrcY0Tmp = vshrq_n_s16(vld1q_s16((int16_t*)(srcY0Tmp)), 4); + int16x8_t shiftSrcY1Tmp = vshrq_n_s16(vld1q_s16((int16_t*)(srcY1Tmp)), 4); + + int16x8_t loadGradX0 = vld1q_s16((int16_t*)(gradX0)); + int16x8_t loadGradX1 = vld1q_s16((int16_t*)(gradX1)); + int16x8_t loadGradY0 = vld1q_s16((int16_t*)(gradY0)); + int16x8_t loadGradY1 = vld1q_s16((int16_t*)(gradY1)); + int16x8_t subTemp1 = vsubq_s16(shiftSrcY1Tmp, shiftSrcY0Tmp); + int16x8_t packTempX = vshrq_n_s16( vaddq_s16(loadGradX0, loadGradX1), 1 ); + int16x8_t packTempY = vshrq_n_s16( vaddq_s16(loadGradY0, loadGradY1), 1 ); + int16x8_t gX = vabsq_s16(packTempX); + int16x8_t gY = vabsq_s16(packTempY); + int16x8_t dIX = vmulq_s16(subTemp1,vreinterpretq_s16_u16(vcleq_s16(packTempX, vdupq_n_s16(0))-vcgeq_s16(packTempX,vdupq_n_s16(0)))); + int16x8_t dIY = vmulq_s16(subTemp1,vreinterpretq_s16_u16(vcleq_s16(packTempY, vdupq_n_s16(0))-vcgeq_s16(packTempY,vdupq_n_s16(0)))); + int16x8_t signGY_GX = vmulq_s16(packTempX,vreinterpretq_s16_u16(vcleq_s16(packTempY, vdupq_n_s16(0))-vcgeq_s16(packTempY,vdupq_n_s16(0)))); + + sumAbsGXTmp = vaddq_s16(sumAbsGXTmp, gX); + sumAbsGYTmp = vaddq_s16(sumAbsGYTmp, gY); + sumDIXTmp = vaddq_s16(sumDIXTmp, dIX); + sumDIYTmp = vaddq_s16(sumDIYTmp, dIY); + sumSignGyGxTmp = vaddq_s16(sumSignGyGxTmp, signGY_GX); + + srcY0Tmp += srcStride; + srcY1Tmp += srcStride; + gradX0 += widthG; + gradX1 += widthG; + gradY0 += widthG; + gradY1 += widthG; + + shiftSrcY0Tmp = vshrq_n_s16(vld1q_s16((int16_t*)(srcY0Tmp)), 4); + shiftSrcY1Tmp = vshrq_n_s16(vld1q_s16((int16_t*)(srcY1Tmp)), 4); + + loadGradX0 = vld1q_s16((int16_t*)(gradX0)); + loadGradX1 = vld1q_s16((int16_t*)(gradX1)); + loadGradY0 = vld1q_s16((int16_t*)(gradY0)); + loadGradY1 = vld1q_s16((int16_t*)(gradY1)); + subTemp1 = vsubq_s16(shiftSrcY1Tmp, shiftSrcY0Tmp); + packTempX = vshrq_n_s16( vaddq_s16(loadGradX0, loadGradX1), 1 ); + packTempY = vshrq_n_s16( vaddq_s16(loadGradY0, loadGradY1), 1 ); + + gX = vabsq_s16(packTempX); + gY = vabsq_s16(packTempY); + + dIX = vmulq_s16(subTemp1,vreinterpretq_s16_u16(vcleq_s16(packTempX, vdupq_n_s16(0))-vcgeq_s16(packTempX,vdupq_n_s16(0)))); + dIY = vmulq_s16(subTemp1,vreinterpretq_s16_u16(vcleq_s16(packTempY, vdupq_n_s16(0))-vcgeq_s16(packTempY,vdupq_n_s16(0)))); + signGY_GX = vmulq_s16(packTempX,vreinterpretq_s16_u16(vcleq_s16(packTempY, vdupq_n_s16(0))-vcgeq_s16(packTempY,vdupq_n_s16(0)))); + + sumAbsGXTmp = vaddq_s16(sumAbsGXTmp, gX); + sumAbsGYTmp = vaddq_s16(sumAbsGYTmp, gY); + sumDIXTmp = vaddq_s16(sumDIXTmp, dIX); + sumDIYTmp = vaddq_s16(sumDIYTmp, dIY); + sumSignGyGxTmp = vaddq_s16(sumSignGyGxTmp, signGY_GX); + + srcY0Tmp += srcStride; + srcY1Tmp += srcStride; + gradX0 += widthG; + gradX1 += widthG; + gradY0 += widthG; + gradY1 += widthG; + } + + int sumAbsGX = vaddvq_s16(vmulq_s16( sumAbsGXTmp, x)); + int sumAbsGY = vaddvq_s16(vmulq_s16( sumAbsGYTmp, x)); + int sumDIX = vaddvq_s16(vmulq_s16( sumDIXTmp, x)); + int sumDIY = vaddvq_s16(vmulq_s16( sumDIYTmp, x)); + int sumSignGY_GX = vaddvq_s16(vmulq_s16( sumSignGyGxTmp, x)); + + tmpx = sumAbsGX == 0 ? 0 : rightShiftMSB( sumDIX << 2, sumAbsGX ); + tmpx = Clip3( -limit, limit, tmpx ); + + int mainsGxGy = sumSignGY_GX >> 12; + int secsGxGy = sumSignGY_GX & ( ( 1 << 12 ) - 1 ); + int tmpData = tmpx * mainsGxGy; + tmpData = ( ( tmpData << 12 ) + tmpx * secsGxGy ) >> 1; + tmpy = sumAbsGY == 0 ? 0 : rightShiftMSB( ( ( sumDIY << 2 ) - tmpData ), sumAbsGY ); + tmpy = Clip3( -limit, limit, tmpy ); +} + +template +static inline void addBIOAvg4_Neon(const int16_t* src0, const int16_t* src1, int16_t* dst, ptrdiff_t dstStride, const int16_t* gradX0, const int16_t* gradX1, const int16_t* gradY0, const int16_t* gradY1, ptrdiff_t widthG, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng) +{ + const ptrdiff_t src0Stride = widthG + 2; + const ptrdiff_t src1Stride = widthG + 2; + const ptrdiff_t gradStride = widthG; + int32x4_t mm_offset = vdupq_n_s32( offset ); + int16x4_t vibdimin = vdup_n_s16( clpRng.min() ); + int16x4_t vibdimax = vdup_n_s16( clpRng.max() ); + + int16x4_t mm_a; + int16x4_t mm_b; + int32x4_t mm_sum; + int16x4_t mm_sum3; + + for( int y = 0; y < 2; y++) + { + mm_sum = vdupq_n_s32(0); + + mm_a = vsub_s16 ( vld1_s16( (const int16_t *) gradX0 ), vld1_s16( (const int16_t *) gradX1 ) ); + mm_b = vsub_s16 ( vld1_s16( (const int16_t *) gradY0 ), vld1_s16( (const int16_t *) gradY1 ) ); + + mm_sum = vmlal_n_s16 (mm_sum, mm_a, tmpx); + mm_sum = vmlal_n_s16 (mm_sum, mm_b, tmpy); + mm_sum = vaddq_s32 ( vaddw_s16( mm_sum, vld1_s16( (const int16_t *) ( src0 ) ) ), vaddw_s16( mm_offset, vld1_s16( (const int16_t *) ( src1 ) )) ); + mm_sum3 = vmin_s16 (vibdimax, vmax_s16(vibdimin, vqmovn_s32(vshlq_s32( mm_sum, vdupq_n_s32(-1*shift) )))); + + vst1_s16((int16_t *)dst, mm_sum3); + + dst += dstStride; + src0 += src0Stride; + src1 += src1Stride; + gradX0 += gradStride; + gradX1 += gradStride; + gradY0 += gradStride; + gradY1 += gradStride; + + mm_sum = vdupq_n_s32(0); + + mm_a = vsub_s16 ( vld1_s16( (const int16_t *) gradX0 ), vld1_s16( (const int16_t *) gradX1 ) ); + mm_b = vsub_s16 ( vld1_s16( (const int16_t *) gradY0 ), vld1_s16( (const int16_t *) gradY1 ) ); + + mm_sum = vmlal_n_s16 (mm_sum, mm_a, tmpx); + mm_sum = vmlal_n_s16 (mm_sum, mm_b, tmpy); + mm_sum = vaddq_s32 ( vaddw_s16( mm_sum, vld1_s16( (const int16_t *) ( src0 ) ) ), vaddw_s16( mm_offset, vld1_s16( (const int16_t *) ( src1 ) )) ); + mm_sum3 = vmin_s16 (vibdimax, vmax_s16(vibdimin, vqmovn_s32(vshlq_s32( mm_sum, vdupq_n_s32(-1*shift) )))); + + vst1_s16((int16_t *)dst, mm_sum3); + + dst += dstStride; + src0 += src0Stride; + src1 += src1Stride; + gradX0 += gradStride; + gradX1 += gradStride; + gradY0 += gradStride; + gradY1 += gradStride; + } +} + +template< ARM_VEXT vext> +void BiOptFlowCoreARMSIMD( const Pel* srcY0, + const Pel* srcY1, + const Pel* gradX0, + const Pel* gradX1, + const Pel* gradY0, + const Pel* gradY1, + const int width, + const int height, + Pel* dstY, + const ptrdiff_t dstStride, + const int shiftNum, + const int offset, + const int limit, + const ClpRng& clpRng, + const int bitDepth ) +{ + const int widthG = width + 2 * BDOF_EXTEND_SIZE; + const int stridePredMC = widthG + 2; + int offsetPos = widthG * BDOF_EXTEND_SIZE + BDOF_EXTEND_SIZE; + const int xUnit = ( width >> 2 ); + const int yUnit = ( height >> 2 ); + + const Pel* srcY0Temp; + const Pel* srcY1Temp; + Pel *dstY0; + + int OffPos; + int OffPad = 0; + + for( int yu = 0; yu < yUnit; yu++, srcY0 += ( stridePredMC << 2 ), srcY1 += ( stridePredMC << 2 ), dstY += ( dstStride << 2 ), offsetPos += ( widthG << 2 ) ) + { + srcY0Temp = srcY0; + srcY1Temp = srcY1; + dstY0 = dstY; + + OffPos = offsetPos; + OffPad = ( ( yu * widthG ) << 2 ); + for( int xu = 0; xu < xUnit; xu++, srcY0Temp += 4, srcY1Temp += 4, dstY0 += 4, OffPos += 4, OffPad += 4 ) + { + int tmpx, tmpy; + + calcBIOSums_Neon( srcY0Temp, srcY1Temp, gradX0 + OffPad, gradX1 + OffPad, gradY0 + OffPad, gradY1 + OffPad, widthG, bitDepth, limit, tmpx, tmpy ); + + addBIOAvg4_Neon ( srcY0Temp + stridePredMC + 1, srcY1Temp + stridePredMC + 1, dstY0, dstStride, gradX0 + OffPos, gradX1 + OffPos, gradY0 + OffPos, gradY1 + OffPos, widthG, tmpx, tmpy, shiftNum, offset, clpRng ); + } + } +} + + +template +void InterPredInterpolation::_initInterPredictionARM() +{ + xFpBiDirOptFlow = BiOptFlowCoreARMSIMD; +} + +#else + +template +void TCoeffOps::_initInterPredictionARM() +{} +#endif + +template void InterPredInterpolation::_initInterPredictionARM(); + +#endif +} // namespace vvenc + +//! \} + +// #endif // TARGET_SIMD_X86 +//! \} diff --git a/source/Lib/CommonLib/arm/RdCostARM.h b/source/Lib/CommonLib/arm/RdCostARM.h index c3e7dc1c7..c45ea3506 100644 --- a/source/Lib/CommonLib/arm/RdCostARM.h +++ b/source/Lib/CommonLib/arm/RdCostARM.h @@ -69,6 +69,17 @@ POSSIBILITY OF SUCH DAMAGE. namespace vvenc { + +static int32x4_t neon_madd_16 (int16x8_t a, int16x8_t b) { + + int32x4_t sum = vdupq_n_s32(0); + int32x4_t c = vmull_s16(vget_low_s16(a), vget_low_s16(b)); + int32x4_t d = vmull_high_s16((a), (b)); + sum = vpaddq_s32(c,d); + + return sum; +} + #if defined( TARGET_SIMD_ARM ) // The xGetHADs_ARMSIMD functions depend on the SIMDe kernels being enabled @@ -997,6 +1008,159 @@ void RdCost::xGetSADX5_16xN_SIMD(const DistParam& rcDtParam, Distortion* cost, b xGetSADX5_16xN_SIMDImp( rcDtParam, cost ); } +template< int iWidth, ARM_VEXT vext > +Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam ) +{ + + const short* pSrc1 = (const short*)rcDtParam.org.buf; + const short* pSrc2 = (const short*)rcDtParam.cur.buf; + int iRows = rcDtParam.org.height; + int iSubShift = rcDtParam.subShift; + int iSubStep = ( 1 << iSubShift ); + const int iStrideSrc1 = rcDtParam.org.stride * iSubStep; + const int iStrideSrc2 = rcDtParam.cur.stride * iSubStep; + + uint32_t uiSum = 0; + int16x8_t vzero_16 = vdupq_n_s16(0); + + if( iWidth == 4 ) + { + if( iRows == 4 && iSubShift == 0 ) + { + int16x8_t vsrc1 = vcombine_s16( vld1_s16( ( const int16_t* )pSrc1 ), vld1_s16( ( const int16_t* )( &pSrc1[iStrideSrc1] ) ) ); + int16x8_t vsrc2 = vcombine_s16( vld1_s16( ( const int16_t* )pSrc2 ), vld1_s16( ( const int16_t* )( &pSrc2[iStrideSrc2] ) ) ); + int32x4_t vsum = vmovl_s16(vget_low_s16( vpaddq_s16( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 )) ); + vsrc1 = vcombine_s16( vld1_s16( ( const int16_t* )( &pSrc1[2 * iStrideSrc1] ) ), vld1_s16( ( const int16_t* )( &pSrc1[3 * iStrideSrc1] ) ) ); + vsrc2 = vcombine_s16( vld1_s16( ( const int16_t* )( &pSrc2[2 * iStrideSrc2] ) ), vld1_s16( ( const int16_t* )( &pSrc2[3 * iStrideSrc2] ) ) ); + vsum = vaddq_s32( vsum, vmovl_s16(vget_low_s16( vpaddq_s16( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 ) ) )); + uiSum = vaddvq_s32(vsum); + } + else + { + int32x4_t vsum32 = vdupq_n_s32(0); + for( int iY = 0; iY < iRows; iY += iSubStep ) + { + int32x4_t vsrc1 = vmovl_s16( vld1_s16( ( const int16_t* )pSrc1 ) ); + int32x4_t vsrc2 = vmovl_s16( vld1_s16( ( const int16_t* )pSrc2 ) ); + vsum32 = vaddq_s32( vsum32, vabsq_s32( vsubq_s32( vsrc1, vsrc2 ) ) ); + + pSrc1 += iStrideSrc1; + pSrc2 += iStrideSrc2; + } + uiSum = vaddvq_s32(vsum32); + } + } + else + { + static constexpr bool earlyExitAllowed = iWidth >= 64; + int32x4_t vsum32 = vdupq_n_s32( 0 ); + int checkExit = 3; + + for( int iY = 0; iY < iRows; iY+=iSubStep ) + { + int16x8_t vsrc1 = vld1q_s16( ( const int16_t* )( pSrc1 ) ); + int16x8_t vsrc2 = vld1q_s16( ( const int16_t* )( pSrc2 ) ); + int16x8_t vsum16 = vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ); + + if( iWidth >= 16 ) + { + vsrc1 = vld1q_s16( ( const int16_t* )( &pSrc1[8] ) ); + vsrc2 = vld1q_s16( ( const int16_t* )( &pSrc2[8] ) ); + vsum16 = vaddq_s16( vsum16, vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ) ); + + for( int iX = 16; iX < iWidth; iX += 16 ) + { + vsrc1 = vld1q_s16( ( const int16_t* )( &pSrc1[iX] ) ); + vsrc2 = vld1q_s16( ( const int16_t* )( &pSrc2[iX] ) ); + vsum16 = vaddq_s16( vsum16, vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ) ); + + vsrc1 = vld1q_s16( ( const int16_t* )( &pSrc1[iX + 8] ) ); + vsrc2 = vld1q_s16( ( const int16_t* )( &pSrc2[iX + 8] ) ); + vsum16 = vaddq_s16( vsum16, vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ) ); + } + } + + int32x4_t vsumtemp = vpaddlq_s16( vsum16); + + if( earlyExitAllowed ) vsum32 = vpaddq_s32( vsum32, vsumtemp ); + else vsum32 = vaddq_s32 ( vsum32, vsumtemp ); + + pSrc1 += iStrideSrc1; + pSrc2 += iStrideSrc2; + + if( earlyExitAllowed && checkExit == 0 ) + { + Distortion distTemp = vgetq_lane_s32(vsum32, 0); + distTemp <<= iSubShift; + distTemp >>= DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth ); + if( distTemp > rcDtParam.maximumDistortionForEarlyExit ) return distTemp; + checkExit = 3; + } + else if( earlyExitAllowed ) + { + checkExit--; + } + } + uiSum = vaddvq_s32(vsum32); + } + + uiSum <<= iSubShift; + return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); +} + +template +Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam) +{ + if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight) + return RdCost::xGetSADwMask(rcDtParam); + + const short *src1 = (const short *) rcDtParam.org.buf; + const short *src2 = (const short *) rcDtParam.cur.buf; + const short *weightMask = (const short *) rcDtParam.mask; + int rows = rcDtParam.org.height; + int cols = rcDtParam.org.width; + int subShift = rcDtParam.subShift; + int subStep = (1 << subShift); + const int strideSrc1 = rcDtParam.org.stride * subStep; + const int strideSrc2 = rcDtParam.cur.stride * subStep; + const int strideMask = rcDtParam.maskStride * subStep; + + Distortion sum = 0; + + int32x4_t vsum32 = vdupq_n_s32( 0 ); + static const uint8_t shuffle_table[16] = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1}; + uint8x16_t shuffle_vector = vld1q_u8(shuffle_table); + + for (int y = 0; y < rows; y += subStep) + { + for (int x = 0; x < cols; x += 8) + { + int16x8_t vsrc1 = vld1q_s16( ( const int16_t* )(&src1[x] ) ); + int16x8_t vsrc2 = vld1q_s16( ( const int16_t* )(&src2[x] ) ); + int16x8_t vmask; + if (rcDtParam.stepX == -1) + { + vmask = vld1q_s16( ( const int16_t* ) ((&weightMask[x]) - (x << 1) - (8 - 1))); + uint8x16_t input_vector = vreinterpretq_u8_s16(vmask); + uint8x16_t shuffled_vector = vqtbl1q_u8(input_vector, shuffle_vector); + vmask = vreinterpretq_s16_u8(shuffled_vector); + } + else + { + vmask = vld1q_s16( ( const int16_t* ) (&weightMask[x])); + } + vsum32 = vaddq_s32(vsum32, neon_madd_16(vmask, vabsq_s16(vsubq_s16(vsrc1, vsrc2)))); + } + src1 += strideSrc1; + src2 += strideSrc2; + weightMask += strideMask; + } + sum = vaddvq_s32(vsum32); + sum <<= subShift; + return sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); +} + + template void RdCost::_initRdCostARM() { @@ -1022,6 +1186,16 @@ void RdCost::_initRdCostARM() m_afpDistortFunc[0][DF_HAD32_fast] = RdCost::xGetHADs_ARMSIMD; m_afpDistortFunc[0][DF_HAD64_fast] = RdCost::xGetHADs_ARMSIMD; m_afpDistortFunc[0][DF_HAD128_fast] = RdCost::xGetHADs_ARMSIMD; + + m_afpDistortFunc[0][DF_SAD4 ] = xGetSAD_NxN_ARMSIMD<4, vext>; + m_afpDistortFunc[0][DF_SAD8 ] = xGetSAD_NxN_ARMSIMD<8, vext>; + m_afpDistortFunc[0][DF_SAD16 ] = xGetSAD_NxN_ARMSIMD<16, vext>; + m_afpDistortFunc[0][DF_SAD32 ] = xGetSAD_NxN_ARMSIMD<32, vext>; + m_afpDistortFunc[0][DF_SAD64 ] = xGetSAD_NxN_ARMSIMD<64, vext>; + m_afpDistortFunc[0][DF_SAD128] = xGetSAD_NxN_ARMSIMD<128, vext>; + + m_afpDistortFunc[0][DF_SAD_WITH_MASK] = xGetSADwMask_ARMSIMD; + #endif // defined( TARGET_SIMD_X86 ) } diff --git a/source/Lib/CommonLib/arm/neon/InterPredARM.cpp b/source/Lib/CommonLib/arm/neon/InterPredARM.cpp new file mode 100644 index 000000000..a469dcb58 --- /dev/null +++ b/source/Lib/CommonLib/arm/neon/InterPredARM.cpp @@ -0,0 +1,43 @@ +/* ----------------------------------------------------------------------------- +The copyright in this software is being made available under the Clear BSD +License, included below. No patent rights, trademark rights and/or +other Intellectual Property Rights other than the copyrights concerning +the Software are granted under this license. + +The Clear BSD License + +Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted (subject to the limitations in the disclaimer below) provided that +the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY +THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------------------- */ + +#include "../InterPredARM.h" \ No newline at end of file diff --git a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp index 3d7af09f5..d23ab150a 100644 --- a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp +++ b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp @@ -56,6 +56,18 @@ POSSIBILITY OF SUCH DAMAGE. //! \ingroup CommonLib //! \{ +#if SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_AVX2 +# define USE_AVX2 +#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_SSE42 +# define USE_SSE42 +#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_SSE41 +# define USE_SSE41 +#endif + +#ifdef TARGET_SIMD_X86 +# include "../x86/InterpolationFilterX86.h" +#endif + #if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_OPT_MCIF namespace vvenc @@ -497,6 +509,316 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const* src, int sr } while( --height != 0 ); } +template +static void simdInterpolateHorM8_Neon( const int16_t* src, int srcStride, int16_t *dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const *coeff ) +{ + + int16x8_t vibdimin = vdupq_n_s16( clpRng.min() ); + int16x8_t vibdimax = vdupq_n_s16( clpRng.max() ); + int32x4_t vsuma, vsumb; + int16x8_t vsum, vsrc0, vsrc1; + + for( int row = 0; row < height; row++ ) + { + for( int col = 0; col < width; col+=8 ) + { + vsuma = vdupq_n_s32(offset); + vsumb = vdupq_n_s32(offset); + + vsrc0 = vld1q_s16( ( const int16_t * )&src[col] ); + vsrc1 = vld1q_s16( ( const int16_t * )&src[col + 4] ); + + vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[0])); + vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[1])); + vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 6), vdupq_n_s16(coeff[2])); + vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 7), vdupq_n_s16(coeff[3])); + + vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[0])); + vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[1])); + vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 6), vdupq_n_s16(coeff[2])); + vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 7), vdupq_n_s16(coeff[3])); + + + if( N == 8 ) + { + vsrc0 = vld1q_s16( ( const int16_t * )&src[col + 8] ); + vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[4])); + vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[5])); + vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 6), vdupq_n_s16(coeff[6])); + vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 7), vdupq_n_s16(coeff[7])); + + vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[4])); + vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[5])); + vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 6), vdupq_n_s16(coeff[6])); + vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 7), vdupq_n_s16(coeff[7])); + } + if( N == 6 ) + { + vsrc0 = vld1q_s16( ( const int16_t * )&src[col + 8] ); + vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[4])); + vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[5])); + + vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[4])); + vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[5])); + } + + vsuma = vshlq_s32( vsuma, vdupq_n_s32(-1*shift) ); + vsumb = vshlq_s32( vsumb, vdupq_n_s32(-1*shift) ); + vsum = vcombine_s16(vqmovn_s32(vsuma), vqmovn_s32(vsumb)); + + if( shiftBack ) + { + vsum = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum ) ); + } + vst1q_s16((int16_t*) &dst[col], vsum); + } + src += srcStride; + dst += dstStride; + } +} + +template +static void simdInterpolateVerM8_Neon( const int16_t *src, int srcStride, int16_t *dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const *coeff ) +{ + const Pel* srcOrig = src; + int16_t *dstOrig = dst; + + int16x8_t vsrc[N+1]; + int32x4_t voffset = vdupq_n_s32( offset ); + int16x8_t vibdimin = vdupq_n_s16( clpRng.min() ); + int16x8_t vibdimax = vdupq_n_s16( clpRng.max() ); + int32x4_t vsuma, vsumb; + int16x8_t vsum; + vsrc[N] = vdupq_n_s16(0); + for( int col = 0; col < width; col += 8 ) + { + + for( int i = 0; i < N - 1; i++ ) + { + vsrc[i] = vld1q_s16( ( int16_t const * )&src[col + i * srcStride] ); + } + + for( int row = 0; row < height; row++ ) + { + vsrc[N - 1] = vld1q_s16( ( int16_t const * )&src[col + ( N - 1 ) * srcStride] ); + vsuma = vsumb = voffset; + if(N < 2) + { + vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[ 0]), vdup_n_s16(coeff[0])); + vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[ 1]), vdup_n_s16(coeff[1])); + vsumb = vmlal_high_s16(vsumb, vsrc[0], vdupq_n_s16(coeff[0])); + vsumb = vmlal_high_s16(vsumb, vsrc[1], vdupq_n_s16(coeff[1])); + + vsrc[0] = vsrc[1]; + } + else + { + for( int i = 0; i < N; i += 2 ) + { + vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[i + 0]), vdup_n_s16(coeff[i + 0])); + vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[i + 1]), vdup_n_s16(coeff[i + 1])); + vsumb = vmlal_high_s16(vsumb, vsrc[i + 0], vdupq_n_s16(coeff[i + 0])); + vsumb = vmlal_high_s16(vsumb, vsrc[i + 1], vdupq_n_s16(coeff[i + 1])); + vsrc[i ] = vsrc[i + 1]; + vsrc[i + 1] = vsrc[i + 2]; + } + } + vsuma = vshlq_s32( vsuma, vdupq_n_s32(-1*shift) ); + vsumb = vshlq_s32( vsumb, vdupq_n_s32(-1*shift) ); + vsum = vcombine_s16(vqmovn_s32(vsuma), vqmovn_s32(vsumb)); + if( shiftBack ) + { + vsum = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum ) ); + } + vst1q_s16((int16_t*) &dst[col], vsum); + src += srcStride; + dst += dstStride; + } + src = srcOrig; + dst = dstOrig; + } +} + +template +static void simdFilterARM( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff ) +{ + int row, col; + + Pel c[8]; + c[0] = coeff[0]; + c[1] = coeff[1]; + if( N >= 4 ) + { + c[2] = coeff[2]; + c[3] = coeff[3]; + } + if( N >= 6 ) + { + c[4] = coeff[4]; + c[5] = coeff[5]; + } + if( N == 8 ) + { + c[6] = coeff[6]; + c[7] = coeff[7]; + } + + int cStride = ( isVertical ) ? srcStride : 1; + src -= ( N/2 - 1 ) * cStride; + + int offset; + int headRoom = std::max( 2, ( IF_INTERNAL_PREC - clpRng.bd ) ); + int shift = IF_FILTER_PREC; + CHECK( shift < 0, "Negative shift" ); + + if( N != 2 ) + { + if( isLast ) + { + shift += ( isFirst ) ? 0 : headRoom; + offset = 1 << ( shift - 1 ); + offset += ( isFirst ) ? 0 : IF_INTERNAL_OFFS << IF_FILTER_PREC; + } + else + { + shift -= ( isFirst ) ? headRoom : 0; + offset = ( isFirst ) ? -IF_INTERNAL_OFFS * (1<< shift) : 0; + } + } + else + { + if( isFirst ) + { + shift = IF_FILTER_PREC_BILINEAR - (IF_INTERNAL_PREC_BILINEAR - clpRng.bd); + offset = 1 << (shift - 1); + } + else + { + shift = 4; + offset = 1 << (shift - 1); + } + } + + CHECKD( clpRng.bd > 10, "VVenC does not support bitdepths larger than 10!" ); + + if( N == 6 ) + { + c[6] = coeff[6]; + c[7] = coeff[7]; + int src8tOff = cStride; + + if( !( width & 7 ) ) + { + if( !isVertical ) + { + simdInterpolateHorM8_Neon<6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); + } + else + { + simdInterpolateVerM8_Neon<6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); + } + } + + else if( !( width & 3 ) ) + { + if( !isVertical ) + { + simdInterpolateHorM4( src - src8tOff, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + } + else + simdInterpolateVerM4( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 ); + } + + else if( width == 1 && !isVertical ) + { + simdInterpolateHorM1( src - src8tOff, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + } + + else if( width == 1 && isVertical ) + { + c[0] = c[1]; c[1] = c[2]; c[2] = c[3]; c[3] = c[4]; c[4] = c[5]; c[5] = coeff[6]; + goto scalar_if; + } + + return; + } + + if( !isVertical && N != 2 ) + { + if( ( width & 7 ) == 0 ) + { + simdInterpolateHorM8_Neon( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + } + + else if( ( width & 3 ) == 0 ) + simdInterpolateHorM4( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + else if( ( width & 1 ) == 0 ) + simdInterpolateHorM2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + else + simdInterpolateHorM1( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + return; + } + + else if( N != 2 ) + { + if( ( width & 7 ) == 0 ) + { + simdInterpolateVerM8_Neon( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + } + else if( ( width & 3 ) == 0 ) + simdInterpolateVerM4( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + else if( ( width & 1 ) == 0 ) + simdInterpolateVerM2( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + else + simdInterpolateVerM1( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c ); + return; + } + else + { + THROW( "To be implemented" ); + return; + } + +scalar_if: + for( row = 0; row < height; row++ ) + { + for( col = 0; col < width; col++ ) + { + int sum; + + sum = src[col + 0 * cStride] * c[0]; + sum += src[col + 1 * cStride] * c[1]; + if( N >= 4 ) + { + sum += src[col + 2 * cStride] * c[2]; + sum += src[col + 3 * cStride] * c[3]; + } + if( N >= 6 ) + { + sum += src[col + 4 * cStride] * c[4]; + sum += src[col + 5 * cStride] * c[5]; + } + if( N == 8 ) + + { + sum += src[col + 6 * cStride] * c[6]; + sum += src[col + 7 * cStride] * c[7]; + } + + Pel val = ( sum + offset ) >> shift; + if( isLast ) + { + val = ClipPel( val, clpRng ); + } + dst[col] = val; + } + + src += srcStride; + dst += dstStride; + } +} + + template<> void InterpolationFilter::_initInterpolationFilterARM() { @@ -510,6 +832,38 @@ void InterpolationFilter::_initInterpolationFilterARM() m_filter16x16[ 0 ][ 1 ] = simdFilter16xX_N8_neon; m_filterN2_2D = simdInterpolateN2_2D_neon; + + m_filterHor[0][0][0] = simdFilterARM<8, false, false, false>; + m_filterHor[0][0][1] = simdFilterARM<8, false, false, true>; + m_filterHor[0][1][0] = simdFilterARM<8, false, true, false>; + m_filterHor[0][1][1] = simdFilterARM<8, false, true, true>; + + m_filterHor[1][0][0] = simdFilterARM<4, false, false, false>; + m_filterHor[1][0][1] = simdFilterARM<4, false, false, true>; + m_filterHor[1][1][0] = simdFilterARM<4, false, true, false>; + m_filterHor[1][1][1] = simdFilterARM<4, false, true, true>; + + m_filterHor[3][0][0] = simdFilterARM<6, false, false, false>; + m_filterHor[3][0][1] = simdFilterARM<6, false, false, true>; + m_filterHor[3][1][0] = simdFilterARM<6, false, true, false>; + m_filterHor[3][1][1] = simdFilterARM<6, false, true, true>; + + m_filterVer[0][0][0] = simdFilterARM<8, true, false, false>; + m_filterVer[0][0][1] = simdFilterARM<8, true, false, true>; + m_filterVer[0][1][0] = simdFilterARM<8, true, true, false>; + m_filterVer[0][1][1] = simdFilterARM<8, true, true, true>; + + m_filterVer[1][0][0] = simdFilterARM<4, true, false, false>; + m_filterVer[1][0][1] = simdFilterARM<4, true, false, true>; + m_filterVer[1][1][0] = simdFilterARM<4, true, true, false>; + m_filterVer[1][1][1] = simdFilterARM<4, true, true, true>; + + m_filterVer[3][0][0] = simdFilterARM<6, true, false, false>; + m_filterVer[3][0][1] = simdFilterARM<6, true, false, true>; + m_filterVer[3][1][0] = simdFilterARM<6, true, true, false>; + m_filterVer[3][1][1] = simdFilterARM<6, true, true, true>; + + } } // namespace vvenc diff --git a/source/Lib/EncoderLib/EncCfg.cpp b/source/Lib/EncoderLib/EncCfg.cpp index a4f20183e..58f053064 100644 --- a/source/Lib/EncoderLib/EncCfg.cpp +++ b/source/Lib/EncoderLib/EncCfg.cpp @@ -83,15 +83,16 @@ static unsigned getMaxTlVal( unsigned perTlVal ) void VVEncCfg::xInitCfgMembers() { - m_stageParallelProc = m_numThreads > 0 && m_maxParallelFrames > 0; - m_log2GopSize = floorLog2( m_GOPSize ); - m_maxTLayer = m_picReordering && m_GOPSize > 1 ? vvenc::ceilLog2( m_GOPSize ) : 0; - m_bimCtuSize = m_CTUSize; - m_MaxQT[0] = - m_MaxQT[1] = - m_MaxQT[2] = m_CTUSize; - m_rateCap = m_RCMaxBitrate > 0 && m_RCMaxBitrate < INT32_MAX && m_RCTargetBitrate == 0; - m_reuseCuResults = ( m_IntraPeriod > 1 && getMaxTlVal( m_maxMTTDepth ) > 1 ) || m_maxMTTDepthI > ( m_IntraPeriod == 1 ? 1 : 2 ); + m_stageParallelProc = m_numThreads > 0 && m_maxParallelFrames > 0; + m_log2GopSize = floorLog2( m_GOPSize ); + m_maxTLayer = m_picReordering && m_GOPSize > 1 ? vvenc::ceilLog2( m_GOPSize ) : 0; + m_bimCtuSize = m_CTUSize; + m_MaxQT[0] = + m_MaxQT[1] = + m_MaxQT[2] = m_CTUSize; + m_rateCap = m_RCMaxBitrate > 0 && m_RCMaxBitrate < INT32_MAX && m_RCTargetBitrate == 0; + m_reuseCuResults = ( m_IntraPeriod > 1 && getMaxTlVal( m_maxMTTDepth ) > 1 ) || m_maxMTTDepthI > ( m_IntraPeriod == 1 ? 1 : 2 ); + m_splitCostThrParamId = getMaxTlVal(m_maxMTTDepth); m_mergeRdCandQuotaRegular = std::min( NUM_MRG_SATD_CAND, std::max( ( int ) m_maxNumMergeCand - 2, 1 ) ); // 0 1 2 3 4 diff --git a/source/Lib/EncoderLib/EncCfg.h b/source/Lib/EncoderLib/EncCfg.h index 8fc1cef8f..3425e24f6 100644 --- a/source/Lib/EncoderLib/EncCfg.h +++ b/source/Lib/EncoderLib/EncCfg.h @@ -91,6 +91,7 @@ struct VVEncCfg : public vvenc_config int m_mergeRdCandQuotaCiip; int m_mergeRdCandQuotaGpm; bool m_reuseCuResults; + int m_splitCostThrParamId; vvencFG m_fg; private: diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp index 6dfc37531..37668d227 100644 --- a/source/Lib/EncoderLib/EncCu.cpp +++ b/source/Lib/EncoderLib/EncCu.cpp @@ -75,6 +75,26 @@ const MergeIdxPair EncCu::m_GeoModeTest[GEO_MAX_NUM_CANDS] = { MergeIdxPair{0, 1 MergeIdxPair{3, 4}, MergeIdxPair{4, 0}, MergeIdxPair{4, 1}, MergeIdxPair{4, 2}, MergeIdxPair{4, 3}, MergeIdxPair{0, 5}, MergeIdxPair{1, 5}, MergeIdxPair{2, 5}, MergeIdxPair{3, 5}, MergeIdxPair{4, 5}, MergeIdxPair{5, 0}, MergeIdxPair{5, 1}, MergeIdxPair{5, 2}, MergeIdxPair{5, 3}, MergeIdxPair{5, 4} }; + + +// Shape coefSquareCUs (2 x 5 x 2 x 2 x 2): preset (faster and fast + medium) x cusize x nspred x sptype x numcoef + +const double EncCu::coefSquareCUs[2][5][2][2][2] = { +{{{{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, }, {{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, }, }, +{{{-1.00000000, -1.00000000, }, {0.06213828, 0.00611228, }, }, {{-1.00000000, -1.00000000, }, {0.06943756, 0.00320762, }, }, }, +{{{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, }, {{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, }, }, +{{{-1.00000000, -1.00000000, }, {0.10833051, 0.00053144, }, }, {{-1.00000000, -1.00000000, }, {0.08304352, 0.00142876, }, }, }, +{{{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, }, {{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, }, }, +}, +{{{{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, }, {{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, }, }, +{{{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, }, {{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, }, }, +{{{0.06852235, 0.00388054, }, {0.09236045, 0.00084528, }, }, {{0.06955832, 0.00289679, }, {0.09598522, 0.00096187, }, }, }, +{{{0.07268085, 0.00302796, }, {0.09323753, 0.00050996, }, }, {{0.06123618, 0.00471601, }, {0.09253389, 0.00046826, }, }, }, +{{{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, }, {{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, }, }, +}, +}; + + // ==================================================================================================================== EncCu::EncCu() : m_CtxCache ( nullptr ) @@ -1006,52 +1026,80 @@ void EncCu::xCheckModeSplit(CodingStructure *&tempCS, CodingStructure *&bestCS, void EncCu::xCheckModeSplitInternal(CodingStructure *&tempCS, CodingStructure *&bestCS, Partitioner &partitioner, const EncTestMode& encTestMode, const ModeType modeTypeParent, bool& skipInterPass ) { - const int qp = encTestMode.qp; - const int oldPrevQp = tempCS->prevQP[partitioner.chType]; - const auto oldMotionLut = tempCS->motionLut; - const ReshapeData& reshapeData = tempCS->picture->reshapeData; - - const PartSplit split = getPartSplit( encTestMode ); - const ModeType modeTypeChild = partitioner.modeType; + const int qp = encTestMode.qp; + const int oldPrevQp = tempCS->prevQP[partitioner.chType]; + const auto oldMotionLut = tempCS->motionLut; + const ReshapeData& reshapeData = tempCS->picture->reshapeData; + + const PartSplit split = getPartSplit( encTestMode ); + const ModeType modeTypeChild = partitioner.modeType; - CHECK( split == CU_DONT_SPLIT, "No proper split provided!" ); + CHECK( !( split == CU_QUAD_SPLIT || split == CU_HORZ_SPLIT || split == CU_VERT_SPLIT + || split == CU_TRIH_SPLIT || split == CU_TRIV_SPLIT ), "invalid split type" ); tempCS->initStructData( qp ); - m_CABACEstimator->getCtx() = m_CurrCtx->start; + m_CABACEstimator->getCtx() = m_CurrCtx->start; - const uint16_t split_ctx_size = Ctx::SplitFlag.size() + Ctx::SplitQtFlag.size() + Ctx::SplitHvFlag.size() + Ctx::Split12Flag.size() + Ctx::ModeConsFlag.size(); - const TempCtx ctxSplitFlags( m_CtxCache, SubCtx(CtxSet(Ctx::SplitFlag(), split_ctx_size), m_CABACEstimator->getCtx())); + const uint16_t split_ctx_size = Ctx::SplitFlag.size() + Ctx::SplitQtFlag.size() + Ctx::SplitHvFlag.size() + Ctx::Split12Flag.size() + Ctx::ModeConsFlag.size(); + const TempCtx ctxSplitFlags ( m_CtxCache, SubCtx( CtxSet( Ctx::SplitFlag(), split_ctx_size ), m_CABACEstimator->getCtx() ) ); m_CABACEstimator->resetBits(); - m_CABACEstimator->split_cu_mode( split, *tempCS, partitioner ); - partitioner.modeType = modeTypeParent; + m_CABACEstimator->split_cu_mode ( split, *tempCS, partitioner ); + partitioner . modeType = modeTypeParent; m_CABACEstimator->mode_constraint( split, *tempCS, partitioner, modeTypeChild ); - partitioner.modeType = modeTypeChild; + partitioner . modeType = modeTypeChild; - const int64_t splitBits = m_CABACEstimator->getEstFracBits(); + const int64_t splitBits = m_CABACEstimator->getEstFracBits(); - int numChild = 3; - if( split == CU_VERT_SPLIT || split == CU_HORZ_SPLIT ) numChild--; - else if( split == CU_QUAD_SPLIT ) numChild++; + const bool chromaNotSplit = modeTypeParent == MODE_TYPE_ALL && modeTypeChild == MODE_TYPE_INTRA; + const bool isChromaTooBig = isChromaEnabled( tempCS->pps->pcv->chrFormat ) && tempCS->area.Y().maxDim() > tempCS->sps->getMaxTbSize(); + bool skipSplitTest = chromaNotSplit && isChromaTooBig; - int64_t approxBits = m_pcEncCfg->m_qtbttSpeedUp > 0 ? numChild << SCALE_BITS : 0; + if( !skipSplitTest ) + { + double a = -1, b = -1; + const unsigned w = partitioner.currArea().lwidth(); + const unsigned h = partitioner.currArea().lheight(); + const bool contextCond = w == h && tempCS->slice->sliceType == VVENC_B_SLICE && isLuma( partitioner.chType ) && m_pcEncCfg->m_splitCostThrParamId >= 0 && m_pcEncCfg->m_splitCostThrParamId <= 1; - const double factor = ( tempCS->currQP[partitioner.chType] > 30 ? 1.1 : 1.075 ) - + ( m_pcEncCfg->m_qtbttSpeedUp > 0 ? 0.01 : 0.0 ) - + ( ( m_pcEncCfg->m_qtbttSpeedUp > 0 && isChroma( partitioner.chType ) ) ? 0.2 : 0.0 ); + if( contextCond ) + { + uint8_t nsPredInd = m_modeCtrl.comprCUCtx->bestNsPredMode.type == ETM_INTRA; + uint8_t szInd = getLog2( w ) - 3; + uint8_t splitInd = split == CU_QUAD_SPLIT ? 1 : 0; + a = coefSquareCUs[m_pcEncCfg->m_splitCostThrParamId][szInd][nsPredInd][splitInd][0]; + b = coefSquareCUs[m_pcEncCfg->m_splitCostThrParamId][szInd][nsPredInd][splitInd][1]; + } - const double cost = m_cRdCost.calcRdCost( uint64_t( splitBits + approxBits + ( ( bestCS->fracBits ) / factor ) ), Distortion( bestCS->dist / factor ) ) + bestCS->costDbOffset / factor; - - const bool chromaNotSplit = modeTypeParent == MODE_TYPE_ALL && modeTypeChild == MODE_TYPE_INTRA ? true : false; - const bool isChromaTooBig = isChromaEnabled( tempCS->pps->pcv->chrFormat ) && std::max( tempCS->area.Y().width, tempCS->area.Y().height ) > tempCS->sps->getMaxTbSize(); + if( a > -1 && b > -1 ) + { + const double bestNsCost = m_modeCtrl.comprCUCtx->bestCostBeforeSplit == MAX_DOUBLE ? -1 : m_modeCtrl.comprCUCtx->bestCostBeforeSplit; + const double factor = 1.0 + b * exp( a * qp ); + const double predSplitCost = bestNsCost / factor + splitBits; + skipSplitTest = bestNsCost >= 0 && predSplitCost >= bestNsCost; + } + else + { + int numChild = 3; + if( split == CU_VERT_SPLIT || split == CU_HORZ_SPLIT ) numChild--; + else if( split == CU_QUAD_SPLIT ) numChild++; - if( cost > bestCS->cost + bestCS->costDbOffset // speedup - || ( chromaNotSplit && isChromaTooBig ) // TODO: proper fix, for now inhibit chroma TU split that we cannot handle, resulting in missing chroma encoding! - ) + int64_t approxBits = m_pcEncCfg->m_qtbttSpeedUp > 0 ? numChild << SCALE_BITS : 0; + + const double factor = ( tempCS->currQP[partitioner.chType] > 30 ? 1.1 : 1.075 ) + + ( m_pcEncCfg->m_qtbttSpeedUp > 0 ? 0.01 : 0.0 ) + + ( ( m_pcEncCfg->m_qtbttSpeedUp > 0 && isChroma( partitioner.chType ) ) ? 0.2 : 0.0 ); + + const double baseCost = bestCS->cost + bestCS->costDbOffset; + const double predCost = baseCost / factor + splitBits + approxBits; + skipSplitTest = predCost >= baseCost; + } + } + + if( skipSplitTest ) { m_CABACEstimator->getCtx() = SubCtx( CtxSet( Ctx::SplitFlag(), split_ctx_size ), ctxSplitFlags ); - // DTRACE( g_trace_ctx, D_TMP, "%d exit split %f %f %f\n", g_trace_ctx->getChannelCounter(D_TMP), cost, bestCS->cost, bestCS->costDbOffset ); xCheckBestMode( tempCS, bestCS, partitioner, encTestMode ); return; } @@ -1069,22 +1117,19 @@ void EncCu::xCheckModeSplitInternal(CodingStructure *&tempCS, CodingStructure *& } } - CHECK(!(split == CU_QUAD_SPLIT || split == CU_HORZ_SPLIT || split == CU_VERT_SPLIT - || split == CU_TRIH_SPLIT || split == CU_TRIV_SPLIT), "invalid split type"); - partitioner.splitCurrArea( split, *tempCS ); bool qgEnableChildren = partitioner.currQgEnable(); // QG possible at children level m_CurrCtx++; AffineMVInfo tmpMVInfo; - bool isAffMVInfoSaved = m_cInterSearch.m_AffineProfList->savePrevAffMVInfo(0, tmpMVInfo ); + bool isAffMVInfoSaved = m_cInterSearch.m_AffineProfList->savePrevAffMVInfo( 0, tmpMVInfo ); BlkUniMvInfo tmpUniMvInfo; bool isUniMvInfoSaved = false; - if (!tempCS->slice->isIntra()) + if( !tempCS->slice->isIntra() ) { - m_cInterSearch.m_BlkUniMvInfoBuffer->savePrevUniMvInfo(tempCS->area.Y(), tmpUniMvInfo, isUniMvInfoSaved); + m_cInterSearch.m_BlkUniMvInfoBuffer->savePrevUniMvInfo( tempCS->area.Y(), tmpUniMvInfo, isUniMvInfoSaved ); } DeriveCtx deriveCtx = m_CABACEstimator->getDeriveCtx(); diff --git a/source/Lib/EncoderLib/EncCu.h b/source/Lib/EncoderLib/EncCu.h index ca72dabf6..d0827c92c 100644 --- a/source/Lib/EncoderLib/EncCu.h +++ b/source/Lib/EncoderLib/EncCu.h @@ -304,6 +304,8 @@ class EncCu m_subPuMvOffset[MRG_MAX_NUM_CANDS]; Distortion m_uiSadBestForQPA; + static const double coefSquareCUs[2][5][2][2][2]; + public: EncCu(); virtual ~EncCu(); diff --git a/source/Lib/EncoderLib/EncModeCtrl.cpp b/source/Lib/EncoderLib/EncModeCtrl.cpp index 2bf3c2faa..ad39e002f 100644 --- a/source/Lib/EncoderLib/EncModeCtrl.cpp +++ b/source/Lib/EncoderLib/EncModeCtrl.cpp @@ -1099,10 +1099,8 @@ void EncModeCtrl::beforeSplit( Partitioner& partitioner ) CodedCUInfo &relatedCU = getBlkInfo( partitioner.currArea() ); const CodingUnit& bestCU = *cuECtx.bestCU; - if (m_pcEncCfg->m_fastTTSplit) - { - cuECtx.bestCostBeforeSplit = cuECtx.bestCS->cost; - } + cuECtx.bestNsPredMode = cuECtx.bestMode; + cuECtx.bestCostBeforeSplit = cuECtx.bestCS->cost; setFromCs( *cuECtx.bestCS, cuECtx.bestMode, partitioner ); @@ -1158,14 +1156,6 @@ bool EncModeCtrl::useModeResult( const EncTestMode& encTestmode, CodingStructure { cuECtx.bestCostVertSplit = tempCS->cost; } - else if( encTestmode.type == ETM_SPLIT_TT_H ) - { - cuECtx.bestCostTriHorzSplit = tempCS->cost; - } - else if( encTestmode.type == ETM_SPLIT_TT_V ) - { - cuECtx.bestCostTriVertSplit = tempCS->cost; - } else if( !isModeSplit( encTestmode ) && isModeInter( encTestmode ) && tempCS->cus.size() == 1 ) { cuECtx.nonSkipWasTested |= !tempCS->cus.front()->skip; @@ -1240,12 +1230,6 @@ bool EncModeCtrl::useModeResult( const EncTestMode& encTestmode, CodingStructure cuECtx.bestTU = cuECtx.bestCU->firstTU; cuECtx.bestMode = encTestmode; - if( isModeInter( encTestmode ) ) - { - //Here we take the best cost of both inter modes. We are assuming only the inter modes (and all of them) have come before the intra modes!!! - cuECtx.bestInterCost = cuECtx.bestCS->cost; - } - return true; } else diff --git a/source/Lib/EncoderLib/EncModeCtrl.h b/source/Lib/EncoderLib/EncModeCtrl.h index e0c8b110d..2cdc6247b 100644 --- a/source/Lib/EncoderLib/EncModeCtrl.h +++ b/source/Lib/EncoderLib/EncModeCtrl.h @@ -174,13 +174,9 @@ struct ComprCUCtx , bestCU ( nullptr ) , bestTU ( nullptr ) , bestMode () - , bestInterCost ( MAX_DOUBLE ) - , bestCostBeforeSplit ( MAX_DOUBLE ) + , bestCostBeforeSplit (MAX_DOUBLE) , bestCostVertSplit (MAX_DOUBLE) , bestCostHorzSplit (MAX_DOUBLE) - , bestCostTriVertSplit (MAX_DOUBLE) - , bestCostTriHorzSplit (MAX_DOUBLE) - , bestCostImv (MAX_DOUBLE *.5) , bestCostNoImv (MAX_DOUBLE *.5) , grad_horVal (0) , grad_verVal (0) @@ -199,11 +195,11 @@ struct ComprCUCtx , doVerChromaSplit (false) , doQtChromaSplit (false) , isBestNoSplitSkip (false) - , skipSecondMTSPass (false) , intraWasTested (false) , relatedCuIsValid (false) , isIntra (false) , nonSkipWasTested (false) + , bestNsPredMode (EncTestMode()) { } @@ -213,13 +209,9 @@ struct ComprCUCtx CodingUnit* bestCU; TransformUnit* bestTU; EncTestMode bestMode; - double bestInterCost; double bestCostBeforeSplit; double bestCostVertSplit; double bestCostHorzSplit; - double bestCostTriVertSplit; - double bestCostTriHorzSplit; - double bestCostImv; double bestCostNoImv; double grad_horVal; double grad_verVal; @@ -239,11 +231,11 @@ struct ComprCUCtx bool doVerChromaSplit; bool doQtChromaSplit; bool isBestNoSplitSkip; - bool skipSecondMTSPass; bool intraWasTested; bool relatedCuIsValid; bool isIntra; bool nonSkipWasTested; + EncTestMode bestNsPredMode; }; ////////////////////////////////////////////////////////////////////////// diff --git a/test/vvenc_unit_test/vvenc_unit_test.cpp b/test/vvenc_unit_test/vvenc_unit_test.cpp index 0d9ce6de3..59ac93bc9 100644 --- a/test/vvenc_unit_test/vvenc_unit_test.cpp +++ b/test/vvenc_unit_test/vvenc_unit_test.cpp @@ -282,7 +282,7 @@ static bool test_TCoeffOps() int main() { - unsigned seed = time( NULL ); + unsigned seed = ( unsigned ) time( NULL ); srand( seed ); bool passed = test_TCoeffOps();