diff --git a/AUTHORS.md b/AUTHORS.md
index 15830f961..0ace759b9 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -16,3 +16,5 @@
 * Hossein Pejman, , École de technologie supérieure (ÉTS)
 * Vignesh V Menon, , Fraunhofer HHI
 * George Steed, @georges-arm, Arm
+* Yiqun Liu, , Fraunhofer HHI
+* Mehrdad Ghafari, , Fraunhofer HHI
diff --git a/source/Lib/CommonLib/InterPrediction.cpp b/source/Lib/CommonLib/InterPrediction.cpp
index 0eaebcad2..8f8614105 100644
--- a/source/Lib/CommonLib/InterPrediction.cpp
+++ b/source/Lib/CommonLib/InterPrediction.cpp
@@ -681,6 +681,10 @@ void InterPredInterpolation::init()
   initInterPredictionX86();
 #endif
 
+#if ENABLE_SIMD_OPT_BDOF && defined( TARGET_SIMD_ARM )
+  initInterPredictionARM();
+#endif
+
   if (m_storedMv == nullptr)
   {
     const int MVBUFFER_SIZE = MAX_CU_SIZE / MIN_PU_SIZE;
diff --git a/source/Lib/CommonLib/InterPrediction.h b/source/Lib/CommonLib/InterPrediction.h
index 8b3686f0b..dabe99204 100644
--- a/source/Lib/CommonLib/InterPrediction.h
+++ b/source/Lib/CommonLib/InterPrediction.h
@@ -98,6 +98,12 @@ class InterPredInterpolation
   void _initInterPredictionX86();
 #endif
 
+#if ENABLE_SIMD_OPT_BDOF && defined( TARGET_SIMD_ARM )
+  void initInterPredictionARM();
+  template <ARM_VEXT vext>
+  void _initInterPredictionARM();
+#endif
+
 protected:
   void xWeightedAverage       ( const CodingUnit& cu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const bool bdofApplied, PelUnitBuf *yuvPredTmp = NULL );
   void xPredAffineBlk         ( const ComponentID compID, const CodingUnit& cu, const Picture* refPic, const Mv* _mv, PelUnitBuf& dstPic, const bool bi, const ClpRng& clpRng, const RefPicList refPicList = REF_PIC_LIST_X);
diff --git a/source/Lib/CommonLib/RdCost.h b/source/Lib/CommonLib/RdCost.h
index c4239181c..301b551be 100644
--- a/source/Lib/CommonLib/RdCost.h
+++ b/source/Lib/CommonLib/RdCost.h
@@ -275,6 +275,12 @@ class RdCost
 
   template <ARM_VEXT vext, bool fastHad>
   static Distortion xGetHADs_ARMSIMD   ( const DistParam& pcDtParam );
+
+  template<ARM_VEXT vext> 
+  static Distortion xGetSADwMask_ARMSIMD(const DistParam &rcDtParam);
+
+  template< int iWidth, ARM_VEXT vext >
+  static Distortion xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam );
 #endif
 	
   unsigned int   getBitsMultiplePredsIBC(int x, int y, bool useIMV);
diff --git a/source/Lib/CommonLib/arm/InitARM.cpp b/source/Lib/CommonLib/arm/InitARM.cpp
index 362d9ff23..9d664c02d 100644
--- a/source/Lib/CommonLib/arm/InitARM.cpp
+++ b/source/Lib/CommonLib/arm/InitARM.cpp
@@ -121,6 +121,22 @@ void TCoeffOps::initTCoeffOpsARM()
 }
 #endif
 
+#if ENABLE_SIMD_OPT_BDOF
+void InterPredInterpolation::initInterPredictionARM()
+{
+  auto vext = read_arm_extension_flags();
+  switch (vext){
+    case NEON:
+      _initInterPredictionARM<NEON>();
+      break;
+    default:
+      break;
+  }
+}
+#endif
+
+
+
 #endif   // TARGET_SIMD_ARM
 
 }   // namespace
diff --git a/source/Lib/CommonLib/arm/InterPredARM.h b/source/Lib/CommonLib/arm/InterPredARM.h
new file mode 100644
index 000000000..543f9d2a1
--- /dev/null
+++ b/source/Lib/CommonLib/arm/InterPredARM.h
@@ -0,0 +1,292 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or 
+other Intellectual Property Rights other than the copyrights concerning 
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+/** \file     InterPredX86.h
+    \brief    SIMD for InterPrediction
+*/
+
+//! \ingroup CommonLib
+//! \{
+
+
+#include "CommonDefARM.h"
+#include "Rom.h"
+#include "InterPrediction.h"
+
+
+
+//! \ingroup CommonLib
+//! \{
+
+namespace vvenc {
+
+static inline int rightShiftMSB(int numer, int denom)
+{
+  int shiftIdx = bit_scan_reverse(denom);
+  return (numer >> shiftIdx);
+}
+
+#ifdef TARGET_SIMD_ARM
+#if __ARM_ARCH >= 8
+
+
+template< ARM_VEXT vext >
+static inline void calcBIOSums_Neon(const Pel* srcY0Tmp, const Pel* srcY1Tmp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, const int widthG, const int bitDepth, int limit, int &tmpx, int &tmpy)
+{
+  const int srcStride = widthG + 2;
+  int16x8_t sumAbsGXTmp    = vdupq_n_s16(0);
+  int16x8_t sumDIXTmp      = vdupq_n_s16(0);
+  int16x8_t sumAbsGYTmp    = vdupq_n_s16(0);
+  int16x8_t sumDIYTmp      = vdupq_n_s16(0);
+  int16x8_t sumSignGyGxTmp = vdupq_n_s16(0);
+  int16x8_t x = {1, 1, 1, 1, 1, 1, 0, 0};
+
+  for (int y = 0; y < 3; y++)
+  {
+    int16x8_t shiftSrcY0Tmp = vshrq_n_s16(vld1q_s16((int16_t*)(srcY0Tmp)), 4);
+    int16x8_t shiftSrcY1Tmp = vshrq_n_s16(vld1q_s16((int16_t*)(srcY1Tmp)), 4);
+
+    int16x8_t loadGradX0    = vld1q_s16((int16_t*)(gradX0));
+    int16x8_t loadGradX1    = vld1q_s16((int16_t*)(gradX1));
+    int16x8_t loadGradY0    = vld1q_s16((int16_t*)(gradY0));
+    int16x8_t loadGradY1    = vld1q_s16((int16_t*)(gradY1));
+    int16x8_t subTemp1      = vsubq_s16(shiftSrcY1Tmp, shiftSrcY0Tmp);
+    int16x8_t packTempX     =  vshrq_n_s16( vaddq_s16(loadGradX0, loadGradX1), 1 );
+    int16x8_t packTempY     =  vshrq_n_s16( vaddq_s16(loadGradY0, loadGradY1), 1 );
+    int16x8_t gX            = vabsq_s16(packTempX);
+    int16x8_t gY            = vabsq_s16(packTempY);
+    int16x8_t dIX           = vmulq_s16(subTemp1,vreinterpretq_s16_u16(vcleq_s16(packTempX, vdupq_n_s16(0))-vcgeq_s16(packTempX,vdupq_n_s16(0))));
+    int16x8_t dIY           = vmulq_s16(subTemp1,vreinterpretq_s16_u16(vcleq_s16(packTempY, vdupq_n_s16(0))-vcgeq_s16(packTempY,vdupq_n_s16(0))));
+    int16x8_t signGY_GX     = vmulq_s16(packTempX,vreinterpretq_s16_u16(vcleq_s16(packTempY, vdupq_n_s16(0))-vcgeq_s16(packTempY,vdupq_n_s16(0))));
+    
+    sumAbsGXTmp     = vaddq_s16(sumAbsGXTmp, gX);
+    sumAbsGYTmp     = vaddq_s16(sumAbsGYTmp, gY);
+    sumDIXTmp       = vaddq_s16(sumDIXTmp, dIX);
+    sumDIYTmp       = vaddq_s16(sumDIYTmp, dIY);
+    sumSignGyGxTmp  = vaddq_s16(sumSignGyGxTmp, signGY_GX);
+
+    srcY0Tmp += srcStride;
+    srcY1Tmp += srcStride;
+    gradX0 += widthG;
+    gradX1 += widthG;
+    gradY0 += widthG;
+    gradY1 += widthG;
+
+    shiftSrcY0Tmp = vshrq_n_s16(vld1q_s16((int16_t*)(srcY0Tmp)), 4);
+    shiftSrcY1Tmp = vshrq_n_s16(vld1q_s16((int16_t*)(srcY1Tmp)), 4);
+
+    loadGradX0    = vld1q_s16((int16_t*)(gradX0));
+    loadGradX1    = vld1q_s16((int16_t*)(gradX1));
+    loadGradY0    = vld1q_s16((int16_t*)(gradY0));
+    loadGradY1    = vld1q_s16((int16_t*)(gradY1));
+    subTemp1      = vsubq_s16(shiftSrcY1Tmp, shiftSrcY0Tmp);
+    packTempX     =  vshrq_n_s16( vaddq_s16(loadGradX0, loadGradX1), 1 );
+    packTempY     =  vshrq_n_s16( vaddq_s16(loadGradY0, loadGradY1), 1 );
+
+    gX            = vabsq_s16(packTempX);
+    gY            = vabsq_s16(packTempY);
+    
+    dIX           = vmulq_s16(subTemp1,vreinterpretq_s16_u16(vcleq_s16(packTempX, vdupq_n_s16(0))-vcgeq_s16(packTempX,vdupq_n_s16(0))));
+    dIY           = vmulq_s16(subTemp1,vreinterpretq_s16_u16(vcleq_s16(packTempY, vdupq_n_s16(0))-vcgeq_s16(packTempY,vdupq_n_s16(0))));
+    signGY_GX     = vmulq_s16(packTempX,vreinterpretq_s16_u16(vcleq_s16(packTempY, vdupq_n_s16(0))-vcgeq_s16(packTempY,vdupq_n_s16(0))));
+  
+    sumAbsGXTmp     = vaddq_s16(sumAbsGXTmp, gX);
+    sumAbsGYTmp     = vaddq_s16(sumAbsGYTmp, gY);
+    sumDIXTmp       = vaddq_s16(sumDIXTmp, dIX);
+    sumDIYTmp       = vaddq_s16(sumDIYTmp, dIY);
+    sumSignGyGxTmp  = vaddq_s16(sumSignGyGxTmp, signGY_GX);
+
+    srcY0Tmp += srcStride;
+    srcY1Tmp += srcStride;
+    gradX0 += widthG;
+    gradX1 += widthG;
+    gradY0 += widthG;
+    gradY1 += widthG;
+  }
+
+  int sumAbsGX = vaddvq_s16(vmulq_s16( sumAbsGXTmp, x));
+  int sumAbsGY = vaddvq_s16(vmulq_s16( sumAbsGYTmp, x));
+  int sumDIX   = vaddvq_s16(vmulq_s16( sumDIXTmp, x));
+  int sumDIY   = vaddvq_s16(vmulq_s16( sumDIYTmp, x));
+  int sumSignGY_GX  = vaddvq_s16(vmulq_s16( sumSignGyGxTmp, x));
+
+  tmpx = sumAbsGX == 0 ? 0 : rightShiftMSB( sumDIX << 2, sumAbsGX );
+  tmpx = Clip3( -limit, limit, tmpx );
+
+  int mainsGxGy = sumSignGY_GX >> 12;
+  int secsGxGy  = sumSignGY_GX & ( ( 1 << 12 ) - 1 );
+  int tmpData   = tmpx * mainsGxGy;
+  tmpData       = ( ( tmpData << 12 ) + tmpx * secsGxGy ) >> 1;
+  tmpy = sumAbsGY == 0 ? 0 : rightShiftMSB( ( ( sumDIY << 2 ) - tmpData ), sumAbsGY );
+  tmpy = Clip3( -limit, limit, tmpy );
+}
+
+template<ARM_VEXT vext>
+static inline void addBIOAvg4_Neon(const int16_t* src0, const int16_t* src1, int16_t* dst, ptrdiff_t dstStride, const int16_t* gradX0, const int16_t* gradX1, const int16_t* gradY0, const int16_t* gradY1, ptrdiff_t widthG, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng)
+{
+  const ptrdiff_t src0Stride = widthG + 2;
+  const ptrdiff_t src1Stride = widthG + 2;
+  const ptrdiff_t gradStride = widthG;
+  int32x4_t mm_offset  = vdupq_n_s32( offset );
+  int16x4_t vibdimin   = vdup_n_s16( clpRng.min() );
+  int16x4_t vibdimax   = vdup_n_s16( clpRng.max() );
+
+  int16x4_t mm_a;
+  int16x4_t mm_b;
+  int32x4_t mm_sum;
+  int16x4_t mm_sum3;
+
+  for( int y = 0; y < 2; y++)
+  {
+    mm_sum = vdupq_n_s32(0);
+
+    mm_a   = vsub_s16 ( vld1_s16( (const int16_t *) gradX0 ), vld1_s16( (const int16_t *) gradX1 ) );
+    mm_b   = vsub_s16 ( vld1_s16( (const int16_t *) gradY0 ), vld1_s16( (const int16_t *) gradY1 ) );
+
+    mm_sum   = vmlal_n_s16      (mm_sum, mm_a, tmpx);
+    mm_sum   = vmlal_n_s16      (mm_sum, mm_b, tmpy);
+    mm_sum = vaddq_s32      ( vaddw_s16( mm_sum, vld1_s16( (const int16_t *) ( src0 ) ) ), vaddw_s16( mm_offset, vld1_s16( (const int16_t *) ( src1 ) )) );
+    mm_sum3 = vmin_s16 (vibdimax, vmax_s16(vibdimin, vqmovn_s32(vshlq_s32( mm_sum, vdupq_n_s32(-1*shift) ))));    
+
+    vst1_s16((int16_t *)dst, mm_sum3);
+
+    dst += dstStride;
+    src0 += src0Stride; 
+    src1 += src1Stride; 
+    gradX0 += gradStride;
+    gradX1 += gradStride;
+    gradY0 += gradStride;
+    gradY1 += gradStride;
+
+    mm_sum = vdupq_n_s32(0);
+
+    mm_a   = vsub_s16 ( vld1_s16( (const int16_t *) gradX0 ), vld1_s16( (const int16_t *) gradX1 ) );
+    mm_b   = vsub_s16 ( vld1_s16( (const int16_t *) gradY0 ), vld1_s16( (const int16_t *) gradY1 ) );
+
+    mm_sum   = vmlal_n_s16      (mm_sum, mm_a, tmpx);
+    mm_sum   = vmlal_n_s16      (mm_sum, mm_b, tmpy);
+    mm_sum = vaddq_s32      ( vaddw_s16( mm_sum, vld1_s16( (const int16_t *) ( src0 ) ) ), vaddw_s16( mm_offset, vld1_s16( (const int16_t *) ( src1 ) )) );
+    mm_sum3 = vmin_s16 (vibdimax, vmax_s16(vibdimin, vqmovn_s32(vshlq_s32( mm_sum, vdupq_n_s32(-1*shift) ))));    
+
+    vst1_s16((int16_t *)dst, mm_sum3);
+
+    dst += dstStride;
+    src0 += src0Stride; 
+    src1 += src1Stride; 
+    gradX0 += gradStride;
+    gradX1 += gradStride;
+    gradY0 += gradStride;
+    gradY1 += gradStride;
+  }
+}
+
+template< ARM_VEXT vext>
+void BiOptFlowCoreARMSIMD( const Pel* srcY0,
+                        const Pel* srcY1,
+                        const Pel* gradX0,
+                        const Pel* gradX1,
+                        const Pel* gradY0,
+                        const Pel* gradY1,
+                        const int  width,
+                        const int  height,
+                              Pel* dstY,
+                        const ptrdiff_t dstStride,
+                        const int  shiftNum,
+                        const int  offset,
+                        const int  limit,
+                        const ClpRng& clpRng,
+                        const int bitDepth )
+{
+  const int widthG        = width  + 2 * BDOF_EXTEND_SIZE;
+  const int stridePredMC  = widthG + 2;
+        int offsetPos     = widthG * BDOF_EXTEND_SIZE + BDOF_EXTEND_SIZE;
+  const int xUnit         = ( width  >> 2 );
+  const int yUnit         = ( height >> 2 );
+
+  const Pel* srcY0Temp;
+  const Pel* srcY1Temp;
+        Pel *dstY0;
+  
+  int OffPos;
+  int OffPad = 0;
+
+  for( int yu = 0; yu < yUnit; yu++, srcY0 += ( stridePredMC << 2 ), srcY1 += ( stridePredMC << 2 ), dstY += ( dstStride << 2 ), offsetPos += ( widthG << 2 ) )
+  {
+    srcY0Temp = srcY0;
+    srcY1Temp = srcY1;
+    dstY0     = dstY;
+    
+    OffPos = offsetPos;
+    OffPad = ( ( yu * widthG ) << 2 );
+    for( int xu = 0; xu < xUnit; xu++, srcY0Temp += 4, srcY1Temp += 4, dstY0 += 4, OffPos += 4, OffPad += 4 )
+    {
+      int tmpx, tmpy;
+
+      calcBIOSums_Neon<vext>( srcY0Temp, srcY1Temp, gradX0 + OffPad, gradX1 + OffPad, gradY0 + OffPad, gradY1 + OffPad, widthG, bitDepth, limit, tmpx, tmpy );
+
+      addBIOAvg4_Neon<vext> ( srcY0Temp + stridePredMC + 1, srcY1Temp + stridePredMC + 1, dstY0, dstStride, gradX0 + OffPos, gradX1 + OffPos, gradY0 + OffPos, gradY1 + OffPos, widthG, tmpx, tmpy, shiftNum, offset, clpRng );
+    }  
+  }  
+}
+
+
+template<ARM_VEXT vext>
+void InterPredInterpolation::_initInterPredictionARM()
+{
+  xFpBiDirOptFlow     = BiOptFlowCoreARMSIMD<vext>;
+}
+
+#else 
+
+template<ARM_VEXT vext>
+void TCoeffOps::_initInterPredictionARM()
+{}
+#endif
+
+template void InterPredInterpolation::_initInterPredictionARM<SIMDARM>();
+
+#endif
+} // namespace vvenc
+
+//! \}
+
+// #endif // TARGET_SIMD_X86
+//! \}
diff --git a/source/Lib/CommonLib/arm/RdCostARM.h b/source/Lib/CommonLib/arm/RdCostARM.h
index c3e7dc1c7..c45ea3506 100644
--- a/source/Lib/CommonLib/arm/RdCostARM.h
+++ b/source/Lib/CommonLib/arm/RdCostARM.h
@@ -69,6 +69,17 @@ POSSIBILITY OF SUCH DAMAGE.
 namespace vvenc
 {
 
+
+static int32x4_t neon_madd_16 (int16x8_t a, int16x8_t b) {
+
+  int32x4_t sum = vdupq_n_s32(0);   
+  int32x4_t c = vmull_s16(vget_low_s16(a), vget_low_s16(b));
+  int32x4_t d = vmull_high_s16((a), (b)); 
+  sum = vpaddq_s32(c,d);
+
+  return sum;
+}
+
 #if defined( TARGET_SIMD_ARM )
 
 // The xGetHADs_ARMSIMD functions depend on the SIMDe kernels being enabled
@@ -997,6 +1008,159 @@ void RdCost::xGetSADX5_16xN_SIMD(const DistParam& rcDtParam, Distortion* cost, b
     xGetSADX5_16xN_SIMDImp<vext, false>( rcDtParam, cost );
 }
 
+template< int iWidth, ARM_VEXT vext >
+Distortion RdCost::xGetSAD_NxN_ARMSIMD( const DistParam &rcDtParam )
+{
+
+  const short* pSrc1   = (const short*)rcDtParam.org.buf;
+  const short* pSrc2   = (const short*)rcDtParam.cur.buf;
+  int  iRows           = rcDtParam.org.height;
+  int  iSubShift       = rcDtParam.subShift;
+  int  iSubStep        = ( 1 << iSubShift );
+  const int iStrideSrc1 = rcDtParam.org.stride * iSubStep;
+  const int iStrideSrc2 = rcDtParam.cur.stride * iSubStep;
+
+  uint32_t uiSum = 0;
+  int16x8_t vzero_16 = vdupq_n_s16(0);
+
+  if( iWidth == 4 )
+  {
+    if( iRows == 4 && iSubShift == 0 )
+    {
+      int16x8_t vsrc1 = vcombine_s16( vld1_s16( ( const int16_t* )pSrc1 ),  vld1_s16( ( const int16_t* )( &pSrc1[iStrideSrc1] ) ) );
+      int16x8_t vsrc2 = vcombine_s16( vld1_s16( ( const int16_t* )pSrc2 ),  vld1_s16( ( const int16_t* )( &pSrc2[iStrideSrc2] ) ) );
+      int32x4_t vsum  = vmovl_s16(vget_low_s16( vpaddq_s16( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 )) );
+      vsrc1 = vcombine_s16( vld1_s16( ( const int16_t* )( &pSrc1[2 * iStrideSrc1] ) ),  vld1_s16( ( const int16_t* )( &pSrc1[3 * iStrideSrc1] ) ) );
+      vsrc2 = vcombine_s16( vld1_s16( ( const int16_t* )( &pSrc2[2 * iStrideSrc2] ) ),  vld1_s16( ( const int16_t* )( &pSrc2[3 * iStrideSrc2] ) ) );
+      vsum  = vaddq_s32( vsum, vmovl_s16(vget_low_s16( vpaddq_s16( vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ), vzero_16 ) ) ));
+      uiSum = vaddvq_s32(vsum);
+    }
+    else
+    {
+      int32x4_t vsum32 = vdupq_n_s32(0);
+      for( int iY = 0; iY < iRows; iY += iSubStep )
+      {
+        int32x4_t vsrc1 = vmovl_s16( vld1_s16( ( const int16_t* )pSrc1 ) );
+        int32x4_t vsrc2 = vmovl_s16( vld1_s16( ( const int16_t* )pSrc2 ) );
+        vsum32 = vaddq_s32( vsum32, vabsq_s32( vsubq_s32( vsrc1, vsrc2 ) ) );
+
+        pSrc1 += iStrideSrc1;
+        pSrc2 += iStrideSrc2;
+      }
+      uiSum = vaddvq_s32(vsum32);
+    }
+  }
+  else
+  {    
+    static constexpr bool earlyExitAllowed = iWidth >= 64;
+    int32x4_t vsum32 = vdupq_n_s32( 0 );
+    int checkExit = 3;
+
+    for( int iY = 0; iY < iRows; iY+=iSubStep )
+    {
+      int16x8_t vsrc1  = vld1q_s16( ( const int16_t* )( pSrc1 ) );
+      int16x8_t vsrc2  = vld1q_s16( ( const int16_t* )( pSrc2 ) );
+      int16x8_t vsum16 = vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) );
+
+      if( iWidth >= 16 )
+      {
+        vsrc1  = vld1q_s16( ( const int16_t* )( &pSrc1[8] ) );
+        vsrc2  = vld1q_s16( ( const int16_t* )( &pSrc2[8] ) );
+        vsum16 = vaddq_s16( vsum16, vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ) );
+
+        for( int iX = 16; iX < iWidth; iX += 16 )
+        {
+          vsrc1  = vld1q_s16( ( const int16_t* )( &pSrc1[iX] ) );
+          vsrc2  = vld1q_s16( ( const int16_t* )( &pSrc2[iX] ) );
+          vsum16 = vaddq_s16( vsum16, vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ) );
+          
+          vsrc1  = vld1q_s16( ( const int16_t* )( &pSrc1[iX + 8] ) );
+          vsrc2  = vld1q_s16( ( const int16_t* )( &pSrc2[iX + 8] ) );
+          vsum16 = vaddq_s16( vsum16, vabsq_s16( vsubq_s16( vsrc1, vsrc2 ) ) );
+        }
+      }
+
+      int32x4_t vsumtemp = vpaddlq_s16( vsum16);
+      
+      if( earlyExitAllowed ) vsum32 = vpaddq_s32( vsum32, vsumtemp );
+      else                   vsum32 = vaddq_s32 ( vsum32, vsumtemp );
+
+      pSrc1   += iStrideSrc1;
+      pSrc2   += iStrideSrc2;
+
+      if( earlyExitAllowed && checkExit == 0 )
+      {
+        Distortion distTemp = vgetq_lane_s32(vsum32, 0); 
+        distTemp <<= iSubShift;
+        distTemp >>= DISTORTION_PRECISION_ADJUSTMENT( rcDtParam.bitDepth );
+        if( distTemp > rcDtParam.maximumDistortionForEarlyExit ) return distTemp;
+        checkExit = 3;
+      }
+      else if( earlyExitAllowed )
+      {
+        checkExit--;
+      }
+    }
+    uiSum = vaddvq_s32(vsum32);
+  }
+
+  uiSum <<= iSubShift;
+  return uiSum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
+}
+
+template<ARM_VEXT vext> 
+Distortion RdCost::xGetSADwMask_ARMSIMD(const DistParam &rcDtParam)
+{ 
+  if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight)
+    return RdCost::xGetSADwMask(rcDtParam);
+
+  const short *src1       = (const short *) rcDtParam.org.buf;
+  const short *src2       = (const short *) rcDtParam.cur.buf;
+  const short *weightMask = (const short *) rcDtParam.mask;
+  int          rows       = rcDtParam.org.height;
+  int          cols       = rcDtParam.org.width;
+  int          subShift   = rcDtParam.subShift;
+  int          subStep    = (1 << subShift);
+  const int    strideSrc1 = rcDtParam.org.stride * subStep;
+  const int    strideSrc2 = rcDtParam.cur.stride * subStep;
+  const int    strideMask = rcDtParam.maskStride * subStep;
+
+  Distortion sum = 0;
+
+  int32x4_t vsum32 = vdupq_n_s32( 0 );
+  static const uint8_t shuffle_table[16] = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
+  uint8x16_t shuffle_vector = vld1q_u8(shuffle_table);
+
+  for (int y = 0; y < rows; y += subStep)
+  {
+    for (int x = 0; x < cols; x += 8)
+    {
+      int16x8_t vsrc1  = vld1q_s16( ( const int16_t* )(&src1[x] ) );
+      int16x8_t vsrc2  = vld1q_s16( ( const int16_t* )(&src2[x] ) );
+      int16x8_t vmask;
+      if (rcDtParam.stepX == -1)
+      {
+        vmask                      = vld1q_s16( ( const int16_t* ) ((&weightMask[x]) - (x << 1) - (8 - 1)));
+        uint8x16_t input_vector = vreinterpretq_u8_s16(vmask);
+        uint8x16_t shuffled_vector = vqtbl1q_u8(input_vector, shuffle_vector);
+        vmask = vreinterpretq_s16_u8(shuffled_vector);
+      }
+      else
+      {
+        vmask = vld1q_s16( ( const int16_t* ) (&weightMask[x]));
+      }
+      vsum32 = vaddq_s32(vsum32, neon_madd_16(vmask, vabsq_s16(vsubq_s16(vsrc1, vsrc2))));
+    }
+    src1 += strideSrc1;
+    src2 += strideSrc2;
+    weightMask += strideMask;
+  }
+  sum    = vaddvq_s32(vsum32);
+  sum <<= subShift;
+  return sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth);
+}
+
+
 template<ARM_VEXT vext>
 void RdCost::_initRdCostARM()
 {
@@ -1022,6 +1186,16 @@ void RdCost::_initRdCostARM()
   m_afpDistortFunc[0][DF_HAD32_fast]   = RdCost::xGetHADs_ARMSIMD<vext, true>;
   m_afpDistortFunc[0][DF_HAD64_fast]   = RdCost::xGetHADs_ARMSIMD<vext, true>;
   m_afpDistortFunc[0][DF_HAD128_fast]  = RdCost::xGetHADs_ARMSIMD<vext, true>;
+
+  m_afpDistortFunc[0][DF_SAD4   ] = xGetSAD_NxN_ARMSIMD<4,  vext>;
+  m_afpDistortFunc[0][DF_SAD8   ] = xGetSAD_NxN_ARMSIMD<8,  vext>;
+  m_afpDistortFunc[0][DF_SAD16  ] = xGetSAD_NxN_ARMSIMD<16, vext>;
+  m_afpDistortFunc[0][DF_SAD32  ] = xGetSAD_NxN_ARMSIMD<32, vext>;
+  m_afpDistortFunc[0][DF_SAD64  ] = xGetSAD_NxN_ARMSIMD<64, vext>;
+  m_afpDistortFunc[0][DF_SAD128]  = xGetSAD_NxN_ARMSIMD<128, vext>;
+
+  m_afpDistortFunc[0][DF_SAD_WITH_MASK] = xGetSADwMask_ARMSIMD<vext>;
+
 #endif  // defined( TARGET_SIMD_X86 )
 }
 
diff --git a/source/Lib/CommonLib/arm/neon/InterPredARM.cpp b/source/Lib/CommonLib/arm/neon/InterPredARM.cpp
new file mode 100644
index 000000000..a469dcb58
--- /dev/null
+++ b/source/Lib/CommonLib/arm/neon/InterPredARM.cpp
@@ -0,0 +1,43 @@
+/* -----------------------------------------------------------------------------
+The copyright in this software is being made available under the Clear BSD
+License, included below. No patent rights, trademark rights and/or
+other Intellectual Property Rights other than the copyrights concerning
+the Software are granted under this license.
+
+The Clear BSD License
+
+Copyright (c) 2019-2024, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V. & The VVenC Authors.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted (subject to the limitations in the disclaimer below) provided that
+the following conditions are met:
+
+     * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+
+     * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+     * Neither the name of the copyright holder nor the names of its
+     contributors may be used to endorse or promote products derived from this
+     software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
+THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+------------------------------------------------------------------------------------------- */
+
+#include "../InterPredARM.h"
\ No newline at end of file
diff --git a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp
index 3d7af09f5..d23ab150a 100644
--- a/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp
+++ b/source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp
@@ -56,6 +56,18 @@ POSSIBILITY OF SUCH DAMAGE.
 //! \ingroup CommonLib
 //! \{
 
+#if SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_AVX2
+# define USE_AVX2
+#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_SSE42
+# define USE_SSE42
+#elif SIMD_EVERYWHERE_EXTENSION_LEVEL_ID==X86_SIMD_SSE41
+# define USE_SSE41
+#endif
+
+#ifdef TARGET_SIMD_X86
+# include "../x86/InterpolationFilterX86.h"
+#endif
+
 #if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_OPT_MCIF
 
 namespace vvenc
@@ -497,6 +509,316 @@ static void simdFilter16xX_N8_neon( const ClpRng& clpRng, Pel const* src, int sr
   } while( --height != 0 );
 }
 
+template<int N, bool shiftBack>
+static void simdInterpolateHorM8_Neon( const int16_t* src, int srcStride, int16_t *dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const *coeff )
+{
+
+  int16x8_t vibdimin   = vdupq_n_s16( clpRng.min() );
+  int16x8_t vibdimax   = vdupq_n_s16( clpRng.max() );
+  int32x4_t vsuma, vsumb;
+  int16x8_t vsum, vsrc0, vsrc1;
+
+  for( int row = 0; row < height; row++ )
+  {
+    for( int col = 0; col < width; col+=8 )
+    {
+      vsuma = vdupq_n_s32(offset);
+      vsumb = vdupq_n_s32(offset);
+
+      vsrc0 = vld1q_s16( ( const int16_t * )&src[col] );
+      vsrc1 = vld1q_s16( ( const int16_t * )&src[col + 4] );
+
+      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[0]));
+      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[1]));
+      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 6), vdupq_n_s16(coeff[2]));
+      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc0, vsrc0, 7), vdupq_n_s16(coeff[3]));
+
+      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[0]));
+      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[1]));
+      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 6), vdupq_n_s16(coeff[2]));
+      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc1, vsrc1, 7), vdupq_n_s16(coeff[3]));
+
+
+    if( N == 8 )
+    {
+      vsrc0 = vld1q_s16( ( const int16_t * )&src[col + 8] );
+      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[4]));
+      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[5]));
+      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 6), vdupq_n_s16(coeff[6]));
+      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 7), vdupq_n_s16(coeff[7]));
+
+      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[4]));
+      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[5]));
+      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 6), vdupq_n_s16(coeff[6]));
+      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 7), vdupq_n_s16(coeff[7]));
+    }
+    if( N == 6 )
+    {
+      vsrc0 = vld1q_s16( ( const int16_t * )&src[col + 8] );
+      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 4), vdupq_n_s16(coeff[4]));
+      vsuma = vmlal_high_s16(vsuma, vextq_s16(vsrc1, vsrc1, 5), vdupq_n_s16(coeff[5]));
+
+      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 4), vdupq_n_s16(coeff[4]));
+      vsumb = vmlal_high_s16(vsumb, vextq_s16(vsrc0, vsrc0, 5), vdupq_n_s16(coeff[5]));   
+    }
+    
+    vsuma = vshlq_s32( vsuma, vdupq_n_s32(-1*shift) );
+    vsumb = vshlq_s32( vsumb, vdupq_n_s32(-1*shift) );
+    vsum = vcombine_s16(vqmovn_s32(vsuma), vqmovn_s32(vsumb));
+
+    if( shiftBack )
+    { 
+      vsum = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum ) );
+    }
+    vst1q_s16((int16_t*) &dst[col], vsum);
+    }
+    src += srcStride;
+    dst += dstStride;
+  }
+}
+
+template<int N, bool shiftBack>
+static void simdInterpolateVerM8_Neon( const int16_t *src, int srcStride, int16_t *dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const *coeff )
+{
+  const Pel* srcOrig = src;
+  int16_t *dstOrig = dst;
+
+  int16x8_t vsrc[N+1];
+  int32x4_t voffset = vdupq_n_s32( offset );
+  int16x8_t vibdimin = vdupq_n_s16( clpRng.min() );
+  int16x8_t vibdimax = vdupq_n_s16( clpRng.max() );
+  int32x4_t vsuma, vsumb;
+  int16x8_t vsum;
+  vsrc[N] = vdupq_n_s16(0);
+  for( int col = 0; col < width; col += 8 )
+  {
+
+    for( int i = 0; i < N - 1; i++ )
+    {
+      vsrc[i] = vld1q_s16( ( int16_t const * )&src[col + i * srcStride] );
+    }
+
+    for( int row = 0; row < height; row++ )
+    {
+      vsrc[N - 1] = vld1q_s16( ( int16_t const * )&src[col + ( N - 1 ) * srcStride] );
+      vsuma = vsumb = voffset;
+      if(N < 2)
+      {  
+        vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[ 0]), vdup_n_s16(coeff[0]));
+        vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[ 1]), vdup_n_s16(coeff[1]));
+        vsumb = vmlal_high_s16(vsumb, vsrc[0], vdupq_n_s16(coeff[0]));
+        vsumb = vmlal_high_s16(vsumb, vsrc[1], vdupq_n_s16(coeff[1]));
+
+        vsrc[0] = vsrc[1];
+      }
+      else
+      {
+        for( int i = 0; i < N; i += 2 )
+        {
+          vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[i + 0]), vdup_n_s16(coeff[i + 0]));
+          vsuma = vmlal_s16(vsuma, vget_low_s16(vsrc[i + 1]), vdup_n_s16(coeff[i + 1]));
+          vsumb = vmlal_high_s16(vsumb, vsrc[i + 0], vdupq_n_s16(coeff[i + 0]));
+          vsumb = vmlal_high_s16(vsumb, vsrc[i + 1], vdupq_n_s16(coeff[i + 1]));
+          vsrc[i    ] = vsrc[i + 1];
+          vsrc[i + 1] = vsrc[i + 2];
+        }
+      }
+      vsuma = vshlq_s32( vsuma, vdupq_n_s32(-1*shift) );
+      vsumb = vshlq_s32( vsumb, vdupq_n_s32(-1*shift) );
+      vsum = vcombine_s16(vqmovn_s32(vsuma), vqmovn_s32(vsumb));
+      if( shiftBack ) 
+      {
+        vsum = vminq_s16( vibdimax, vmaxq_s16( vibdimin, vsum ) );
+      }
+      vst1q_s16((int16_t*) &dst[col], vsum);
+      src += srcStride;
+      dst += dstStride;
+    }
+    src = srcOrig;
+    dst = dstOrig;
+  }
+}
+
+template<int N, bool isVertical, bool isFirst, bool isLast>
+static void simdFilterARM( const ClpRng& clpRng, Pel const *src, int srcStride, Pel* dst, int dstStride, int width, int height, TFilterCoeff const *coeff )
+{
+  int row, col;
+
+  Pel c[8];
+  c[0] = coeff[0];
+  c[1] = coeff[1];
+  if( N >= 4 )
+  {
+    c[2] = coeff[2];
+    c[3] = coeff[3];
+  }
+  if( N >= 6 )
+  {
+    c[4] = coeff[4];
+    c[5] = coeff[5];
+  }
+  if( N == 8 )
+  {
+    c[6] = coeff[6];
+    c[7] = coeff[7];
+  }
+
+  int cStride = ( isVertical ) ? srcStride : 1;
+  src -= ( N/2 - 1 ) * cStride;
+
+  int offset;
+  int headRoom = std::max<int>( 2, ( IF_INTERNAL_PREC - clpRng.bd ) );
+  int shift    = IF_FILTER_PREC;
+  CHECK( shift < 0, "Negative shift" );
+  
+  if( N != 2 )
+  {
+    if( isLast )
+    {
+      shift  += ( isFirst ) ? 0 : headRoom;
+      offset  = 1 << ( shift - 1 );
+      offset += ( isFirst ) ? 0 : IF_INTERNAL_OFFS << IF_FILTER_PREC;
+    }
+    else
+    {
+      shift -= ( isFirst ) ? headRoom : 0;
+      offset = ( isFirst ) ? -IF_INTERNAL_OFFS * (1<< shift) : 0;
+    }
+  }
+  else
+  {
+    if( isFirst )
+    {
+      shift  = IF_FILTER_PREC_BILINEAR - (IF_INTERNAL_PREC_BILINEAR - clpRng.bd);
+      offset = 1 << (shift - 1);
+    }
+    else
+    {
+      shift  = 4;
+      offset = 1 << (shift - 1);
+    }
+  }
+
+  CHECKD( clpRng.bd > 10, "VVenC does not support bitdepths larger than 10!" );
+
+  if( N == 6 )
+  {
+    c[6] = coeff[6];
+    c[7] = coeff[7];
+    int src8tOff = cStride;
+
+    if( !( width & 7 ) )
+    {
+      if( !isVertical )
+      {
+        simdInterpolateHorM8_Neon<6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
+      }
+      else
+      {
+        simdInterpolateVerM8_Neon<6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
+      }
+    }
+
+    else if( !( width & 3 ) )
+    {
+      if( !isVertical )
+      {
+        simdInterpolateHorM4<SIMD_EVERYWHERE_EXTENSION_LEVEL, 8, isLast>( src - src8tOff, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+      }
+      else
+        simdInterpolateVerM4<SIMD_EVERYWHERE_EXTENSION_LEVEL, 6, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c + 1 );
+    }
+
+    else if( width == 1 && !isVertical )
+    {
+      simdInterpolateHorM1<SIMD_EVERYWHERE_EXTENSION_LEVEL, 8, isLast>( src - src8tOff, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+    }
+
+    else if( width == 1 && isVertical )
+    {
+      c[0] = c[1]; c[1] = c[2]; c[2] = c[3]; c[3] = c[4]; c[4] = c[5]; c[5] = coeff[6];
+      goto scalar_if;
+    }
+
+    return;
+  }
+
+  if( !isVertical && N != 2 )
+  {
+    if( ( width & 7 ) == 0 )
+    {
+      simdInterpolateHorM8_Neon<N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+    }
+
+    else if( ( width & 3 ) == 0 )
+      simdInterpolateHorM4<SIMD_EVERYWHERE_EXTENSION_LEVEL, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+    else if( ( width & 1 ) == 0 )
+      simdInterpolateHorM2<SIMD_EVERYWHERE_EXTENSION_LEVEL, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+    else
+      simdInterpolateHorM1<SIMD_EVERYWHERE_EXTENSION_LEVEL, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+    return;
+  }
+
+  else if( N != 2 )
+  {
+    if( ( width & 7 ) == 0 )
+    {
+      simdInterpolateVerM8_Neon<N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+    }
+    else if( ( width & 3 ) == 0 )
+      simdInterpolateVerM4<SIMD_EVERYWHERE_EXTENSION_LEVEL, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+    else if( ( width & 1 ) == 0 )
+      simdInterpolateVerM2<SIMD_EVERYWHERE_EXTENSION_LEVEL, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+    else
+      simdInterpolateVerM1<SIMD_EVERYWHERE_EXTENSION_LEVEL, N, isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
+    return;
+  }
+  else
+  {
+    THROW( "To be implemented" );
+    return;
+  }
+
+scalar_if:
+  for( row = 0; row < height; row++ )
+  {
+    for( col = 0; col < width; col++ )
+    {
+      int sum;
+
+      sum  = src[col + 0 * cStride] * c[0];
+      sum += src[col + 1 * cStride] * c[1];
+      if( N >= 4 )
+      {
+        sum += src[col + 2 * cStride] * c[2];
+        sum += src[col + 3 * cStride] * c[3];
+      }
+      if( N >= 6 )
+      {
+        sum += src[col + 4 * cStride] * c[4];
+        sum += src[col + 5 * cStride] * c[5];
+      }
+      if( N == 8 )  
+
+      {
+        sum += src[col + 6 * cStride] * c[6];
+        sum += src[col + 7 * cStride] * c[7];
+      }
+
+      Pel val = ( sum + offset ) >> shift;
+      if( isLast )
+      {
+        val = ClipPel( val, clpRng );
+      }
+      dst[col] = val;
+    }
+
+    src += srcStride;
+    dst += dstStride;
+  }
+}
+
+
 template<>
 void InterpolationFilter::_initInterpolationFilterARM<NEON>()
 {
@@ -510,6 +832,38 @@ void InterpolationFilter::_initInterpolationFilterARM<NEON>()
   m_filter16x16[ 0 ][ 1 ] = simdFilter16xX_N8_neon<true>;
 
   m_filterN2_2D = simdInterpolateN2_2D_neon;
+
+  m_filterHor[0][0][0] = simdFilterARM<8, false, false, false>;
+  m_filterHor[0][0][1] = simdFilterARM<8, false, false, true>;
+  m_filterHor[0][1][0] = simdFilterARM<8, false, true, false>;
+  m_filterHor[0][1][1] = simdFilterARM<8, false, true, true>;
+  
+  m_filterHor[1][0][0] = simdFilterARM<4, false, false, false>;
+  m_filterHor[1][0][1] = simdFilterARM<4, false, false, true>;
+  m_filterHor[1][1][0] = simdFilterARM<4, false, true, false>;
+  m_filterHor[1][1][1] = simdFilterARM<4, false, true, true>;
+  
+  m_filterHor[3][0][0] = simdFilterARM<6, false, false, false>;
+  m_filterHor[3][0][1] = simdFilterARM<6, false, false, true>;
+  m_filterHor[3][1][0] = simdFilterARM<6, false, true, false>;
+  m_filterHor[3][1][1] = simdFilterARM<6, false, true, true>;
+  
+  m_filterVer[0][0][0] = simdFilterARM<8, true, false, false>;
+  m_filterVer[0][0][1] = simdFilterARM<8, true, false, true>;
+  m_filterVer[0][1][0] = simdFilterARM<8, true, true, false>;
+  m_filterVer[0][1][1] = simdFilterARM<8, true, true, true>;
+  
+  m_filterVer[1][0][0] = simdFilterARM<4, true, false, false>;
+  m_filterVer[1][0][1] = simdFilterARM<4, true, false, true>;
+  m_filterVer[1][1][0] = simdFilterARM<4, true, true, false>;
+  m_filterVer[1][1][1] = simdFilterARM<4, true, true, true>;
+  
+  m_filterVer[3][0][0] = simdFilterARM<6, true, false, false>;
+  m_filterVer[3][0][1] = simdFilterARM<6, true, false, true>;
+  m_filterVer[3][1][0] = simdFilterARM<6, true, true, false>;
+  m_filterVer[3][1][1] = simdFilterARM<6, true, true, true>;
+
+
 }
 
 } // namespace vvenc
diff --git a/source/Lib/EncoderLib/EncCfg.cpp b/source/Lib/EncoderLib/EncCfg.cpp
index a4f20183e..58f053064 100644
--- a/source/Lib/EncoderLib/EncCfg.cpp
+++ b/source/Lib/EncoderLib/EncCfg.cpp
@@ -83,15 +83,16 @@ static unsigned getMaxTlVal( unsigned perTlVal )
 
 void VVEncCfg::xInitCfgMembers()
 {
-  m_stageParallelProc = m_numThreads > 0 && m_maxParallelFrames > 0;
-  m_log2GopSize       = floorLog2( m_GOPSize );
-  m_maxTLayer         = m_picReordering && m_GOPSize > 1 ? vvenc::ceilLog2( m_GOPSize ) : 0;
-  m_bimCtuSize        = m_CTUSize;
-  m_MaxQT[0]          =
-    m_MaxQT[1]        = 
-    m_MaxQT[2]        = m_CTUSize;
-  m_rateCap           = m_RCMaxBitrate > 0 && m_RCMaxBitrate < INT32_MAX && m_RCTargetBitrate == 0;
-  m_reuseCuResults    = ( m_IntraPeriod > 1 && getMaxTlVal( m_maxMTTDepth ) > 1 ) || m_maxMTTDepthI > ( m_IntraPeriod == 1 ? 1 : 2 );
+  m_stageParallelProc   = m_numThreads > 0 && m_maxParallelFrames > 0;
+  m_log2GopSize         = floorLog2( m_GOPSize );
+  m_maxTLayer           = m_picReordering && m_GOPSize > 1 ? vvenc::ceilLog2( m_GOPSize ) : 0;
+  m_bimCtuSize          = m_CTUSize;
+  m_MaxQT[0]            =
+  m_MaxQT[1]            = 
+  m_MaxQT[2]            = m_CTUSize;
+  m_rateCap             = m_RCMaxBitrate > 0 && m_RCMaxBitrate < INT32_MAX && m_RCTargetBitrate == 0;
+  m_reuseCuResults      = ( m_IntraPeriod > 1 && getMaxTlVal( m_maxMTTDepth ) > 1 ) || m_maxMTTDepthI > ( m_IntraPeriod == 1 ? 1 : 2 );
+  m_splitCostThrParamId = getMaxTlVal(m_maxMTTDepth);
 
   m_mergeRdCandQuotaRegular = std::min( NUM_MRG_SATD_CAND, std::max( ( int ) m_maxNumMergeCand - 2, 1 ) );
   //                                        0  1  2  3  4
diff --git a/source/Lib/EncoderLib/EncCfg.h b/source/Lib/EncoderLib/EncCfg.h
index 8fc1cef8f..3425e24f6 100644
--- a/source/Lib/EncoderLib/EncCfg.h
+++ b/source/Lib/EncoderLib/EncCfg.h
@@ -91,6 +91,7 @@ struct VVEncCfg : public vvenc_config
   int       m_mergeRdCandQuotaCiip;
   int       m_mergeRdCandQuotaGpm;
   bool      m_reuseCuResults;
+  int       m_splitCostThrParamId;
   vvencFG   m_fg;
 
 private:
diff --git a/source/Lib/EncoderLib/EncCu.cpp b/source/Lib/EncoderLib/EncCu.cpp
index 6dfc37531..37668d227 100644
--- a/source/Lib/EncoderLib/EncCu.cpp
+++ b/source/Lib/EncoderLib/EncCu.cpp
@@ -75,6 +75,26 @@ const MergeIdxPair EncCu::m_GeoModeTest[GEO_MAX_NUM_CANDS] = { MergeIdxPair{0, 1
                                                                MergeIdxPair{3, 4}, MergeIdxPair{4, 0}, MergeIdxPair{4, 1}, MergeIdxPair{4, 2}, MergeIdxPair{4, 3},
                                                                MergeIdxPair{0, 5}, MergeIdxPair{1, 5}, MergeIdxPair{2, 5}, MergeIdxPair{3, 5}, MergeIdxPair{4, 5},
                                                                MergeIdxPair{5, 0}, MergeIdxPair{5, 1}, MergeIdxPair{5, 2}, MergeIdxPair{5, 3}, MergeIdxPair{5, 4} };
+
+
+// Shape coefSquareCUs (2 x 5 x 2 x 2 x 2): preset (faster and fast + medium) x cusize x nspred x sptype x numcoef
+
+const double EncCu::coefSquareCUs[2][5][2][2][2] = {
+{{{{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, },  {{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, },  },
+{{{-1.00000000, -1.00000000, }, {0.06213828, 0.00611228, }, },  {{-1.00000000, -1.00000000, }, {0.06943756, 0.00320762, }, },  },
+{{{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, },  {{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, },  },
+{{{-1.00000000, -1.00000000, }, {0.10833051, 0.00053144, }, },  {{-1.00000000, -1.00000000, }, {0.08304352, 0.00142876, }, },  },
+{{{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, },  {{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, },  },
+},
+{{{{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, },  {{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, },  },
+{{{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, },  {{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, },  },
+{{{0.06852235, 0.00388054, }, {0.09236045, 0.00084528, }, },  {{0.06955832, 0.00289679, }, {0.09598522, 0.00096187, }, },  },
+{{{0.07268085, 0.00302796, }, {0.09323753, 0.00050996, }, },  {{0.06123618, 0.00471601, }, {0.09253389, 0.00046826, }, },  },
+{{{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, },  {{-1.00000000, -1.00000000, }, {-1.00000000, -1.00000000, }, },  },
+},
+};
+
+
 // ====================================================================================================================
 EncCu::EncCu()
   : m_CtxCache          ( nullptr )
@@ -1006,52 +1026,80 @@ void EncCu::xCheckModeSplit(CodingStructure *&tempCS, CodingStructure *&bestCS,
 
 void EncCu::xCheckModeSplitInternal(CodingStructure *&tempCS, CodingStructure *&bestCS, Partitioner &partitioner, const EncTestMode& encTestMode, const ModeType modeTypeParent, bool& skipInterPass )
 {
-  const int qp                   = encTestMode.qp;
-  const int oldPrevQp            = tempCS->prevQP[partitioner.chType];
-  const auto oldMotionLut        = tempCS->motionLut;
-  const ReshapeData& reshapeData = tempCS->picture->reshapeData;
-
-  const PartSplit split = getPartSplit( encTestMode );
-  const ModeType modeTypeChild = partitioner.modeType;
+  const int qp                     = encTestMode.qp;
+  const int oldPrevQp              = tempCS->prevQP[partitioner.chType];
+  const auto oldMotionLut          = tempCS->motionLut;
+  const ReshapeData& reshapeData   = tempCS->picture->reshapeData;
+                                   
+  const PartSplit split            = getPartSplit( encTestMode );
+  const ModeType  modeTypeChild    = partitioner.modeType;
 
-  CHECK( split == CU_DONT_SPLIT, "No proper split provided!" );
+  CHECK( !( split == CU_QUAD_SPLIT || split == CU_HORZ_SPLIT || split == CU_VERT_SPLIT
+         || split == CU_TRIH_SPLIT || split == CU_TRIV_SPLIT ), "invalid split type" );
 
   tempCS->initStructData( qp );
 
-  m_CABACEstimator->getCtx() = m_CurrCtx->start;
+  m_CABACEstimator->getCtx()       = m_CurrCtx->start;
 
-  const uint16_t split_ctx_size = Ctx::SplitFlag.size() + Ctx::SplitQtFlag.size() + Ctx::SplitHvFlag.size() + Ctx::Split12Flag.size() + Ctx::ModeConsFlag.size();
-  const TempCtx  ctxSplitFlags( m_CtxCache, SubCtx(CtxSet(Ctx::SplitFlag(), split_ctx_size), m_CABACEstimator->getCtx()));
+  const uint16_t split_ctx_size    = Ctx::SplitFlag.size() + Ctx::SplitQtFlag.size() + Ctx::SplitHvFlag.size() + Ctx::Split12Flag.size() + Ctx::ModeConsFlag.size();
+  const TempCtx  ctxSplitFlags     ( m_CtxCache, SubCtx( CtxSet( Ctx::SplitFlag(), split_ctx_size ), m_CABACEstimator->getCtx() ) );
 
   m_CABACEstimator->resetBits();
-  m_CABACEstimator->split_cu_mode( split, *tempCS, partitioner );
-  partitioner.modeType = modeTypeParent;
+  m_CABACEstimator->split_cu_mode  ( split, *tempCS, partitioner );
+  partitioner     . modeType       = modeTypeParent;
   m_CABACEstimator->mode_constraint( split, *tempCS, partitioner, modeTypeChild );
-  partitioner.modeType = modeTypeChild;
+  partitioner     . modeType       = modeTypeChild;
 
-  const int64_t splitBits = m_CABACEstimator->getEstFracBits();
+  const int64_t splitBits   = m_CABACEstimator->getEstFracBits();
 
-  int numChild = 3;
-  if( split == CU_VERT_SPLIT || split == CU_HORZ_SPLIT ) numChild--;
-  else if( split == CU_QUAD_SPLIT ) numChild++;
+  const bool chromaNotSplit = modeTypeParent == MODE_TYPE_ALL && modeTypeChild == MODE_TYPE_INTRA;
+  const bool isChromaTooBig = isChromaEnabled( tempCS->pps->pcv->chrFormat ) && tempCS->area.Y().maxDim() > tempCS->sps->getMaxTbSize();
+  bool       skipSplitTest  = chromaNotSplit && isChromaTooBig;
 
-  int64_t approxBits = m_pcEncCfg->m_qtbttSpeedUp > 0 ? numChild << SCALE_BITS : 0;
+  if( !skipSplitTest )
+  {
+    double         a = -1, b = -1;
+    const unsigned w       = partitioner.currArea().lwidth();
+    const unsigned h       = partitioner.currArea().lheight();
+    const bool contextCond = w == h && tempCS->slice->sliceType == VVENC_B_SLICE && isLuma( partitioner.chType ) && m_pcEncCfg->m_splitCostThrParamId >= 0 && m_pcEncCfg->m_splitCostThrParamId <= 1;
 
-  const double factor = ( tempCS->currQP[partitioner.chType] > 30 ? 1.1 : 1.075 )
-                      + ( m_pcEncCfg->m_qtbttSpeedUp > 0 ? 0.01 : 0.0 )
-                      + ( ( m_pcEncCfg->m_qtbttSpeedUp > 0 && isChroma( partitioner.chType ) ) ? 0.2 : 0.0 );
+    if( contextCond )
+    {
+      uint8_t nsPredInd = m_modeCtrl.comprCUCtx->bestNsPredMode.type == ETM_INTRA;
+      uint8_t szInd     = getLog2( w ) - 3;
+      uint8_t splitInd  = split == CU_QUAD_SPLIT ? 1 : 0;
+      a = coefSquareCUs[m_pcEncCfg->m_splitCostThrParamId][szInd][nsPredInd][splitInd][0];
+      b = coefSquareCUs[m_pcEncCfg->m_splitCostThrParamId][szInd][nsPredInd][splitInd][1];
+    }
 
-  const double cost   = m_cRdCost.calcRdCost( uint64_t( splitBits + approxBits + ( ( bestCS->fracBits ) / factor ) ), Distortion( bestCS->dist / factor ) ) + bestCS->costDbOffset / factor;
-  
-  const bool chromaNotSplit = modeTypeParent == MODE_TYPE_ALL && modeTypeChild == MODE_TYPE_INTRA ? true : false;
-  const bool isChromaTooBig = isChromaEnabled( tempCS->pps->pcv->chrFormat ) && std::max( tempCS->area.Y().width, tempCS->area.Y().height ) > tempCS->sps->getMaxTbSize();
+    if( a > -1 && b > -1 )
+    {
+      const double bestNsCost    = m_modeCtrl.comprCUCtx->bestCostBeforeSplit == MAX_DOUBLE ? -1 : m_modeCtrl.comprCUCtx->bestCostBeforeSplit;
+      const double factor        = 1.0 + b * exp( a * qp );
+      const double predSplitCost = bestNsCost / factor + splitBits;
+      skipSplitTest              = bestNsCost >= 0 && predSplitCost >= bestNsCost;
+    }
+    else
+    {
+      int numChild = 3;
+      if( split == CU_VERT_SPLIT || split == CU_HORZ_SPLIT ) numChild--;
+      else if( split == CU_QUAD_SPLIT ) numChild++;
 
-  if( cost > bestCS->cost + bestCS->costDbOffset // speedup
-      || ( chromaNotSplit && isChromaTooBig ) // TODO: proper fix, for now inhibit chroma TU split that we cannot handle, resulting in missing chroma encoding!
-      )
+      int64_t approxBits = m_pcEncCfg->m_qtbttSpeedUp > 0 ? numChild << SCALE_BITS : 0;
+
+      const double factor     = ( tempCS->currQP[partitioner.chType] > 30                              ? 1.1  : 1.075 ) +
+                                (   m_pcEncCfg->m_qtbttSpeedUp > 0                                     ? 0.01 : 0.0   ) +
+                                ( ( m_pcEncCfg->m_qtbttSpeedUp > 0 && isChroma( partitioner.chType ) ) ? 0.2  : 0.0   );
+       
+      const double baseCost   = bestCS->cost + bestCS->costDbOffset;
+      const double predCost   = baseCost / factor + splitBits + approxBits;
+      skipSplitTest           = predCost >= baseCost;
+    }
+  }
+
+  if( skipSplitTest )
   {
     m_CABACEstimator->getCtx() = SubCtx( CtxSet( Ctx::SplitFlag(), split_ctx_size ), ctxSplitFlags );
-    // DTRACE( g_trace_ctx, D_TMP, "%d exit split %f %f %f\n", g_trace_ctx->getChannelCounter(D_TMP), cost, bestCS->cost, bestCS->costDbOffset );
     xCheckBestMode( tempCS, bestCS, partitioner, encTestMode );
     return;
   }
@@ -1069,22 +1117,19 @@ void EncCu::xCheckModeSplitInternal(CodingStructure *&tempCS, CodingStructure *&
     }
   }
 
-  CHECK(!(split == CU_QUAD_SPLIT || split == CU_HORZ_SPLIT || split == CU_VERT_SPLIT
-    || split == CU_TRIH_SPLIT || split == CU_TRIV_SPLIT), "invalid split type");
-
   partitioner.splitCurrArea( split, *tempCS );
   bool qgEnableChildren = partitioner.currQgEnable(); // QG possible at children level
 
   m_CurrCtx++;
 
   AffineMVInfo tmpMVInfo;
-  bool isAffMVInfoSaved = m_cInterSearch.m_AffineProfList->savePrevAffMVInfo(0, tmpMVInfo );
+  bool isAffMVInfoSaved = m_cInterSearch.m_AffineProfList->savePrevAffMVInfo( 0, tmpMVInfo );
 
   BlkUniMvInfo tmpUniMvInfo;
   bool         isUniMvInfoSaved = false;
-  if (!tempCS->slice->isIntra())
+  if( !tempCS->slice->isIntra() )
   {
-    m_cInterSearch.m_BlkUniMvInfoBuffer->savePrevUniMvInfo(tempCS->area.Y(), tmpUniMvInfo, isUniMvInfoSaved);
+    m_cInterSearch.m_BlkUniMvInfoBuffer->savePrevUniMvInfo( tempCS->area.Y(), tmpUniMvInfo, isUniMvInfoSaved );
   }
 
   DeriveCtx deriveCtx = m_CABACEstimator->getDeriveCtx();
diff --git a/source/Lib/EncoderLib/EncCu.h b/source/Lib/EncoderLib/EncCu.h
index ca72dabf6..d0827c92c 100644
--- a/source/Lib/EncoderLib/EncCu.h
+++ b/source/Lib/EncoderLib/EncCu.h
@@ -304,6 +304,8 @@ class EncCu
                         m_subPuMvOffset[MRG_MAX_NUM_CANDS];
   Distortion            m_uiSadBestForQPA;
 
+  static const double coefSquareCUs[2][5][2][2][2];
+
 public:
   EncCu();
   virtual ~EncCu();
diff --git a/source/Lib/EncoderLib/EncModeCtrl.cpp b/source/Lib/EncoderLib/EncModeCtrl.cpp
index 2bf3c2faa..ad39e002f 100644
--- a/source/Lib/EncoderLib/EncModeCtrl.cpp
+++ b/source/Lib/EncoderLib/EncModeCtrl.cpp
@@ -1099,10 +1099,8 @@ void EncModeCtrl::beforeSplit( Partitioner& partitioner )
   CodedCUInfo    &relatedCU   = getBlkInfo( partitioner.currArea() );
   const CodingUnit&  bestCU   = *cuECtx.bestCU;
 
-  if (m_pcEncCfg->m_fastTTSplit)
-  {
-    cuECtx.bestCostBeforeSplit = cuECtx.bestCS->cost;
-  }
+  cuECtx.bestNsPredMode       = cuECtx.bestMode;
+  cuECtx.bestCostBeforeSplit  = cuECtx.bestCS->cost;
 
   setFromCs( *cuECtx.bestCS, cuECtx.bestMode, partitioner );
 
@@ -1158,14 +1156,6 @@ bool EncModeCtrl::useModeResult( const EncTestMode& encTestmode, CodingStructure
   {
     cuECtx.bestCostVertSplit = tempCS->cost;
   }
-  else if( encTestmode.type == ETM_SPLIT_TT_H )
-  {
-    cuECtx.bestCostTriHorzSplit = tempCS->cost;
-  }
-  else if( encTestmode.type == ETM_SPLIT_TT_V )
-  {
-    cuECtx.bestCostTriVertSplit = tempCS->cost;
-  }
   else if( !isModeSplit( encTestmode ) && isModeInter( encTestmode ) && tempCS->cus.size() == 1 )
   {
     cuECtx.nonSkipWasTested |= !tempCS->cus.front()->skip;
@@ -1240,12 +1230,6 @@ bool EncModeCtrl::useModeResult( const EncTestMode& encTestmode, CodingStructure
     cuECtx.bestTU   = cuECtx.bestCU->firstTU;
     cuECtx.bestMode = encTestmode;
 
-    if( isModeInter( encTestmode ) )
-    {
-      //Here we take the best cost of both inter modes. We are assuming only the inter modes (and all of them) have come before the intra modes!!!
-      cuECtx.bestInterCost = cuECtx.bestCS->cost;
-    }
-
     return true;
   }
   else
diff --git a/source/Lib/EncoderLib/EncModeCtrl.h b/source/Lib/EncoderLib/EncModeCtrl.h
index e0c8b110d..2cdc6247b 100644
--- a/source/Lib/EncoderLib/EncModeCtrl.h
+++ b/source/Lib/EncoderLib/EncModeCtrl.h
@@ -174,13 +174,9 @@ struct ComprCUCtx
     , bestCU        ( nullptr    )
     , bestTU        ( nullptr    )
     , bestMode      ()
-    , bestInterCost             ( MAX_DOUBLE )
-    , bestCostBeforeSplit       ( MAX_DOUBLE )
+    , bestCostBeforeSplit   (MAX_DOUBLE)
     , bestCostVertSplit     (MAX_DOUBLE)
     , bestCostHorzSplit     (MAX_DOUBLE)
-    , bestCostTriVertSplit  (MAX_DOUBLE)
-    , bestCostTriHorzSplit  (MAX_DOUBLE)
-    , bestCostImv           (MAX_DOUBLE *.5)
     , bestCostNoImv         (MAX_DOUBLE *.5)
     , grad_horVal           (0)
     , grad_verVal           (0)
@@ -199,11 +195,11 @@ struct ComprCUCtx
     , doVerChromaSplit      (false)
     , doQtChromaSplit       (false)
     , isBestNoSplitSkip     (false)
-    , skipSecondMTSPass     (false)
     , intraWasTested        (false)
     , relatedCuIsValid      (false)
     , isIntra               (false)
     , nonSkipWasTested      (false)
+    , bestNsPredMode        (EncTestMode())
   {
   }
 
@@ -213,13 +209,9 @@ struct ComprCUCtx
   CodingUnit*       bestCU;
   TransformUnit*    bestTU;
   EncTestMode       bestMode;
-  double            bestInterCost;
   double            bestCostBeforeSplit;
   double            bestCostVertSplit;
   double            bestCostHorzSplit;
-  double            bestCostTriVertSplit;
-  double            bestCostTriHorzSplit;
-  double            bestCostImv;
   double            bestCostNoImv;
   double            grad_horVal;
   double            grad_verVal;
@@ -239,11 +231,11 @@ struct ComprCUCtx
   bool              doVerChromaSplit;
   bool              doQtChromaSplit;
   bool              isBestNoSplitSkip;
-  bool              skipSecondMTSPass;
   bool              intraWasTested;
   bool              relatedCuIsValid;
   bool              isIntra;
   bool              nonSkipWasTested;
+  EncTestMode       bestNsPredMode;
 };
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/test/vvenc_unit_test/vvenc_unit_test.cpp b/test/vvenc_unit_test/vvenc_unit_test.cpp
index 0d9ce6de3..59ac93bc9 100644
--- a/test/vvenc_unit_test/vvenc_unit_test.cpp
+++ b/test/vvenc_unit_test/vvenc_unit_test.cpp
@@ -282,7 +282,7 @@ static bool test_TCoeffOps()
 
 int main()
 {
-  unsigned seed = time( NULL );
+  unsigned seed = ( unsigned ) time( NULL );
   srand( seed );
 
   bool passed = test_TCoeffOps();