diff --git a/AUTHORS b/AUTHORS
index abae401c1ef..36b2b6f7942 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -46,6 +46,7 @@ Bryan Cross (crossbr)
 candirufish
 Chess13234
 Chris Cain (ceebo)
+Ciekce
 clefrks
 Clemens L. (rn5f107s2)
 Cody Ho (aesrentai)
diff --git a/src/engine.cpp b/src/engine.cpp
index 164f228ad09..c4c0918082a 100644
--- a/src/engine.cpp
+++ b/src/engine.cpp
@@ -54,6 +54,7 @@ Engine::Engine(std::string path) :
       NN::NetworkMedium({EvalFileDefaultNameMedium, "None", ""}, NN::EmbeddedNNUEType::MEDIUM),
       NN::NetworkSmall({EvalFileDefaultNameSmall, "None", ""}, NN::EmbeddedNNUEType::SMALL))) {
     pos.set(StartFEN, false, &states->back());
+    capSq = SQ_NONE;
 }
 
 std::uint64_t Engine::perft(const std::string& fen, Depth depth, bool isChess960) {
@@ -62,9 +63,10 @@ std::uint64_t Engine::perft(const std::string& fen, Depth depth, bool isChess960
     return Benchmark::perft(fen, depth, isChess960);
 }
 
-void Engine::go(const Search::LimitsType& limits) {
+void Engine::go(Search::LimitsType& limits) {
     assert(limits.perft == 0);
     verify_networks();
+    limits.capSq = capSq;
 
     threads.start_thinking(options, pos, states, limits);
 }
@@ -103,6 +105,7 @@ void Engine::set_position(const std::string& fen, const std::vector<std::string>
     states = StateListPtr(new std::deque<StateInfo>(1));
     pos.set(fen, options["UCI_Chess960"], &states->back());
 
+    capSq = SQ_NONE;
     for (const auto& move : moves)
     {
         auto m = UCIEngine::to_move(pos, move);
@@ -112,6 +115,11 @@ void Engine::set_position(const std::string& fen, const std::vector<std::string>
 
         states->emplace_back();
         pos.do_move(m, states->back());
+
+        capSq          = SQ_NONE;
+        DirtyPiece& dp = states->back().dirtyPiece;
+        if (dp.dirty_num > 1 && dp.to[1] == SQ_NONE)
+            capSq = m.to_sq();
     }
 }
 
@@ -180,4 +188,4 @@ std::string Engine::visualize() const {
     return ss.str();
 }
 
-}
\ No newline at end of file
+}
diff --git a/src/engine.h b/src/engine.h
index 753ec4899f4..fc6bda97a43 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -20,24 +20,26 @@
 #define ENGINE_H_INCLUDED
 
 #include <cstddef>
+#include <cstdint>
 #include <functional>
 #include <optional>
 #include <string>
 #include <string_view>
 #include <utility>
 #include <vector>
-#include <cstdint>
 
 #include "nnue/network.h"
 #include "position.h"
 #include "search.h"
+#include "syzygy/tbprobe.h"  // for Stockfish::Depth
 #include "thread.h"
 #include "tt.h"
 #include "ucioption.h"
-#include "syzygy/tbprobe.h"  // for Stockfish::Depth
 
 namespace Stockfish {
 
+enum Square : int;
+
 class Engine {
    public:
     using InfoShort = Search::InfoShort;
@@ -50,7 +52,7 @@ class Engine {
     std::uint64_t perft(const std::string& fen, Depth depth, bool isChess960);
 
     // non blocking call to start searching
-    void go(const Search::LimitsType&);
+    void go(Search::LimitsType&);
     // non blocking call to stop searching
     void stop();
 
@@ -93,6 +95,7 @@ class Engine {
 
     Position     pos;
     StateListPtr states;
+    Square       capSq;
 
     OptionsMap           options;
     ThreadPool           threads;
@@ -105,4 +108,4 @@ class Engine {
 }  // namespace Stockfish
 
 
-#endif  // #ifndef ENGINE_H_INCLUDED
\ No newline at end of file
+#endif  // #ifndef ENGINE_H_INCLUDED
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index d2dfeed5830..e4e8d9fcc80 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -67,26 +67,30 @@ Value Eval::evaluate(const Eval::NNUE::Networks&    networks,
     bool psqtOnly   = std::abs(simpleEval) > PsqtOnlyThreshold;
     int  nnueComplexity;
     int  v;
-      
+
+    //Value nnue = smallNet
+    //             ? networks.small.evaluate(pos, &caches.small, true, &nnueComplexity, psqtOnly)
+    //             : networks.big.evaluate(pos, &caches.big, true, &nnueComplexity, false);
+
     Value nnue;
 
     if (smallNet) 
-        nnue = networks.small.evaluate(pos, nullptr, true, &nnueComplexity, psqtOnly); 
+            nnue = networks.small.evaluate(pos, &caches.small, true, &nnueComplexity, psqtOnly); 
     else {
         if (Eval::mediumNetOn) 
             nnue = networks.medium.evaluate(pos, nullptr, true, &nnueComplexity, false); //funktioniert Cache? Nö &caches.medium
         else 
             nnue = networks.big.evaluate(pos, &caches.big, true, &nnueComplexity, false);
-    }    
-    const auto adjustEval = [&](int optDiv, int nnueDiv, int npmDiv, int pawnCountConstant,
-                                int pawnCountMul, int npmConstant, int evalDiv,
-                                int shufflingConstant, int shufflingDiv) {
+    } 
+    const auto adjustEval = [&](int optDiv, int nnueDiv, int pawnCountConstant, int pawnCountMul,
+                                int npmConstant, int evalDiv, int shufflingConstant,
+                                int shufflingDiv) {
 
         // Blend optimism and eval with nnue complexity and material imbalance
         optimism += optimism * (nnueComplexity + std::abs(simpleEval - nnue)) / optDiv;
         nnue -= nnue * (nnueComplexity * 5 / 3) / nnueDiv;
 
-        int npm = pos.non_pawn_material() / npmDiv;
+        int npm = pos.non_pawn_material() / 64;
         v       = (nnue * (npm + pawnCountConstant + pawnCountMul * pos.count<PAWN>())
              + optimism * (npmConstant + npm))
           / evalDiv;
@@ -97,11 +101,11 @@ Value Eval::evaluate(const Eval::NNUE::Networks&    networks,
     };
 
     if (!smallNet)
-        adjustEval(524, 32395, 66, 942, 11, 139, 1058, 178, 204);
+        adjustEval(524, 32395, 942, 11, 139, 1058, 178, 204);
     else if (psqtOnly)
-        adjustEval(517, 32857, 65, 908, 7, 155, 1006, 224, 238);
+        adjustEval(517, 32857, 908, 7, 155, 1006, 224, 238);
     else
-        adjustEval(515, 32793, 63, 944, 9, 140, 1067, 206, 206);
+        adjustEval(515, 32793, 944, 9, 140, 1067, 206, 206);
 
     // SFnps Begin //
     if((NNUE::RandomEval) || (NNUE::WaitMs))
@@ -133,11 +137,11 @@ Value Eval::evaluate(const Eval::NNUE::Networks&    networks,
 // Trace scores are from white's point of view
 std::string Eval::trace(Position& pos, const Eval::NNUE::Networks& networks) {
 
-    auto caches = std::make_unique<Eval::NNUE::AccumulatorCaches>();
-
     if (pos.checkers())
         return "Final evaluation: none (in check)";
 
+    auto caches = std::make_unique<Eval::NNUE::AccumulatorCaches>(networks);
+
     std::stringstream ss;
     ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2);
     ss << '\n' << NNUE::trace(pos, networks, *caches) << '\n';
diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp
index f7235c5d776..f5ac7b9f427 100644
--- a/src/nnue/network.cpp
+++ b/src/nnue/network.cpp
@@ -259,15 +259,19 @@ void Network<Arch, Transformer>::verify(std::string evalfilePath) const {
         exit(EXIT_FAILURE);
     }
 
-    sync_cout << "info string NNUE evaluation using " << evalfilePath << sync_endl;
+    size_t size = sizeof(*featureTransformer) + sizeof(*network) * LayerStacks;
+    sync_cout << "info string NNUE evaluation using " << evalfilePath << " ("
+              << size / (1024 * 1024) << "MiB, (" << featureTransformer->InputDimensions << ", "
+              << network[0]->TransformedFeatureDimensions << ", " << network[0]->FC_0_OUTPUTS
+              << ", " << network[0]->FC_1_OUTPUTS << ", 1))" << sync_endl;
 }
 
 
 template<typename Arch, typename Transformer>
 void Network<Arch, Transformer>::hint_common_access(const Position&                         pos,
                                                     AccumulatorCaches::Cache<FTDimensions>* cache,
-                                                    bool psqtOnl) const {
-    featureTransformer->hint_common_access(pos, cache, psqtOnl);
+                                                    bool psqtOnly) const {
+    featureTransformer->hint_common_access(pos, cache, psqtOnly);
 }
 
 template<typename Arch, typename Transformer>
diff --git a/src/nnue/network.h b/src/nnue/network.h
index a4ccb2a1af5..18c5835e664 100644
--- a/src/nnue/network.h
+++ b/src/nnue/network.h
@@ -63,7 +63,7 @@ class Network {
 
     void hint_common_access(const Position&                         pos,
                             AccumulatorCaches::Cache<FTDimensions>* cache,
-                            bool                                    psqtOnl) const;
+                            bool                                    psqtOnly) const;
 
     void          verify(std::string evalfilePath) const;
     NnueEvalTrace trace_evaluate(const Position&                         pos,
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index b8dc96f4fab..abfd2cc8ea5 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -50,33 +50,36 @@ struct alignas(CacheLineSize) Accumulator {
 // is commonly referred to as "Finny Tables".
 struct AccumulatorCaches {
 
+    template<typename Networks>
+    AccumulatorCaches(const Networks& networks) {
+        clear(networks);
+    }
+
     template<IndexType Size>
     struct alignas(CacheLineSize) Cache {
 
         struct alignas(CacheLineSize) Entry {
-            BiasType       accumulation[COLOR_NB][Size];
-            PSQTWeightType psqtAccumulation[COLOR_NB][PSQTBuckets];
-            Bitboard       byColorBB[COLOR_NB][COLOR_NB];
-            Bitboard       byTypeBB[COLOR_NB][PIECE_TYPE_NB];
+            BiasType       accumulation[Size];
+            PSQTWeightType psqtAccumulation[PSQTBuckets];
+            Bitboard       byColorBB[COLOR_NB];
+            Bitboard       byTypeBB[PIECE_TYPE_NB];
+            bool           psqtOnly;
 
             // To initialize a refresh entry, we set all its bitboards empty,
             // so we put the biases in the accumulation, without any weights on top
             void clear(const BiasType* biases) {
 
-                std::memset(byColorBB, 0, sizeof(byColorBB));
-                std::memset(byTypeBB, 0, sizeof(byTypeBB));
-
-                std::memcpy(accumulation[WHITE], biases, Size * sizeof(BiasType));
-                std::memcpy(accumulation[BLACK], biases, Size * sizeof(BiasType));
-
-                std::memset(psqtAccumulation, 0, sizeof(psqtAccumulation));
+                std::memcpy(accumulation, biases, sizeof(accumulation));
+                std::memset((uint8_t*) this + offsetof(Entry, psqtAccumulation), 0,
+                            sizeof(Entry) - offsetof(Entry, psqtAccumulation));
             }
         };
 
         template<typename Network>
         void clear(const Network& network) {
-            for (auto& entry : entries)
-                entry.clear(network.featureTransformer->biases);
+            for (auto& entries1D : entries)
+                for (auto& entry : entries1D)
+                    entry.clear(network.featureTransformer->biases);
         }
 
         void clear(const BiasType* biases) {
@@ -84,20 +87,23 @@ struct AccumulatorCaches {
                 entry.clear(biases);
         }
 
-        Entry& operator[](Square sq) { return entries[sq]; }
+        std::array<Entry, COLOR_NB>& operator[](Square sq) { return entries[sq]; }
 
-        std::array<Entry, SQUARE_NB> entries;
+        std::array<std::array<Entry, COLOR_NB>, SQUARE_NB> entries;
     };
 
     template<typename Networks>
     void clear(const Networks& networks) {
         big.clear(networks.big);
+        small.clear(networks.small);
     }
 
     // When adding a new cache for a network, i.e. the smallnet
     // the appropriate condition must be added to FeatureTransformer::update_accumulator_refresh.
     //Cache<TransformedFeatureDimensionsMedium> medium;
-    Cache<TransformedFeatureDimensionsBig> big;
+    Cache<TransformedFeatureDimensionsBig>   big;
+    Cache<TransformedFeatureDimensionsMedium> medium;
+    Cache<TransformedFeatureDimensionsSmall> small;
 };
 
 }  // namespace Stockfish::Eval::NNUE
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 88f0e4031a4..4647ecd066d 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -404,19 +404,25 @@ class FeatureTransformer {
         return {st, next};
     }
 
-    // NOTE: The parameter states_to_update is an array of position states, ending with nullptr.
+    // NOTE: The parameter states_to_update is an array of position states.
     //       All states must be sequential, that is states_to_update[i] must either be reachable
-    //       by repeatedly applying ->previous from states_to_update[i+1] or
-    //       states_to_update[i] == nullptr.
+    //       by repeatedly applying ->previous from states_to_update[i+1].
     //       computed_st must be reachable by repeatedly applying ->previous on
-    //       states_to_update[0], if not nullptr.
+    //       states_to_update[0].
     template<Color Perspective, size_t N>
     void update_accumulator_incremental(const Position& pos,
                                         StateInfo*      computed_st,
                                         StateInfo*      states_to_update[N],
                                         bool            psqtOnly) const {
         static_assert(N > 0);
-        assert(states_to_update[N - 1] == nullptr);
+        assert([&]() {
+            for (size_t i = 0; i < N; ++i)
+            {
+                if (states_to_update[i] == nullptr)
+                    return false;
+            }
+            return true;
+        }());
 
 #ifdef VECTOR
         // Gcc-10.2 unnecessarily spills AVX2 registers if this array
@@ -425,11 +431,7 @@ class FeatureTransformer {
         psqt_vec_t psqt[NumPsqtRegs];
 #endif
 
-        if (states_to_update[0] == nullptr)
-            return;
-
         // Update incrementally going back through states_to_update.
-
         // Gather all features to be updated.
         const Square ksq = pos.square<KING>(Perspective);
 
@@ -437,28 +439,18 @@ class FeatureTransformer {
         // That might depend on the feature set and generally relies on the
         // feature set's update cost calculation to be correct and never allow
         // updates with more added/removed features than MaxActiveDimensions.
-        FeatureSet::IndexList removed[N - 1], added[N - 1];
+        FeatureSet::IndexList removed[N], added[N];
 
+        for (int i = N - 1; i >= 0; --i)
         {
-            int i =
-              N
-              - 2;  // Last potential state to update. Skip last element because it must be nullptr.
-            while (states_to_update[i] == nullptr)
-                --i;
+            (states_to_update[i]->*accPtr).computed[Perspective]     = !psqtOnly;
+            (states_to_update[i]->*accPtr).computedPSQT[Perspective] = true;
 
-            StateInfo* st2 = states_to_update[i];
+            const StateInfo* end_state = i == 0 ? computed_st : states_to_update[i - 1];
 
-            for (; i >= 0; --i)
-            {
-                (states_to_update[i]->*accPtr).computed[Perspective]     = !psqtOnly;
-                (states_to_update[i]->*accPtr).computedPSQT[Perspective] = true;
-
-                const StateInfo* end_state = i == 0 ? computed_st : states_to_update[i - 1];
-
-                for (; st2 != end_state; st2 = st2->previous)
-                    FeatureSet::append_changed_indices<Perspective>(ksq, st2->dirtyPiece,
-                                                                    removed[i], added[i]);
-            }
+            for (StateInfo* st2 = states_to_update[i]; st2 != end_state; st2 = st2->previous)
+                FeatureSet::append_changed_indices<Perspective>(ksq, st2->dirtyPiece, removed[i],
+                                                                added[i]);
         }
 
         StateInfo* st = computed_st;
@@ -466,8 +458,7 @@ class FeatureTransformer {
         // Now update the accumulators listed in states_to_update[], where the last element is a sentinel.
 #ifdef VECTOR
 
-        if (states_to_update[1] == nullptr && (removed[0].size() == 1 || removed[0].size() == 2)
-            && added[0].size() == 1)
+        if (N == 1 && (removed[0].size() == 1 || removed[0].size() == 2) && added[0].size() == 1)
         {
             assert(states_to_update[0]);
 
@@ -541,7 +532,7 @@ class FeatureTransformer {
                     for (IndexType k = 0; k < NumRegs; ++k)
                         acc[k] = vec_load(&accTileIn[k]);
 
-                    for (IndexType i = 0; states_to_update[i]; ++i)
+                    for (IndexType i = 0; i < N; ++i)
                     {
                         // Difference calculation for the deactivated features
                         for (const auto index : removed[i])
@@ -578,7 +569,7 @@ class FeatureTransformer {
                 for (std::size_t k = 0; k < NumPsqtRegs; ++k)
                     psqt[k] = vec_load_psqt(&accTilePsqtIn[k]);
 
-                for (IndexType i = 0; states_to_update[i]; ++i)
+                for (IndexType i = 0; i < N; ++i)
                 {
                     // Difference calculation for the deactivated features
                     for (const auto index : removed[i])
@@ -608,7 +599,7 @@ class FeatureTransformer {
             }
         }
 #else
-        for (IndexType i = 0; states_to_update[i]; ++i)
+        for (IndexType i = 0; i < N; ++i)
         {
             if (!psqtOnly)
                 std::memcpy((states_to_update[i]->*accPtr).accumulation[Perspective],
@@ -656,200 +647,85 @@ class FeatureTransformer {
 
     template<Color Perspective>
     void update_accumulator_refresh_cache(const Position&                           pos,
-                                          AccumulatorCaches::Cache<HalfDimensions>* cache) const {
+                                          AccumulatorCaches::Cache<HalfDimensions>* cache,
+                                          bool psqtOnly) const {
         assert(cache != nullptr);
 
-        Square ksq = pos.square<KING>(Perspective);
-
-        auto& entry = (*cache)[ksq];
-
-        auto& accumulator                     = pos.state()->*accPtr;
-        accumulator.computed[Perspective]     = true;
-        accumulator.computedPSQT[Perspective] = true;
-
+        Square                ksq   = pos.square<KING>(Perspective);
+        auto&                 entry = (*cache)[ksq][Perspective];
         FeatureSet::IndexList removed, added;
-        for (Color c : {WHITE, BLACK})
-        {
-            for (PieceType pt = PAWN; pt <= KING; ++pt)
-            {
-                const Piece    piece = make_piece(c, pt);
-                const Bitboard oldBB =
-                  entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt];
-                const Bitboard newBB    = pos.pieces(c, pt);
-                Bitboard       toRemove = oldBB & ~newBB;
-                Bitboard       toAdd    = newBB & ~oldBB;
-
-                while (toRemove)
-                {
-                    Square sq = pop_lsb(toRemove);
-                    removed.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
-                }
-                while (toAdd)
-                {
-                    Square sq = pop_lsb(toAdd);
-                    added.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
-                }
-            }
-        }
-
-#ifdef VECTOR
-        vec_t      acc[NumRegs];
-        psqt_vec_t psqt[NumPsqtRegs];
 
-        for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
+        if (entry.psqtOnly && !psqtOnly)
         {
-            auto entryTile =
-              reinterpret_cast<vec_t*>(&entry.accumulation[Perspective][j * TileHeight]);
-            for (IndexType k = 0; k < NumRegs; ++k)
-                acc[k] = entryTile[k];
-
-            for (int i = 0; i < int(added.size()); ++i)
-            {
-                IndexType       index  = added[i];
-                const IndexType offset = HalfDimensions * index + j * TileHeight;
-                auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
-
-                for (unsigned k = 0; k < NumRegs; ++k)
-                    acc[k] = vec_add_16(acc[k], column[k]);
-            }
-            for (int i = 0; i < int(removed.size()); ++i)
-            {
-                IndexType       index  = removed[i];
-                const IndexType offset = HalfDimensions * index + j * TileHeight;
-                auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
-
-                for (unsigned k = 0; k < NumRegs; ++k)
-                    acc[k] = vec_sub_16(acc[k], column[k]);
-            }
-
-            for (IndexType k = 0; k < NumRegs; k++)
-                vec_store(&entryTile[k], acc[k]);
+            entry.clear(biases);
+            FeatureSet::append_active_indices<Perspective>(pos, added);
         }
-
-        for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
+        else
         {
-            auto entryTilePsqt = reinterpret_cast<psqt_vec_t*>(
-              &entry.psqtAccumulation[Perspective][j * PsqtTileHeight]);
-            for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                psqt[k] = entryTilePsqt[k];
-
-            for (int i = 0; i < int(added.size()); ++i)
-            {
-                IndexType       index  = added[i];
-                const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
-                auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
-
-                for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
-            }
-            for (int i = 0; i < int(removed.size()); ++i)
+            for (Color c : {WHITE, BLACK})
             {
-                IndexType       index  = removed[i];
-                const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
-                auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
+                for (PieceType pt = PAWN; pt <= KING; ++pt)
+                {
+                    const Piece    piece    = make_piece(c, pt);
+                    const Bitboard oldBB    = entry.byColorBB[c] & entry.byTypeBB[pt];
+                    const Bitboard newBB    = pos.pieces(c, pt);
+                    Bitboard       toRemove = oldBB & ~newBB;
+                    Bitboard       toAdd    = newBB & ~oldBB;
 
-                for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
+                    while (toRemove)
+                    {
+                        Square sq = pop_lsb(toRemove);
+                        removed.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
+                    }
+                    while (toAdd)
+                    {
+                        Square sq = pop_lsb(toAdd);
+                        added.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
+                    }
+                }
             }
-
-            for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                vec_store_psqt(&entryTilePsqt[k], psqt[k]);
-        }
-
-#else
-
-        for (const auto index : added)
-        {
-            const IndexType offset = HalfDimensions * index;
-            for (IndexType j = 0; j < HalfDimensions; ++j)
-                entry.accumulation[Perspective][j] += weights[offset + j];
-
-            for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k];
-        }
-        for (const auto index : removed)
-        {
-            const IndexType offset = HalfDimensions * index;
-            for (IndexType j = 0; j < HalfDimensions; ++j)
-                entry.accumulation[Perspective][j] -= weights[offset + j];
-
-            for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k];
         }
 
-#endif
-
-        // The accumulator of the refresh entry has been updated.
-        // Now copy its content to the actual accumulator we were refreshing
-
-        std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation[Perspective],
-                    sizeof(int32_t) * PSQTBuckets);
-
-        std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective],
-                    sizeof(BiasType) * HalfDimensions);
-
-        for (Color c : {WHITE, BLACK})
-            entry.byColorBB[Perspective][c] = pos.pieces(c);
-
-        for (PieceType pt = PAWN; pt <= KING; ++pt)
-            entry.byTypeBB[Perspective][pt] = pos.pieces(pt);
-    }
-
-    template<Color Perspective>
-    void
-    update_accumulator_refresh(const Position&                                            pos,
-                               [[maybe_unused]] AccumulatorCaches::Cache<HalfDimensions>* cache,
-                               bool psqtOnly) const {
-
-        // When we are refreshing the accumulator of the big net,
-        // redirect to the version of refresh that uses the refresh table.
-        // Using the cache for the small net is not beneficial.
-        if constexpr (HalfDimensions == Eval::NNUE::TransformedFeatureDimensionsBig)
-        {
-            update_accumulator_refresh_cache<Perspective>(pos, cache);
-            return;
-        }
-
-#ifdef VECTOR
-        // Gcc-10.2 unnecessarily spills AVX2 registers if this array
-        // is defined in the VECTOR code below, once in each branch
-        vec_t      acc[NumRegs];
-        psqt_vec_t psqt[NumPsqtRegs];
-#endif
-
-        // Refresh the accumulator
-        // Could be extracted to a separate function because it's done in 2 places,
-        // but it's unclear if compilers would correctly handle register allocation.
         auto& accumulator                     = pos.state()->*accPtr;
         accumulator.computed[Perspective]     = !psqtOnly;
         accumulator.computedPSQT[Perspective] = true;
-        FeatureSet::IndexList active;
-        FeatureSet::append_active_indices<Perspective>(pos, active);
 
 #ifdef VECTOR
+        vec_t      acc[NumRegs];
+        psqt_vec_t psqt[NumPsqtRegs];
+
         if (!psqtOnly)
             for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
             {
-                auto biasesTile = reinterpret_cast<const vec_t*>(&biases[j * TileHeight]);
+                auto entryTile = reinterpret_cast<vec_t*>(&entry.accumulation[j * TileHeight]);
                 for (IndexType k = 0; k < NumRegs; ++k)
-                    acc[k] = biasesTile[k];
+                    acc[k] = entryTile[k];
 
-                int i = 0;
-                for (; i < int(active.size()) - 1; i += 2)
+                int i0 = 0;
+                for (; i0 < int(std::min(removed.size(), added.size())); ++i0)
                 {
-                    IndexType       index0  = active[i];
-                    IndexType       index1  = active[i + 1];
-                    const IndexType offset0 = HalfDimensions * index0 + j * TileHeight;
-                    const IndexType offset1 = HalfDimensions * index1 + j * TileHeight;
-                    auto            column0 = reinterpret_cast<const vec_t*>(&weights[offset0]);
-                    auto            column1 = reinterpret_cast<const vec_t*>(&weights[offset1]);
+                    IndexType       indexR  = removed[i0];
+                    const IndexType offsetR = HalfDimensions * indexR + j * TileHeight;
+                    auto            columnR = reinterpret_cast<const vec_t*>(&weights[offsetR]);
+                    IndexType       indexA  = added[i0];
+                    const IndexType offsetA = HalfDimensions * indexA + j * TileHeight;
+                    auto            columnA = reinterpret_cast<const vec_t*>(&weights[offsetA]);
 
                     for (unsigned k = 0; k < NumRegs; ++k)
-                        acc[k] = vec_add_16(acc[k], vec_add_16(column0[k], column1[k]));
+                        acc[k] = vec_add_16(vec_sub_16(acc[k], columnR[k]), columnA[k]);
                 }
-                for (; i < int(active.size()); ++i)
+                for (int i = i0; i < int(removed.size()); ++i)
                 {
-                    IndexType       index  = active[i];
+                    IndexType       index  = removed[i];
+                    const IndexType offset = HalfDimensions * index + j * TileHeight;
+                    auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
+
+                    for (unsigned k = 0; k < NumRegs; ++k)
+                        acc[k] = vec_sub_16(acc[k], column[k]);
+                }
+                for (int i = i0; i < int(added.size()); ++i)
+                {
+                    IndexType       index  = added[i];
                     const IndexType offset = HalfDimensions * index + j * TileHeight;
                     auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
 
@@ -857,34 +733,29 @@ class FeatureTransformer {
                         acc[k] = vec_add_16(acc[k], column[k]);
                 }
 
-                auto accTile =
-                  reinterpret_cast<vec_t*>(&accumulator.accumulation[Perspective][j * TileHeight]);
-                for (unsigned k = 0; k < NumRegs; k++)
-                    vec_store(&accTile[k], acc[k]);
+                for (IndexType k = 0; k < NumRegs; k++)
+                    vec_store(&entryTile[k], acc[k]);
             }
 
         for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
         {
+            auto entryTilePsqt =
+              reinterpret_cast<psqt_vec_t*>(&entry.psqtAccumulation[j * PsqtTileHeight]);
             for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                psqt[k] = vec_zero_psqt();
+                psqt[k] = entryTilePsqt[k];
 
-            int i = 0;
-            for (; i < int(active.size()) - 1; i += 2)
+            for (int i = 0; i < int(removed.size()); ++i)
             {
-                IndexType       index0  = active[i];
-                IndexType       index1  = active[i + 1];
-                const IndexType offset0 = PSQTBuckets * index0 + j * PsqtTileHeight;
-                const IndexType offset1 = PSQTBuckets * index1 + j * PsqtTileHeight;
-                auto columnPsqt0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset0]);
-                auto columnPsqt1 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset1]);
+                IndexType       index  = removed[i];
+                const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
+                auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
 
                 for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] =
-                      vec_add_psqt_32(psqt[k], vec_add_psqt_32(columnPsqt0[k], columnPsqt1[k]));
+                    psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
             }
-            for (; i < int(active.size()); ++i)
+            for (int i = 0; i < int(added.size()); ++i)
             {
-                IndexType       index  = active[i];
+                IndexType       index  = added[i];
                 const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
                 auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
 
@@ -892,34 +763,56 @@ class FeatureTransformer {
                     psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
             }
 
-            auto accTilePsqt = reinterpret_cast<psqt_vec_t*>(
-              &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]);
             for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                vec_store_psqt(&accTilePsqt[k], psqt[k]);
+                vec_store_psqt(&entryTilePsqt[k], psqt[k]);
         }
 
 #else
-        if (!psqtOnly)
-            std::memcpy(accumulator.accumulation[Perspective], biases,
-                        HalfDimensions * sizeof(BiasType));
 
-        for (std::size_t k = 0; k < PSQTBuckets; ++k)
-            accumulator.psqtAccumulation[Perspective][k] = 0;
+        for (const auto index : removed)
+        {
+            if (!psqtOnly)
+            {
+                const IndexType offset = HalfDimensions * index;
+                for (IndexType j = 0; j < HalfDimensions; ++j)
+                    entry.accumulation[j] -= weights[offset + j];
+            }
 
-        for (const auto index : active)
+            for (std::size_t k = 0; k < PSQTBuckets; ++k)
+                entry.psqtAccumulation[k] -= psqtWeights[index * PSQTBuckets + k];
+        }
+        for (const auto index : added)
         {
             if (!psqtOnly)
             {
                 const IndexType offset = HalfDimensions * index;
                 for (IndexType j = 0; j < HalfDimensions; ++j)
-                    accumulator.accumulation[Perspective][j] += weights[offset + j];
+                    entry.accumulation[j] += weights[offset + j];
             }
 
             for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                accumulator.psqtAccumulation[Perspective][k] +=
-                  psqtWeights[index * PSQTBuckets + k];
+                entry.psqtAccumulation[k] += psqtWeights[index * PSQTBuckets + k];
         }
+
 #endif
+
+        // The accumulator of the refresh entry has been updated.
+        // Now copy its content to the actual accumulator we were refreshing
+
+        if (!psqtOnly)
+            std::memcpy(accumulator.accumulation[Perspective], entry.accumulation,
+                        sizeof(BiasType) * HalfDimensions);
+
+        std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation,
+                    sizeof(int32_t) * PSQTBuckets);
+
+        for (Color c : {WHITE, BLACK})
+            entry.byColorBB[c] = pos.pieces(c);
+
+        for (PieceType pt = PAWN; pt <= KING; ++pt)
+            entry.byTypeBB[pt] = pos.pieces(pt);
+
+        entry.psqtOnly = psqtOnly;
     }
 
     template<Color Perspective>
@@ -943,12 +836,12 @@ class FeatureTransformer {
             || (psqtOnly && (oldest_st->*accPtr).computedPSQT[Perspective]))
         {
             // Only update current position accumulator to minimize work.
-            StateInfo* states_to_update[2] = {pos.state(), nullptr};
-            update_accumulator_incremental<Perspective, 2>(pos, oldest_st, states_to_update,
+            StateInfo* states_to_update[1] = {pos.state()};
+            update_accumulator_incremental<Perspective, 1>(pos, oldest_st, states_to_update,
                                                            psqtOnly);
         }
         else
-            update_accumulator_refresh<Perspective>(pos, cache, psqtOnly);
+            update_accumulator_refresh_cache<Perspective>(pos, cache, psqtOnly);
     }
 
     template<Color Perspective>
@@ -969,14 +862,23 @@ class FeatureTransformer {
             //     1. for the current position
             //     2. the next accumulator after the computed one
             // The heuristic may change in the future.
-            StateInfo* states_to_update[3] = {next, next == pos.state() ? nullptr : pos.state(),
-                                              nullptr};
+            if (next == pos.state())
+            {
+                StateInfo* states_to_update[1] = {next};
 
-            update_accumulator_incremental<Perspective, 3>(pos, oldest_st, states_to_update,
-                                                           psqtOnly);
+                update_accumulator_incremental<Perspective, 1>(pos, oldest_st, states_to_update,
+                                                               psqtOnly);
+            }
+            else
+            {
+                StateInfo* states_to_update[2] = {next, pos.state()};
+
+                update_accumulator_incremental<Perspective, 2>(pos, oldest_st, states_to_update,
+                                                               psqtOnly);
+            }
         }
         else
-            update_accumulator_refresh<Perspective>(pos, cache, psqtOnly);
+            update_accumulator_refresh_cache<Perspective>(pos, cache, psqtOnly);
     }
 
     template<IndexType Size>
diff --git a/src/nnue/nnue_misc.cpp b/src/nnue/nnue_misc.cpp
index 18abc8d6a17..859dab97990 100644
--- a/src/nnue/nnue_misc.cpp
+++ b/src/nnue/nnue_misc.cpp
@@ -48,7 +48,7 @@ void hint_common_parent_position(const Position&    pos,
 
     int simpleEvalAbs = std::abs(simple_eval(pos, pos.side_to_move()));
     if (simpleEvalAbs > Eval::SmallNetThreshold)
-        networks.small.hint_common_access(pos, nullptr, simpleEvalAbs > Eval::PsqtOnlyThreshold);
+        networks.small.hint_common_access(pos, &caches.small, simpleEvalAbs > Eval::PsqtOnlyThreshold);
     else if (Stockfish::Eval::mediumNetOn)
         networks.medium.hint_common_access(pos, nullptr, false);  //funktioniert Cache? Nein &caches.medium,
     else    
diff --git a/src/search.cpp b/src/search.cpp
index 0413c656e6e..41d72d08ac2 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -54,8 +54,8 @@ using namespace Search;
 
 namespace {
 
-static constexpr double EvalLevel[10] = {1.043, 1.017, 0.952, 1.009, 0.971,
-                                         1.002, 0.992, 0.947, 1.046, 1.001};
+static constexpr double EvalLevel[10] = {0.981, 0.956, 0.895, 0.949, 0.913,
+                                         0.942, 0.933, 0.890, 0.984, 0.941};
 
 // Futility margin
 Value futility_margin(Depth d, bool noTtCutNode, bool improving, bool oppWorsening) {
@@ -137,19 +137,16 @@ Search::Worker::Worker(SharedState&                    sharedState,
     // Unpack the SharedState struct into member variables
     thread_idx(thread_id),
     manager(std::move(sm)),
-    refreshTable(),
     options(sharedState.options),
     threads(sharedState.threads),
     tt(sharedState.tt),
-    networks(sharedState.networks) {
+    networks(sharedState.networks),
+    refreshTable(networks) {
     clear();
 }
 
 void Search::Worker::start_searching() {
 
-    // Initialize accumulator refresh entries
-    refreshTable.clear(networks);
-
     // Non-main threads go directly to iterative_deepening()
     if (!is_mainthread())
     {
@@ -459,9 +456,10 @@ void Search::Worker::iterative_deepening() {
             double reduction = (1.48 + mainThread->previousTimeReduction) / (2.17 * timeReduction);
             double bestMoveInstability = 1 + 1.88 * totBestMoveChanges / threads.size();
             int    el                  = std::clamp((bestValue + 750) / 150, 0, 9);
+            double recapture           = limits.capSq == rootMoves[0].pv[0].to_sq() ? 0.955 : 1.005;
 
             double totalTime = mainThread->tm.optimum() * fallingEval * reduction
-                             * bestMoveInstability * EvalLevel[el];
+                             * bestMoveInstability * EvalLevel[el] * recapture;
 
             // Cap used time in case of a single legal move for a better viewer experience
             if (rootMoves.size() == 1)
@@ -518,6 +516,8 @@ void Search::Worker::clear() {
 
     for (size_t i = 1; i < reductions.size(); ++i)
         reductions[i] = int((20.14 + std::log(size_t(options["Threads"])) / 2) * std::log(i));
+
+    refreshTable.clear(networks);
 }
 
 
@@ -979,20 +979,22 @@ Value Search::Worker::search(
 
             if (capture || givesCheck)
             {
+                Piece capturedPiece = pos.piece_on(move.to_sq());
+                int   captHist =
+                  thisThread->captureHistory[movedPiece][move.to_sq()][type_of(capturedPiece)];
+
                 // Futility pruning for captures (~2 Elo)
                 if (!givesCheck && lmrDepth < 7 && !ss->inCheck)
                 {
-                    Piece capturedPiece = pos.piece_on(move.to_sq());
-                    Value futilityValue =
-                      ss->staticEval + 285 + 277 * lmrDepth + PieceValue[capturedPiece]
-                      + thisThread->captureHistory[movedPiece][move.to_sq()][type_of(capturedPiece)]
-                          / 7;
+                    Value futilityValue = ss->staticEval + 285 + 277 * lmrDepth
+                                        + PieceValue[capturedPiece] + captHist / 7;
                     if (futilityValue <= alpha)
                         continue;
                 }
 
                 // SEE based pruning for captures and checks (~11 Elo)
-                if (!pos.see_ge(move, -203 * depth))
+                int seeHist = std::clamp(captHist / 32, -199 * depth, 199 * depth);
+                if (!pos.see_ge(move, -203 * depth - seeHist))
                     continue;
             }
             else
@@ -1079,7 +1081,12 @@ Value Search::Worker::search(
                 // we assume this expected cut-node is not singular (multiple moves fail high),
                 // and we can prune the whole subtree by returning a softbound.
                 else if (singularBeta >= beta)
+                {
+                    if (!ttCapture)
+                        update_quiet_stats(pos, ss, *this, ttMove, -stat_malus(depth));
+
                     return singularBeta;
+                }
 
                 // Negative extensions
                 // If other moves failed high over (ttValue - margin) without the ttMove on a reduced search,
@@ -1130,6 +1137,9 @@ Value Search::Worker::search(
         if (ss->ttPv)
             r -= 1 + (ttValue > alpha) + (tte->depth() >= depth);
 
+        else if (cutNode && move != ttMove && move != ss->killers[0])
+            r++;
+
         // Increase reduction for cut nodes (~4 Elo)
         if (cutNode)
             r += 2 - (tte->depth() >= depth && ss->ttPv);
diff --git a/src/search.h b/src/search.h
index 0fd778b47e6..444e3b8bb1d 100644
--- a/src/search.h
+++ b/src/search.h
@@ -109,8 +109,7 @@ struct RootMove {
 using RootMoves = std::vector<RootMove>;
 
 
-// LimitsType struct stores information sent by GUI about available time to
-// search the current move, maximum depth/time, or if we are in analysis mode.
+// LimitsType struct stores information sent by the caller about the analysis required.
 struct LimitsType {
 
     // Init explicitly due to broken value-initialization of non POD in MSVC
@@ -128,6 +127,7 @@ struct LimitsType {
     int                      movestogo, depth, mate, perft, infinite;
     uint64_t                 nodes;
     bool                     ponderMode;
+    Square                   capSq;
 };
 
 
@@ -302,15 +302,14 @@ class Worker {
 
     Tablebases::Config tbConfig;
 
-    // Used by NNUE
-
-    Eval::NNUE::AccumulatorCaches refreshTable;
-
     const OptionsMap&           options;
     ThreadPool&                 threads;
     TranspositionTable&         tt;
     const Eval::NNUE::Networks& networks;
 
+    // Used by NNUE
+    Eval::NNUE::AccumulatorCaches refreshTable;
+
     friend class Stockfish::ThreadPool;
     friend class SearchManager;
 };