From 886ed90ec3599cdf0dc4e7d07b0543a27028c6c0 Mon Sep 17 00:00:00 2001
From: xoto10 <23479932+xoto10@users.noreply.github.com>
Date: Sun, 28 Apr 2024 16:27:40 +0100
Subject: [PATCH 01/12] Use less time on recaptures

Credit for the idea goes to peregrine on discord.

Passed STC 10+0.1:
https://tests.stockfishchess.org/tests/view/662652623fe04ce4cefc48cf
LLR: 2.95 (-2.94,2.94) <0.00,2.00>
Total: 75712 W: 19793 L: 19423 D: 36496
Ptnml(0-2): 258, 8487, 20023, 8803, 285

Passed LTC 60+0.6:
https://tests.stockfishchess.org/tests/view/6627495e3fe04ce4cefc59b6
LLR: 2.94 (-2.94,2.94) <0.50,2.50>
Total: 49788 W: 12743 L: 12404 D: 24641
Ptnml(0-2): 29, 5141, 14215, 5480, 29

The code was updated slightly and tested for non-regression against the
original code at STC:

LLR: 2.94 (-2.94,2.94) <-1.75,0.25>
Total: 41952 W: 10912 L: 10698 D: 20342
Ptnml(0-2): 133, 4825, 10835, 5061, 122
https://tests.stockfishchess.org/tests/view/662d84f56115ff6764c7e438

closes https://github.com/official-stockfish/Stockfish/pull/5189

Bench: 1836777
---
 src/engine.cpp | 12 ++++++++++--
 src/engine.h   | 11 +++++++----
 src/search.cpp |  7 ++++---
 src/search.h   |  4 ++--
 4 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/src/engine.cpp b/src/engine.cpp
index 4625e00a816..72a37ce9b0b 100644
--- a/src/engine.cpp
+++ b/src/engine.cpp
@@ -53,6 +53,7 @@ Engine::Engine(std::string path) :
       NN::NetworkBig({EvalFileDefaultNameBig, "None", ""}, NN::EmbeddedNNUEType::BIG),
       NN::NetworkSmall({EvalFileDefaultNameSmall, "None", ""}, NN::EmbeddedNNUEType::SMALL))) {
     pos.set(StartFEN, false, &states->back());
+    capSq = SQ_NONE;
 }
 
 std::uint64_t Engine::perft(const std::string& fen, Depth depth, bool isChess960) {
@@ -61,9 +62,10 @@ std::uint64_t Engine::perft(const std::string& fen, Depth depth, bool isChess960
     return Benchmark::perft(fen, depth, isChess960);
 }
 
-void Engine::go(const Search::LimitsType& limits) {
+void Engine::go(Search::LimitsType& limits) {
     assert(limits.perft == 0);
     verify_networks();
+    limits.capSq = capSq;
 
     threads.start_thinking(options, pos, states, limits);
 }
@@ -102,6 +104,7 @@ void Engine::set_position(const std::string& fen, const std::vector<std::string>
     states = StateListPtr(new std::deque<StateInfo>(1));
     pos.set(fen, options["UCI_Chess960"], &states->back());
 
+    capSq = SQ_NONE;
     for (const auto& move : moves)
     {
         auto m = UCIEngine::to_move(pos, move);
@@ -111,6 +114,11 @@ void Engine::set_position(const std::string& fen, const std::vector<std::string>
 
         states->emplace_back();
         pos.do_move(m, states->back());
+
+        capSq          = SQ_NONE;
+        DirtyPiece& dp = states->back().dirtyPiece;
+        if (dp.dirty_num > 1 && dp.to[1] == SQ_NONE)
+            capSq = m.to_sq();
     }
 }
 
@@ -172,4 +180,4 @@ std::string Engine::visualize() const {
     return ss.str();
 }
 
-}
\ No newline at end of file
+}
diff --git a/src/engine.h b/src/engine.h
index 041f5678585..64a814cb4aa 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -20,24 +20,26 @@
 #define ENGINE_H_INCLUDED
 
 #include <cstddef>
+#include <cstdint>
 #include <functional>
 #include <optional>
 #include <string>
 #include <string_view>
 #include <utility>
 #include <vector>
-#include <cstdint>
 
 #include "nnue/network.h"
 #include "position.h"
 #include "search.h"
+#include "syzygy/tbprobe.h"  // for Stockfish::Depth
 #include "thread.h"
 #include "tt.h"
 #include "ucioption.h"
-#include "syzygy/tbprobe.h"  // for Stockfish::Depth
 
 namespace Stockfish {
 
+enum Square : int;
+
 class Engine {
    public:
     using InfoShort = Search::InfoShort;
@@ -50,7 +52,7 @@ class Engine {
     std::uint64_t perft(const std::string& fen, Depth depth, bool isChess960);
 
     // non blocking call to start searching
-    void go(const Search::LimitsType&);
+    void go(Search::LimitsType&);
     // non blocking call to stop searching
     void stop();
 
@@ -92,6 +94,7 @@ class Engine {
 
     Position     pos;
     StateListPtr states;
+    Square       capSq;
 
     OptionsMap           options;
     ThreadPool           threads;
@@ -104,4 +107,4 @@ class Engine {
 }  // namespace Stockfish
 
 
-#endif  // #ifndef ENGINE_H_INCLUDED
\ No newline at end of file
+#endif  // #ifndef ENGINE_H_INCLUDED
diff --git a/src/search.cpp b/src/search.cpp
index 893daab20e6..396e5aa06c8 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -54,8 +54,8 @@ using namespace Search;
 
 namespace {
 
-static constexpr double EvalLevel[10] = {1.043, 1.017, 0.952, 1.009, 0.971,
-                                         1.002, 0.992, 0.947, 1.046, 1.001};
+static constexpr double EvalLevel[10] = {0.981, 0.956, 0.895, 0.949, 0.913,
+                                         0.942, 0.933, 0.890, 0.984, 0.941};
 
 // Futility margin
 Value futility_margin(Depth d, bool noTtCutNode, bool improving, bool oppWorsening) {
@@ -446,9 +446,10 @@ void Search::Worker::iterative_deepening() {
             double reduction = (1.48 + mainThread->previousTimeReduction) / (2.17 * timeReduction);
             double bestMoveInstability = 1 + 1.88 * totBestMoveChanges / threads.size();
             int    el                  = std::clamp((bestValue + 750) / 150, 0, 9);
+            double recapture           = limits.capSq == rootMoves[0].pv[0].to_sq() ? 0.955 : 1.005;
 
             double totalTime = mainThread->tm.optimum() * fallingEval * reduction
-                             * bestMoveInstability * EvalLevel[el];
+                             * bestMoveInstability * EvalLevel[el] * recapture;
 
             // Cap used time in case of a single legal move for a better viewer experience
             if (rootMoves.size() == 1)
diff --git a/src/search.h b/src/search.h
index 0fd778b47e6..9b3528c8741 100644
--- a/src/search.h
+++ b/src/search.h
@@ -109,8 +109,7 @@ struct RootMove {
 using RootMoves = std::vector<RootMove>;
 
 
-// LimitsType struct stores information sent by GUI about available time to
-// search the current move, maximum depth/time, or if we are in analysis mode.
+// LimitsType struct stores information sent by the caller about the analysis required.
 struct LimitsType {
 
     // Init explicitly due to broken value-initialization of non POD in MSVC
@@ -128,6 +127,7 @@ struct LimitsType {
     int                      movestogo, depth, mate, perft, infinite;
     uint64_t                 nodes;
     bool                     ponderMode;
+    Square                   capSq;
 };
 
 

From 3502c8ae426506453ca64e87e48d962b327c2356 Mon Sep 17 00:00:00 2001
From: Disservin <disservin.social@gmail.com>
Date: Thu, 25 Apr 2024 19:20:57 +0200
Subject: [PATCH 02/12] Fix missing initialization of AccumulatorCaches in
 Eval::trace

Add a constructor to `AccumulatorCaches` instead of just calling
`clear(networks)` to prevent similar issues from appearing in the
future.

fixes https://github.com/official-stockfish/Stockfish/issues/5190

closes https://github.com/official-stockfish/Stockfish/pull/5191

No functional change
---
 src/evaluate.cpp            | 2 +-
 src/nnue/nnue_accumulator.h | 5 +++++
 src/search.cpp              | 4 ++--
 src/search.h                | 7 +++----
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index f5746ca5199..6e101e7830a 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -99,7 +99,7 @@ Value Eval::evaluate(const Eval::NNUE::Networks&    networks,
 // Trace scores are from white's point of view
 std::string Eval::trace(Position& pos, const Eval::NNUE::Networks& networks) {
 
-    auto caches = std::make_unique<Eval::NNUE::AccumulatorCaches>();
+    auto caches = std::make_unique<Eval::NNUE::AccumulatorCaches>(networks);
 
     if (pos.checkers())
         return "Final evaluation: none (in check)";
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index 8d73dbef5ad..f65385688de 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -50,6 +50,11 @@ struct alignas(CacheLineSize) Accumulator {
 // is commonly referred to as "Finny Tables".
 struct AccumulatorCaches {
 
+    template<typename Networks>
+    AccumulatorCaches(const Networks& networks) {
+        clear(networks);
+    }
+
     template<IndexType Size>
     struct alignas(CacheLineSize) Cache {
 
diff --git a/src/search.cpp b/src/search.cpp
index 396e5aa06c8..11373707b34 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -137,11 +137,11 @@ Search::Worker::Worker(SharedState&                    sharedState,
     // Unpack the SharedState struct into member variables
     thread_idx(thread_id),
     manager(std::move(sm)),
-    refreshTable(),
     options(sharedState.options),
     threads(sharedState.threads),
     tt(sharedState.tt),
-    networks(sharedState.networks) {
+    networks(sharedState.networks),
+    refreshTable(networks) {
     clear();
 }
 
diff --git a/src/search.h b/src/search.h
index 9b3528c8741..444e3b8bb1d 100644
--- a/src/search.h
+++ b/src/search.h
@@ -302,15 +302,14 @@ class Worker {
 
     Tablebases::Config tbConfig;
 
-    // Used by NNUE
-
-    Eval::NNUE::AccumulatorCaches refreshTable;
-
     const OptionsMap&           options;
     ThreadPool&                 threads;
     TranspositionTable&         tt;
     const Eval::NNUE::Networks& networks;
 
+    // Used by NNUE
+    Eval::NNUE::AccumulatorCaches refreshTable;
+
     friend class Stockfish::ThreadPool;
     friend class SearchManager;
 };

From bc45cbc820a53a9fc405c06ca67bd7be3970344e Mon Sep 17 00:00:00 2001
From: Joost VandeVondele <Joost.VandeVondele@gmail.com>
Date: Sat, 27 Apr 2024 18:09:45 +0200
Subject: [PATCH 03/12] Output some basic info about the used networks

Adds size in memory as well as layer sizes as in

info string NNUE evaluation using nn-ae6a388e4a1a.nnue (132MiB, (22528, 3072, 15, 32, 1))
info string NNUE evaluation using nn-baff1ede1f90.nnue (6MiB, (22528, 128, 15, 32, 1))

For example, the size in MiB is useful to keep the fishtest memory sizes up-to-date,
the L1-L3 sizes give a useful hint about the architecture used.

closes https://github.com/official-stockfish/Stockfish/pull/5193

No functional change
---
 src/nnue/network.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp
index 656ad97a1e3..42320bae1ab 100644
--- a/src/nnue/network.cpp
+++ b/src/nnue/network.cpp
@@ -252,7 +252,11 @@ void Network<Arch, Transformer>::verify(std::string evalfilePath) const {
         exit(EXIT_FAILURE);
     }
 
-    sync_cout << "info string NNUE evaluation using " << evalfilePath << sync_endl;
+    size_t size = sizeof(*featureTransformer) + sizeof(*network) * LayerStacks;
+    sync_cout << "info string NNUE evaluation using " << evalfilePath << " ("
+              << size / (1024 * 1024) << "MiB, (" << featureTransformer->InputDimensions << ", "
+              << network[0]->TransformedFeatureDimensions << ", " << network[0]->FC_0_OUTPUTS
+              << ", " << network[0]->FC_1_OUTPUTS << ", 1))" << sync_endl;
 }
 
 

From 940a3a7383f48cea7aacbbe335671aa0d3ead1ae Mon Sep 17 00:00:00 2001
From: mstembera <m_stembera@yahoo.com>
Date: Thu, 25 Apr 2024 18:20:08 -0700
Subject: [PATCH 04/12] Cache small net w/ psqtOnly support

Caching the small net in the same way as the big net allows them to
share the same code path and completely removes
update_accumulator_refresh().

STC:
https://tests.stockfishchess.org/tests/view/662bfb5ed46f72253dcfed85
LLR: 2.94 (-2.94,2.94) <-1.75,0.25>
Total: 151712 W: 39252 L: 39158 D: 73302
Ptnml(0-2): 565, 17474, 39683, 17570, 564

closes https://github.com/official-stockfish/Stockfish/pull/5194

Bench: 1836777
---
 src/evaluate.cpp                    |   2 +-
 src/nnue/network.cpp                |   4 +-
 src/nnue/network.h                  |   2 +-
 src/nnue/nnue_accumulator.h         |   6 +-
 src/nnue/nnue_feature_transformer.h | 263 ++++++++--------------------
 src/nnue/nnue_misc.cpp              |   2 +-
 6 files changed, 86 insertions(+), 193 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 6e101e7830a..345925f6b2a 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -60,7 +60,7 @@ Value Eval::evaluate(const Eval::NNUE::Networks&    networks,
     int  nnueComplexity;
     int  v;
 
-    Value nnue = smallNet ? networks.small.evaluate(pos, nullptr, true, &nnueComplexity, psqtOnly)
+    Value nnue = smallNet ? networks.small.evaluate(pos, &caches.small, true, &nnueComplexity, psqtOnly)
                           : networks.big.evaluate(pos, &caches.big, true, &nnueComplexity, false);
 
     const auto adjustEval = [&](int optDiv, int nnueDiv, int npmDiv, int pawnCountConstant,
diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp
index 42320bae1ab..2eca18bd15d 100644
--- a/src/nnue/network.cpp
+++ b/src/nnue/network.cpp
@@ -263,8 +263,8 @@ void Network<Arch, Transformer>::verify(std::string evalfilePath) const {
 template<typename Arch, typename Transformer>
 void Network<Arch, Transformer>::hint_common_access(const Position&                         pos,
                                                     AccumulatorCaches::Cache<FTDimensions>* cache,
-                                                    bool psqtOnl) const {
-    featureTransformer->hint_common_access(pos, cache, psqtOnl);
+                                                    bool psqtOnly) const {
+    featureTransformer->hint_common_access(pos, cache, psqtOnly);
 }
 
 template<typename Arch, typename Transformer>
diff --git a/src/nnue/network.h b/src/nnue/network.h
index df59732d955..053b7d19c82 100644
--- a/src/nnue/network.h
+++ b/src/nnue/network.h
@@ -62,7 +62,7 @@ class Network {
 
     void hint_common_access(const Position&                         pos,
                             AccumulatorCaches::Cache<FTDimensions>* cache,
-                            bool                                    psqtOnl) const;
+                            bool                                    psqtOnly) const;
 
     void          verify(std::string evalfilePath) const;
     NnueEvalTrace trace_evaluate(const Position&                         pos,
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index f65385688de..dd313958fe6 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -63,6 +63,7 @@ struct AccumulatorCaches {
             PSQTWeightType psqtAccumulation[COLOR_NB][PSQTBuckets];
             Bitboard       byColorBB[COLOR_NB][COLOR_NB];
             Bitboard       byTypeBB[COLOR_NB][PIECE_TYPE_NB];
+            bool           psqtOnly;
 
             // To initialize a refresh entry, we set all its bitboards empty,
             // so we put the biases in the accumulation, without any weights on top
@@ -70,6 +71,7 @@ struct AccumulatorCaches {
 
                 std::memset(byColorBB, 0, sizeof(byColorBB));
                 std::memset(byTypeBB, 0, sizeof(byTypeBB));
+                psqtOnly = false;
 
                 std::memcpy(accumulation[WHITE], biases, Size * sizeof(BiasType));
                 std::memcpy(accumulation[BLACK], biases, Size * sizeof(BiasType));
@@ -97,11 +99,11 @@ struct AccumulatorCaches {
     template<typename Networks>
     void clear(const Networks& networks) {
         big.clear(networks.big);
+        small.clear(networks.small);
     }
 
-    // When adding a new cache for a network, i.e. the smallnet
-    // the appropriate condition must be added to FeatureTransformer::update_accumulator_refresh.
     Cache<TransformedFeatureDimensionsBig> big;
+    Cache<TransformedFeatureDimensionsSmall> small;
 };
 
 }  // namespace Stockfish::Eval::NNUE
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 88f0e4031a4..60957ebeb77 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -656,75 +656,84 @@ class FeatureTransformer {
 
     template<Color Perspective>
     void update_accumulator_refresh_cache(const Position&                           pos,
-                                          AccumulatorCaches::Cache<HalfDimensions>* cache) const {
+                                          AccumulatorCaches::Cache<HalfDimensions>* cache,
+                                          bool psqtOnly) const {
         assert(cache != nullptr);
 
         Square ksq = pos.square<KING>(Perspective);
-
         auto& entry = (*cache)[ksq];
-
-        auto& accumulator                     = pos.state()->*accPtr;
-        accumulator.computed[Perspective]     = true;
-        accumulator.computedPSQT[Perspective] = true;
-
         FeatureSet::IndexList removed, added;
-        for (Color c : {WHITE, BLACK})
+
+        if (entry.psqtOnly && !psqtOnly)
         {
-            for (PieceType pt = PAWN; pt <= KING; ++pt)
+            entry.clear(biases);
+            FeatureSet::append_active_indices<Perspective>(pos, added);
+        }
+        else
+        {
+            for (Color c : {WHITE, BLACK})
             {
-                const Piece    piece = make_piece(c, pt);
-                const Bitboard oldBB =
-                  entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt];
-                const Bitboard newBB    = pos.pieces(c, pt);
-                Bitboard       toRemove = oldBB & ~newBB;
-                Bitboard       toAdd    = newBB & ~oldBB;
-
-                while (toRemove)
-                {
-                    Square sq = pop_lsb(toRemove);
-                    removed.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
-                }
-                while (toAdd)
+                for (PieceType pt = PAWN; pt <= KING; ++pt)
                 {
-                    Square sq = pop_lsb(toAdd);
-                    added.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
+                    const Piece    piece = make_piece(c, pt);
+                    const Bitboard oldBB =
+                      entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt];
+                    const Bitboard newBB    = pos.pieces(c, pt);
+                    Bitboard       toRemove = oldBB & ~newBB;
+                    Bitboard       toAdd    = newBB & ~oldBB;
+
+                    while (toRemove)
+                    {
+                        Square sq = pop_lsb(toRemove);
+                        removed.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
+                    }
+                    while (toAdd)
+                    {
+                        Square sq = pop_lsb(toAdd);
+                        added.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
+                    }
                 }
             }
         }
 
+        auto& accumulator                     = pos.state()->*accPtr;
+        accumulator.computed[Perspective]     = !psqtOnly;
+        accumulator.computedPSQT[Perspective] = true;
+
 #ifdef VECTOR
         vec_t      acc[NumRegs];
         psqt_vec_t psqt[NumPsqtRegs];
 
-        for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
-        {
-            auto entryTile =
-              reinterpret_cast<vec_t*>(&entry.accumulation[Perspective][j * TileHeight]);
-            for (IndexType k = 0; k < NumRegs; ++k)
-                acc[k] = entryTile[k];
-
-            for (int i = 0; i < int(added.size()); ++i)
+        if (!psqtOnly)
+            for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
             {
-                IndexType       index  = added[i];
-                const IndexType offset = HalfDimensions * index + j * TileHeight;
-                auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
+                auto entryTile =
+                  reinterpret_cast<vec_t*>(&entry.accumulation[Perspective][j * TileHeight]);
+                for (IndexType k = 0; k < NumRegs; ++k)
+                    acc[k] = entryTile[k];
 
-                for (unsigned k = 0; k < NumRegs; ++k)
-                    acc[k] = vec_add_16(acc[k], column[k]);
-            }
-            for (int i = 0; i < int(removed.size()); ++i)
-            {
-                IndexType       index  = removed[i];
-                const IndexType offset = HalfDimensions * index + j * TileHeight;
-                auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
+                for (int i = 0; i < int(added.size()); ++i)
+                {
+                    IndexType       index  = added[i];
+                    const IndexType offset = HalfDimensions * index + j * TileHeight;
+                    auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
 
-                for (unsigned k = 0; k < NumRegs; ++k)
-                    acc[k] = vec_sub_16(acc[k], column[k]);
-            }
+                    for (unsigned k = 0; k < NumRegs; ++k)
+                        acc[k] = vec_add_16(acc[k], column[k]);
+                }
+                for (int i = 0; i < int(removed.size()); ++i)
+                {
+                    IndexType       index  = removed[i];
+                    const IndexType offset = HalfDimensions * index + j * TileHeight;
+                    auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
 
-            for (IndexType k = 0; k < NumRegs; k++)
-                vec_store(&entryTile[k], acc[k]);
-        }
+                    for (unsigned k = 0; k < NumRegs; ++k)
+                        acc[k] = vec_sub_16(acc[k], column[k]);
+                }
+
+                for (IndexType k = 0; k < NumRegs; k++)
+                    vec_store(&entryTile[k], acc[k]);
+            }
 
         for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
         {
@@ -760,18 +769,24 @@ class FeatureTransformer {
 
         for (const auto index : added)
         {
-            const IndexType offset = HalfDimensions * index;
-            for (IndexType j = 0; j < HalfDimensions; ++j)
-                entry.accumulation[Perspective][j] += weights[offset + j];
+            if (!psqtOnly)
+            {
+                const IndexType offset = HalfDimensions * index;
+                for (IndexType j = 0; j < HalfDimensions; ++j)
+                    entry.accumulation[Perspective][j] += weights[offset + j];
+            }
 
             for (std::size_t k = 0; k < PSQTBuckets; ++k)
                 entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k];
         }
         for (const auto index : removed)
         {
-            const IndexType offset = HalfDimensions * index;
-            for (IndexType j = 0; j < HalfDimensions; ++j)
-                entry.accumulation[Perspective][j] -= weights[offset + j];
+            if (!psqtOnly)
+            {
+                const IndexType offset = HalfDimensions * index;
+                for (IndexType j = 0; j < HalfDimensions; ++j)
+                    entry.accumulation[Perspective][j] -= weights[offset + j];
+            }
 
             for (std::size_t k = 0; k < PSQTBuckets; ++k)
                 entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k];
@@ -782,144 +797,20 @@ class FeatureTransformer {
         // The accumulator of the refresh entry has been updated.
         // Now copy its content to the actual accumulator we were refreshing
 
+        if (!psqtOnly)
+            std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective],
+                        sizeof(BiasType) * HalfDimensions);
+
         std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation[Perspective],
                     sizeof(int32_t) * PSQTBuckets);
 
-        std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective],
-                    sizeof(BiasType) * HalfDimensions);
-
         for (Color c : {WHITE, BLACK})
             entry.byColorBB[Perspective][c] = pos.pieces(c);
 
         for (PieceType pt = PAWN; pt <= KING; ++pt)
             entry.byTypeBB[Perspective][pt] = pos.pieces(pt);
-    }
-
-    template<Color Perspective>
-    void
-    update_accumulator_refresh(const Position&                                            pos,
-                               [[maybe_unused]] AccumulatorCaches::Cache<HalfDimensions>* cache,
-                               bool psqtOnly) const {
-
-        // When we are refreshing the accumulator of the big net,
-        // redirect to the version of refresh that uses the refresh table.
-        // Using the cache for the small net is not beneficial.
-        if constexpr (HalfDimensions == Eval::NNUE::TransformedFeatureDimensionsBig)
-        {
-            update_accumulator_refresh_cache<Perspective>(pos, cache);
-            return;
-        }
 
-#ifdef VECTOR
-        // Gcc-10.2 unnecessarily spills AVX2 registers if this array
-        // is defined in the VECTOR code below, once in each branch
-        vec_t      acc[NumRegs];
-        psqt_vec_t psqt[NumPsqtRegs];
-#endif
-
-        // Refresh the accumulator
-        // Could be extracted to a separate function because it's done in 2 places,
-        // but it's unclear if compilers would correctly handle register allocation.
-        auto& accumulator                     = pos.state()->*accPtr;
-        accumulator.computed[Perspective]     = !psqtOnly;
-        accumulator.computedPSQT[Perspective] = true;
-        FeatureSet::IndexList active;
-        FeatureSet::append_active_indices<Perspective>(pos, active);
-
-#ifdef VECTOR
-        if (!psqtOnly)
-            for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
-            {
-                auto biasesTile = reinterpret_cast<const vec_t*>(&biases[j * TileHeight]);
-                for (IndexType k = 0; k < NumRegs; ++k)
-                    acc[k] = biasesTile[k];
-
-                int i = 0;
-                for (; i < int(active.size()) - 1; i += 2)
-                {
-                    IndexType       index0  = active[i];
-                    IndexType       index1  = active[i + 1];
-                    const IndexType offset0 = HalfDimensions * index0 + j * TileHeight;
-                    const IndexType offset1 = HalfDimensions * index1 + j * TileHeight;
-                    auto            column0 = reinterpret_cast<const vec_t*>(&weights[offset0]);
-                    auto            column1 = reinterpret_cast<const vec_t*>(&weights[offset1]);
-
-                    for (unsigned k = 0; k < NumRegs; ++k)
-                        acc[k] = vec_add_16(acc[k], vec_add_16(column0[k], column1[k]));
-                }
-                for (; i < int(active.size()); ++i)
-                {
-                    IndexType       index  = active[i];
-                    const IndexType offset = HalfDimensions * index + j * TileHeight;
-                    auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
-
-                    for (unsigned k = 0; k < NumRegs; ++k)
-                        acc[k] = vec_add_16(acc[k], column[k]);
-                }
-
-                auto accTile =
-                  reinterpret_cast<vec_t*>(&accumulator.accumulation[Perspective][j * TileHeight]);
-                for (unsigned k = 0; k < NumRegs; k++)
-                    vec_store(&accTile[k], acc[k]);
-            }
-
-        for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
-        {
-            for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                psqt[k] = vec_zero_psqt();
-
-            int i = 0;
-            for (; i < int(active.size()) - 1; i += 2)
-            {
-                IndexType       index0  = active[i];
-                IndexType       index1  = active[i + 1];
-                const IndexType offset0 = PSQTBuckets * index0 + j * PsqtTileHeight;
-                const IndexType offset1 = PSQTBuckets * index1 + j * PsqtTileHeight;
-                auto columnPsqt0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset0]);
-                auto columnPsqt1 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset1]);
-
-                for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] =
-                      vec_add_psqt_32(psqt[k], vec_add_psqt_32(columnPsqt0[k], columnPsqt1[k]));
-            }
-            for (; i < int(active.size()); ++i)
-            {
-                IndexType       index  = active[i];
-                const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
-                auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
-
-                for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
-            }
-
-            auto accTilePsqt = reinterpret_cast<psqt_vec_t*>(
-              &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]);
-            for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                vec_store_psqt(&accTilePsqt[k], psqt[k]);
-        }
-
-#else
-        if (!psqtOnly)
-            std::memcpy(accumulator.accumulation[Perspective], biases,
-                        HalfDimensions * sizeof(BiasType));
-
-        for (std::size_t k = 0; k < PSQTBuckets; ++k)
-            accumulator.psqtAccumulation[Perspective][k] = 0;
-
-        for (const auto index : active)
-        {
-            if (!psqtOnly)
-            {
-                const IndexType offset = HalfDimensions * index;
-                for (IndexType j = 0; j < HalfDimensions; ++j)
-                    accumulator.accumulation[Perspective][j] += weights[offset + j];
-            }
-
-            for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                accumulator.psqtAccumulation[Perspective][k] +=
-                  psqtWeights[index * PSQTBuckets + k];
-        }
-#endif
+        entry.psqtOnly = psqtOnly;
     }
 
     template<Color Perspective>
@@ -948,7 +839,7 @@ class FeatureTransformer {
                                                            psqtOnly);
         }
         else
-            update_accumulator_refresh<Perspective>(pos, cache, psqtOnly);
+            update_accumulator_refresh_cache<Perspective>(pos, cache, psqtOnly);
     }
 
     template<Color Perspective>
@@ -976,7 +867,7 @@ class FeatureTransformer {
                                                            psqtOnly);
         }
         else
-            update_accumulator_refresh<Perspective>(pos, cache, psqtOnly);
+            update_accumulator_refresh_cache<Perspective>(pos, cache, psqtOnly);
     }
 
     template<IndexType Size>
diff --git a/src/nnue/nnue_misc.cpp b/src/nnue/nnue_misc.cpp
index 51838fefa44..e92dcc71086 100644
--- a/src/nnue/nnue_misc.cpp
+++ b/src/nnue/nnue_misc.cpp
@@ -48,7 +48,7 @@ void hint_common_parent_position(const Position&    pos,
 
     int simpleEvalAbs = std::abs(simple_eval(pos, pos.side_to_move()));
     if (simpleEvalAbs > Eval::SmallNetThreshold)
-        networks.small.hint_common_access(pos, nullptr, simpleEvalAbs > Eval::PsqtOnlyThreshold);
+        networks.small.hint_common_access(pos, &caches.small, simpleEvalAbs > Eval::PsqtOnlyThreshold);
     else
         networks.big.hint_common_access(pos, &caches.big, false);
 }

From a129c0695be921acfbb3f5c966eef756d0b6f843 Mon Sep 17 00:00:00 2001
From: mstembera <m_stembera@yahoo.com>
Date: Sun, 28 Apr 2024 10:28:25 -0700
Subject: [PATCH 05/12] Combine remove and add in
 update_accumulator_refresh_cache()

Combine remove and add in update_accumulator_refresh_cache().
Move remove before add to match other parts of the code.

STC:
https://tests.stockfishchess.org/tests/view/662d96dc6115ff6764c7f4ca
LLR: 2.95 (-2.94,2.94) <0.00,2.00>
Total: 364032 W: 94421 L: 93624 D: 175987
Ptnml(0-2): 1261, 41983, 94811, 42620, 1341

closes https://github.com/official-stockfish/Stockfish/pull/5194

Bench: 1836777
---
 src/evaluate.cpp                    |  5 +--
 src/nnue/nnue_accumulator.h         |  2 +-
 src/nnue/nnue_feature_transformer.h | 53 ++++++++++++++++++-----------
 src/nnue/nnue_misc.cpp              |  3 +-
 4 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 345925f6b2a..fe6b83aa111 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -60,8 +60,9 @@ Value Eval::evaluate(const Eval::NNUE::Networks&    networks,
     int  nnueComplexity;
     int  v;
 
-    Value nnue = smallNet ? networks.small.evaluate(pos, &caches.small, true, &nnueComplexity, psqtOnly)
-                          : networks.big.evaluate(pos, &caches.big, true, &nnueComplexity, false);
+    Value nnue = smallNet
+                 ? networks.small.evaluate(pos, &caches.small, true, &nnueComplexity, psqtOnly)
+                 : networks.big.evaluate(pos, &caches.big, true, &nnueComplexity, false);
 
     const auto adjustEval = [&](int optDiv, int nnueDiv, int npmDiv, int pawnCountConstant,
                                 int pawnCountMul, int npmConstant, int evalDiv,
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index dd313958fe6..a2b3b98988e 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -102,7 +102,7 @@ struct AccumulatorCaches {
         small.clear(networks.small);
     }
 
-    Cache<TransformedFeatureDimensionsBig> big;
+    Cache<TransformedFeatureDimensionsBig>   big;
     Cache<TransformedFeatureDimensionsSmall> small;
 };
 
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 60957ebeb77..6b3f78a9a4b 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -660,8 +660,8 @@ class FeatureTransformer {
                                           bool psqtOnly) const {
         assert(cache != nullptr);
 
-        Square ksq = pos.square<KING>(Perspective);
-        auto& entry = (*cache)[ksq];
+        Square                ksq   = pos.square<KING>(Perspective);
+        auto&                 entry = (*cache)[ksq];
         FeatureSet::IndexList removed, added;
 
         if (entry.psqtOnly && !psqtOnly)
@@ -712,16 +712,20 @@ class FeatureTransformer {
                 for (IndexType k = 0; k < NumRegs; ++k)
                     acc[k] = entryTile[k];
 
-                for (int i = 0; i < int(added.size()); ++i)
+                int i0 = 0;
+                for (; i0 < int(std::min(removed.size(), added.size())); ++i0)
                 {
-                    IndexType       index  = added[i];
-                    const IndexType offset = HalfDimensions * index + j * TileHeight;
-                    auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
+                    IndexType       indexR  = removed[i0];
+                    const IndexType offsetR = HalfDimensions * indexR + j * TileHeight;
+                    auto            columnR = reinterpret_cast<const vec_t*>(&weights[offsetR]);
+                    IndexType       indexA  = added[i0];
+                    const IndexType offsetA = HalfDimensions * indexA + j * TileHeight;
+                    auto            columnA = reinterpret_cast<const vec_t*>(&weights[offsetA]);
 
                     for (unsigned k = 0; k < NumRegs; ++k)
-                        acc[k] = vec_add_16(acc[k], column[k]);
+                        acc[k] = vec_add_16(vec_sub_16(acc[k], columnR[k]), columnA[k]);
                 }
-                for (int i = 0; i < int(removed.size()); ++i)
+                for (int i = i0; i < int(removed.size()); ++i)
                 {
                     IndexType       index  = removed[i];
                     const IndexType offset = HalfDimensions * index + j * TileHeight;
@@ -730,6 +734,15 @@ class FeatureTransformer {
                     for (unsigned k = 0; k < NumRegs; ++k)
                         acc[k] = vec_sub_16(acc[k], column[k]);
                 }
+                for (int i = i0; i < int(added.size()); ++i)
+                {
+                    IndexType       index  = added[i];
+                    const IndexType offset = HalfDimensions * index + j * TileHeight;
+                    auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
+
+                    for (unsigned k = 0; k < NumRegs; ++k)
+                        acc[k] = vec_add_16(acc[k], column[k]);
+                }
 
                 for (IndexType k = 0; k < NumRegs; k++)
                     vec_store(&entryTile[k], acc[k]);
@@ -742,23 +755,23 @@ class FeatureTransformer {
             for (std::size_t k = 0; k < NumPsqtRegs; ++k)
                 psqt[k] = entryTilePsqt[k];
 
-            for (int i = 0; i < int(added.size()); ++i)
+            for (int i = 0; i < int(removed.size()); ++i)
             {
-                IndexType       index  = added[i];
+                IndexType       index  = removed[i];
                 const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
                 auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
 
                 for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
+                    psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
             }
-            for (int i = 0; i < int(removed.size()); ++i)
+            for (int i = 0; i < int(added.size()); ++i)
             {
-                IndexType       index  = removed[i];
+                IndexType       index  = added[i];
                 const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
                 auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
 
                 for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
+                    psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
             }
 
             for (std::size_t k = 0; k < NumPsqtRegs; ++k)
@@ -767,29 +780,29 @@ class FeatureTransformer {
 
 #else
 
-        for (const auto index : added)
+        for (const auto index : removed)
         {
             if (!psqtOnly)
             {
                 const IndexType offset = HalfDimensions * index;
                 for (IndexType j = 0; j < HalfDimensions; ++j)
-                    entry.accumulation[Perspective][j] += weights[offset + j];
+                    entry.accumulation[Perspective][j] -= weights[offset + j];
             }
 
             for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k];
+                entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k];
         }
-        for (const auto index : removed)
+        for (const auto index : added)
         {
             if (!psqtOnly)
             {
                 const IndexType offset = HalfDimensions * index;
                 for (IndexType j = 0; j < HalfDimensions; ++j)
-                    entry.accumulation[Perspective][j] -= weights[offset + j];
+                    entry.accumulation[Perspective][j] += weights[offset + j];
             }
 
             for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k];
+                entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k];
         }
 
 #endif
diff --git a/src/nnue/nnue_misc.cpp b/src/nnue/nnue_misc.cpp
index e92dcc71086..21685d0f2a3 100644
--- a/src/nnue/nnue_misc.cpp
+++ b/src/nnue/nnue_misc.cpp
@@ -48,7 +48,8 @@ void hint_common_parent_position(const Position&    pos,
 
     int simpleEvalAbs = std::abs(simple_eval(pos, pos.side_to_move()));
     if (simpleEvalAbs > Eval::SmallNetThreshold)
-        networks.small.hint_common_access(pos, &caches.small, simpleEvalAbs > Eval::PsqtOnlyThreshold);
+        networks.small.hint_common_access(pos, &caches.small,
+                                          simpleEvalAbs > Eval::PsqtOnlyThreshold);
     else
         networks.big.hint_common_access(pos, &caches.big, false);
 }

From 834e8ff619b212baf402c3922f8fde9af979cd0c Mon Sep 17 00:00:00 2001
From: cj5716 <125858804+cj5716@users.noreply.github.com>
Date: Sun, 28 Apr 2024 08:53:28 +0800
Subject: [PATCH 06/12] Penalise the TT move in multicut

Passed STC:
LLR: 2.99 (-2.94,2.94) <0.00,2.00>
Total: 185504 W: 48079 L: 47533 D: 89892
Ptnml(0-2): 716, 21866, 46988, 22520, 662
https://tests.stockfishchess.org/tests/view/662d9e1d6115ff6764c7f83d

Passed LTC:
LLR: 2.94 (-2.94,2.94) <0.50,2.50>
Total: 75612 W: 19351 L: 18948 D: 37313
Ptnml(0-2): 46, 8363, 20592, 8752, 53
https://tests.stockfishchess.org/tests/view/662dc9dc6115ff6764c80fea

closes https://github.com/official-stockfish/Stockfish/pull/5195

Bench: 1415435
---
 src/search.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/search.cpp b/src/search.cpp
index 11373707b34..ad59b35a545 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1067,7 +1067,12 @@ Value Search::Worker::search(
                 // we assume this expected cut-node is not singular (multiple moves fail high),
                 // and we can prune the whole subtree by returning a softbound.
                 else if (singularBeta >= beta)
+                {
+                    if (!ttCapture)
+                        update_quiet_stats(pos, ss, *this, ttMove, -stat_malus(depth));
+
                     return singularBeta;
+                }
 
                 // Negative extensions
                 // If other moves failed high over (ttValue - margin) without the ttMove on a reduced search,

From 48a3b7c0ee7d32441a5a4519c85bd1e93e467f6e Mon Sep 17 00:00:00 2001
From: Stefan Geschwentner <stgeschwentner@gmail.com>
Date: Sun, 28 Apr 2024 16:04:28 +0200
Subject: [PATCH 07/12] Simplify non-pawn material divisor to a constant

Passed STC:
https://tests.stockfishchess.org/tests/view/662942603fe04ce4cefc7aba
LLR: 2.93 (-2.94,2.94) <-1.75,0.25>
Total: 272832 W: 70456 L: 70497 D: 131879
Ptnml(0-2): 1020, 32619, 69154, 32628, 995

Passed LTC:
https://tests.stockfishchess.org/tests/view/662dfe3b6115ff6764c829eb
LLR: 2.94 (-2.94,2.94) <-1.75,0.25>
Total: 100254 W: 25446 L: 25303 D: 49505
Ptnml(0-2): 121, 11292, 27166, 11419, 129

closes https://github.com/official-stockfish/Stockfish/pull/5198

Bench: 1544645
---
 src/evaluate.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index fe6b83aa111..1d41f3a266b 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -64,14 +64,14 @@ Value Eval::evaluate(const Eval::NNUE::Networks&    networks,
                  ? networks.small.evaluate(pos, &caches.small, true, &nnueComplexity, psqtOnly)
                  : networks.big.evaluate(pos, &caches.big, true, &nnueComplexity, false);
 
-    const auto adjustEval = [&](int optDiv, int nnueDiv, int npmDiv, int pawnCountConstant,
-                                int pawnCountMul, int npmConstant, int evalDiv,
-                                int shufflingConstant, int shufflingDiv) {
+    const auto adjustEval = [&](int optDiv, int nnueDiv, int pawnCountConstant, int pawnCountMul,
+                                int npmConstant, int evalDiv, int shufflingConstant,
+                                int shufflingDiv) {
         // Blend optimism and eval with nnue complexity and material imbalance
         optimism += optimism * (nnueComplexity + std::abs(simpleEval - nnue)) / optDiv;
         nnue -= nnue * (nnueComplexity * 5 / 3) / nnueDiv;
 
-        int npm = pos.non_pawn_material() / npmDiv;
+        int npm = pos.non_pawn_material() / 64;
         v       = (nnue * (npm + pawnCountConstant + pawnCountMul * pos.count<PAWN>())
              + optimism * (npmConstant + npm))
           / evalDiv;
@@ -82,11 +82,11 @@ Value Eval::evaluate(const Eval::NNUE::Networks&    networks,
     };
 
     if (!smallNet)
-        adjustEval(524, 32395, 66, 942, 11, 139, 1058, 178, 204);
+        adjustEval(524, 32395, 942, 11, 139, 1058, 178, 204);
     else if (psqtOnly)
-        adjustEval(517, 32857, 65, 908, 7, 155, 1006, 224, 238);
+        adjustEval(517, 32857, 908, 7, 155, 1006, 224, 238);
     else
-        adjustEval(515, 32793, 63, 944, 9, 140, 1067, 206, 206);
+        adjustEval(515, 32793, 944, 9, 140, 1067, 206, 206);
 
     // Guarantee evaluation does not hit the tablebase range
     v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);

From 0fe64286457549d2f80cd7792088375aaa9bee55 Mon Sep 17 00:00:00 2001
From: Stefan Geschwentner <stgeschwentner@gmail.com>
Date: Sun, 28 Apr 2024 16:53:47 +0200
Subject: [PATCH 08/12] More reduction at cut nodes which are not a former PV
 node

But the tt move and first killer are excluded.

This idea is based on following LMR condition tuning
https://tests.stockfishchess.org/tests/view/66228bed3fe04ce4cefc0c71 by
using only the two largest terms P[0] and P[1].

Passed STC:
LLR: 2.93 (-2.94,2.94) <0.00,2.00>
Total: 173248 W: 45091 L: 44565 D: 83592
Ptnml(0-2): 693, 20534, 43673, 21002, 722
https://tests.stockfishchess.org/tests/view/6629603b3fe04ce4cefc7d37

Passed LTC:
LLR: 2.94 (-2.94,2.94) <0.50,2.50>
Total: 722394 W: 183231 L: 181487 D: 357676
Ptnml(0-2): 462, 80650, 197252, 82348, 485
https://tests.stockfishchess.org/tests/view/662cbe45d46f72253dcff7bf

closes https://github.com/official-stockfish/Stockfish/pull/5199

Bench: 1619613
---
 src/search.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/search.cpp b/src/search.cpp
index ad59b35a545..3718c37813b 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -1123,6 +1123,9 @@ Value Search::Worker::search(
         if (ss->ttPv)
             r -= 1 + (ttValue > alpha) + (tte->depth() >= depth);
 
+        else if (cutNode && move != ttMove && move != ss->killers[0])
+            r++;
+
         // Increase reduction for cut nodes (~4 Elo)
         if (cutNode)
             r += 2 - (tte->depth() >= depth && ss->ttPv);

From 5d720325596699ceba2743776cb39f9cea1754f5 Mon Sep 17 00:00:00 2001
From: Dubslow <bunslow@gmail.com>
Date: Sat, 20 Apr 2024 00:29:01 -0500
Subject: [PATCH 09/12] Use capture history to better judge which sacrifices to
 explore

This idea has been bouncing around a while. @Vizvezdenec tried it a
couple years ago in Stockfish without results, but its recent arrival in
Ethereal inspired him and thence me to try it afresh in Stockfish.

(Also factor out the now-common code with futpruning for captures.)

STC:
https://tests.stockfishchess.org/tests/view/662355bc3fe04ce4cefc18ac
LLR: 2.92 (-2.94,2.94) <0.00,2.00>
Total: 45760 W: 11970 L: 11640 D: 22150
Ptnml(0-2): 124, 5371, 11625, 5571, 189

LTC:
https://tests.stockfishchess.org/tests/view/662dda396115ff6764c817c9
LLR: 2.94 (-2.94,2.94) <0.50,2.50>
Total: 243828 W: 62042 L: 61287 D: 120499
Ptnml(0-2): 211, 27202, 66329, 27965, 207

closes https://github.com/official-stockfish/Stockfish/pull/5200

Bench: 1480008
---
 src/search.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/search.cpp b/src/search.cpp
index 3718c37813b..e4f170be61d 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -967,20 +967,22 @@ Value Search::Worker::search(
 
             if (capture || givesCheck)
             {
+                Piece capturedPiece = pos.piece_on(move.to_sq());
+                int   captHist =
+                  thisThread->captureHistory[movedPiece][move.to_sq()][type_of(capturedPiece)];
+
                 // Futility pruning for captures (~2 Elo)
                 if (!givesCheck && lmrDepth < 7 && !ss->inCheck)
                 {
-                    Piece capturedPiece = pos.piece_on(move.to_sq());
-                    Value futilityValue =
-                      ss->staticEval + 285 + 277 * lmrDepth + PieceValue[capturedPiece]
-                      + thisThread->captureHistory[movedPiece][move.to_sq()][type_of(capturedPiece)]
-                          / 7;
+                    Value futilityValue = ss->staticEval + 285 + 277 * lmrDepth
+                                        + PieceValue[capturedPiece] + captHist / 7;
                     if (futilityValue <= alpha)
                         continue;
                 }
 
                 // SEE based pruning for captures and checks (~11 Elo)
-                if (!pos.see_ge(move, -203 * depth))
+                int seeHist = std::clamp(captHist / 32, -199 * depth, 199 * depth);
+                if (!pos.see_ge(move, -203 * depth - seeHist))
                     continue;
             }
             else

From eb20de36c05b4101af37b2bf3783c570a47bb1cc Mon Sep 17 00:00:00 2001
From: Ciekce <44617491+Ciekce@users.noreply.github.com>
Date: Mon, 29 Apr 2024 01:45:56 +0100
Subject: [PATCH 10/12] Avoid unnecessary creation of accumulator cache

Saves a (currently) 800 KB allocation and deallocation when running
`eval`, not particularly significant and zero impact on play but not
necessary either.

closes https://github.com/official-stockfish/Stockfish/pull/5201

No functional change
---
 AUTHORS          | 1 +
 src/evaluate.cpp | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index abae401c1ef..36b2b6f7942 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -46,6 +46,7 @@ Bryan Cross (crossbr)
 candirufish
 Chess13234
 Chris Cain (ceebo)
+Ciekce
 clefrks
 Clemens L. (rn5f107s2)
 Cody Ho (aesrentai)
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index 1d41f3a266b..e3aa249ca41 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -100,11 +100,11 @@ Value Eval::evaluate(const Eval::NNUE::Networks&    networks,
 // Trace scores are from white's point of view
 std::string Eval::trace(Position& pos, const Eval::NNUE::Networks& networks) {
 
-    auto caches = std::make_unique<Eval::NNUE::AccumulatorCaches>(networks);
-
     if (pos.checkers())
         return "Final evaluation: none (in check)";
 
+    auto caches = std::make_unique<Eval::NNUE::AccumulatorCaches>(networks);
+
     std::stringstream ss;
     ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2);
     ss << '\n' << NNUE::trace(pos, networks, *caches) << '\n';

From 6a9b8a0c7b913b9d4c4474bae7804184d20e8c4a Mon Sep 17 00:00:00 2001
From: cj5716 <125858804+cj5716@users.noreply.github.com>
Date: Sun, 28 Apr 2024 16:33:59 +0800
Subject: [PATCH 11/12] Optimise NNUE Accumulator updates

Passed STC:
https://tests.stockfishchess.org/tests/view/662e3c6a5e9274400985a741
LLR: 2.94 (-2.94,2.94) <0.00,2.00>
Total: 86176 W: 22284 L: 21905 D: 41987
Ptnml(0-2): 254, 9572, 23051, 9963, 248

closes https://github.com/official-stockfish/Stockfish/pull/5202

No functional change
---
 src/nnue/nnue_feature_transformer.h | 76 ++++++++++++++---------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 6b3f78a9a4b..402a47a815d 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -404,19 +404,25 @@ class FeatureTransformer {
         return {st, next};
     }
 
-    // NOTE: The parameter states_to_update is an array of position states, ending with nullptr.
+    // NOTE: The parameter states_to_update is an array of position states.
     //       All states must be sequential, that is states_to_update[i] must either be reachable
-    //       by repeatedly applying ->previous from states_to_update[i+1] or
-    //       states_to_update[i] == nullptr.
+    //       by repeatedly applying ->previous from states_to_update[i+1].
     //       computed_st must be reachable by repeatedly applying ->previous on
-    //       states_to_update[0], if not nullptr.
+    //       states_to_update[0].
     template<Color Perspective, size_t N>
     void update_accumulator_incremental(const Position& pos,
                                         StateInfo*      computed_st,
                                         StateInfo*      states_to_update[N],
                                         bool            psqtOnly) const {
         static_assert(N > 0);
-        assert(states_to_update[N - 1] == nullptr);
+        assert([&]() {
+            for (size_t i = 0; i < N; ++i)
+            {
+                if (states_to_update[i] == nullptr)
+                    return false;
+            }
+            return true;
+        }());
 
 #ifdef VECTOR
         // Gcc-10.2 unnecessarily spills AVX2 registers if this array
@@ -425,11 +431,7 @@ class FeatureTransformer {
         psqt_vec_t psqt[NumPsqtRegs];
 #endif
 
-        if (states_to_update[0] == nullptr)
-            return;
-
         // Update incrementally going back through states_to_update.
-
         // Gather all features to be updated.
         const Square ksq = pos.square<KING>(Perspective);
 
@@ -437,28 +439,18 @@ class FeatureTransformer {
         // That might depend on the feature set and generally relies on the
         // feature set's update cost calculation to be correct and never allow
         // updates with more added/removed features than MaxActiveDimensions.
-        FeatureSet::IndexList removed[N - 1], added[N - 1];
+        FeatureSet::IndexList removed[N], added[N];
 
+        for (int i = N - 1; i >= 0; --i)
         {
-            int i =
-              N
-              - 2;  // Last potential state to update. Skip last element because it must be nullptr.
-            while (states_to_update[i] == nullptr)
-                --i;
-
-            StateInfo* st2 = states_to_update[i];
-
-            for (; i >= 0; --i)
-            {
-                (states_to_update[i]->*accPtr).computed[Perspective]     = !psqtOnly;
-                (states_to_update[i]->*accPtr).computedPSQT[Perspective] = true;
+            (states_to_update[i]->*accPtr).computed[Perspective]     = !psqtOnly;
+            (states_to_update[i]->*accPtr).computedPSQT[Perspective] = true;
 
-                const StateInfo* end_state = i == 0 ? computed_st : states_to_update[i - 1];
+            const StateInfo* end_state = i == 0 ? computed_st : states_to_update[i - 1];
 
-                for (; st2 != end_state; st2 = st2->previous)
-                    FeatureSet::append_changed_indices<Perspective>(ksq, st2->dirtyPiece,
-                                                                    removed[i], added[i]);
-            }
+            for (StateInfo* st2 = states_to_update[i]; st2 != end_state; st2 = st2->previous)
+                FeatureSet::append_changed_indices<Perspective>(ksq, st2->dirtyPiece, removed[i],
+                                                                added[i]);
         }
 
         StateInfo* st = computed_st;
@@ -466,8 +458,7 @@ class FeatureTransformer {
         // Now update the accumulators listed in states_to_update[], where the last element is a sentinel.
 #ifdef VECTOR
 
-        if (states_to_update[1] == nullptr && (removed[0].size() == 1 || removed[0].size() == 2)
-            && added[0].size() == 1)
+        if (N == 1 && (removed[0].size() == 1 || removed[0].size() == 2) && added[0].size() == 1)
         {
             assert(states_to_update[0]);
 
@@ -541,7 +532,7 @@ class FeatureTransformer {
                     for (IndexType k = 0; k < NumRegs; ++k)
                         acc[k] = vec_load(&accTileIn[k]);
 
-                    for (IndexType i = 0; states_to_update[i]; ++i)
+                    for (IndexType i = 0; i < N; ++i)
                     {
                         // Difference calculation for the deactivated features
                         for (const auto index : removed[i])
@@ -578,7 +569,7 @@ class FeatureTransformer {
                 for (std::size_t k = 0; k < NumPsqtRegs; ++k)
                     psqt[k] = vec_load_psqt(&accTilePsqtIn[k]);
 
-                for (IndexType i = 0; states_to_update[i]; ++i)
+                for (IndexType i = 0; i < N; ++i)
                 {
                     // Difference calculation for the deactivated features
                     for (const auto index : removed[i])
@@ -608,7 +599,7 @@ class FeatureTransformer {
             }
         }
 #else
-        for (IndexType i = 0; states_to_update[i]; ++i)
+        for (IndexType i = 0; i < N; ++i)
         {
             if (!psqtOnly)
                 std::memcpy((states_to_update[i]->*accPtr).accumulation[Perspective],
@@ -847,8 +838,8 @@ class FeatureTransformer {
             || (psqtOnly && (oldest_st->*accPtr).computedPSQT[Perspective]))
         {
             // Only update current position accumulator to minimize work.
-            StateInfo* states_to_update[2] = {pos.state(), nullptr};
-            update_accumulator_incremental<Perspective, 2>(pos, oldest_st, states_to_update,
+            StateInfo* states_to_update[1] = {pos.state()};
+            update_accumulator_incremental<Perspective, 1>(pos, oldest_st, states_to_update,
                                                            psqtOnly);
         }
         else
@@ -873,11 +864,20 @@ class FeatureTransformer {
             //     1. for the current position
             //     2. the next accumulator after the computed one
             // The heuristic may change in the future.
-            StateInfo* states_to_update[3] = {next, next == pos.state() ? nullptr : pos.state(),
-                                              nullptr};
+            if (next == pos.state())
+            {
+                StateInfo* states_to_update[1] = {next};
 
-            update_accumulator_incremental<Perspective, 3>(pos, oldest_st, states_to_update,
-                                                           psqtOnly);
+                update_accumulator_incremental<Perspective, 1>(pos, oldest_st, states_to_update,
+                                                               psqtOnly);
+            }
+            else
+            {
+                StateInfo* states_to_update[2] = {next, pos.state()};
+
+                update_accumulator_incremental<Perspective, 2>(pos, oldest_st, states_to_update,
+                                                               psqtOnly);
+            }
         }
         else
             update_accumulator_refresh_cache<Perspective>(pos, cache, psqtOnly);

From be142337d843ef3afc675e27628ab8e896c32cce Mon Sep 17 00:00:00 2001
From: mstembera <m_stembera@yahoo.com>
Date: Mon, 29 Apr 2024 20:37:54 -0700
Subject: [PATCH 12/12] Accumulator cache bugfix and cleanup

STC:
https://tests.stockfishchess.org/tests/view/663068913a05f1bf7a511dc2
LLR: 2.98 (-2.94,2.94) <-1.75,0.25>
Total: 70304 W: 18211 L: 18026 D: 34067
Ptnml(0-2): 232, 7966, 18582, 8129, 243

1) Fixes a bug introduced in
   https://github.com/official-stockfish/Stockfish/pull/5194. Only one
   psqtOnly flag was used for two perspectives which was causing
   wrong entries to be cleared and marked.
2) The finny caches should be cleared like histories and not at the
   start of every search.

closes https://github.com/official-stockfish/Stockfish/pull/5203

No functional change
---
 src/nnue/nnue_accumulator.h         | 28 ++++++++++++---------------
 src/nnue/nnue_feature_transformer.h | 30 ++++++++++++++---------------
 src/search.cpp                      |  5 ++---
 3 files changed, 28 insertions(+), 35 deletions(-)

diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index a2b3b98988e..179feba553e 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -59,31 +59,27 @@ struct AccumulatorCaches {
     struct alignas(CacheLineSize) Cache {
 
         struct alignas(CacheLineSize) Entry {
-            BiasType       accumulation[COLOR_NB][Size];
-            PSQTWeightType psqtAccumulation[COLOR_NB][PSQTBuckets];
-            Bitboard       byColorBB[COLOR_NB][COLOR_NB];
-            Bitboard       byTypeBB[COLOR_NB][PIECE_TYPE_NB];
+            BiasType       accumulation[Size];
+            PSQTWeightType psqtAccumulation[PSQTBuckets];
+            Bitboard       byColorBB[COLOR_NB];
+            Bitboard       byTypeBB[PIECE_TYPE_NB];
             bool           psqtOnly;
 
             // To initialize a refresh entry, we set all its bitboards empty,
             // so we put the biases in the accumulation, without any weights on top
             void clear(const BiasType* biases) {
 
-                std::memset(byColorBB, 0, sizeof(byColorBB));
-                std::memset(byTypeBB, 0, sizeof(byTypeBB));
-                psqtOnly = false;
-
-                std::memcpy(accumulation[WHITE], biases, Size * sizeof(BiasType));
-                std::memcpy(accumulation[BLACK], biases, Size * sizeof(BiasType));
-
-                std::memset(psqtAccumulation, 0, sizeof(psqtAccumulation));
+                std::memcpy(accumulation, biases, sizeof(accumulation));
+                std::memset((uint8_t*) this + offsetof(Entry, psqtAccumulation), 0,
+                            sizeof(Entry) - offsetof(Entry, psqtAccumulation));
             }
         };
 
         template<typename Network>
         void clear(const Network& network) {
-            for (auto& entry : entries)
-                entry.clear(network.featureTransformer->biases);
+            for (auto& entries1D : entries)
+                for (auto& entry : entries1D)
+                    entry.clear(network.featureTransformer->biases);
         }
 
         void clear(const BiasType* biases) {
@@ -91,9 +87,9 @@ struct AccumulatorCaches {
                 entry.clear(biases);
         }
 
-        Entry& operator[](Square sq) { return entries[sq]; }
+        std::array<Entry, COLOR_NB>& operator[](Square sq) { return entries[sq]; }
 
-        std::array<Entry, SQUARE_NB> entries;
+        std::array<std::array<Entry, COLOR_NB>, SQUARE_NB> entries;
     };
 
     template<typename Networks>
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 402a47a815d..4647ecd066d 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -652,7 +652,7 @@ class FeatureTransformer {
         assert(cache != nullptr);
 
         Square                ksq   = pos.square<KING>(Perspective);
-        auto&                 entry = (*cache)[ksq];
+        auto&                 entry = (*cache)[ksq][Perspective];
         FeatureSet::IndexList removed, added;
 
         if (entry.psqtOnly && !psqtOnly)
@@ -666,9 +666,8 @@ class FeatureTransformer {
             {
                 for (PieceType pt = PAWN; pt <= KING; ++pt)
                 {
-                    const Piece    piece = make_piece(c, pt);
-                    const Bitboard oldBB =
-                      entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt];
+                    const Piece    piece    = make_piece(c, pt);
+                    const Bitboard oldBB    = entry.byColorBB[c] & entry.byTypeBB[pt];
                     const Bitboard newBB    = pos.pieces(c, pt);
                     Bitboard       toRemove = oldBB & ~newBB;
                     Bitboard       toAdd    = newBB & ~oldBB;
@@ -698,8 +697,7 @@ class FeatureTransformer {
         if (!psqtOnly)
             for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
             {
-                auto entryTile =
-                  reinterpret_cast<vec_t*>(&entry.accumulation[Perspective][j * TileHeight]);
+                auto entryTile = reinterpret_cast<vec_t*>(&entry.accumulation[j * TileHeight]);
                 for (IndexType k = 0; k < NumRegs; ++k)
                     acc[k] = entryTile[k];
 
@@ -741,8 +739,8 @@ class FeatureTransformer {
 
         for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
         {
-            auto entryTilePsqt = reinterpret_cast<psqt_vec_t*>(
-              &entry.psqtAccumulation[Perspective][j * PsqtTileHeight]);
+            auto entryTilePsqt =
+              reinterpret_cast<psqt_vec_t*>(&entry.psqtAccumulation[j * PsqtTileHeight]);
             for (std::size_t k = 0; k < NumPsqtRegs; ++k)
                 psqt[k] = entryTilePsqt[k];
 
@@ -777,11 +775,11 @@ class FeatureTransformer {
             {
                 const IndexType offset = HalfDimensions * index;
                 for (IndexType j = 0; j < HalfDimensions; ++j)
-                    entry.accumulation[Perspective][j] -= weights[offset + j];
+                    entry.accumulation[j] -= weights[offset + j];
             }
 
             for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k];
+                entry.psqtAccumulation[k] -= psqtWeights[index * PSQTBuckets + k];
         }
         for (const auto index : added)
         {
@@ -789,11 +787,11 @@ class FeatureTransformer {
             {
                 const IndexType offset = HalfDimensions * index;
                 for (IndexType j = 0; j < HalfDimensions; ++j)
-                    entry.accumulation[Perspective][j] += weights[offset + j];
+                    entry.accumulation[j] += weights[offset + j];
             }
 
             for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k];
+                entry.psqtAccumulation[k] += psqtWeights[index * PSQTBuckets + k];
         }
 
 #endif
@@ -802,17 +800,17 @@ class FeatureTransformer {
         // Now copy its content to the actual accumulator we were refreshing
 
         if (!psqtOnly)
-            std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective],
+            std::memcpy(accumulator.accumulation[Perspective], entry.accumulation,
                         sizeof(BiasType) * HalfDimensions);
 
-        std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation[Perspective],
+        std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation,
                     sizeof(int32_t) * PSQTBuckets);
 
         for (Color c : {WHITE, BLACK})
-            entry.byColorBB[Perspective][c] = pos.pieces(c);
+            entry.byColorBB[c] = pos.pieces(c);
 
         for (PieceType pt = PAWN; pt <= KING; ++pt)
-            entry.byTypeBB[Perspective][pt] = pos.pieces(pt);
+            entry.byTypeBB[pt] = pos.pieces(pt);
 
         entry.psqtOnly = psqtOnly;
     }
diff --git a/src/search.cpp b/src/search.cpp
index e4f170be61d..b8e515f0267 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -147,9 +147,6 @@ Search::Worker::Worker(SharedState&                    sharedState,
 
 void Search::Worker::start_searching() {
 
-    // Initialize accumulator refresh entries
-    refreshTable.clear(networks);
-
     // Non-main threads go directly to iterative_deepening()
     if (!is_mainthread())
     {
@@ -506,6 +503,8 @@ void Search::Worker::clear() {
 
     for (size_t i = 1; i < reductions.size(); ++i)
         reductions[i] = int((20.14 + std::log(size_t(options["Threads"])) / 2) * std::log(i));
+
+    refreshTable.clear(networks);
 }