From 886ed90ec3599cdf0dc4e7d07b0543a27028c6c0 Mon Sep 17 00:00:00 2001 From: xoto10 <23479932+xoto10@users.noreply.github.com> Date: Sun, 28 Apr 2024 16:27:40 +0100 Subject: [PATCH 01/12] Use less time on recaptures Credit for the idea goes to peregrine on discord. Passed STC 10+0.1: https://tests.stockfishchess.org/tests/view/662652623fe04ce4cefc48cf LLR: 2.95 (-2.94,2.94) <0.00,2.00> Total: 75712 W: 19793 L: 19423 D: 36496 Ptnml(0-2): 258, 8487, 20023, 8803, 285 Passed LTC 60+0.6: https://tests.stockfishchess.org/tests/view/6627495e3fe04ce4cefc59b6 LLR: 2.94 (-2.94,2.94) <0.50,2.50> Total: 49788 W: 12743 L: 12404 D: 24641 Ptnml(0-2): 29, 5141, 14215, 5480, 29 The code was updated slightly and tested for non-regression against the original code at STC: LLR: 2.94 (-2.94,2.94) <-1.75,0.25> Total: 41952 W: 10912 L: 10698 D: 20342 Ptnml(0-2): 133, 4825, 10835, 5061, 122 https://tests.stockfishchess.org/tests/view/662d84f56115ff6764c7e438 closes https://github.com/official-stockfish/Stockfish/pull/5189 Bench: 1836777 --- src/engine.cpp | 12 ++++++++++-- src/engine.h | 11 +++++++---- src/search.cpp | 7 ++++--- src/search.h | 4 ++-- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/engine.cpp b/src/engine.cpp index 4625e00a816..72a37ce9b0b 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -53,6 +53,7 @@ Engine::Engine(std::string path) : NN::NetworkBig({EvalFileDefaultNameBig, "None", ""}, NN::EmbeddedNNUEType::BIG), NN::NetworkSmall({EvalFileDefaultNameSmall, "None", ""}, NN::EmbeddedNNUEType::SMALL))) { pos.set(StartFEN, false, &states->back()); + capSq = SQ_NONE; } std::uint64_t Engine::perft(const std::string& fen, Depth depth, bool isChess960) { @@ -61,9 +62,10 @@ std::uint64_t Engine::perft(const std::string& fen, Depth depth, bool isChess960 return Benchmark::perft(fen, depth, isChess960); } -void Engine::go(const Search::LimitsType& limits) { +void Engine::go(Search::LimitsType& limits) { assert(limits.perft == 0); verify_networks(); + limits.capSq = capSq; threads.start_thinking(options, pos, states, limits); } @@ -102,6 +104,7 @@ void Engine::set_position(const std::string& fen, const std::vector states = StateListPtr(new std::deque(1)); pos.set(fen, options["UCI_Chess960"], &states->back()); + capSq = SQ_NONE; for (const auto& move : moves) { auto m = UCIEngine::to_move(pos, move); @@ -111,6 +114,11 @@ void Engine::set_position(const std::string& fen, const std::vector states->emplace_back(); pos.do_move(m, states->back()); + + capSq = SQ_NONE; + DirtyPiece& dp = states->back().dirtyPiece; + if (dp.dirty_num > 1 && dp.to[1] == SQ_NONE) + capSq = m.to_sq(); } } @@ -172,4 +180,4 @@ std::string Engine::visualize() const { return ss.str(); } -} \ No newline at end of file +} diff --git a/src/engine.h b/src/engine.h index 041f5678585..64a814cb4aa 100644 --- a/src/engine.h +++ b/src/engine.h @@ -20,24 +20,26 @@ #define ENGINE_H_INCLUDED #include +#include #include #include #include #include #include #include -#include #include "nnue/network.h" #include "position.h" #include "search.h" +#include "syzygy/tbprobe.h" // for Stockfish::Depth #include "thread.h" #include "tt.h" #include "ucioption.h" -#include "syzygy/tbprobe.h" // for Stockfish::Depth namespace Stockfish { +enum Square : int; + class Engine { public: using InfoShort = Search::InfoShort; @@ -50,7 +52,7 @@ class Engine { std::uint64_t perft(const std::string& fen, Depth depth, bool isChess960); // non blocking call to start searching - void go(const Search::LimitsType&); + void go(Search::LimitsType&); // non blocking call to stop searching void stop(); @@ -92,6 +94,7 @@ class Engine { Position pos; StateListPtr states; + Square capSq; OptionsMap options; ThreadPool threads; @@ -104,4 +107,4 @@ class Engine { } // namespace Stockfish -#endif // #ifndef ENGINE_H_INCLUDED \ No newline at end of file +#endif // #ifndef ENGINE_H_INCLUDED diff --git a/src/search.cpp b/src/search.cpp index 893daab20e6..396e5aa06c8 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -54,8 +54,8 @@ using namespace Search; namespace { -static constexpr double EvalLevel[10] = {1.043, 1.017, 0.952, 1.009, 0.971, - 1.002, 0.992, 0.947, 1.046, 1.001}; +static constexpr double EvalLevel[10] = {0.981, 0.956, 0.895, 0.949, 0.913, + 0.942, 0.933, 0.890, 0.984, 0.941}; // Futility margin Value futility_margin(Depth d, bool noTtCutNode, bool improving, bool oppWorsening) { @@ -446,9 +446,10 @@ void Search::Worker::iterative_deepening() { double reduction = (1.48 + mainThread->previousTimeReduction) / (2.17 * timeReduction); double bestMoveInstability = 1 + 1.88 * totBestMoveChanges / threads.size(); int el = std::clamp((bestValue + 750) / 150, 0, 9); + double recapture = limits.capSq == rootMoves[0].pv[0].to_sq() ? 0.955 : 1.005; double totalTime = mainThread->tm.optimum() * fallingEval * reduction - * bestMoveInstability * EvalLevel[el]; + * bestMoveInstability * EvalLevel[el] * recapture; // Cap used time in case of a single legal move for a better viewer experience if (rootMoves.size() == 1) diff --git a/src/search.h b/src/search.h index 0fd778b47e6..9b3528c8741 100644 --- a/src/search.h +++ b/src/search.h @@ -109,8 +109,7 @@ struct RootMove { using RootMoves = std::vector; -// LimitsType struct stores information sent by GUI about available time to -// search the current move, maximum depth/time, or if we are in analysis mode. +// LimitsType struct stores information sent by the caller about the analysis required. struct LimitsType { // Init explicitly due to broken value-initialization of non POD in MSVC @@ -128,6 +127,7 @@ struct LimitsType { int movestogo, depth, mate, perft, infinite; uint64_t nodes; bool ponderMode; + Square capSq; }; From 3502c8ae426506453ca64e87e48d962b327c2356 Mon Sep 17 00:00:00 2001 From: Disservin Date: Thu, 25 Apr 2024 19:20:57 +0200 Subject: [PATCH 02/12] Fix missing initialization of AccumulatorCaches in Eval::trace Add a constructor to `AccumulatorCaches` instead of just calling `clear(networks)` to prevent similar issues from appearing in the future. fixes https://github.com/official-stockfish/Stockfish/issues/5190 closes https://github.com/official-stockfish/Stockfish/pull/5191 No functional change --- src/evaluate.cpp | 2 +- src/nnue/nnue_accumulator.h | 5 +++++ src/search.cpp | 4 ++-- src/search.h | 7 +++---- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index f5746ca5199..6e101e7830a 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -99,7 +99,7 @@ Value Eval::evaluate(const Eval::NNUE::Networks& networks, // Trace scores are from white's point of view std::string Eval::trace(Position& pos, const Eval::NNUE::Networks& networks) { - auto caches = std::make_unique(); + auto caches = std::make_unique(networks); if (pos.checkers()) return "Final evaluation: none (in check)"; diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index 8d73dbef5ad..f65385688de 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -50,6 +50,11 @@ struct alignas(CacheLineSize) Accumulator { // is commonly referred to as "Finny Tables". struct AccumulatorCaches { + template + AccumulatorCaches(const Networks& networks) { + clear(networks); + } + template struct alignas(CacheLineSize) Cache { diff --git a/src/search.cpp b/src/search.cpp index 396e5aa06c8..11373707b34 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -137,11 +137,11 @@ Search::Worker::Worker(SharedState& sharedState, // Unpack the SharedState struct into member variables thread_idx(thread_id), manager(std::move(sm)), - refreshTable(), options(sharedState.options), threads(sharedState.threads), tt(sharedState.tt), - networks(sharedState.networks) { + networks(sharedState.networks), + refreshTable(networks) { clear(); } diff --git a/src/search.h b/src/search.h index 9b3528c8741..444e3b8bb1d 100644 --- a/src/search.h +++ b/src/search.h @@ -302,15 +302,14 @@ class Worker { Tablebases::Config tbConfig; - // Used by NNUE - - Eval::NNUE::AccumulatorCaches refreshTable; - const OptionsMap& options; ThreadPool& threads; TranspositionTable& tt; const Eval::NNUE::Networks& networks; + // Used by NNUE + Eval::NNUE::AccumulatorCaches refreshTable; + friend class Stockfish::ThreadPool; friend class SearchManager; }; From bc45cbc820a53a9fc405c06ca67bd7be3970344e Mon Sep 17 00:00:00 2001 From: Joost VandeVondele Date: Sat, 27 Apr 2024 18:09:45 +0200 Subject: [PATCH 03/12] Output some basic info about the used networks Adds size in memory as well as layer sizes as in info string NNUE evaluation using nn-ae6a388e4a1a.nnue (132MiB, (22528, 3072, 15, 32, 1)) info string NNUE evaluation using nn-baff1ede1f90.nnue (6MiB, (22528, 128, 15, 32, 1)) For example, the size in MiB is useful to keep the fishtest memory sizes up-to-date, the L1-L3 sizes give a useful hint about the architecture used. closes https://github.com/official-stockfish/Stockfish/pull/5193 No functional change --- src/nnue/network.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp index 656ad97a1e3..42320bae1ab 100644 --- a/src/nnue/network.cpp +++ b/src/nnue/network.cpp @@ -252,7 +252,11 @@ void Network::verify(std::string evalfilePath) const { exit(EXIT_FAILURE); } - sync_cout << "info string NNUE evaluation using " << evalfilePath << sync_endl; + size_t size = sizeof(*featureTransformer) + sizeof(*network) * LayerStacks; + sync_cout << "info string NNUE evaluation using " << evalfilePath << " (" + << size / (1024 * 1024) << "MiB, (" << featureTransformer->InputDimensions << ", " + << network[0]->TransformedFeatureDimensions << ", " << network[0]->FC_0_OUTPUTS + << ", " << network[0]->FC_1_OUTPUTS << ", 1))" << sync_endl; } From 940a3a7383f48cea7aacbbe335671aa0d3ead1ae Mon Sep 17 00:00:00 2001 From: mstembera Date: Thu, 25 Apr 2024 18:20:08 -0700 Subject: [PATCH 04/12] Cache small net w/ psqtOnly support Caching the small net in the same way as the big net allows them to share the same code path and completely removes update_accumulator_refresh(). STC: https://tests.stockfishchess.org/tests/view/662bfb5ed46f72253dcfed85 LLR: 2.94 (-2.94,2.94) <-1.75,0.25> Total: 151712 W: 39252 L: 39158 D: 73302 Ptnml(0-2): 565, 17474, 39683, 17570, 564 closes https://github.com/official-stockfish/Stockfish/pull/5194 Bench: 1836777 --- src/evaluate.cpp | 2 +- src/nnue/network.cpp | 4 +- src/nnue/network.h | 2 +- src/nnue/nnue_accumulator.h | 6 +- src/nnue/nnue_feature_transformer.h | 263 ++++++++-------------------- src/nnue/nnue_misc.cpp | 2 +- 6 files changed, 86 insertions(+), 193 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 6e101e7830a..345925f6b2a 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -60,7 +60,7 @@ Value Eval::evaluate(const Eval::NNUE::Networks& networks, int nnueComplexity; int v; - Value nnue = smallNet ? networks.small.evaluate(pos, nullptr, true, &nnueComplexity, psqtOnly) + Value nnue = smallNet ? networks.small.evaluate(pos, &caches.small, true, &nnueComplexity, psqtOnly) : networks.big.evaluate(pos, &caches.big, true, &nnueComplexity, false); const auto adjustEval = [&](int optDiv, int nnueDiv, int npmDiv, int pawnCountConstant, diff --git a/src/nnue/network.cpp b/src/nnue/network.cpp index 42320bae1ab..2eca18bd15d 100644 --- a/src/nnue/network.cpp +++ b/src/nnue/network.cpp @@ -263,8 +263,8 @@ void Network::verify(std::string evalfilePath) const { template void Network::hint_common_access(const Position& pos, AccumulatorCaches::Cache* cache, - bool psqtOnl) const { - featureTransformer->hint_common_access(pos, cache, psqtOnl); + bool psqtOnly) const { + featureTransformer->hint_common_access(pos, cache, psqtOnly); } template diff --git a/src/nnue/network.h b/src/nnue/network.h index df59732d955..053b7d19c82 100644 --- a/src/nnue/network.h +++ b/src/nnue/network.h @@ -62,7 +62,7 @@ class Network { void hint_common_access(const Position& pos, AccumulatorCaches::Cache* cache, - bool psqtOnl) const; + bool psqtOnly) const; void verify(std::string evalfilePath) const; NnueEvalTrace trace_evaluate(const Position& pos, diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index f65385688de..dd313958fe6 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -63,6 +63,7 @@ struct AccumulatorCaches { PSQTWeightType psqtAccumulation[COLOR_NB][PSQTBuckets]; Bitboard byColorBB[COLOR_NB][COLOR_NB]; Bitboard byTypeBB[COLOR_NB][PIECE_TYPE_NB]; + bool psqtOnly; // To initialize a refresh entry, we set all its bitboards empty, // so we put the biases in the accumulation, without any weights on top @@ -70,6 +71,7 @@ struct AccumulatorCaches { std::memset(byColorBB, 0, sizeof(byColorBB)); std::memset(byTypeBB, 0, sizeof(byTypeBB)); + psqtOnly = false; std::memcpy(accumulation[WHITE], biases, Size * sizeof(BiasType)); std::memcpy(accumulation[BLACK], biases, Size * sizeof(BiasType)); @@ -97,11 +99,11 @@ struct AccumulatorCaches { template void clear(const Networks& networks) { big.clear(networks.big); + small.clear(networks.small); } - // When adding a new cache for a network, i.e. the smallnet - // the appropriate condition must be added to FeatureTransformer::update_accumulator_refresh. Cache big; + Cache small; }; } // namespace Stockfish::Eval::NNUE diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 88f0e4031a4..60957ebeb77 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -656,75 +656,84 @@ class FeatureTransformer { template void update_accumulator_refresh_cache(const Position& pos, - AccumulatorCaches::Cache* cache) const { + AccumulatorCaches::Cache* cache, + bool psqtOnly) const { assert(cache != nullptr); Square ksq = pos.square(Perspective); - auto& entry = (*cache)[ksq]; - - auto& accumulator = pos.state()->*accPtr; - accumulator.computed[Perspective] = true; - accumulator.computedPSQT[Perspective] = true; - FeatureSet::IndexList removed, added; - for (Color c : {WHITE, BLACK}) + + if (entry.psqtOnly && !psqtOnly) { - for (PieceType pt = PAWN; pt <= KING; ++pt) + entry.clear(biases); + FeatureSet::append_active_indices(pos, added); + } + else + { + for (Color c : {WHITE, BLACK}) { - const Piece piece = make_piece(c, pt); - const Bitboard oldBB = - entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt]; - const Bitboard newBB = pos.pieces(c, pt); - Bitboard toRemove = oldBB & ~newBB; - Bitboard toAdd = newBB & ~oldBB; - - while (toRemove) - { - Square sq = pop_lsb(toRemove); - removed.push_back(FeatureSet::make_index(sq, piece, ksq)); - } - while (toAdd) + for (PieceType pt = PAWN; pt <= KING; ++pt) { - Square sq = pop_lsb(toAdd); - added.push_back(FeatureSet::make_index(sq, piece, ksq)); + const Piece piece = make_piece(c, pt); + const Bitboard oldBB = + entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt]; + const Bitboard newBB = pos.pieces(c, pt); + Bitboard toRemove = oldBB & ~newBB; + Bitboard toAdd = newBB & ~oldBB; + + while (toRemove) + { + Square sq = pop_lsb(toRemove); + removed.push_back(FeatureSet::make_index(sq, piece, ksq)); + } + while (toAdd) + { + Square sq = pop_lsb(toAdd); + added.push_back(FeatureSet::make_index(sq, piece, ksq)); + } } } } + auto& accumulator = pos.state()->*accPtr; + accumulator.computed[Perspective] = !psqtOnly; + accumulator.computedPSQT[Perspective] = true; + #ifdef VECTOR vec_t acc[NumRegs]; psqt_vec_t psqt[NumPsqtRegs]; - for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) - { - auto entryTile = - reinterpret_cast(&entry.accumulation[Perspective][j * TileHeight]); - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = entryTile[k]; - - for (int i = 0; i < int(added.size()); ++i) + if (!psqtOnly) + for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) { - IndexType index = added[i]; - const IndexType offset = HalfDimensions * index + j * TileHeight; - auto column = reinterpret_cast(&weights[offset]); + auto entryTile = + reinterpret_cast(&entry.accumulation[Perspective][j * TileHeight]); + for (IndexType k = 0; k < NumRegs; ++k) + acc[k] = entryTile[k]; - for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(acc[k], column[k]); - } - for (int i = 0; i < int(removed.size()); ++i) - { - IndexType index = removed[i]; - const IndexType offset = HalfDimensions * index + j * TileHeight; - auto column = reinterpret_cast(&weights[offset]); + for (int i = 0; i < int(added.size()); ++i) + { + IndexType index = added[i]; + const IndexType offset = HalfDimensions * index + j * TileHeight; + auto column = reinterpret_cast(&weights[offset]); - for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_sub_16(acc[k], column[k]); - } + for (unsigned k = 0; k < NumRegs; ++k) + acc[k] = vec_add_16(acc[k], column[k]); + } + for (int i = 0; i < int(removed.size()); ++i) + { + IndexType index = removed[i]; + const IndexType offset = HalfDimensions * index + j * TileHeight; + auto column = reinterpret_cast(&weights[offset]); - for (IndexType k = 0; k < NumRegs; k++) - vec_store(&entryTile[k], acc[k]); - } + for (unsigned k = 0; k < NumRegs; ++k) + acc[k] = vec_sub_16(acc[k], column[k]); + } + + for (IndexType k = 0; k < NumRegs; k++) + vec_store(&entryTile[k], acc[k]); + } for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) { @@ -760,18 +769,24 @@ class FeatureTransformer { for (const auto index : added) { - const IndexType offset = HalfDimensions * index; - for (IndexType j = 0; j < HalfDimensions; ++j) - entry.accumulation[Perspective][j] += weights[offset + j]; + if (!psqtOnly) + { + const IndexType offset = HalfDimensions * index; + for (IndexType j = 0; j < HalfDimensions; ++j) + entry.accumulation[Perspective][j] += weights[offset + j]; + } for (std::size_t k = 0; k < PSQTBuckets; ++k) entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k]; } for (const auto index : removed) { - const IndexType offset = HalfDimensions * index; - for (IndexType j = 0; j < HalfDimensions; ++j) - entry.accumulation[Perspective][j] -= weights[offset + j]; + if (!psqtOnly) + { + const IndexType offset = HalfDimensions * index; + for (IndexType j = 0; j < HalfDimensions; ++j) + entry.accumulation[Perspective][j] -= weights[offset + j]; + } for (std::size_t k = 0; k < PSQTBuckets; ++k) entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k]; @@ -782,144 +797,20 @@ class FeatureTransformer { // The accumulator of the refresh entry has been updated. // Now copy its content to the actual accumulator we were refreshing + if (!psqtOnly) + std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective], + sizeof(BiasType) * HalfDimensions); + std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation[Perspective], sizeof(int32_t) * PSQTBuckets); - std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective], - sizeof(BiasType) * HalfDimensions); - for (Color c : {WHITE, BLACK}) entry.byColorBB[Perspective][c] = pos.pieces(c); for (PieceType pt = PAWN; pt <= KING; ++pt) entry.byTypeBB[Perspective][pt] = pos.pieces(pt); - } - - template - void - update_accumulator_refresh(const Position& pos, - [[maybe_unused]] AccumulatorCaches::Cache* cache, - bool psqtOnly) const { - - // When we are refreshing the accumulator of the big net, - // redirect to the version of refresh that uses the refresh table. - // Using the cache for the small net is not beneficial. - if constexpr (HalfDimensions == Eval::NNUE::TransformedFeatureDimensionsBig) - { - update_accumulator_refresh_cache(pos, cache); - return; - } -#ifdef VECTOR - // Gcc-10.2 unnecessarily spills AVX2 registers if this array - // is defined in the VECTOR code below, once in each branch - vec_t acc[NumRegs]; - psqt_vec_t psqt[NumPsqtRegs]; -#endif - - // Refresh the accumulator - // Could be extracted to a separate function because it's done in 2 places, - // but it's unclear if compilers would correctly handle register allocation. - auto& accumulator = pos.state()->*accPtr; - accumulator.computed[Perspective] = !psqtOnly; - accumulator.computedPSQT[Perspective] = true; - FeatureSet::IndexList active; - FeatureSet::append_active_indices(pos, active); - -#ifdef VECTOR - if (!psqtOnly) - for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) - { - auto biasesTile = reinterpret_cast(&biases[j * TileHeight]); - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = biasesTile[k]; - - int i = 0; - for (; i < int(active.size()) - 1; i += 2) - { - IndexType index0 = active[i]; - IndexType index1 = active[i + 1]; - const IndexType offset0 = HalfDimensions * index0 + j * TileHeight; - const IndexType offset1 = HalfDimensions * index1 + j * TileHeight; - auto column0 = reinterpret_cast(&weights[offset0]); - auto column1 = reinterpret_cast(&weights[offset1]); - - for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(acc[k], vec_add_16(column0[k], column1[k])); - } - for (; i < int(active.size()); ++i) - { - IndexType index = active[i]; - const IndexType offset = HalfDimensions * index + j * TileHeight; - auto column = reinterpret_cast(&weights[offset]); - - for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(acc[k], column[k]); - } - - auto accTile = - reinterpret_cast(&accumulator.accumulation[Perspective][j * TileHeight]); - for (unsigned k = 0; k < NumRegs; k++) - vec_store(&accTile[k], acc[k]); - } - - for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) - { - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_zero_psqt(); - - int i = 0; - for (; i < int(active.size()) - 1; i += 2) - { - IndexType index0 = active[i]; - IndexType index1 = active[i + 1]; - const IndexType offset0 = PSQTBuckets * index0 + j * PsqtTileHeight; - const IndexType offset1 = PSQTBuckets * index1 + j * PsqtTileHeight; - auto columnPsqt0 = reinterpret_cast(&psqtWeights[offset0]); - auto columnPsqt1 = reinterpret_cast(&psqtWeights[offset1]); - - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = - vec_add_psqt_32(psqt[k], vec_add_psqt_32(columnPsqt0[k], columnPsqt1[k])); - } - for (; i < int(active.size()); ++i) - { - IndexType index = active[i]; - const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; - auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); - - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); - } - - auto accTilePsqt = reinterpret_cast( - &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - vec_store_psqt(&accTilePsqt[k], psqt[k]); - } - -#else - if (!psqtOnly) - std::memcpy(accumulator.accumulation[Perspective], biases, - HalfDimensions * sizeof(BiasType)); - - for (std::size_t k = 0; k < PSQTBuckets; ++k) - accumulator.psqtAccumulation[Perspective][k] = 0; - - for (const auto index : active) - { - if (!psqtOnly) - { - const IndexType offset = HalfDimensions * index; - for (IndexType j = 0; j < HalfDimensions; ++j) - accumulator.accumulation[Perspective][j] += weights[offset + j]; - } - - for (std::size_t k = 0; k < PSQTBuckets; ++k) - accumulator.psqtAccumulation[Perspective][k] += - psqtWeights[index * PSQTBuckets + k]; - } -#endif + entry.psqtOnly = psqtOnly; } template @@ -948,7 +839,7 @@ class FeatureTransformer { psqtOnly); } else - update_accumulator_refresh(pos, cache, psqtOnly); + update_accumulator_refresh_cache(pos, cache, psqtOnly); } template @@ -976,7 +867,7 @@ class FeatureTransformer { psqtOnly); } else - update_accumulator_refresh(pos, cache, psqtOnly); + update_accumulator_refresh_cache(pos, cache, psqtOnly); } template diff --git a/src/nnue/nnue_misc.cpp b/src/nnue/nnue_misc.cpp index 51838fefa44..e92dcc71086 100644 --- a/src/nnue/nnue_misc.cpp +++ b/src/nnue/nnue_misc.cpp @@ -48,7 +48,7 @@ void hint_common_parent_position(const Position& pos, int simpleEvalAbs = std::abs(simple_eval(pos, pos.side_to_move())); if (simpleEvalAbs > Eval::SmallNetThreshold) - networks.small.hint_common_access(pos, nullptr, simpleEvalAbs > Eval::PsqtOnlyThreshold); + networks.small.hint_common_access(pos, &caches.small, simpleEvalAbs > Eval::PsqtOnlyThreshold); else networks.big.hint_common_access(pos, &caches.big, false); } From a129c0695be921acfbb3f5c966eef756d0b6f843 Mon Sep 17 00:00:00 2001 From: mstembera Date: Sun, 28 Apr 2024 10:28:25 -0700 Subject: [PATCH 05/12] Combine remove and add in update_accumulator_refresh_cache() Combine remove and add in update_accumulator_refresh_cache(). Move remove before add to match other parts of the code. STC: https://tests.stockfishchess.org/tests/view/662d96dc6115ff6764c7f4ca LLR: 2.95 (-2.94,2.94) <0.00,2.00> Total: 364032 W: 94421 L: 93624 D: 175987 Ptnml(0-2): 1261, 41983, 94811, 42620, 1341 closes https://github.com/official-stockfish/Stockfish/pull/5194 Bench: 1836777 --- src/evaluate.cpp | 5 +-- src/nnue/nnue_accumulator.h | 2 +- src/nnue/nnue_feature_transformer.h | 53 ++++++++++++++++++----------- src/nnue/nnue_misc.cpp | 3 +- 4 files changed, 39 insertions(+), 24 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 345925f6b2a..fe6b83aa111 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -60,8 +60,9 @@ Value Eval::evaluate(const Eval::NNUE::Networks& networks, int nnueComplexity; int v; - Value nnue = smallNet ? networks.small.evaluate(pos, &caches.small, true, &nnueComplexity, psqtOnly) - : networks.big.evaluate(pos, &caches.big, true, &nnueComplexity, false); + Value nnue = smallNet + ? networks.small.evaluate(pos, &caches.small, true, &nnueComplexity, psqtOnly) + : networks.big.evaluate(pos, &caches.big, true, &nnueComplexity, false); const auto adjustEval = [&](int optDiv, int nnueDiv, int npmDiv, int pawnCountConstant, int pawnCountMul, int npmConstant, int evalDiv, diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index dd313958fe6..a2b3b98988e 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -102,7 +102,7 @@ struct AccumulatorCaches { small.clear(networks.small); } - Cache big; + Cache big; Cache small; }; diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 60957ebeb77..6b3f78a9a4b 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -660,8 +660,8 @@ class FeatureTransformer { bool psqtOnly) const { assert(cache != nullptr); - Square ksq = pos.square(Perspective); - auto& entry = (*cache)[ksq]; + Square ksq = pos.square(Perspective); + auto& entry = (*cache)[ksq]; FeatureSet::IndexList removed, added; if (entry.psqtOnly && !psqtOnly) @@ -712,16 +712,20 @@ class FeatureTransformer { for (IndexType k = 0; k < NumRegs; ++k) acc[k] = entryTile[k]; - for (int i = 0; i < int(added.size()); ++i) + int i0 = 0; + for (; i0 < int(std::min(removed.size(), added.size())); ++i0) { - IndexType index = added[i]; - const IndexType offset = HalfDimensions * index + j * TileHeight; - auto column = reinterpret_cast(&weights[offset]); + IndexType indexR = removed[i0]; + const IndexType offsetR = HalfDimensions * indexR + j * TileHeight; + auto columnR = reinterpret_cast(&weights[offsetR]); + IndexType indexA = added[i0]; + const IndexType offsetA = HalfDimensions * indexA + j * TileHeight; + auto columnA = reinterpret_cast(&weights[offsetA]); for (unsigned k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(acc[k], column[k]); + acc[k] = vec_add_16(vec_sub_16(acc[k], columnR[k]), columnA[k]); } - for (int i = 0; i < int(removed.size()); ++i) + for (int i = i0; i < int(removed.size()); ++i) { IndexType index = removed[i]; const IndexType offset = HalfDimensions * index + j * TileHeight; @@ -730,6 +734,15 @@ class FeatureTransformer { for (unsigned k = 0; k < NumRegs; ++k) acc[k] = vec_sub_16(acc[k], column[k]); } + for (int i = i0; i < int(added.size()); ++i) + { + IndexType index = added[i]; + const IndexType offset = HalfDimensions * index + j * TileHeight; + auto column = reinterpret_cast(&weights[offset]); + + for (unsigned k = 0; k < NumRegs; ++k) + acc[k] = vec_add_16(acc[k], column[k]); + } for (IndexType k = 0; k < NumRegs; k++) vec_store(&entryTile[k], acc[k]); @@ -742,23 +755,23 @@ class FeatureTransformer { for (std::size_t k = 0; k < NumPsqtRegs; ++k) psqt[k] = entryTilePsqt[k]; - for (int i = 0; i < int(added.size()); ++i) + for (int i = 0; i < int(removed.size()); ++i) { - IndexType index = added[i]; + IndexType index = removed[i]; const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); + psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); } - for (int i = 0; i < int(removed.size()); ++i) + for (int i = 0; i < int(added.size()); ++i) { - IndexType index = removed[i]; + IndexType index = added[i]; const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); + psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); } for (std::size_t k = 0; k < NumPsqtRegs; ++k) @@ -767,29 +780,29 @@ class FeatureTransformer { #else - for (const auto index : added) + for (const auto index : removed) { if (!psqtOnly) { const IndexType offset = HalfDimensions * index; for (IndexType j = 0; j < HalfDimensions; ++j) - entry.accumulation[Perspective][j] += weights[offset + j]; + entry.accumulation[Perspective][j] -= weights[offset + j]; } for (std::size_t k = 0; k < PSQTBuckets; ++k) - entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k]; + entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k]; } - for (const auto index : removed) + for (const auto index : added) { if (!psqtOnly) { const IndexType offset = HalfDimensions * index; for (IndexType j = 0; j < HalfDimensions; ++j) - entry.accumulation[Perspective][j] -= weights[offset + j]; + entry.accumulation[Perspective][j] += weights[offset + j]; } for (std::size_t k = 0; k < PSQTBuckets; ++k) - entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k]; + entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k]; } #endif diff --git a/src/nnue/nnue_misc.cpp b/src/nnue/nnue_misc.cpp index e92dcc71086..21685d0f2a3 100644 --- a/src/nnue/nnue_misc.cpp +++ b/src/nnue/nnue_misc.cpp @@ -48,7 +48,8 @@ void hint_common_parent_position(const Position& pos, int simpleEvalAbs = std::abs(simple_eval(pos, pos.side_to_move())); if (simpleEvalAbs > Eval::SmallNetThreshold) - networks.small.hint_common_access(pos, &caches.small, simpleEvalAbs > Eval::PsqtOnlyThreshold); + networks.small.hint_common_access(pos, &caches.small, + simpleEvalAbs > Eval::PsqtOnlyThreshold); else networks.big.hint_common_access(pos, &caches.big, false); } From 834e8ff619b212baf402c3922f8fde9af979cd0c Mon Sep 17 00:00:00 2001 From: cj5716 <125858804+cj5716@users.noreply.github.com> Date: Sun, 28 Apr 2024 08:53:28 +0800 Subject: [PATCH 06/12] Penalise the TT move in multicut Passed STC: LLR: 2.99 (-2.94,2.94) <0.00,2.00> Total: 185504 W: 48079 L: 47533 D: 89892 Ptnml(0-2): 716, 21866, 46988, 22520, 662 https://tests.stockfishchess.org/tests/view/662d9e1d6115ff6764c7f83d Passed LTC: LLR: 2.94 (-2.94,2.94) <0.50,2.50> Total: 75612 W: 19351 L: 18948 D: 37313 Ptnml(0-2): 46, 8363, 20592, 8752, 53 https://tests.stockfishchess.org/tests/view/662dc9dc6115ff6764c80fea closes https://github.com/official-stockfish/Stockfish/pull/5195 Bench: 1415435 --- src/search.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/search.cpp b/src/search.cpp index 11373707b34..ad59b35a545 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1067,7 +1067,12 @@ Value Search::Worker::search( // we assume this expected cut-node is not singular (multiple moves fail high), // and we can prune the whole subtree by returning a softbound. else if (singularBeta >= beta) + { + if (!ttCapture) + update_quiet_stats(pos, ss, *this, ttMove, -stat_malus(depth)); + return singularBeta; + } // Negative extensions // If other moves failed high over (ttValue - margin) without the ttMove on a reduced search, From 48a3b7c0ee7d32441a5a4519c85bd1e93e467f6e Mon Sep 17 00:00:00 2001 From: Stefan Geschwentner Date: Sun, 28 Apr 2024 16:04:28 +0200 Subject: [PATCH 07/12] Simplify non-pawn material divisor to a constant Passed STC: https://tests.stockfishchess.org/tests/view/662942603fe04ce4cefc7aba LLR: 2.93 (-2.94,2.94) <-1.75,0.25> Total: 272832 W: 70456 L: 70497 D: 131879 Ptnml(0-2): 1020, 32619, 69154, 32628, 995 Passed LTC: https://tests.stockfishchess.org/tests/view/662dfe3b6115ff6764c829eb LLR: 2.94 (-2.94,2.94) <-1.75,0.25> Total: 100254 W: 25446 L: 25303 D: 49505 Ptnml(0-2): 121, 11292, 27166, 11419, 129 closes https://github.com/official-stockfish/Stockfish/pull/5198 Bench: 1544645 --- src/evaluate.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index fe6b83aa111..1d41f3a266b 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -64,14 +64,14 @@ Value Eval::evaluate(const Eval::NNUE::Networks& networks, ? networks.small.evaluate(pos, &caches.small, true, &nnueComplexity, psqtOnly) : networks.big.evaluate(pos, &caches.big, true, &nnueComplexity, false); - const auto adjustEval = [&](int optDiv, int nnueDiv, int npmDiv, int pawnCountConstant, - int pawnCountMul, int npmConstant, int evalDiv, - int shufflingConstant, int shufflingDiv) { + const auto adjustEval = [&](int optDiv, int nnueDiv, int pawnCountConstant, int pawnCountMul, + int npmConstant, int evalDiv, int shufflingConstant, + int shufflingDiv) { // Blend optimism and eval with nnue complexity and material imbalance optimism += optimism * (nnueComplexity + std::abs(simpleEval - nnue)) / optDiv; nnue -= nnue * (nnueComplexity * 5 / 3) / nnueDiv; - int npm = pos.non_pawn_material() / npmDiv; + int npm = pos.non_pawn_material() / 64; v = (nnue * (npm + pawnCountConstant + pawnCountMul * pos.count()) + optimism * (npmConstant + npm)) / evalDiv; @@ -82,11 +82,11 @@ Value Eval::evaluate(const Eval::NNUE::Networks& networks, }; if (!smallNet) - adjustEval(524, 32395, 66, 942, 11, 139, 1058, 178, 204); + adjustEval(524, 32395, 942, 11, 139, 1058, 178, 204); else if (psqtOnly) - adjustEval(517, 32857, 65, 908, 7, 155, 1006, 224, 238); + adjustEval(517, 32857, 908, 7, 155, 1006, 224, 238); else - adjustEval(515, 32793, 63, 944, 9, 140, 1067, 206, 206); + adjustEval(515, 32793, 944, 9, 140, 1067, 206, 206); // Guarantee evaluation does not hit the tablebase range v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1); From 0fe64286457549d2f80cd7792088375aaa9bee55 Mon Sep 17 00:00:00 2001 From: Stefan Geschwentner Date: Sun, 28 Apr 2024 16:53:47 +0200 Subject: [PATCH 08/12] More reduction at cut nodes which are not a former PV node But the tt move and first killer are excluded. This idea is based on following LMR condition tuning https://tests.stockfishchess.org/tests/view/66228bed3fe04ce4cefc0c71 by using only the two largest terms P[0] and P[1]. Passed STC: LLR: 2.93 (-2.94,2.94) <0.00,2.00> Total: 173248 W: 45091 L: 44565 D: 83592 Ptnml(0-2): 693, 20534, 43673, 21002, 722 https://tests.stockfishchess.org/tests/view/6629603b3fe04ce4cefc7d37 Passed LTC: LLR: 2.94 (-2.94,2.94) <0.50,2.50> Total: 722394 W: 183231 L: 181487 D: 357676 Ptnml(0-2): 462, 80650, 197252, 82348, 485 https://tests.stockfishchess.org/tests/view/662cbe45d46f72253dcff7bf closes https://github.com/official-stockfish/Stockfish/pull/5199 Bench: 1619613 --- src/search.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/search.cpp b/src/search.cpp index ad59b35a545..3718c37813b 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1123,6 +1123,9 @@ Value Search::Worker::search( if (ss->ttPv) r -= 1 + (ttValue > alpha) + (tte->depth() >= depth); + else if (cutNode && move != ttMove && move != ss->killers[0]) + r++; + // Increase reduction for cut nodes (~4 Elo) if (cutNode) r += 2 - (tte->depth() >= depth && ss->ttPv); From 5d720325596699ceba2743776cb39f9cea1754f5 Mon Sep 17 00:00:00 2001 From: Dubslow Date: Sat, 20 Apr 2024 00:29:01 -0500 Subject: [PATCH 09/12] Use capture history to better judge which sacrifices to explore This idea has been bouncing around a while. @Vizvezdenec tried it a couple years ago in Stockfish without results, but its recent arrival in Ethereal inspired him and thence me to try it afresh in Stockfish. (Also factor out the now-common code with futpruning for captures.) STC: https://tests.stockfishchess.org/tests/view/662355bc3fe04ce4cefc18ac LLR: 2.92 (-2.94,2.94) <0.00,2.00> Total: 45760 W: 11970 L: 11640 D: 22150 Ptnml(0-2): 124, 5371, 11625, 5571, 189 LTC: https://tests.stockfishchess.org/tests/view/662dda396115ff6764c817c9 LLR: 2.94 (-2.94,2.94) <0.50,2.50> Total: 243828 W: 62042 L: 61287 D: 120499 Ptnml(0-2): 211, 27202, 66329, 27965, 207 closes https://github.com/official-stockfish/Stockfish/pull/5200 Bench: 1480008 --- src/search.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index 3718c37813b..e4f170be61d 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -967,20 +967,22 @@ Value Search::Worker::search( if (capture || givesCheck) { + Piece capturedPiece = pos.piece_on(move.to_sq()); + int captHist = + thisThread->captureHistory[movedPiece][move.to_sq()][type_of(capturedPiece)]; + // Futility pruning for captures (~2 Elo) if (!givesCheck && lmrDepth < 7 && !ss->inCheck) { - Piece capturedPiece = pos.piece_on(move.to_sq()); - Value futilityValue = - ss->staticEval + 285 + 277 * lmrDepth + PieceValue[capturedPiece] - + thisThread->captureHistory[movedPiece][move.to_sq()][type_of(capturedPiece)] - / 7; + Value futilityValue = ss->staticEval + 285 + 277 * lmrDepth + + PieceValue[capturedPiece] + captHist / 7; if (futilityValue <= alpha) continue; } // SEE based pruning for captures and checks (~11 Elo) - if (!pos.see_ge(move, -203 * depth)) + int seeHist = std::clamp(captHist / 32, -199 * depth, 199 * depth); + if (!pos.see_ge(move, -203 * depth - seeHist)) continue; } else From eb20de36c05b4101af37b2bf3783c570a47bb1cc Mon Sep 17 00:00:00 2001 From: Ciekce <44617491+Ciekce@users.noreply.github.com> Date: Mon, 29 Apr 2024 01:45:56 +0100 Subject: [PATCH 10/12] Avoid unnecessary creation of accumulator cache Saves a (currently) 800 KB allocation and deallocation when running `eval`, not particularly significant and zero impact on play but not necessary either. closes https://github.com/official-stockfish/Stockfish/pull/5201 No functional change --- AUTHORS | 1 + src/evaluate.cpp | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/AUTHORS b/AUTHORS index abae401c1ef..36b2b6f7942 100644 --- a/AUTHORS +++ b/AUTHORS @@ -46,6 +46,7 @@ Bryan Cross (crossbr) candirufish Chess13234 Chris Cain (ceebo) +Ciekce clefrks Clemens L. (rn5f107s2) Cody Ho (aesrentai) diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 1d41f3a266b..e3aa249ca41 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -100,11 +100,11 @@ Value Eval::evaluate(const Eval::NNUE::Networks& networks, // Trace scores are from white's point of view std::string Eval::trace(Position& pos, const Eval::NNUE::Networks& networks) { - auto caches = std::make_unique(networks); - if (pos.checkers()) return "Final evaluation: none (in check)"; + auto caches = std::make_unique(networks); + std::stringstream ss; ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2); ss << '\n' << NNUE::trace(pos, networks, *caches) << '\n'; From 6a9b8a0c7b913b9d4c4474bae7804184d20e8c4a Mon Sep 17 00:00:00 2001 From: cj5716 <125858804+cj5716@users.noreply.github.com> Date: Sun, 28 Apr 2024 16:33:59 +0800 Subject: [PATCH 11/12] Optimise NNUE Accumulator updates Passed STC: https://tests.stockfishchess.org/tests/view/662e3c6a5e9274400985a741 LLR: 2.94 (-2.94,2.94) <0.00,2.00> Total: 86176 W: 22284 L: 21905 D: 41987 Ptnml(0-2): 254, 9572, 23051, 9963, 248 closes https://github.com/official-stockfish/Stockfish/pull/5202 No functional change --- src/nnue/nnue_feature_transformer.h | 76 ++++++++++++++--------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 6b3f78a9a4b..402a47a815d 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -404,19 +404,25 @@ class FeatureTransformer { return {st, next}; } - // NOTE: The parameter states_to_update is an array of position states, ending with nullptr. + // NOTE: The parameter states_to_update is an array of position states. // All states must be sequential, that is states_to_update[i] must either be reachable - // by repeatedly applying ->previous from states_to_update[i+1] or - // states_to_update[i] == nullptr. + // by repeatedly applying ->previous from states_to_update[i+1]. // computed_st must be reachable by repeatedly applying ->previous on - // states_to_update[0], if not nullptr. + // states_to_update[0]. template void update_accumulator_incremental(const Position& pos, StateInfo* computed_st, StateInfo* states_to_update[N], bool psqtOnly) const { static_assert(N > 0); - assert(states_to_update[N - 1] == nullptr); + assert([&]() { + for (size_t i = 0; i < N; ++i) + { + if (states_to_update[i] == nullptr) + return false; + } + return true; + }()); #ifdef VECTOR // Gcc-10.2 unnecessarily spills AVX2 registers if this array @@ -425,11 +431,7 @@ class FeatureTransformer { psqt_vec_t psqt[NumPsqtRegs]; #endif - if (states_to_update[0] == nullptr) - return; - // Update incrementally going back through states_to_update. - // Gather all features to be updated. const Square ksq = pos.square(Perspective); @@ -437,28 +439,18 @@ class FeatureTransformer { // That might depend on the feature set and generally relies on the // feature set's update cost calculation to be correct and never allow // updates with more added/removed features than MaxActiveDimensions. - FeatureSet::IndexList removed[N - 1], added[N - 1]; + FeatureSet::IndexList removed[N], added[N]; + for (int i = N - 1; i >= 0; --i) { - int i = - N - - 2; // Last potential state to update. Skip last element because it must be nullptr. - while (states_to_update[i] == nullptr) - --i; - - StateInfo* st2 = states_to_update[i]; - - for (; i >= 0; --i) - { - (states_to_update[i]->*accPtr).computed[Perspective] = !psqtOnly; - (states_to_update[i]->*accPtr).computedPSQT[Perspective] = true; + (states_to_update[i]->*accPtr).computed[Perspective] = !psqtOnly; + (states_to_update[i]->*accPtr).computedPSQT[Perspective] = true; - const StateInfo* end_state = i == 0 ? computed_st : states_to_update[i - 1]; + const StateInfo* end_state = i == 0 ? computed_st : states_to_update[i - 1]; - for (; st2 != end_state; st2 = st2->previous) - FeatureSet::append_changed_indices(ksq, st2->dirtyPiece, - removed[i], added[i]); - } + for (StateInfo* st2 = states_to_update[i]; st2 != end_state; st2 = st2->previous) + FeatureSet::append_changed_indices(ksq, st2->dirtyPiece, removed[i], + added[i]); } StateInfo* st = computed_st; @@ -466,8 +458,7 @@ class FeatureTransformer { // Now update the accumulators listed in states_to_update[], where the last element is a sentinel. #ifdef VECTOR - if (states_to_update[1] == nullptr && (removed[0].size() == 1 || removed[0].size() == 2) - && added[0].size() == 1) + if (N == 1 && (removed[0].size() == 1 || removed[0].size() == 2) && added[0].size() == 1) { assert(states_to_update[0]); @@ -541,7 +532,7 @@ class FeatureTransformer { for (IndexType k = 0; k < NumRegs; ++k) acc[k] = vec_load(&accTileIn[k]); - for (IndexType i = 0; states_to_update[i]; ++i) + for (IndexType i = 0; i < N; ++i) { // Difference calculation for the deactivated features for (const auto index : removed[i]) @@ -578,7 +569,7 @@ class FeatureTransformer { for (std::size_t k = 0; k < NumPsqtRegs; ++k) psqt[k] = vec_load_psqt(&accTilePsqtIn[k]); - for (IndexType i = 0; states_to_update[i]; ++i) + for (IndexType i = 0; i < N; ++i) { // Difference calculation for the deactivated features for (const auto index : removed[i]) @@ -608,7 +599,7 @@ class FeatureTransformer { } } #else - for (IndexType i = 0; states_to_update[i]; ++i) + for (IndexType i = 0; i < N; ++i) { if (!psqtOnly) std::memcpy((states_to_update[i]->*accPtr).accumulation[Perspective], @@ -847,8 +838,8 @@ class FeatureTransformer { || (psqtOnly && (oldest_st->*accPtr).computedPSQT[Perspective])) { // Only update current position accumulator to minimize work. - StateInfo* states_to_update[2] = {pos.state(), nullptr}; - update_accumulator_incremental(pos, oldest_st, states_to_update, + StateInfo* states_to_update[1] = {pos.state()}; + update_accumulator_incremental(pos, oldest_st, states_to_update, psqtOnly); } else @@ -873,11 +864,20 @@ class FeatureTransformer { // 1. for the current position // 2. the next accumulator after the computed one // The heuristic may change in the future. - StateInfo* states_to_update[3] = {next, next == pos.state() ? nullptr : pos.state(), - nullptr}; + if (next == pos.state()) + { + StateInfo* states_to_update[1] = {next}; - update_accumulator_incremental(pos, oldest_st, states_to_update, - psqtOnly); + update_accumulator_incremental(pos, oldest_st, states_to_update, + psqtOnly); + } + else + { + StateInfo* states_to_update[2] = {next, pos.state()}; + + update_accumulator_incremental(pos, oldest_st, states_to_update, + psqtOnly); + } } else update_accumulator_refresh_cache(pos, cache, psqtOnly); From be142337d843ef3afc675e27628ab8e896c32cce Mon Sep 17 00:00:00 2001 From: mstembera Date: Mon, 29 Apr 2024 20:37:54 -0700 Subject: [PATCH 12/12] Accumulator cache bugfix and cleanup STC: https://tests.stockfishchess.org/tests/view/663068913a05f1bf7a511dc2 LLR: 2.98 (-2.94,2.94) <-1.75,0.25> Total: 70304 W: 18211 L: 18026 D: 34067 Ptnml(0-2): 232, 7966, 18582, 8129, 243 1) Fixes a bug introduced in https://github.com/official-stockfish/Stockfish/pull/5194. Only one psqtOnly flag was used for two perspectives which was causing wrong entries to be cleared and marked. 2) The finny caches should be cleared like histories and not at the start of every search. closes https://github.com/official-stockfish/Stockfish/pull/5203 No functional change --- src/nnue/nnue_accumulator.h | 28 ++++++++++++--------------- src/nnue/nnue_feature_transformer.h | 30 ++++++++++++++--------------- src/search.cpp | 5 ++--- 3 files changed, 28 insertions(+), 35 deletions(-) diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index a2b3b98988e..179feba553e 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -59,31 +59,27 @@ struct AccumulatorCaches { struct alignas(CacheLineSize) Cache { struct alignas(CacheLineSize) Entry { - BiasType accumulation[COLOR_NB][Size]; - PSQTWeightType psqtAccumulation[COLOR_NB][PSQTBuckets]; - Bitboard byColorBB[COLOR_NB][COLOR_NB]; - Bitboard byTypeBB[COLOR_NB][PIECE_TYPE_NB]; + BiasType accumulation[Size]; + PSQTWeightType psqtAccumulation[PSQTBuckets]; + Bitboard byColorBB[COLOR_NB]; + Bitboard byTypeBB[PIECE_TYPE_NB]; bool psqtOnly; // To initialize a refresh entry, we set all its bitboards empty, // so we put the biases in the accumulation, without any weights on top void clear(const BiasType* biases) { - std::memset(byColorBB, 0, sizeof(byColorBB)); - std::memset(byTypeBB, 0, sizeof(byTypeBB)); - psqtOnly = false; - - std::memcpy(accumulation[WHITE], biases, Size * sizeof(BiasType)); - std::memcpy(accumulation[BLACK], biases, Size * sizeof(BiasType)); - - std::memset(psqtAccumulation, 0, sizeof(psqtAccumulation)); + std::memcpy(accumulation, biases, sizeof(accumulation)); + std::memset((uint8_t*) this + offsetof(Entry, psqtAccumulation), 0, + sizeof(Entry) - offsetof(Entry, psqtAccumulation)); } }; template void clear(const Network& network) { - for (auto& entry : entries) - entry.clear(network.featureTransformer->biases); + for (auto& entries1D : entries) + for (auto& entry : entries1D) + entry.clear(network.featureTransformer->biases); } void clear(const BiasType* biases) { @@ -91,9 +87,9 @@ struct AccumulatorCaches { entry.clear(biases); } - Entry& operator[](Square sq) { return entries[sq]; } + std::array& operator[](Square sq) { return entries[sq]; } - std::array entries; + std::array, SQUARE_NB> entries; }; template diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 402a47a815d..4647ecd066d 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -652,7 +652,7 @@ class FeatureTransformer { assert(cache != nullptr); Square ksq = pos.square(Perspective); - auto& entry = (*cache)[ksq]; + auto& entry = (*cache)[ksq][Perspective]; FeatureSet::IndexList removed, added; if (entry.psqtOnly && !psqtOnly) @@ -666,9 +666,8 @@ class FeatureTransformer { { for (PieceType pt = PAWN; pt <= KING; ++pt) { - const Piece piece = make_piece(c, pt); - const Bitboard oldBB = - entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt]; + const Piece piece = make_piece(c, pt); + const Bitboard oldBB = entry.byColorBB[c] & entry.byTypeBB[pt]; const Bitboard newBB = pos.pieces(c, pt); Bitboard toRemove = oldBB & ~newBB; Bitboard toAdd = newBB & ~oldBB; @@ -698,8 +697,7 @@ class FeatureTransformer { if (!psqtOnly) for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) { - auto entryTile = - reinterpret_cast(&entry.accumulation[Perspective][j * TileHeight]); + auto entryTile = reinterpret_cast(&entry.accumulation[j * TileHeight]); for (IndexType k = 0; k < NumRegs; ++k) acc[k] = entryTile[k]; @@ -741,8 +739,8 @@ class FeatureTransformer { for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) { - auto entryTilePsqt = reinterpret_cast( - &entry.psqtAccumulation[Perspective][j * PsqtTileHeight]); + auto entryTilePsqt = + reinterpret_cast(&entry.psqtAccumulation[j * PsqtTileHeight]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) psqt[k] = entryTilePsqt[k]; @@ -777,11 +775,11 @@ class FeatureTransformer { { const IndexType offset = HalfDimensions * index; for (IndexType j = 0; j < HalfDimensions; ++j) - entry.accumulation[Perspective][j] -= weights[offset + j]; + entry.accumulation[j] -= weights[offset + j]; } for (std::size_t k = 0; k < PSQTBuckets; ++k) - entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k]; + entry.psqtAccumulation[k] -= psqtWeights[index * PSQTBuckets + k]; } for (const auto index : added) { @@ -789,11 +787,11 @@ class FeatureTransformer { { const IndexType offset = HalfDimensions * index; for (IndexType j = 0; j < HalfDimensions; ++j) - entry.accumulation[Perspective][j] += weights[offset + j]; + entry.accumulation[j] += weights[offset + j]; } for (std::size_t k = 0; k < PSQTBuckets; ++k) - entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k]; + entry.psqtAccumulation[k] += psqtWeights[index * PSQTBuckets + k]; } #endif @@ -802,17 +800,17 @@ class FeatureTransformer { // Now copy its content to the actual accumulator we were refreshing if (!psqtOnly) - std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective], + std::memcpy(accumulator.accumulation[Perspective], entry.accumulation, sizeof(BiasType) * HalfDimensions); - std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation[Perspective], + std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation, sizeof(int32_t) * PSQTBuckets); for (Color c : {WHITE, BLACK}) - entry.byColorBB[Perspective][c] = pos.pieces(c); + entry.byColorBB[c] = pos.pieces(c); for (PieceType pt = PAWN; pt <= KING; ++pt) - entry.byTypeBB[Perspective][pt] = pos.pieces(pt); + entry.byTypeBB[pt] = pos.pieces(pt); entry.psqtOnly = psqtOnly; } diff --git a/src/search.cpp b/src/search.cpp index e4f170be61d..b8e515f0267 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -147,9 +147,6 @@ Search::Worker::Worker(SharedState& sharedState, void Search::Worker::start_searching() { - // Initialize accumulator refresh entries - refreshTable.clear(networks); - // Non-main threads go directly to iterative_deepening() if (!is_mainthread()) { @@ -506,6 +503,8 @@ void Search::Worker::clear() { for (size_t i = 1; i < reductions.size(); ++i) reductions[i] = int((20.14 + std::log(size_t(options["Threads"])) / 2) * std::log(i)); + + refreshTable.clear(networks); }