From 33cf47fc5ce4acc7030afdb32245c6f9d96fd2ba Mon Sep 17 00:00:00 2001 From: borg323 Date: Sat, 23 Oct 2021 00:15:02 +0300 Subject: [PATCH 01/12] reduce cache size by storing compressed policy --- src/mcts/node.cc | 47 ------------------- src/mcts/node.h | 7 +-- src/mcts/search.cc | 45 +++++------------- src/mcts/search.h | 15 +----- src/mcts/stoppers/stoppers.cc | 3 +- src/neural/cache.cc | 55 ++++++++++++++-------- src/neural/cache.h | 13 +++--- src/selfplay/game.cc | 3 +- src/trainingdata/trainingdata.cc | 22 ++++----- src/trainingdata/trainingdata.h | 3 +- src/utils/pfloat16.h | 79 ++++++++++++++++++++++++++++++++ 11 files changed, 153 insertions(+), 139 deletions(-) create mode 100644 src/utils/pfloat16.h diff --git a/src/mcts/node.cc b/src/mcts/node.cc index 9b71cff091..60235caf99 100644 --- a/src/mcts/node.cc +++ b/src/mcts/node.cc @@ -131,53 +131,6 @@ Move Edge::GetMove(bool as_opponent) const { return m; } -// Policy priors (P) are stored in a compressed 16-bit format. -// -// Source values are 32-bit floats: -// * bit 31 is sign (zero means positive) -// * bit 30 is sign of exponent (zero means nonpositive) -// * bits 29..23 are value bits of exponent -// * bits 22..0 are significand bits (plus a "virtual" always-on bit: s ∈ [1,2)) -// The number is then sign * 2^exponent * significand, usually. -// See https://www.h-schmidt.net/FloatConverter/IEEE754.html for details. -// -// In compressed 16-bit value we store bits 27..12: -// * bit 31 is always off as values are always >= 0 -// * bit 30 is always off as values are always < 2 -// * bits 29..28 are only off for values < 4.6566e-10, assume they are always on -// * bits 11..0 are for higher precision, they are dropped leaving only 11 bits -// of precision -// -// When converting to compressed format, bit 11 is added to in order to make it -// a rounding rather than truncation. -// -// Out of 65556 possible values, 2047 are outside of [0,1] interval (they are in -// interval (1,2)). This is fine because the values in [0,1] are skewed towards -// 0, which is also exactly how the components of policy tend to behave (since -// they add up to 1). - -// If the two assumed-on exponent bits (3<<28) are in fact off, the input is -// rounded up to the smallest value with them on. We accomplish this by -// subtracting the two bits from the input and checking for a negative result -// (the subtraction works despite crossing from exponent to significand). This -// is combined with the round-to-nearest addition (1<<11) into one op. -void Edge::SetP(float p) { - assert(0.0f <= p && p <= 1.0f); - constexpr int32_t roundings = (1 << 11) - (3 << 28); - int32_t tmp; - std::memcpy(&tmp, &p, sizeof(float)); - tmp += roundings; - p_ = (tmp < 0) ? 0 : static_cast(tmp >> 12); -} - -float Edge::GetP() const { - // Reshift into place and set the assumed-set exponent bits. - uint32_t tmp = (static_cast(p_) << 12) | (3 << 28); - float ret; - std::memcpy(&ret, &tmp, sizeof(uint32_t)); - return ret; -} - std::string Edge::DebugString() const { std::ostringstream oss; oss << "Move: " << move_.as_string() << " p_: " << p_ << " GetP: " << GetP(); diff --git a/src/mcts/node.h b/src/mcts/node.h index 8fffea6a9a..c7964fb55a 100644 --- a/src/mcts/node.h +++ b/src/mcts/node.h @@ -36,10 +36,10 @@ #include "chess/board.h" #include "chess/callbacks.h" #include "chess/position.h" -#include "neural/cache.h" #include "neural/encoder.h" #include "proto/net.pb.h" #include "utils/mutex.h" +#include "utils/pfloat16.h" namespace lczero { @@ -92,8 +92,9 @@ class Edge { // Returns or sets value of Move policy prior returned from the neural net // (but can be changed by adding Dirichlet noise). Must be in [0,1]. - float GetP() const; - void SetP(float val); + float GetP() const { return Pfloat16ToFloat(p_); } + void SetP(float val) { p_ = FloatToPfloat16(val); } + void SetPCompressed(uint16_t p) { p_ = p; } // Debug information about the edge. std::string DebugString() const; diff --git a/src/mcts/search.cc b/src/mcts/search.cc index 1936096185..0beb71e5f3 100644 --- a/src/mcts/search.cc +++ b/src/mcts/search.cc @@ -1963,11 +1963,11 @@ bool SearchWorker::AddNodeToComputation(Node* node, bool add_if_cached, moves.emplace_back(edge.GetMove().as_nn_index(transform)); } } else { - // Cache pseudolegal moves. A bit of a waste, but faster. - const auto& pseudolegal_moves = - history_.Last().GetBoard().GeneratePseudolegalMoves(); - moves.reserve(pseudolegal_moves.size()); - for (auto iter = pseudolegal_moves.begin(), end = pseudolegal_moves.end(); + // Cache legal moves. + const auto& legal_moves = + history_.Last().GetBoard().GenerateLegalMoves(); + moves.reserve(legal_moves.size()); + for (auto iter = legal_moves.begin(), end = legal_moves.end(); iter != end; ++iter) { moves.emplace_back(iter->as_nn_index(transform)); } @@ -2098,7 +2098,9 @@ int SearchWorker::PrefetchIntoCache(Node* node, int budget, bool is_odd_depth) { // 4. Run NN computation. // ~~~~~~~~~~~~~~~~~~~~~~ -void SearchWorker::RunNNComputation() { computation_->ComputeBlocking(); } +void SearchWorker::RunNNComputation() { + computation_->ComputeBlocking(params_.GetPolicySoftmaxTemp()); +} // 5. Retrieve NN computations (and terminal values) into nodes. // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2130,34 +2132,11 @@ void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process, node_to_process->v = -computation.GetQVal(idx_in_computation); node_to_process->d = computation.GetDVal(idx_in_computation); node_to_process->m = computation.GetMVal(idx_in_computation); - // ...and secondly, the policy data. - // Calculate maximum first. - float max_p = -std::numeric_limits::infinity(); - // Intermediate array to store values when processing policy. - // There are never more than 256 valid legal moves in any legal position. - std::array intermediate; - int counter = 0; - for (auto& edge : node->Edges()) { - float p = computation.GetPVal( - idx_in_computation, - edge.GetMove().as_nn_index(node_to_process->probability_transform)); - intermediate[counter++] = p; - max_p = std::max(max_p, p); - } - float total = 0.0; - for (int i = 0; i < counter; i++) { - // Perform softmax and take into account policy softmax temperature T. - // Note that we want to calculate (exp(p-max_p))^(1/T) = exp((p-max_p)/T). - float p = - FastExp((intermediate[i] - max_p) / params_.GetPolicySoftmaxTemp()); - intermediate[i] = p; - total += p; - } - counter = 0; - // Normalize P values to add up to 1.0. - const float scale = total > 0.0f ? 1.0f / total : 1.0f; + // ...and secondly, the policy data. The cache returns compressed values after + // softmax. + int idx = 0; for (auto& edge : node->Edges()) { - edge.edge()->SetP(intermediate[counter++] * scale); + edge.edge()->SetPCompressed(computation.GetPVal(idx_in_computation, idx++)); } // Add Dirichlet noise if enabled and at root. if (params_.GetNoiseEpsilon() && node == search_->root_node_) { diff --git a/src/mcts/search.h b/src/mcts/search.h index 3c95058a7b..bae877a740 100644 --- a/src/mcts/search.h +++ b/src/mcts/search.h @@ -353,20 +353,7 @@ class SearchWorker { float GetMVal(int) const { return lock->m; } - float GetPVal(int, int move_id) const { - const auto& moves = lock->p; - - int total_count = 0; - while (total_count < moves.size()) { - // Optimization: usually moves are stored in the same order as queried. - const auto& move = moves[last_idx++]; - if (last_idx == moves.size()) last_idx = 0; - if (move.first == move_id) return move.second; - ++total_count; - } - assert(false); // Move not found. - return 0; - } + uint16_t GetPVal(int, int move_ct) const { return lock->p[move_ct]; } private: NodeToProcess(Node* node, uint16_t depth, bool is_collision, int multivisit, diff --git a/src/mcts/stoppers/stoppers.cc b/src/mcts/stoppers/stoppers.cc index 8d6f7ae426..42260a6e15 100644 --- a/src/mcts/stoppers/stoppers.cc +++ b/src/mcts/stoppers/stoppers.cc @@ -96,8 +96,7 @@ const size_t kAvgNodeSize = sizeof(Node) + MemoryWatchingStopper::kAvgMovesPerPosition * sizeof(Edge); const size_t kAvgCacheItemSize = NNCache::GetItemStructSize() + sizeof(CachedNNRequest) + - sizeof(CachedNNRequest::IdxAndProb) * - MemoryWatchingStopper::kAvgMovesPerPosition; + sizeof(CachedNNRequest::p) * MemoryWatchingStopper::kAvgMovesPerPosition; } // namespace MemoryWatchingStopper::MemoryWatchingStopper(int cache_size, int ram_limit_mb, diff --git a/src/neural/cache.cc b/src/neural/cache.cc index d729a562f0..56b6515d55 100644 --- a/src/neural/cache.cc +++ b/src/neural/cache.cc @@ -25,9 +25,14 @@ Program grant you additional permission to convey the resulting work. */ #include "neural/cache.h" + +#include #include #include +#include "utils/fastmath.h" +#include "utils/pfloat16.h" + namespace lczero { CachingComputation::CachingComputation( std::unique_ptr parent, NNCache* cache) @@ -77,22 +82,44 @@ void CachingComputation::PopLastInputHit() { batch_.pop_back(); } -void CachingComputation::ComputeBlocking() { +void CachingComputation::ComputeBlocking(float softmax_temp) { if (parent_->GetBatchSize() == 0) return; parent_->ComputeBlocking(); // Fill cache with data from NN. - for (const auto& item : batch_) { + for (auto& item : batch_) { if (item.idx_in_parent == -1) continue; auto req = std::make_unique(item.probabilities_to_cache.size()); req->q = parent_->GetQVal(item.idx_in_parent); req->d = parent_->GetDVal(item.idx_in_parent); req->m = parent_->GetMVal(item.idx_in_parent); - int idx = 0; + + // Calculate maximum first. + float max_p = -std::numeric_limits::infinity(); + // Intermediate array to store values when processing policy. + // There are never more than 256 valid legal moves in any legal position. + std::array intermediate; + int counter = 0; for (auto x : item.probabilities_to_cache) { - req->p[idx++] = - std::make_pair(x, parent_->GetPVal(item.idx_in_parent, x)); + float p = parent_->GetPVal(item.idx_in_parent, x); + intermediate[counter++] = p; + max_p = std::max(max_p, p); + } + float total = 0.0; + for (int i = 0; i < counter; i++) { + // Perform softmax and take into account policy softmax temperature T. + // Note that we want to calculate (exp(p-max_p))^(1/T) = exp((p-max_p)/T). + float p = FastExp((intermediate[i] - max_p) / softmax_temp); + intermediate[i] = p; + total += p; + } + // Normalize P values to add up to 1.0. + const float scale = total > 0.0f ? 1.0f / total : 1.0f; + for (size_t ct = 0; ct < item.probabilities_to_cache.size(); ct++) { + uint16_t p = FloatToPfloat16(intermediate[ct] * scale); + req->p[ct] = p; + item.probabilities_to_cache[ct] = p; } cache_->Insert(item.hash, std::move(req)); } @@ -116,22 +143,12 @@ float CachingComputation::GetMVal(int sample) const { return item.lock->m; } -float CachingComputation::GetPVal(int sample, int move_id) const { +uint16_t CachingComputation::GetPVal(int sample, int move_ct) const { auto& item = batch_[sample]; - if (item.idx_in_parent >= 0) - return parent_->GetPVal(item.idx_in_parent, move_id); - const auto& moves = item.lock->p; - - int total_count = 0; - while (total_count < moves.size()) { - // Optimization: usually moves are stored in the same order as queried. - const auto& move = moves[item.last_idx++]; - if (item.last_idx == moves.size()) item.last_idx = 0; - if (move.first == move_id) return move.second; - ++total_count; + if (item.idx_in_parent >= 0) { + return item.probabilities_to_cache[move_ct]; } - assert(false); // Move not found. - return 0; + return item.lock->p[move_ct]; } } // namespace lczero diff --git a/src/neural/cache.h b/src/neural/cache.h index 207e0fe6e4..fbe6deedd7 100644 --- a/src/neural/cache.h +++ b/src/neural/cache.h @@ -34,12 +34,11 @@ namespace lczero { struct CachedNNRequest { CachedNNRequest(size_t size) : p(size) {} - typedef std::pair IdxAndProb; float q; float d; float m; - // TODO(mooskagh) Don't really need index if using perfect hash. - SmallArray p; + // Store p only for valid moves. + SmallArray p; }; typedef HashKeyedCache NNCache; @@ -73,15 +72,15 @@ class CachingComputation { // from parent's batch. void PopLastInputHit(); // Do the computation. - void ComputeBlocking(); + void ComputeBlocking(float softmax_temp); // Returns Q value of @sample. float GetQVal(int sample) const; // Returns probability of draw if NN has WDL value head. float GetDVal(int sample) const; // Returns estimated remaining moves. float GetMVal(int sample) const; - // Returns P value @move_id of @sample. - float GetPVal(int sample, int move_id) const; + // Returns compressed P value @move_id of @sample. + uint16_t GetPVal(int sample, int move_ct) const; // Pops last input from the computation. Only allowed for inputs which were // cached. void PopCacheHit(); @@ -94,8 +93,8 @@ class CachingComputation { uint64_t hash; NNCacheLock lock; int idx_in_parent = -1; + // Initially the move indices, after computation the policy values. std::vector probabilities_to_cache; - mutable int last_idx = 0; }; std::unique_ptr parent_; diff --git a/src/selfplay/game.cc b/src/selfplay/game.cc index fad43bbba1..0d5b677705 100644 --- a/src/selfplay/game.cc +++ b/src/selfplay/game.cc @@ -268,7 +268,8 @@ void SelfPlayGame::Play(int white_threads, int black_threads, bool training, search_->GetCachedNNEval(tree_[idx]->GetCurrentHead()); training_data_.Add(tree_[idx]->GetCurrentHead(), tree_[idx]->GetPositionHistory(), best_eval, - played_eval, best_is_proof, best_move, move, nneval); + played_eval, best_is_proof, best_move, move, nneval, + search_->GetParams().GetPolicySoftmaxTemp()); } // Must reset the search before mutating the tree. search_.reset(); diff --git a/src/trainingdata/trainingdata.cc b/src/trainingdata/trainingdata.cc index 9dcb0e17b7..7a2f264662 100644 --- a/src/trainingdata/trainingdata.cc +++ b/src/trainingdata/trainingdata.cc @@ -114,7 +114,8 @@ void V6TrainingDataArray::Write(TrainingDataWriter* writer, GameResult result, void V6TrainingDataArray::Add(const Node* node, const PositionHistory& history, Eval best_eval, Eval played_eval, bool best_is_proven, Move best_move, - Move played_move, const NNCacheLock& nneval) { + Move played_move, const NNCacheLock& nneval, + float softmax_temp) { V6TrainingData result; const auto& position = history.Last(); @@ -146,24 +147,20 @@ void V6TrainingDataArray::Add(const Node* node, const PositionHistory& history, // Set moves probabilities according to their relative amount of visits. // Compute Kullback-Leibler divergence in nats (between policy and visits). float kld_sum = 0; - float max_p = -std::numeric_limits::infinity(); std::vector intermediate; if (nneval) { - int last_idx = 0; + // The cache stores policies in GenerateLegalMoves() order. + auto legal_moves = history.Last().GetBoard().GenerateLegalMoves(); for (const auto& child : node->Edges()) { - auto nn_idx = child.edge()->GetMove().as_nn_index(transform); + auto move = child.edge()->GetMove(); float p = 0; - for (int i = 0; i < nneval->p.size(); i++) { - // Optimization: usually moves are stored in the same order as queried. - const auto& move = nneval->p[last_idx++]; - if (last_idx == nneval->p.size()) last_idx = 0; - if (move.first == nn_idx) { - p = move.second; + for (size_t i = 0; i < legal_moves.size(); i++) { + if (move == legal_moves[i]) { + p = Pfloat16ToFloat(nneval->p[i]); break; } } intermediate.emplace_back(p); - max_p = std::max(max_p, p); } } float total = 0.0; @@ -172,7 +169,8 @@ void V6TrainingDataArray::Add(const Node* node, const PositionHistory& history, auto nn_idx = child.edge()->GetMove().as_nn_index(transform); float fracv = total_n > 0 ? child.GetN() / static_cast(total_n) : 1; if (nneval) { - float P = std::exp(*it - max_p); + // Undo any softmax temperature in the cached data. + float P = std::pow(*it, softmax_temp); if (fracv > 0) { kld_sum += fracv * std::log(fracv / P); } diff --git a/src/trainingdata/trainingdata.h b/src/trainingdata/trainingdata.h index 6fc3b3b8a5..601b8a80d9 100644 --- a/src/trainingdata/trainingdata.h +++ b/src/trainingdata/trainingdata.h @@ -28,6 +28,7 @@ #pragma once #include "mcts/node.h" +#include "neural/cache.h" #include "trainingdata/writer.h" namespace lczero { @@ -98,7 +99,7 @@ class V6TrainingDataArray { // Add a chunk. void Add(const Node* node, const PositionHistory& history, Eval best_eval, Eval played_eval, bool best_is_proven, Move best_move, - Move played_move, const NNCacheLock& nneval); + Move played_move, const NNCacheLock& nneval, float softmax_temp); // Writes training data to a file. void Write(TrainingDataWriter* writer, GameResult result, diff --git a/src/utils/pfloat16.h b/src/utils/pfloat16.h new file mode 100644 index 0000000000..0554377eca --- /dev/null +++ b/src/utils/pfloat16.h @@ -0,0 +1,79 @@ +/* + This file is part of Leela Chess Zero. + Copyright (C) 2021 The LCZero Authors + + Leela Chess is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Leela Chess is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Leela Chess. If not, see . + + Additional permission under GNU GPL version 3 section 7 + + If you modify this Program, or any covered work, by linking or + combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA + Toolkit and the NVIDIA CUDA Deep Neural Network library (or a + modified version of those libraries), containing parts covered by the + terms of the respective license agreement, the licensors of this + Program grant you additional permission to convey the resulting work. +*/ + +#pragma once + +#include +#include + +namespace lczero { +// Compressed 16-bit floating point format for probability values. +// Optimised for representing numbers in the [0,1] range. +// +// Source values are 32-bit floats: +// * bit 31 is sign (zero means positive) +// * bit 30 is sign of exponent (zero means nonpositive) +// * bits 29..23 are value bits of exponent +// * bits 22..0 are significand bits (plus a "virtual" always-on bit: s ∈ [1,2)) +// The number is then sign * 2^exponent * significand, usually. +// See https://www.h-schmidt.net/FloatConverter/IEEE754.html for details. +// +// In compressed 16-bit value we store bits 27..12: +// * bit 31 is always off as values are always >= 0 +// * bit 30 is always off as values are always < 2 +// * bits 29..28 are only off for values < 4.6566e-10, assume they are always on +// * bits 11..0 are for higher precision, they are dropped leaving only 11 bits +// of precision +// +// Out of 65556 possible values, 2047 are outside of [0,1] interval (they are in +// interval (1,2)). + +// When converting to compressed format, bit 11 is added to in order to make it +// a rounding rather than truncation. +// If the two assumed-on exponent bits (3<<28) are in fact off, the input is +// rounded up to the smallest value with them on. We accomplish this by +// subtracting the two bits from the input and checking for a negative result +// (the subtraction works despite crossing from exponent to significand). This +// is combined with the round-to-nearest addition (1<<11) into one op. +static inline uint16_t FloatToPfloat16(const float &p) { + assert(0.0f <= p && p <= 1.0f); + constexpr int32_t roundings = (1 << 11) - (3 << 28); + int32_t tmp; + std::memcpy(&tmp, &p, sizeof(float)); + tmp += roundings; + return (tmp < 0) ? 0 : static_cast(tmp >> 12); +} + +static inline float Pfloat16ToFloat(const uint16_t &p) { + // Reshift into place and set the assumed-set exponent bits. + uint32_t tmp = (static_cast(p) << 12) | (3 << 28); + float ret; + std::memcpy(&ret, &tmp, sizeof(uint32_t)); + return ret; +} + +} // namespace lczero From b3ba1596d1c1f62c8824095a8588f10bf0ec5eb8 Mon Sep 17 00:00:00 2001 From: borg323 Date: Mon, 30 May 2022 13:29:37 +0300 Subject: [PATCH 02/12] guard against hash collision --- src/neural/cache.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/neural/cache.cc b/src/neural/cache.cc index 56b6515d55..d3751a27ba 100644 --- a/src/neural/cache.cc +++ b/src/neural/cache.cc @@ -146,8 +146,14 @@ float CachingComputation::GetMVal(int sample) const { uint16_t CachingComputation::GetPVal(int sample, int move_ct) const { auto& item = batch_[sample]; if (item.idx_in_parent >= 0) { + if (move_ct > static_cast(item.probabilities_to_cache.size())) { + return 0; // Hash collision. + } return item.probabilities_to_cache[move_ct]; } + if (move_ct > item.lock->p.size()) { + return 0; // Hash collision. + } return item.lock->p[move_ct]; } From 7f96e91780601ed614caea3928215d8ed70d41d5 Mon Sep 17 00:00:00 2001 From: borg323 Date: Sat, 1 Jun 2024 00:06:19 +0300 Subject: [PATCH 03/12] move NN encoding to the cache --- src/mcts/search.cc | 44 ++++++++++++-------------------------------- src/mcts/search.h | 5 ++--- src/neural/cache.cc | 26 ++++++++++++++++++++------ src/neural/cache.h | 13 ++++++++----- 4 files changed, 42 insertions(+), 46 deletions(-) diff --git a/src/mcts/search.cc b/src/mcts/search.cc index fc37144bed..490e624ddc 100644 --- a/src/mcts/search.cc +++ b/src/mcts/search.cc @@ -1258,8 +1258,9 @@ void SearchWorker::ExecuteOneIteration() { // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ void SearchWorker::InitializeIteration( std::unique_ptr computation) { - computation_ = std::make_unique(std::move(computation), - search_->cache_); + computation_ = std::make_unique( + std::move(computation), search_->network_->GetCapabilities().input_format, + params_.GetHistoryFill(), search_->cache_); computation_->Reserve(target_minibatch_size_); minibatch_.clear(); minibatch_.reserve(2 * target_minibatch_size_); @@ -1424,9 +1425,8 @@ void SearchWorker::GatherMinibatch() { computation_->AddInputByHash(minibatch_[i].hash, std::move(minibatch_[i].lock)); } else { - computation_->AddInput(minibatch_[i].hash, - std::move(minibatch_[i].input_planes), - std::move(minibatch_[i].probabilities_to_cache)); + computation_->AddInput(minibatch_[i].hash, minibatch_[i].history, + std::move(minibatch_[i].moves)); } } @@ -1478,21 +1478,12 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx, picked_node.lock = NNCacheLock(search_->cache_, hash); picked_node.is_cache_hit = picked_node.lock; if (!picked_node.is_cache_hit) { - int transform; - picked_node.input_planes = EncodePositionForNN( - search_->network_->GetCapabilities().input_format, history, 8, - params_.GetHistoryFill(), &transform); - picked_node.probability_transform = transform; - - std::vector& moves = picked_node.probabilities_to_cache; // Legal moves are known, use them. - moves.reserve(node->GetNumEdges()); + picked_node.moves.reserve(node->GetNumEdges()); for (const auto& edge : node->Edges()) { - moves.emplace_back(edge.GetMove().as_nn_index(transform)); + picked_node.moves.emplace_back(edge.GetMove()); } - } else { - picked_node.probability_transform = TransformForPosition( - search_->network_->GetCapabilities().input_format, history); + picked_node.history = history; } } } @@ -2041,31 +2032,20 @@ bool SearchWorker::AddNodeToComputation(Node* node) { if (search_->cache_->ContainsKey(hash)) { return true; } - int transform; - auto planes = - EncodePositionForNN(search_->network_->GetCapabilities().input_format, - history_, 8, params_.GetHistoryFill(), &transform); - - std::vector moves; + MoveList moves; if (node && node->HasChildren()) { // Legal moves are known, use them. moves.reserve(node->GetNumEdges()); for (const auto& edge : node->Edges()) { - moves.emplace_back(edge.GetMove().as_nn_index(transform)); + moves.emplace_back(edge.GetMove()); } } else { // Cache legal moves. - const auto& legal_moves = - history_.Last().GetBoard().GenerateLegalMoves(); - moves.reserve(legal_moves.size()); - for (auto iter = legal_moves.begin(), end = legal_moves.end(); - iter != end; ++iter) { - moves.emplace_back(iter->as_nn_index(transform)); - } + moves = history_.Last().GetBoard().GenerateLegalMoves(); } - computation_->AddInput(hash, std::move(planes), std::move(moves)); + computation_->AddInput(hash, history_, std::move(moves)); return false; } diff --git a/src/mcts/search.h b/src/mcts/search.h index da631cd306..5ffca3a6b0 100644 --- a/src/mcts/search.h +++ b/src/mcts/search.h @@ -331,7 +331,6 @@ class SearchWorker { bool nn_queried = false; bool is_cache_hit = false; bool is_collision = false; - int probability_transform = 0; // Details only populated in the multigather path. @@ -341,8 +340,8 @@ class SearchWorker { // Details that are filled in as we go. uint64_t hash; NNCacheLock lock; - std::vector probabilities_to_cache; - InputPlanes input_planes; + MoveList moves; + PositionHistory history; mutable int last_idx = 0; bool ooo_completed = false; diff --git a/src/neural/cache.cc b/src/neural/cache.cc index d3751a27ba..6b72bd34cf 100644 --- a/src/neural/cache.cc +++ b/src/neural/cache.cc @@ -30,13 +30,19 @@ #include #include +#include "neural/encoder.h" #include "utils/fastmath.h" #include "utils/pfloat16.h" namespace lczero { CachingComputation::CachingComputation( - std::unique_ptr parent, NNCache* cache) - : parent_(std::move(parent)), cache_(cache) {} + std::unique_ptr parent, + pblczero::NetworkFormat::InputFormat input_format, + lczero::FillEmptyHistory history_fill, NNCache* cache) + : parent_(std::move(parent)), + input_format_(input_format), + history_fill_(history_fill), + cache_(cache) {} int CachingComputation::GetCacheMisses() const { return parent_->GetBatchSize(); @@ -65,14 +71,22 @@ void CachingComputation::PopCacheHit() { batch_.pop_back(); } -void CachingComputation::AddInput( - uint64_t hash, InputPlanes&& input, - std::vector&& probabilities_to_cache) { +void CachingComputation::AddInput(uint64_t hash, const PositionHistory& history, + MoveList&& moves) { if (AddInputByHash(hash)) return; + + int transform; + auto input = + EncodePositionForNN(input_format_, history, 8, history_fill_, &transform); + std::vector moves_as_nn_index; + moves_as_nn_index.reserve(moves.size()); + for (auto iter = moves.begin(), end = moves.end(); iter != end; ++iter) { + moves_as_nn_index.emplace_back(iter->as_nn_index(transform)); + } batch_.emplace_back(); batch_.back().hash = hash; batch_.back().idx_in_parent = parent_->GetBatchSize(); - batch_.back().probabilities_to_cache = probabilities_to_cache; + batch_.back().probabilities_to_cache = std::move(moves_as_nn_index); parent_->AddInput(std::move(input)); } diff --git a/src/neural/cache.h b/src/neural/cache.h index fbe6deedd7..f1d4a050b1 100644 --- a/src/neural/cache.h +++ b/src/neural/cache.h @@ -26,6 +26,7 @@ */ #pragma once +#include "mcts/node.h" #include "neural/network.h" #include "utils/cache.h" #include "utils/smallarray.h" @@ -50,7 +51,8 @@ typedef HashKeyedCacheLock NNCacheLock; class CachingComputation { public: CachingComputation(std::unique_ptr parent, - NNCache* cache); + pblczero::NetworkFormat::InputFormat input_format, + lczero::FillEmptyHistory history_fill, NNCache* cache); // How many inputs are not found in cache and will be forwarded to a wrapped // computation. @@ -63,11 +65,10 @@ class CachingComputation { // Adds input by hash with existing lock. Assumes the given lock holds a real // reference. void AddInputByHash(uint64_t hash, NNCacheLock&& lock); - // Adds a sample to the batch. + // Adds a sample to the batch. Also calls EncodePositionForNN() if needed. // @hash is a hash to store/lookup it in the cache. - // @probabilities_to_cache is which indices of policy head to store. - void AddInput(uint64_t hash, InputPlanes&& input, - std::vector&& probabilities_to_cache); + void AddInput(uint64_t hash, const PositionHistory& history, + MoveList&& moves); // Undos last AddInput. If it was a cache miss, the it's actually not removed // from parent's batch. void PopLastInputHit(); @@ -98,6 +99,8 @@ class CachingComputation { }; std::unique_ptr parent_; + pblczero::NetworkFormat::InputFormat input_format_; + lczero::FillEmptyHistory history_fill_; NNCache* cache_; std::vector batch_; }; From cd7782b7ea62f7dcc89b2ea496d4fadbf746dac4 Mon Sep 17 00:00:00 2001 From: borg323 Date: Sat, 1 Jun 2024 00:20:29 +0300 Subject: [PATCH 04/12] move softmax temp to CachingComputation constructor --- src/mcts/search.cc | 7 +++---- src/neural/cache.cc | 7 ++++--- src/neural/cache.h | 6 ++++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/mcts/search.cc b/src/mcts/search.cc index 490e624ddc..fdc408d78b 100644 --- a/src/mcts/search.cc +++ b/src/mcts/search.cc @@ -1260,7 +1260,8 @@ void SearchWorker::InitializeIteration( std::unique_ptr computation) { computation_ = std::make_unique( std::move(computation), search_->network_->GetCapabilities().input_format, - params_.GetHistoryFill(), search_->cache_); + params_.GetHistoryFill(), params_.GetPolicySoftmaxTemp(), + search_->cache_); computation_->Reserve(target_minibatch_size_); minibatch_.clear(); minibatch_.reserve(2 * target_minibatch_size_); @@ -2169,9 +2170,7 @@ int SearchWorker::PrefetchIntoCache(Node* node, int budget, bool is_odd_depth) { // 4. Run NN computation. // ~~~~~~~~~~~~~~~~~~~~~~ -void SearchWorker::RunNNComputation() { - computation_->ComputeBlocking(params_.GetPolicySoftmaxTemp()); -} +void SearchWorker::RunNNComputation() { computation_->ComputeBlocking(); } // 5. Retrieve NN computations (and terminal values) into nodes. // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/neural/cache.cc b/src/neural/cache.cc index 6b72bd34cf..1e0d155725 100644 --- a/src/neural/cache.cc +++ b/src/neural/cache.cc @@ -38,10 +38,11 @@ namespace lczero { CachingComputation::CachingComputation( std::unique_ptr parent, pblczero::NetworkFormat::InputFormat input_format, - lczero::FillEmptyHistory history_fill, NNCache* cache) + lczero::FillEmptyHistory history_fill, float softmax_temp, NNCache* cache) : parent_(std::move(parent)), input_format_(input_format), history_fill_(history_fill), + softmax_temp_(softmax_temp), cache_(cache) {} int CachingComputation::GetCacheMisses() const { @@ -96,7 +97,7 @@ void CachingComputation::PopLastInputHit() { batch_.pop_back(); } -void CachingComputation::ComputeBlocking(float softmax_temp) { +void CachingComputation::ComputeBlocking() { if (parent_->GetBatchSize() == 0) return; parent_->ComputeBlocking(); @@ -124,7 +125,7 @@ void CachingComputation::ComputeBlocking(float softmax_temp) { for (int i = 0; i < counter; i++) { // Perform softmax and take into account policy softmax temperature T. // Note that we want to calculate (exp(p-max_p))^(1/T) = exp((p-max_p)/T). - float p = FastExp((intermediate[i] - max_p) / softmax_temp); + float p = FastExp((intermediate[i] - max_p) / softmax_temp_); intermediate[i] = p; total += p; } diff --git a/src/neural/cache.h b/src/neural/cache.h index f1d4a050b1..45904c337f 100644 --- a/src/neural/cache.h +++ b/src/neural/cache.h @@ -52,7 +52,8 @@ class CachingComputation { public: CachingComputation(std::unique_ptr parent, pblczero::NetworkFormat::InputFormat input_format, - lczero::FillEmptyHistory history_fill, NNCache* cache); + lczero::FillEmptyHistory history_fill, float softmax_temp, + NNCache* cache); // How many inputs are not found in cache and will be forwarded to a wrapped // computation. @@ -73,7 +74,7 @@ class CachingComputation { // from parent's batch. void PopLastInputHit(); // Do the computation. - void ComputeBlocking(float softmax_temp); + void ComputeBlocking(); // Returns Q value of @sample. float GetQVal(int sample) const; // Returns probability of draw if NN has WDL value head. @@ -101,6 +102,7 @@ class CachingComputation { std::unique_ptr parent_; pblczero::NetworkFormat::InputFormat input_format_; lczero::FillEmptyHistory history_fill_; + float softmax_temp_; NNCache* cache_; std::vector batch_; }; From 185ce8a1c18ec10900b9b813a6fe9f9fa0d0ef50 Mon Sep 17 00:00:00 2001 From: borg323 Date: Sun, 9 Jun 2024 00:05:47 +0300 Subject: [PATCH 05/12] do move generation before ExtendNode --- src/mcts/search.cc | 41 ++++++++++++++++++----------------------- src/mcts/search.h | 4 ++-- 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/src/mcts/search.cc b/src/mcts/search.cc index fdc408d78b..4b11a28b8c 100644 --- a/src/mcts/search.cc +++ b/src/mcts/search.cc @@ -1471,7 +1471,15 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx, // of the game), it means that we already visited this node before. if (picked_node.IsExtendable()) { // Node was never visited, extend it. - ExtendNode(node, picked_node.depth, picked_node.moves_to_visit, &history); + // Initialize position sequence with pre-move position. + history.Trim(search_->played_history_.GetLength()); + for (size_t i = 0; i < picked_node.moves_to_visit.size(); i++) { + history.Append(picked_node.moves_to_visit[i]); + } + + picked_node.moves = history.Last().GetBoard().GenerateLegalMoves(); + + ExtendNode(node, picked_node.depth, history, picked_node.moves); if (!node->IsTerminal()) { picked_node.nn_queried = true; const auto hash = history.HashLast(params_.GetCacheHistoryLength() + 1); @@ -1479,11 +1487,6 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx, picked_node.lock = NNCacheLock(search_->cache_, hash); picked_node.is_cache_hit = picked_node.lock; if (!picked_node.is_cache_hit) { - // Legal moves are known, use them. - picked_node.moves.reserve(node->GetNumEdges()); - for (const auto& edge : node->Edges()) { - picked_node.moves.emplace_back(edge.GetMove()); - } picked_node.history = history; } } @@ -1931,19 +1934,11 @@ void SearchWorker::PickNodesToExtendTask( } void SearchWorker::ExtendNode(Node* node, int depth, - const std::vector& moves_to_node, - PositionHistory* history) { - // Initialize position sequence with pre-move position. - history->Trim(search_->played_history_.GetLength()); - for (size_t i = 0; i < moves_to_node.size(); i++) { - history->Append(moves_to_node[i]); - } - + const PositionHistory& history, + const MoveList& legal_moves) { // We don't need the mutex because other threads will see that N=0 and // N-in-flight=1 and will not touch this node. - const auto& board = history->Last().GetBoard(); - auto legal_moves = board.GenerateLegalMoves(); - + const auto& board = history.Last().GetBoard(); // Check whether it's a draw/lose by position. Importantly, we must check // these before doing the by-rule checks below. if (legal_moves.empty()) { @@ -1964,12 +1959,12 @@ void SearchWorker::ExtendNode(Node* node, int depth, return; } - if (history->Last().GetRule50Ply() >= 100) { + if (history.Last().GetRule50Ply() >= 100) { node->MakeTerminal(GameResult::DRAW); return; } - const auto repetitions = history->Last().GetRepetitions(); + const auto repetitions = history.Last().GetRepetitions(); // Mark two-fold repetitions as draws according to settings. // Depth starts with 1 at root, so number of plies in PV is depth - 1. if (repetitions >= 2) { @@ -1977,8 +1972,8 @@ void SearchWorker::ExtendNode(Node* node, int depth, return; } else if (repetitions == 1 && depth - 1 >= 4 && params_.GetTwoFoldDraws() && - depth - 1 >= history->Last().GetPliesSincePrevRepetition()) { - const auto cycle_length = history->Last().GetPliesSincePrevRepetition(); + depth - 1 >= history.Last().GetPliesSincePrevRepetition()) { + const auto cycle_length = history.Last().GetPliesSincePrevRepetition(); // use plies since first repetition as moves left; exact if forced draw. node->MakeTerminal(GameResult::DRAW, (float)cycle_length, Node::Terminal::TwoFold); @@ -1988,12 +1983,12 @@ void SearchWorker::ExtendNode(Node* node, int depth, // Neither by-position or by-rule termination, but maybe it's a TB position. if (search_->syzygy_tb_ && !search_->root_is_in_dtz_ && board.castlings().no_legal_castle() && - history->Last().GetRule50Ply() == 0 && + history.Last().GetRule50Ply() == 0 && (board.ours() | board.theirs()).count() <= search_->syzygy_tb_->max_cardinality()) { ProbeState state; const WDLScore wdl = - search_->syzygy_tb_->probe_wdl(history->Last(), &state); + search_->syzygy_tb_->probe_wdl(history.Last(), &state); // Only fail state means the WDL is wrong, probe_wdl may produce correct // result with a stat other than OK. if (state != FAIL) { diff --git a/src/mcts/search.h b/src/mcts/search.h index 5ffca3a6b0..5067db5b33 100644 --- a/src/mcts/search.h +++ b/src/mcts/search.h @@ -441,8 +441,8 @@ class SearchWorker { void EnsureNodeTwoFoldCorrectForDepth(Node* node, int depth); void ProcessPickedTask(int batch_start, int batch_end, TaskWorkspace* workspace); - void ExtendNode(Node* node, int depth, const std::vector& moves_to_add, - PositionHistory* history); + void ExtendNode(Node* node, int depth, const PositionHistory& history, + const MoveList& legal_moves); template void FetchSingleNodeResult(NodeToProcess* node_to_process, const Computation& computation, From 72a77c5fbe1001fa159fcb3eac397e4b64a52155 Mon Sep 17 00:00:00 2001 From: borg323 Date: Sat, 8 Jun 2024 23:00:39 +0300 Subject: [PATCH 06/12] cleaner cache interface --- src/mcts/search.cc | 22 +++++++--------------- src/mcts/search.h | 10 +++++----- src/neural/cache.cc | 18 ++++++++++++++++-- src/neural/cache.h | 21 ++++++++++++--------- 4 files changed, 40 insertions(+), 31 deletions(-) diff --git a/src/mcts/search.cc b/src/mcts/search.cc index 4b11a28b8c..86b7e6b70c 100644 --- a/src/mcts/search.cc +++ b/src/mcts/search.cc @@ -1421,14 +1421,8 @@ void SearchWorker::GatherMinibatch() { // There are no OOO though. // Also terminals when OOO is disabled. if (!minibatch_[i].nn_queried) continue; - if (minibatch_[i].is_cache_hit) { - // Since minibatch_[i] holds cache lock, this is guaranteed to succeed. - computation_->AddInputByHash(minibatch_[i].hash, - std::move(minibatch_[i].lock)); - } else { - computation_->AddInput(minibatch_[i].hash, minibatch_[i].history, - std::move(minibatch_[i].moves)); - } + computation_->AddInput(minibatch_[i].hash, minibatch_[i].history, + minibatch_[i].moves); } // Check for stop at the end so we have at least one node. @@ -1484,11 +1478,9 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx, picked_node.nn_queried = true; const auto hash = history.HashLast(params_.GetCacheHistoryLength() + 1); picked_node.hash = hash; - picked_node.lock = NNCacheLock(search_->cache_, hash); - picked_node.is_cache_hit = picked_node.lock; - if (!picked_node.is_cache_hit) { - picked_node.history = history; - } + picked_node.is_cache_hit = + computation_->CacheLookup(hash, &picked_node.entry); + picked_node.history = history; } } if (params_.GetOutOfOrderEval() && picked_node.CanEvalOutOfOrder()) { @@ -2025,7 +2017,7 @@ void SearchWorker::ExtendNode(Node* node, int depth, // Returns whether node was already in cache. bool SearchWorker::AddNodeToComputation(Node* node) { const auto hash = history_.HashLast(params_.GetCacheHistoryLength() + 1); - if (search_->cache_->ContainsKey(hash)) { + if (computation_->CacheLookup(hash)) { return true; } MoveList moves; @@ -2041,7 +2033,7 @@ bool SearchWorker::AddNodeToComputation(Node* node) { moves = history_.Last().GetBoard().GenerateLegalMoves(); } - computation_->AddInput(hash, history_, std::move(moves)); + computation_->AddInput(hash, history_, moves); return false; } diff --git a/src/mcts/search.h b/src/mcts/search.h index 5067db5b33..8f99fa9b56 100644 --- a/src/mcts/search.h +++ b/src/mcts/search.h @@ -339,7 +339,7 @@ class SearchWorker { // Details that are filled in as we go. uint64_t hash; - NNCacheLock lock; + CachedNNRequest entry; MoveList moves; PositionHistory history; mutable int last_idx = 0; @@ -360,13 +360,13 @@ class SearchWorker { // Methods to allow NodeToProcess to conform as a 'Computation'. Only safe // to call if is_cache_hit is true in the multigather path. - float GetQVal(int) const { return lock->q; } + float GetQVal(int) const { return entry.q; } - float GetDVal(int) const { return lock->d; } + float GetDVal(int) const { return entry.d; } - float GetMVal(int) const { return lock->m; } + float GetMVal(int) const { return entry.m; } - uint16_t GetPVal(int, int move_ct) const { return lock->p[move_ct]; } + uint16_t GetPVal(int, int move_ct) const { return entry.p[move_ct]; } private: NodeToProcess(Node* node, uint16_t depth, bool is_collision, int multivisit, diff --git a/src/neural/cache.cc b/src/neural/cache.cc index 1e0d155725..40c4fbb636 100644 --- a/src/neural/cache.cc +++ b/src/neural/cache.cc @@ -72,8 +72,22 @@ void CachingComputation::PopCacheHit() { batch_.pop_back(); } +bool CachingComputation::CacheLookup(uint64_t hash, CachedNNRequest* entry) { + NNCacheLock lock = NNCacheLock(cache_, hash); + if (!lock) return false; + if (entry != nullptr) { + entry->q = lock->q; + entry->d = lock->d; + entry->m = lock->m; + entry->p.clear(); + entry->p.resize(lock->p.size()); + for (size_t i = 0; i < lock->p.size(); i++) entry->p[i] = lock->p[i]; + } + return true; +} + void CachingComputation::AddInput(uint64_t hash, const PositionHistory& history, - MoveList&& moves) { + const MoveList& moves) { if (AddInputByHash(hash)) return; int transform; @@ -166,7 +180,7 @@ uint16_t CachingComputation::GetPVal(int sample, int move_ct) const { } return item.probabilities_to_cache[move_ct]; } - if (move_ct > item.lock->p.size()) { + if (static_cast(move_ct) > item.lock->p.size()) { return 0; // Hash collision. } return item.lock->p[move_ct]; diff --git a/src/neural/cache.h b/src/neural/cache.h index 45904c337f..8e3e916cc7 100644 --- a/src/neural/cache.h +++ b/src/neural/cache.h @@ -34,12 +34,12 @@ namespace lczero { struct CachedNNRequest { - CachedNNRequest(size_t size) : p(size) {} + CachedNNRequest(size_t size = 0) : p(size) {} float q; float d; float m; // Store p only for valid moves. - SmallArray p; + std::vector p; }; typedef HashKeyedCache NNCache; @@ -60,16 +60,12 @@ class CachingComputation { int GetCacheMisses() const; // Total number of times AddInput/AddInputByHash were (successfully) called. int GetBatchSize() const; - // Adds input by hash only. If that hash is not in cache, returns false - // and does nothing. Otherwise adds. - bool AddInputByHash(uint64_t hash); - // Adds input by hash with existing lock. Assumes the given lock holds a real - // reference. - void AddInputByHash(uint64_t hash, NNCacheLock&& lock); + // Check if entry is in the cache. + bool CacheLookup(uint64_t hash, CachedNNRequest* entry = nullptr); // Adds a sample to the batch. Also calls EncodePositionForNN() if needed. // @hash is a hash to store/lookup it in the cache. void AddInput(uint64_t hash, const PositionHistory& history, - MoveList&& moves); + const MoveList& moves); // Undos last AddInput. If it was a cache miss, the it's actually not removed // from parent's batch. void PopLastInputHit(); @@ -91,6 +87,13 @@ class CachingComputation { void Reserve(int batch_size) { batch_.reserve(batch_size); } private: + // Adds input by hash only. If that hash is not in cache, returns false + // and does nothing. Otherwise adds. + bool AddInputByHash(uint64_t hash); + // Adds input by hash with existing lock. Assumes the given lock holds a real + // reference. + void AddInputByHash(uint64_t hash, NNCacheLock&& lock); + struct WorkItem { uint64_t hash; NNCacheLock lock; From 928660d79afe438e72ad15ac68d11a596fc4d8ea Mon Sep 17 00:00:00 2001 From: borg323 Date: Sat, 8 Jun 2024 23:36:49 +0300 Subject: [PATCH 07/12] really guard against hash collisions --- src/mcts/search.cc | 4 ++-- src/neural/cache.cc | 32 ++++++++++---------------------- src/neural/cache.h | 10 ++-------- 3 files changed, 14 insertions(+), 32 deletions(-) diff --git a/src/mcts/search.cc b/src/mcts/search.cc index 86b7e6b70c..d14c938ca8 100644 --- a/src/mcts/search.cc +++ b/src/mcts/search.cc @@ -1478,8 +1478,8 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx, picked_node.nn_queried = true; const auto hash = history.HashLast(params_.GetCacheHistoryLength() + 1); picked_node.hash = hash; - picked_node.is_cache_hit = - computation_->CacheLookup(hash, &picked_node.entry); + picked_node.is_cache_hit = computation_->CacheLookup( + hash, picked_node.moves, &picked_node.entry); picked_node.history = history; } } diff --git a/src/neural/cache.cc b/src/neural/cache.cc index 40c4fbb636..3601c04e8e 100644 --- a/src/neural/cache.cc +++ b/src/neural/cache.cc @@ -51,20 +51,6 @@ int CachingComputation::GetCacheMisses() const { int CachingComputation::GetBatchSize() const { return batch_.size(); } -bool CachingComputation::AddInputByHash(uint64_t hash) { - NNCacheLock lock(cache_, hash); - if (!lock) return false; - AddInputByHash(hash, std::move(lock)); - return true; -} - -void CachingComputation::AddInputByHash(uint64_t hash, NNCacheLock&& lock) { - assert(lock); - batch_.emplace_back(); - batch_.back().lock = std::move(lock); - batch_.back().hash = hash; -} - void CachingComputation::PopCacheHit() { assert(!batch_.empty()); assert(batch_.back().lock); @@ -72,10 +58,12 @@ void CachingComputation::PopCacheHit() { batch_.pop_back(); } -bool CachingComputation::CacheLookup(uint64_t hash, CachedNNRequest* entry) { +bool CachingComputation::CacheLookup(uint64_t hash, const MoveList& moves, + CachedNNRequest* entry) { NNCacheLock lock = NNCacheLock(cache_, hash); if (!lock) return false; if (entry != nullptr) { + if (moves.size() != lock->p.size()) return false; entry->q = lock->q; entry->d = lock->d; entry->m = lock->m; @@ -88,7 +76,13 @@ bool CachingComputation::CacheLookup(uint64_t hash, CachedNNRequest* entry) { void CachingComputation::AddInput(uint64_t hash, const PositionHistory& history, const MoveList& moves) { - if (AddInputByHash(hash)) return; + NNCacheLock lock(cache_, hash); + if (lock && moves.size() == lock->p.size()) { + batch_.emplace_back(); + batch_.back().lock = std::move(lock); + batch_.back().hash = hash; + return; + } int transform; auto input = @@ -175,14 +169,8 @@ float CachingComputation::GetMVal(int sample) const { uint16_t CachingComputation::GetPVal(int sample, int move_ct) const { auto& item = batch_[sample]; if (item.idx_in_parent >= 0) { - if (move_ct > static_cast(item.probabilities_to_cache.size())) { - return 0; // Hash collision. - } return item.probabilities_to_cache[move_ct]; } - if (static_cast(move_ct) > item.lock->p.size()) { - return 0; // Hash collision. - } return item.lock->p[move_ct]; } diff --git a/src/neural/cache.h b/src/neural/cache.h index 8e3e916cc7..dee32bc82c 100644 --- a/src/neural/cache.h +++ b/src/neural/cache.h @@ -61,7 +61,8 @@ class CachingComputation { // Total number of times AddInput/AddInputByHash were (successfully) called. int GetBatchSize() const; // Check if entry is in the cache. - bool CacheLookup(uint64_t hash, CachedNNRequest* entry = nullptr); + bool CacheLookup(uint64_t hash, const MoveList& moves = {}, + CachedNNRequest* entry = nullptr); // Adds a sample to the batch. Also calls EncodePositionForNN() if needed. // @hash is a hash to store/lookup it in the cache. void AddInput(uint64_t hash, const PositionHistory& history, @@ -87,13 +88,6 @@ class CachingComputation { void Reserve(int batch_size) { batch_.reserve(batch_size); } private: - // Adds input by hash only. If that hash is not in cache, returns false - // and does nothing. Otherwise adds. - bool AddInputByHash(uint64_t hash); - // Adds input by hash with existing lock. Assumes the given lock holds a real - // reference. - void AddInputByHash(uint64_t hash, NNCacheLock&& lock); - struct WorkItem { uint64_t hash; NNCacheLock lock; From 8369d5265140f919097de6eb5cc2be2e89c99d54 Mon Sep 17 00:00:00 2001 From: borg323 Date: Sun, 9 Jun 2024 01:37:28 +0300 Subject: [PATCH 08/12] remove hash from cache interface --- src/mcts/search.cc | 14 +++++--------- src/mcts/search.h | 1 - src/neural/cache.cc | 13 +++++++++---- src/neural/cache.h | 8 ++++---- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/mcts/search.cc b/src/mcts/search.cc index d14c938ca8..0260944a24 100644 --- a/src/mcts/search.cc +++ b/src/mcts/search.cc @@ -1261,7 +1261,7 @@ void SearchWorker::InitializeIteration( computation_ = std::make_unique( std::move(computation), search_->network_->GetCapabilities().input_format, params_.GetHistoryFill(), params_.GetPolicySoftmaxTemp(), - search_->cache_); + params_.GetCacheHistoryLength() + 1, search_->cache_); computation_->Reserve(target_minibatch_size_); minibatch_.clear(); minibatch_.reserve(2 * target_minibatch_size_); @@ -1421,8 +1421,7 @@ void SearchWorker::GatherMinibatch() { // There are no OOO though. // Also terminals when OOO is disabled. if (!minibatch_[i].nn_queried) continue; - computation_->AddInput(minibatch_[i].hash, minibatch_[i].history, - minibatch_[i].moves); + computation_->AddInput(minibatch_[i].history, minibatch_[i].moves); } // Check for stop at the end so we have at least one node. @@ -1476,10 +1475,8 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx, ExtendNode(node, picked_node.depth, history, picked_node.moves); if (!node->IsTerminal()) { picked_node.nn_queried = true; - const auto hash = history.HashLast(params_.GetCacheHistoryLength() + 1); - picked_node.hash = hash; picked_node.is_cache_hit = computation_->CacheLookup( - hash, picked_node.moves, &picked_node.entry); + history, picked_node.moves, &picked_node.entry); picked_node.history = history; } } @@ -2016,8 +2013,7 @@ void SearchWorker::ExtendNode(Node* node, int depth, // Returns whether node was already in cache. bool SearchWorker::AddNodeToComputation(Node* node) { - const auto hash = history_.HashLast(params_.GetCacheHistoryLength() + 1); - if (computation_->CacheLookup(hash)) { + if (computation_->CacheLookup(history_)) { return true; } MoveList moves; @@ -2033,7 +2029,7 @@ bool SearchWorker::AddNodeToComputation(Node* node) { moves = history_.Last().GetBoard().GenerateLegalMoves(); } - computation_->AddInput(hash, history_, moves); + computation_->AddInput(history_, moves); return false; } diff --git a/src/mcts/search.h b/src/mcts/search.h index 8f99fa9b56..85074c9d19 100644 --- a/src/mcts/search.h +++ b/src/mcts/search.h @@ -338,7 +338,6 @@ class SearchWorker { std::vector moves_to_visit; // Details that are filled in as we go. - uint64_t hash; CachedNNRequest entry; MoveList moves; PositionHistory history; diff --git a/src/neural/cache.cc b/src/neural/cache.cc index 3601c04e8e..fd661e8fef 100644 --- a/src/neural/cache.cc +++ b/src/neural/cache.cc @@ -38,11 +38,13 @@ namespace lczero { CachingComputation::CachingComputation( std::unique_ptr parent, pblczero::NetworkFormat::InputFormat input_format, - lczero::FillEmptyHistory history_fill, float softmax_temp, NNCache* cache) + lczero::FillEmptyHistory history_fill, float softmax_temp, + int history_length, NNCache* cache) : parent_(std::move(parent)), input_format_(input_format), history_fill_(history_fill), softmax_temp_(softmax_temp), + history_length_(history_length), cache_(cache) {} int CachingComputation::GetCacheMisses() const { @@ -58,9 +60,11 @@ void CachingComputation::PopCacheHit() { batch_.pop_back(); } -bool CachingComputation::CacheLookup(uint64_t hash, const MoveList& moves, +bool CachingComputation::CacheLookup(const PositionHistory& history, + const MoveList& moves, CachedNNRequest* entry) { - NNCacheLock lock = NNCacheLock(cache_, hash); + const auto hash = history.HashLast(history_length_); + NNCacheLock lock(cache_, hash); if (!lock) return false; if (entry != nullptr) { if (moves.size() != lock->p.size()) return false; @@ -74,8 +78,9 @@ bool CachingComputation::CacheLookup(uint64_t hash, const MoveList& moves, return true; } -void CachingComputation::AddInput(uint64_t hash, const PositionHistory& history, +void CachingComputation::AddInput(const PositionHistory& history, const MoveList& moves) { + const auto hash = history.HashLast(history_length_); NNCacheLock lock(cache_, hash); if (lock && moves.size() == lock->p.size()) { batch_.emplace_back(); diff --git a/src/neural/cache.h b/src/neural/cache.h index dee32bc82c..2d25a936a5 100644 --- a/src/neural/cache.h +++ b/src/neural/cache.h @@ -53,7 +53,7 @@ class CachingComputation { CachingComputation(std::unique_ptr parent, pblczero::NetworkFormat::InputFormat input_format, lczero::FillEmptyHistory history_fill, float softmax_temp, - NNCache* cache); + int history_length, NNCache* cache); // How many inputs are not found in cache and will be forwarded to a wrapped // computation. @@ -61,12 +61,11 @@ class CachingComputation { // Total number of times AddInput/AddInputByHash were (successfully) called. int GetBatchSize() const; // Check if entry is in the cache. - bool CacheLookup(uint64_t hash, const MoveList& moves = {}, + bool CacheLookup(const PositionHistory& history, const MoveList& moves = {}, CachedNNRequest* entry = nullptr); // Adds a sample to the batch. Also calls EncodePositionForNN() if needed. // @hash is a hash to store/lookup it in the cache. - void AddInput(uint64_t hash, const PositionHistory& history, - const MoveList& moves); + void AddInput(const PositionHistory& history, const MoveList& moves); // Undos last AddInput. If it was a cache miss, the it's actually not removed // from parent's batch. void PopLastInputHit(); @@ -100,6 +99,7 @@ class CachingComputation { pblczero::NetworkFormat::InputFormat input_format_; lczero::FillEmptyHistory history_fill_; float softmax_temp_; + int history_length_; NNCache* cache_; std::vector batch_; }; From c2449bf61dcdb97bcaf15ccd10efa08a78cf93e1 Mon Sep 17 00:00:00 2001 From: borg323 Date: Sun, 9 Jun 2024 15:09:33 +0300 Subject: [PATCH 09/12] do not add cache hits to batch --- src/mcts/search.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/mcts/search.cc b/src/mcts/search.cc index 0260944a24..6fdab6eabf 100644 --- a/src/mcts/search.cc +++ b/src/mcts/search.cc @@ -1420,7 +1420,7 @@ void SearchWorker::GatherMinibatch() { // If there was no OOO, there can stil be collisions. // There are no OOO though. // Also terminals when OOO is disabled. - if (!minibatch_[i].nn_queried) continue; + if (!minibatch_[i].nn_queried || minibatch_[i].is_cache_hit) continue; computation_->AddInput(minibatch_[i].history, minibatch_[i].moves); } @@ -1466,6 +1466,8 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx, // Node was never visited, extend it. // Initialize position sequence with pre-move position. history.Trim(search_->played_history_.GetLength()); + history.Reserve(search_->played_history_.GetLength() + + picked_node.moves_to_visit.size()); for (size_t i = 0; i < picked_node.moves_to_visit.size(); i++) { history.Append(picked_node.moves_to_visit[i]); } @@ -1477,7 +1479,7 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx, picked_node.nn_queried = true; picked_node.is_cache_hit = computation_->CacheLookup( history, picked_node.moves, &picked_node.entry); - picked_node.history = history; + if (!picked_node.is_cache_hit) picked_node.history = history; } } if (params_.GetOutOfOrderEval() && picked_node.CanEvalOutOfOrder()) { @@ -2161,6 +2163,10 @@ void SearchWorker::FetchMinibatchResults() { // Populate NN/cached results, or terminal results, into nodes. int idx_in_computation = 0; for (auto& node_to_process : minibatch_) { + if (node_to_process.is_cache_hit) { + FetchSingleNodeResult(&node_to_process, node_to_process, 0); + continue; + } FetchSingleNodeResult(&node_to_process, *computation_, idx_in_computation); if (node_to_process.nn_queried) ++idx_in_computation; } From 2880ba141d0a96f6477ae5cf5ac398c914aa0ce4 Mon Sep 17 00:00:00 2001 From: borg323 Date: Thu, 31 Oct 2024 17:17:36 +0200 Subject: [PATCH 10/12] make use of pfloat16 as a class --- src/mcts/node.h | 8 +++---- src/mcts/search.cc | 2 +- src/mcts/search.h | 3 ++- src/neural/cache.cc | 17 ++++++++------ src/neural/cache.h | 5 +++-- src/trainingdata/trainingdata.cc | 2 +- src/utils/pfloat16.h | 38 +++++++++++++++++++------------- 7 files changed, 44 insertions(+), 31 deletions(-) diff --git a/src/mcts/node.h b/src/mcts/node.h index 60461e69a5..4cb31f9097 100644 --- a/src/mcts/node.h +++ b/src/mcts/node.h @@ -92,9 +92,9 @@ class Edge { // Returns or sets value of Move policy prior returned from the neural net // (but can be changed by adding Dirichlet noise). Must be in [0,1]. - float GetP() const { return Pfloat16ToFloat(p_); } - void SetP(float val) { p_ = FloatToPfloat16(val); } - void SetPCompressed(uint16_t p) { p_ = p; } + float GetP() const { return p_; } + void SetP(float val) { p_ = val; } + void SetP(pfloat16 p) { p_ = p; } // Debug information about the edge. std::string DebugString() const; @@ -107,7 +107,7 @@ class Edge { // Probability that this move will be made, from the policy head of the neural // network; compressed to a 16 bit format (5 bits exp, 11 bits significand). - uint16_t p_ = 0; + pfloat16 p_; friend class Node; }; diff --git a/src/mcts/search.cc b/src/mcts/search.cc index 5fb8e876cf..da9e39b58a 100644 --- a/src/mcts/search.cc +++ b/src/mcts/search.cc @@ -2210,7 +2210,7 @@ void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process, // softmax. int idx = 0; for (auto& edge : node->Edges()) { - edge.edge()->SetPCompressed(computation.GetPVal(idx_in_computation, idx++)); + edge.edge()->SetP(computation.GetPVal(idx_in_computation, idx++)); } // Add Dirichlet noise if enabled and at root. if (params_.GetNoiseEpsilon() && node == search_->root_node_) { diff --git a/src/mcts/search.h b/src/mcts/search.h index 85074c9d19..98ec6fe5eb 100644 --- a/src/mcts/search.h +++ b/src/mcts/search.h @@ -44,6 +44,7 @@ #include "syzygy/syzygy.h" #include "utils/logging.h" #include "utils/mutex.h" +#include "utils/pfloat16.h" namespace lczero { @@ -365,7 +366,7 @@ class SearchWorker { float GetMVal(int) const { return entry.m; } - uint16_t GetPVal(int, int move_ct) const { return entry.p[move_ct]; } + pfloat16 GetPVal(int, int move_ct) const { return entry.p[move_ct]; } private: NodeToProcess(Node* node, uint16_t depth, bool is_collision, int multivisit, diff --git a/src/neural/cache.cc b/src/neural/cache.cc index fd661e8fef..2194a45ffa 100644 --- a/src/neural/cache.cc +++ b/src/neural/cache.cc @@ -114,6 +114,10 @@ void CachingComputation::ComputeBlocking() { if (parent_->GetBatchSize() == 0) return; parent_->ComputeBlocking(); + // Intermediate array to store values when processing policy. + // There are never more than 256 valid legal moves in any legal position. + std::array intermediate; + // Fill cache with data from NN. for (auto& item : batch_) { if (item.idx_in_parent == -1) continue; @@ -125,9 +129,6 @@ void CachingComputation::ComputeBlocking() { // Calculate maximum first. float max_p = -std::numeric_limits::infinity(); - // Intermediate array to store values when processing policy. - // There are never more than 256 valid legal moves in any legal position. - std::array intermediate; int counter = 0; for (auto x : item.probabilities_to_cache) { float p = parent_->GetPVal(item.idx_in_parent, x); @@ -145,9 +146,9 @@ void CachingComputation::ComputeBlocking() { // Normalize P values to add up to 1.0. const float scale = total > 0.0f ? 1.0f / total : 1.0f; for (size_t ct = 0; ct < item.probabilities_to_cache.size(); ct++) { - uint16_t p = FloatToPfloat16(intermediate[ct] * scale); + pfloat16 p = intermediate[ct] * scale; req->p[ct] = p; - item.probabilities_to_cache[ct] = p; + std::memcpy(&item.probabilities_to_cache[ct], &p, sizeof(pfloat16)); } cache_->Insert(item.hash, std::move(req)); } @@ -171,10 +172,12 @@ float CachingComputation::GetMVal(int sample) const { return item.lock->m; } -uint16_t CachingComputation::GetPVal(int sample, int move_ct) const { +pfloat16 CachingComputation::GetPVal(int sample, int move_ct) const { auto& item = batch_[sample]; if (item.idx_in_parent >= 0) { - return item.probabilities_to_cache[move_ct]; + pfloat16 r; + std::memcpy(&r, &item.probabilities_to_cache[move_ct], sizeof(pfloat16)); + return r; } return item.lock->p[move_ct]; } diff --git a/src/neural/cache.h b/src/neural/cache.h index 2d25a936a5..2ab93e4e3b 100644 --- a/src/neural/cache.h +++ b/src/neural/cache.h @@ -29,6 +29,7 @@ #include "mcts/node.h" #include "neural/network.h" #include "utils/cache.h" +#include "utils/pfloat16.h" #include "utils/smallarray.h" namespace lczero { @@ -39,7 +40,7 @@ struct CachedNNRequest { float d; float m; // Store p only for valid moves. - std::vector p; + std::vector p; }; typedef HashKeyedCache NNCache; @@ -78,7 +79,7 @@ class CachingComputation { // Returns estimated remaining moves. float GetMVal(int sample) const; // Returns compressed P value @move_id of @sample. - uint16_t GetPVal(int sample, int move_ct) const; + pfloat16 GetPVal(int sample, int move_ct) const; // Pops last input from the computation. Only allowed for inputs which were // cached. void PopCacheHit(); diff --git a/src/trainingdata/trainingdata.cc b/src/trainingdata/trainingdata.cc index 0597f8bc18..a474ff0205 100644 --- a/src/trainingdata/trainingdata.cc +++ b/src/trainingdata/trainingdata.cc @@ -156,7 +156,7 @@ void V6TrainingDataArray::Add(const Node* node, const PositionHistory& history, float p = 0; for (size_t i = 0; i < legal_moves.size(); i++) { if (move == legal_moves[i]) { - p = Pfloat16ToFloat(nneval->p[i]); + p = nneval->p[i]; break; } } diff --git a/src/utils/pfloat16.h b/src/utils/pfloat16.h index 0554377eca..e35c652d79 100644 --- a/src/utils/pfloat16.h +++ b/src/utils/pfloat16.h @@ -59,21 +59,29 @@ namespace lczero { // subtracting the two bits from the input and checking for a negative result // (the subtraction works despite crossing from exponent to significand). This // is combined with the round-to-nearest addition (1<<11) into one op. -static inline uint16_t FloatToPfloat16(const float &p) { - assert(0.0f <= p && p <= 1.0f); - constexpr int32_t roundings = (1 << 11) - (3 << 28); - int32_t tmp; - std::memcpy(&tmp, &p, sizeof(float)); - tmp += roundings; - return (tmp < 0) ? 0 : static_cast(tmp >> 12); -} -static inline float Pfloat16ToFloat(const uint16_t &p) { - // Reshift into place and set the assumed-set exponent bits. - uint32_t tmp = (static_cast(p) << 12) | (3 << 28); - float ret; - std::memcpy(&ret, &tmp, sizeof(uint32_t)); - return ret; -} +class pfloat16 { + public: + pfloat16() { value = 0; } + pfloat16(const float &p) { + assert(0.0f <= p && p <= 1.0f); + constexpr int32_t roundings = (1 << 11) - (3 << 28); + int32_t tmp; + std::memcpy(&tmp, &p, sizeof(float)); + tmp += roundings; + value = (tmp < 0) ? 0 : static_cast(tmp >> 12); + } + + operator float() const { + // Reshift into place and set the assumed-set exponent bits. + uint32_t tmp = (static_cast(value) << 12) | (3 << 28); + float ret; + std::memcpy(&ret, &tmp, sizeof(uint32_t)); + return ret; + } + + private: + uint16_t value = 0; +}; } // namespace lczero From efcc88f79fa0797a81f39d4d4fec239cbbc0f2fb Mon Sep 17 00:00:00 2001 From: borg323 Date: Mon, 16 Dec 2024 15:09:55 +0200 Subject: [PATCH 11/12] warning fix --- src/neural/cache.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/neural/cache.cc b/src/neural/cache.cc index 2194a45ffa..a89fb9b2bf 100644 --- a/src/neural/cache.cc +++ b/src/neural/cache.cc @@ -176,7 +176,8 @@ pfloat16 CachingComputation::GetPVal(int sample, int move_ct) const { auto& item = batch_[sample]; if (item.idx_in_parent >= 0) { pfloat16 r; - std::memcpy(&r, &item.probabilities_to_cache[move_ct], sizeof(pfloat16)); + std::memcpy(&r, (pfloat16*)&item.probabilities_to_cache[move_ct], + sizeof(pfloat16)); return r; } return item.lock->p[move_ct]; From ecb7ad88deafbabb2e0c6eb7bcd1966c1956672a Mon Sep 17 00:00:00 2001 From: borg323 Date: Mon, 16 Dec 2024 18:25:26 +0200 Subject: [PATCH 12/12] fixes --- src/neural/cache.h | 3 +-- src/utils/pfloat16.h | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/neural/cache.h b/src/neural/cache.h index 2ab93e4e3b..72565af12f 100644 --- a/src/neural/cache.h +++ b/src/neural/cache.h @@ -59,13 +59,12 @@ class CachingComputation { // How many inputs are not found in cache and will be forwarded to a wrapped // computation. int GetCacheMisses() const; - // Total number of times AddInput/AddInputByHash were (successfully) called. + // Total number of times AddInput was (successfully) called. int GetBatchSize() const; // Check if entry is in the cache. bool CacheLookup(const PositionHistory& history, const MoveList& moves = {}, CachedNNRequest* entry = nullptr); // Adds a sample to the batch. Also calls EncodePositionForNN() if needed. - // @hash is a hash to store/lookup it in the cache. void AddInput(const PositionHistory& history, const MoveList& moves); // Undos last AddInput. If it was a cache miss, the it's actually not removed // from parent's batch. diff --git a/src/utils/pfloat16.h b/src/utils/pfloat16.h index e35c652d79..bdbe7c8fd6 100644 --- a/src/utils/pfloat16.h +++ b/src/utils/pfloat16.h @@ -73,6 +73,8 @@ class pfloat16 { value = (tmp < 0) ? 0 : static_cast(tmp >> 12); } + pfloat16(const pfloat16 &) = default; + operator float() const { // Reshift into place and set the assumed-set exponent bits. uint32_t tmp = (static_cast(value) << 12) | (3 << 28);