From 33cf47fc5ce4acc7030afdb32245c6f9d96fd2ba Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Sat, 23 Oct 2021 00:15:02 +0300
Subject: [PATCH 01/12] reduce cache size by storing compressed policy

---
 src/mcts/node.cc                 | 47 -------------------
 src/mcts/node.h                  |  7 +--
 src/mcts/search.cc               | 45 +++++-------------
 src/mcts/search.h                | 15 +-----
 src/mcts/stoppers/stoppers.cc    |  3 +-
 src/neural/cache.cc              | 55 ++++++++++++++--------
 src/neural/cache.h               | 13 +++---
 src/selfplay/game.cc             |  3 +-
 src/trainingdata/trainingdata.cc | 22 ++++-----
 src/trainingdata/trainingdata.h  |  3 +-
 src/utils/pfloat16.h             | 79 ++++++++++++++++++++++++++++++++
 11 files changed, 153 insertions(+), 139 deletions(-)
 create mode 100644 src/utils/pfloat16.h

diff --git a/src/mcts/node.cc b/src/mcts/node.cc
index 9b71cff091..60235caf99 100644
--- a/src/mcts/node.cc
+++ b/src/mcts/node.cc
@@ -131,53 +131,6 @@ Move Edge::GetMove(bool as_opponent) const {
   return m;
 }
 
-// Policy priors (P) are stored in a compressed 16-bit format.
-//
-// Source values are 32-bit floats:
-// * bit 31 is sign (zero means positive)
-// * bit 30 is sign of exponent (zero means nonpositive)
-// * bits 29..23 are value bits of exponent
-// * bits 22..0 are significand bits (plus a "virtual" always-on bit: s ∈ [1,2))
-// The number is then sign * 2^exponent * significand, usually.
-// See https://www.h-schmidt.net/FloatConverter/IEEE754.html for details.
-//
-// In compressed 16-bit value we store bits 27..12:
-// * bit 31 is always off as values are always >= 0
-// * bit 30 is always off as values are always < 2
-// * bits 29..28 are only off for values < 4.6566e-10, assume they are always on
-// * bits 11..0 are for higher precision, they are dropped leaving only 11 bits
-//     of precision
-//
-// When converting to compressed format, bit 11 is added to in order to make it
-// a rounding rather than truncation.
-//
-// Out of 65556 possible values, 2047 are outside of [0,1] interval (they are in
-// interval (1,2)). This is fine because the values in [0,1] are skewed towards
-// 0, which is also exactly how the components of policy tend to behave (since
-// they add up to 1).
-
-// If the two assumed-on exponent bits (3<<28) are in fact off, the input is
-// rounded up to the smallest value with them on. We accomplish this by
-// subtracting the two bits from the input and checking for a negative result
-// (the subtraction works despite crossing from exponent to significand). This
-// is combined with the round-to-nearest addition (1<<11) into one op.
-void Edge::SetP(float p) {
-  assert(0.0f <= p && p <= 1.0f);
-  constexpr int32_t roundings = (1 << 11) - (3 << 28);
-  int32_t tmp;
-  std::memcpy(&tmp, &p, sizeof(float));
-  tmp += roundings;
-  p_ = (tmp < 0) ? 0 : static_cast<uint16_t>(tmp >> 12);
-}
-
-float Edge::GetP() const {
-  // Reshift into place and set the assumed-set exponent bits.
-  uint32_t tmp = (static_cast<uint32_t>(p_) << 12) | (3 << 28);
-  float ret;
-  std::memcpy(&ret, &tmp, sizeof(uint32_t));
-  return ret;
-}
-
 std::string Edge::DebugString() const {
   std::ostringstream oss;
   oss << "Move: " << move_.as_string() << " p_: " << p_ << " GetP: " << GetP();
diff --git a/src/mcts/node.h b/src/mcts/node.h
index 8fffea6a9a..c7964fb55a 100644
--- a/src/mcts/node.h
+++ b/src/mcts/node.h
@@ -36,10 +36,10 @@
 #include "chess/board.h"
 #include "chess/callbacks.h"
 #include "chess/position.h"
-#include "neural/cache.h"
 #include "neural/encoder.h"
 #include "proto/net.pb.h"
 #include "utils/mutex.h"
+#include "utils/pfloat16.h"
 
 namespace lczero {
 
@@ -92,8 +92,9 @@ class Edge {
 
   // Returns or sets value of Move policy prior returned from the neural net
   // (but can be changed by adding Dirichlet noise). Must be in [0,1].
-  float GetP() const;
-  void SetP(float val);
+  float GetP() const { return Pfloat16ToFloat(p_); }
+  void SetP(float val) { p_ = FloatToPfloat16(val); }
+  void SetPCompressed(uint16_t p) { p_ = p; }
 
   // Debug information about the edge.
   std::string DebugString() const;
diff --git a/src/mcts/search.cc b/src/mcts/search.cc
index 1936096185..0beb71e5f3 100644
--- a/src/mcts/search.cc
+++ b/src/mcts/search.cc
@@ -1963,11 +1963,11 @@ bool SearchWorker::AddNodeToComputation(Node* node, bool add_if_cached,
       moves.emplace_back(edge.GetMove().as_nn_index(transform));
     }
   } else {
-    // Cache pseudolegal moves. A bit of a waste, but faster.
-    const auto& pseudolegal_moves =
-        history_.Last().GetBoard().GeneratePseudolegalMoves();
-    moves.reserve(pseudolegal_moves.size());
-    for (auto iter = pseudolegal_moves.begin(), end = pseudolegal_moves.end();
+    // Cache legal moves.
+    const auto& legal_moves =
+        history_.Last().GetBoard().GenerateLegalMoves();
+    moves.reserve(legal_moves.size());
+    for (auto iter = legal_moves.begin(), end = legal_moves.end();
          iter != end; ++iter) {
       moves.emplace_back(iter->as_nn_index(transform));
     }
@@ -2098,7 +2098,9 @@ int SearchWorker::PrefetchIntoCache(Node* node, int budget, bool is_odd_depth) {
 
 // 4. Run NN computation.
 // ~~~~~~~~~~~~~~~~~~~~~~
-void SearchWorker::RunNNComputation() { computation_->ComputeBlocking(); }
+void SearchWorker::RunNNComputation() {
+  computation_->ComputeBlocking(params_.GetPolicySoftmaxTemp());
+}
 
 // 5. Retrieve NN computations (and terminal values) into nodes.
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -2130,34 +2132,11 @@ void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process,
   node_to_process->v = -computation.GetQVal(idx_in_computation);
   node_to_process->d = computation.GetDVal(idx_in_computation);
   node_to_process->m = computation.GetMVal(idx_in_computation);
-  // ...and secondly, the policy data.
-  // Calculate maximum first.
-  float max_p = -std::numeric_limits<float>::infinity();
-  // Intermediate array to store values when processing policy.
-  // There are never more than 256 valid legal moves in any legal position.
-  std::array<float, 256> intermediate;
-  int counter = 0;
-  for (auto& edge : node->Edges()) {
-    float p = computation.GetPVal(
-        idx_in_computation,
-        edge.GetMove().as_nn_index(node_to_process->probability_transform));
-    intermediate[counter++] = p;
-    max_p = std::max(max_p, p);
-  }
-  float total = 0.0;
-  for (int i = 0; i < counter; i++) {
-    // Perform softmax and take into account policy softmax temperature T.
-    // Note that we want to calculate (exp(p-max_p))^(1/T) = exp((p-max_p)/T).
-    float p =
-        FastExp((intermediate[i] - max_p) / params_.GetPolicySoftmaxTemp());
-    intermediate[i] = p;
-    total += p;
-  }
-  counter = 0;
-  // Normalize P values to add up to 1.0.
-  const float scale = total > 0.0f ? 1.0f / total : 1.0f;
+  // ...and secondly, the policy data. The cache returns compressed values after
+  // softmax.
+  int idx = 0;
   for (auto& edge : node->Edges()) {
-    edge.edge()->SetP(intermediate[counter++] * scale);
+    edge.edge()->SetPCompressed(computation.GetPVal(idx_in_computation, idx++));
   }
   // Add Dirichlet noise if enabled and at root.
   if (params_.GetNoiseEpsilon() && node == search_->root_node_) {
diff --git a/src/mcts/search.h b/src/mcts/search.h
index 3c95058a7b..bae877a740 100644
--- a/src/mcts/search.h
+++ b/src/mcts/search.h
@@ -353,20 +353,7 @@ class SearchWorker {
 
     float GetMVal(int) const { return lock->m; }
 
-    float GetPVal(int, int move_id) const {
-      const auto& moves = lock->p;
-
-      int total_count = 0;
-      while (total_count < moves.size()) {
-        // Optimization: usually moves are stored in the same order as queried.
-        const auto& move = moves[last_idx++];
-        if (last_idx == moves.size()) last_idx = 0;
-        if (move.first == move_id) return move.second;
-        ++total_count;
-      }
-      assert(false);  // Move not found.
-      return 0;
-    }
+    uint16_t GetPVal(int, int move_ct) const { return lock->p[move_ct]; }
 
    private:
     NodeToProcess(Node* node, uint16_t depth, bool is_collision, int multivisit,
diff --git a/src/mcts/stoppers/stoppers.cc b/src/mcts/stoppers/stoppers.cc
index 8d6f7ae426..42260a6e15 100644
--- a/src/mcts/stoppers/stoppers.cc
+++ b/src/mcts/stoppers/stoppers.cc
@@ -96,8 +96,7 @@ const size_t kAvgNodeSize =
     sizeof(Node) + MemoryWatchingStopper::kAvgMovesPerPosition * sizeof(Edge);
 const size_t kAvgCacheItemSize =
     NNCache::GetItemStructSize() + sizeof(CachedNNRequest) +
-    sizeof(CachedNNRequest::IdxAndProb) *
-        MemoryWatchingStopper::kAvgMovesPerPosition;
+    sizeof(CachedNNRequest::p) * MemoryWatchingStopper::kAvgMovesPerPosition;
 }  // namespace
 
 MemoryWatchingStopper::MemoryWatchingStopper(int cache_size, int ram_limit_mb,
diff --git a/src/neural/cache.cc b/src/neural/cache.cc
index d729a562f0..56b6515d55 100644
--- a/src/neural/cache.cc
+++ b/src/neural/cache.cc
@@ -25,9 +25,14 @@
   Program grant you additional permission to convey the resulting work.
 */
 #include "neural/cache.h"
+
+#include <array>
 #include <cassert>
 #include <iostream>
 
+#include "utils/fastmath.h"
+#include "utils/pfloat16.h"
+
 namespace lczero {
 CachingComputation::CachingComputation(
     std::unique_ptr<NetworkComputation> parent, NNCache* cache)
@@ -77,22 +82,44 @@ void CachingComputation::PopLastInputHit() {
   batch_.pop_back();
 }
 
-void CachingComputation::ComputeBlocking() {
+void CachingComputation::ComputeBlocking(float softmax_temp) {
   if (parent_->GetBatchSize() == 0) return;
   parent_->ComputeBlocking();
 
   // Fill cache with data from NN.
-  for (const auto& item : batch_) {
+  for (auto& item : batch_) {
     if (item.idx_in_parent == -1) continue;
     auto req =
         std::make_unique<CachedNNRequest>(item.probabilities_to_cache.size());
     req->q = parent_->GetQVal(item.idx_in_parent);
     req->d = parent_->GetDVal(item.idx_in_parent);
     req->m = parent_->GetMVal(item.idx_in_parent);
-    int idx = 0;
+
+    // Calculate maximum first.
+    float max_p = -std::numeric_limits<float>::infinity();
+    // Intermediate array to store values when processing policy.
+    // There are never more than 256 valid legal moves in any legal position.
+    std::array<float, 256> intermediate;
+    int counter = 0;
     for (auto x : item.probabilities_to_cache) {
-      req->p[idx++] =
-          std::make_pair(x, parent_->GetPVal(item.idx_in_parent, x));
+      float p = parent_->GetPVal(item.idx_in_parent, x);
+      intermediate[counter++] = p;
+      max_p = std::max(max_p, p);
+    }
+    float total = 0.0;
+    for (int i = 0; i < counter; i++) {
+      // Perform softmax and take into account policy softmax temperature T.
+      // Note that we want to calculate (exp(p-max_p))^(1/T) = exp((p-max_p)/T).
+      float p = FastExp((intermediate[i] - max_p) / softmax_temp);
+      intermediate[i] = p;
+      total += p;
+    }
+    // Normalize P values to add up to 1.0.
+    const float scale = total > 0.0f ? 1.0f / total : 1.0f;
+    for (size_t ct = 0; ct < item.probabilities_to_cache.size(); ct++) {
+      uint16_t p = FloatToPfloat16(intermediate[ct] * scale);
+      req->p[ct] = p;
+      item.probabilities_to_cache[ct] = p;
     }
     cache_->Insert(item.hash, std::move(req));
   }
@@ -116,22 +143,12 @@ float CachingComputation::GetMVal(int sample) const {
   return item.lock->m;
 }
 
-float CachingComputation::GetPVal(int sample, int move_id) const {
+uint16_t CachingComputation::GetPVal(int sample, int move_ct) const {
   auto& item = batch_[sample];
-  if (item.idx_in_parent >= 0)
-    return parent_->GetPVal(item.idx_in_parent, move_id);
-  const auto& moves = item.lock->p;
-
-  int total_count = 0;
-  while (total_count < moves.size()) {
-    // Optimization: usually moves are stored in the same order as queried.
-    const auto& move = moves[item.last_idx++];
-    if (item.last_idx == moves.size()) item.last_idx = 0;
-    if (move.first == move_id) return move.second;
-    ++total_count;
+  if (item.idx_in_parent >= 0) {
+    return item.probabilities_to_cache[move_ct];
   }
-  assert(false);  // Move not found.
-  return 0;
+  return item.lock->p[move_ct];
 }
 
 }  // namespace lczero
diff --git a/src/neural/cache.h b/src/neural/cache.h
index 207e0fe6e4..fbe6deedd7 100644
--- a/src/neural/cache.h
+++ b/src/neural/cache.h
@@ -34,12 +34,11 @@ namespace lczero {
 
 struct CachedNNRequest {
   CachedNNRequest(size_t size) : p(size) {}
-  typedef std::pair<uint16_t, float> IdxAndProb;
   float q;
   float d;
   float m;
-  // TODO(mooskagh) Don't really need index if using perfect hash.
-  SmallArray<IdxAndProb> p;
+  // Store p only for valid moves.
+  SmallArray<uint16_t> p;
 };
 
 typedef HashKeyedCache<CachedNNRequest> NNCache;
@@ -73,15 +72,15 @@ class CachingComputation {
   // from parent's batch.
   void PopLastInputHit();
   // Do the computation.
-  void ComputeBlocking();
+  void ComputeBlocking(float softmax_temp);
   // Returns Q value of @sample.
   float GetQVal(int sample) const;
   // Returns probability of draw if NN has WDL value head.
   float GetDVal(int sample) const;
   // Returns estimated remaining moves.
   float GetMVal(int sample) const;
-  // Returns P value @move_id of @sample.
-  float GetPVal(int sample, int move_id) const;
+  // Returns compressed P value @move_id of @sample.
+  uint16_t GetPVal(int sample, int move_ct) const;
   // Pops last input from the computation. Only allowed for inputs which were
   // cached.
   void PopCacheHit();
@@ -94,8 +93,8 @@ class CachingComputation {
     uint64_t hash;
     NNCacheLock lock;
     int idx_in_parent = -1;
+    // Initially the move indices, after computation the policy values.
     std::vector<uint16_t> probabilities_to_cache;
-    mutable int last_idx = 0;
   };
 
   std::unique_ptr<NetworkComputation> parent_;
diff --git a/src/selfplay/game.cc b/src/selfplay/game.cc
index fad43bbba1..0d5b677705 100644
--- a/src/selfplay/game.cc
+++ b/src/selfplay/game.cc
@@ -268,7 +268,8 @@ void SelfPlayGame::Play(int white_threads, int black_threads, bool training,
           search_->GetCachedNNEval(tree_[idx]->GetCurrentHead());
       training_data_.Add(tree_[idx]->GetCurrentHead(),
                          tree_[idx]->GetPositionHistory(), best_eval,
-                         played_eval, best_is_proof, best_move, move, nneval);
+                         played_eval, best_is_proof, best_move, move, nneval,
+                         search_->GetParams().GetPolicySoftmaxTemp());
     }
     // Must reset the search before mutating the tree.
     search_.reset();
diff --git a/src/trainingdata/trainingdata.cc b/src/trainingdata/trainingdata.cc
index 9dcb0e17b7..7a2f264662 100644
--- a/src/trainingdata/trainingdata.cc
+++ b/src/trainingdata/trainingdata.cc
@@ -114,7 +114,8 @@ void V6TrainingDataArray::Write(TrainingDataWriter* writer, GameResult result,
 void V6TrainingDataArray::Add(const Node* node, const PositionHistory& history,
                               Eval best_eval, Eval played_eval,
                               bool best_is_proven, Move best_move,
-                              Move played_move, const NNCacheLock& nneval) {
+                              Move played_move, const NNCacheLock& nneval,
+                              float softmax_temp) {
   V6TrainingData result;
   const auto& position = history.Last();
 
@@ -146,24 +147,20 @@ void V6TrainingDataArray::Add(const Node* node, const PositionHistory& history,
   // Set moves probabilities according to their relative amount of visits.
   // Compute Kullback-Leibler divergence in nats (between policy and visits).
   float kld_sum = 0;
-  float max_p = -std::numeric_limits<float>::infinity();
   std::vector<float> intermediate;
   if (nneval) {
-    int last_idx = 0;
+    // The cache stores policies in GenerateLegalMoves() order.
+    auto legal_moves = history.Last().GetBoard().GenerateLegalMoves();
     for (const auto& child : node->Edges()) {
-      auto nn_idx = child.edge()->GetMove().as_nn_index(transform);
+      auto move = child.edge()->GetMove();
       float p = 0;
-      for (int i = 0; i < nneval->p.size(); i++) {
-        // Optimization: usually moves are stored in the same order as queried.
-        const auto& move = nneval->p[last_idx++];
-        if (last_idx == nneval->p.size()) last_idx = 0;
-        if (move.first == nn_idx) {
-          p = move.second;
+      for (size_t i = 0; i < legal_moves.size(); i++) {
+        if (move == legal_moves[i]) {
+          p = Pfloat16ToFloat(nneval->p[i]);
           break;
         }
       }
       intermediate.emplace_back(p);
-      max_p = std::max(max_p, p);
     }
   }
   float total = 0.0;
@@ -172,7 +169,8 @@ void V6TrainingDataArray::Add(const Node* node, const PositionHistory& history,
     auto nn_idx = child.edge()->GetMove().as_nn_index(transform);
     float fracv = total_n > 0 ? child.GetN() / static_cast<float>(total_n) : 1;
     if (nneval) {
-      float P = std::exp(*it - max_p);
+      // Undo any softmax temperature in the cached data.
+      float P = std::pow(*it, softmax_temp);
       if (fracv > 0) {
         kld_sum += fracv * std::log(fracv / P);
       }
diff --git a/src/trainingdata/trainingdata.h b/src/trainingdata/trainingdata.h
index 6fc3b3b8a5..601b8a80d9 100644
--- a/src/trainingdata/trainingdata.h
+++ b/src/trainingdata/trainingdata.h
@@ -28,6 +28,7 @@
 #pragma once
 
 #include "mcts/node.h"
+#include "neural/cache.h"
 #include "trainingdata/writer.h"
 
 namespace lczero {
@@ -98,7 +99,7 @@ class V6TrainingDataArray {
   // Add a chunk.
   void Add(const Node* node, const PositionHistory& history, Eval best_eval,
            Eval played_eval, bool best_is_proven, Move best_move,
-           Move played_move, const NNCacheLock& nneval);
+           Move played_move, const NNCacheLock& nneval, float softmax_temp);
 
   // Writes training data to a file.
   void Write(TrainingDataWriter* writer, GameResult result,
diff --git a/src/utils/pfloat16.h b/src/utils/pfloat16.h
new file mode 100644
index 0000000000..0554377eca
--- /dev/null
+++ b/src/utils/pfloat16.h
@@ -0,0 +1,79 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2021 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+
+namespace lczero {
+// Compressed 16-bit floating point format for probability values.
+// Optimised for representing numbers in the [0,1] range.
+//
+// Source values are 32-bit floats:
+// * bit 31 is sign (zero means positive)
+// * bit 30 is sign of exponent (zero means nonpositive)
+// * bits 29..23 are value bits of exponent
+// * bits 22..0 are significand bits (plus a "virtual" always-on bit: s ∈ [1,2))
+// The number is then sign * 2^exponent * significand, usually.
+// See https://www.h-schmidt.net/FloatConverter/IEEE754.html for details.
+//
+// In compressed 16-bit value we store bits 27..12:
+// * bit 31 is always off as values are always >= 0
+// * bit 30 is always off as values are always < 2
+// * bits 29..28 are only off for values < 4.6566e-10, assume they are always on
+// * bits 11..0 are for higher precision, they are dropped leaving only 11 bits
+//     of precision
+//
+// Out of 65556 possible values, 2047 are outside of [0,1] interval (they are in
+// interval (1,2)).
+
+// When converting to compressed format, bit 11 is added to in order to make it
+// a rounding rather than truncation.
+// If the two assumed-on exponent bits (3<<28) are in fact off, the input is
+// rounded up to the smallest value with them on. We accomplish this by
+// subtracting the two bits from the input and checking for a negative result
+// (the subtraction works despite crossing from exponent to significand). This
+// is combined with the round-to-nearest addition (1<<11) into one op.
+static inline uint16_t FloatToPfloat16(const float &p) {
+  assert(0.0f <= p && p <= 1.0f);
+  constexpr int32_t roundings = (1 << 11) - (3 << 28);
+  int32_t tmp;
+  std::memcpy(&tmp, &p, sizeof(float));
+  tmp += roundings;
+  return (tmp < 0) ? 0 : static_cast<uint16_t>(tmp >> 12);
+}
+
+static inline float Pfloat16ToFloat(const uint16_t &p) {
+  // Reshift into place and set the assumed-set exponent bits.
+  uint32_t tmp = (static_cast<uint32_t>(p) << 12) | (3 << 28);
+  float ret;
+  std::memcpy(&ret, &tmp, sizeof(uint32_t));
+  return ret;
+}
+
+}  // namespace lczero

From b3ba1596d1c1f62c8824095a8588f10bf0ec5eb8 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Mon, 30 May 2022 13:29:37 +0300
Subject: [PATCH 02/12] guard against hash collision

---
 src/neural/cache.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/neural/cache.cc b/src/neural/cache.cc
index 56b6515d55..d3751a27ba 100644
--- a/src/neural/cache.cc
+++ b/src/neural/cache.cc
@@ -146,8 +146,14 @@ float CachingComputation::GetMVal(int sample) const {
 uint16_t CachingComputation::GetPVal(int sample, int move_ct) const {
   auto& item = batch_[sample];
   if (item.idx_in_parent >= 0) {
+    if (move_ct > static_cast<int>(item.probabilities_to_cache.size())) {
+      return 0;  // Hash collision.
+    }
     return item.probabilities_to_cache[move_ct];
   }
+  if (move_ct > item.lock->p.size()) {
+    return 0;  // Hash collision.
+  }
   return item.lock->p[move_ct];
 }
 

From 7f96e91780601ed614caea3928215d8ed70d41d5 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Sat, 1 Jun 2024 00:06:19 +0300
Subject: [PATCH 03/12] move NN encoding to the cache

---
 src/mcts/search.cc  | 44 ++++++++++++--------------------------------
 src/mcts/search.h   |  5 ++---
 src/neural/cache.cc | 26 ++++++++++++++++++++------
 src/neural/cache.h  | 13 ++++++++-----
 4 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/src/mcts/search.cc b/src/mcts/search.cc
index fc37144bed..490e624ddc 100644
--- a/src/mcts/search.cc
+++ b/src/mcts/search.cc
@@ -1258,8 +1258,9 @@ void SearchWorker::ExecuteOneIteration() {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 void SearchWorker::InitializeIteration(
     std::unique_ptr<NetworkComputation> computation) {
-  computation_ = std::make_unique<CachingComputation>(std::move(computation),
-                                                      search_->cache_);
+  computation_ = std::make_unique<CachingComputation>(
+      std::move(computation), search_->network_->GetCapabilities().input_format,
+      params_.GetHistoryFill(), search_->cache_);
   computation_->Reserve(target_minibatch_size_);
   minibatch_.clear();
   minibatch_.reserve(2 * target_minibatch_size_);
@@ -1424,9 +1425,8 @@ void SearchWorker::GatherMinibatch() {
         computation_->AddInputByHash(minibatch_[i].hash,
                                      std::move(minibatch_[i].lock));
       } else {
-        computation_->AddInput(minibatch_[i].hash,
-                               std::move(minibatch_[i].input_planes),
-                               std::move(minibatch_[i].probabilities_to_cache));
+        computation_->AddInput(minibatch_[i].hash, minibatch_[i].history,
+                               std::move(minibatch_[i].moves));
       }
     }
 
@@ -1478,21 +1478,12 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx,
         picked_node.lock = NNCacheLock(search_->cache_, hash);
         picked_node.is_cache_hit = picked_node.lock;
         if (!picked_node.is_cache_hit) {
-          int transform;
-          picked_node.input_planes = EncodePositionForNN(
-              search_->network_->GetCapabilities().input_format, history, 8,
-              params_.GetHistoryFill(), &transform);
-          picked_node.probability_transform = transform;
-
-          std::vector<uint16_t>& moves = picked_node.probabilities_to_cache;
           // Legal moves are known, use them.
-          moves.reserve(node->GetNumEdges());
+          picked_node.moves.reserve(node->GetNumEdges());
           for (const auto& edge : node->Edges()) {
-            moves.emplace_back(edge.GetMove().as_nn_index(transform));
+            picked_node.moves.emplace_back(edge.GetMove());
           }
-        } else {
-          picked_node.probability_transform = TransformForPosition(
-              search_->network_->GetCapabilities().input_format, history);
+          picked_node.history = history;
         }
       }
     }
@@ -2041,31 +2032,20 @@ bool SearchWorker::AddNodeToComputation(Node* node) {
   if (search_->cache_->ContainsKey(hash)) {
     return true;
   }
-  int transform;
-  auto planes =
-      EncodePositionForNN(search_->network_->GetCapabilities().input_format,
-                          history_, 8, params_.GetHistoryFill(), &transform);
-
-  std::vector<uint16_t> moves;
+  MoveList moves;
 
   if (node && node->HasChildren()) {
     // Legal moves are known, use them.
     moves.reserve(node->GetNumEdges());
     for (const auto& edge : node->Edges()) {
-      moves.emplace_back(edge.GetMove().as_nn_index(transform));
+      moves.emplace_back(edge.GetMove());
     }
   } else {
     // Cache legal moves.
-    const auto& legal_moves =
-        history_.Last().GetBoard().GenerateLegalMoves();
-    moves.reserve(legal_moves.size());
-    for (auto iter = legal_moves.begin(), end = legal_moves.end();
-         iter != end; ++iter) {
-      moves.emplace_back(iter->as_nn_index(transform));
-    }
+    moves = history_.Last().GetBoard().GenerateLegalMoves();
   }
 
-  computation_->AddInput(hash, std::move(planes), std::move(moves));
+  computation_->AddInput(hash, history_, std::move(moves));
   return false;
 }
 
diff --git a/src/mcts/search.h b/src/mcts/search.h
index da631cd306..5ffca3a6b0 100644
--- a/src/mcts/search.h
+++ b/src/mcts/search.h
@@ -331,7 +331,6 @@ class SearchWorker {
     bool nn_queried = false;
     bool is_cache_hit = false;
     bool is_collision = false;
-    int probability_transform = 0;
 
     // Details only populated in the multigather path.
 
@@ -341,8 +340,8 @@ class SearchWorker {
     // Details that are filled in as we go.
     uint64_t hash;
     NNCacheLock lock;
-    std::vector<uint16_t> probabilities_to_cache;
-    InputPlanes input_planes;
+    MoveList moves;
+    PositionHistory history;
     mutable int last_idx = 0;
     bool ooo_completed = false;
 
diff --git a/src/neural/cache.cc b/src/neural/cache.cc
index d3751a27ba..6b72bd34cf 100644
--- a/src/neural/cache.cc
+++ b/src/neural/cache.cc
@@ -30,13 +30,19 @@
 #include <cassert>
 #include <iostream>
 
+#include "neural/encoder.h"
 #include "utils/fastmath.h"
 #include "utils/pfloat16.h"
 
 namespace lczero {
 CachingComputation::CachingComputation(
-    std::unique_ptr<NetworkComputation> parent, NNCache* cache)
-    : parent_(std::move(parent)), cache_(cache) {}
+    std::unique_ptr<NetworkComputation> parent,
+    pblczero::NetworkFormat::InputFormat input_format,
+    lczero::FillEmptyHistory history_fill, NNCache* cache)
+    : parent_(std::move(parent)),
+      input_format_(input_format),
+      history_fill_(history_fill),
+      cache_(cache) {}
 
 int CachingComputation::GetCacheMisses() const {
   return parent_->GetBatchSize();
@@ -65,14 +71,22 @@ void CachingComputation::PopCacheHit() {
   batch_.pop_back();
 }
 
-void CachingComputation::AddInput(
-    uint64_t hash, InputPlanes&& input,
-    std::vector<uint16_t>&& probabilities_to_cache) {
+void CachingComputation::AddInput(uint64_t hash, const PositionHistory& history,
+                                  MoveList&& moves) {
   if (AddInputByHash(hash)) return;
+
+  int transform;
+  auto input =
+      EncodePositionForNN(input_format_, history, 8, history_fill_, &transform);
+  std::vector<uint16_t> moves_as_nn_index;
+  moves_as_nn_index.reserve(moves.size());
+  for (auto iter = moves.begin(), end = moves.end(); iter != end; ++iter) {
+    moves_as_nn_index.emplace_back(iter->as_nn_index(transform));
+  }
   batch_.emplace_back();
   batch_.back().hash = hash;
   batch_.back().idx_in_parent = parent_->GetBatchSize();
-  batch_.back().probabilities_to_cache = probabilities_to_cache;
+  batch_.back().probabilities_to_cache = std::move(moves_as_nn_index);
   parent_->AddInput(std::move(input));
 }
 
diff --git a/src/neural/cache.h b/src/neural/cache.h
index fbe6deedd7..f1d4a050b1 100644
--- a/src/neural/cache.h
+++ b/src/neural/cache.h
@@ -26,6 +26,7 @@
 */
 #pragma once
 
+#include "mcts/node.h"
 #include "neural/network.h"
 #include "utils/cache.h"
 #include "utils/smallarray.h"
@@ -50,7 +51,8 @@ typedef HashKeyedCacheLock<CachedNNRequest> NNCacheLock;
 class CachingComputation {
  public:
   CachingComputation(std::unique_ptr<NetworkComputation> parent,
-                     NNCache* cache);
+                     pblczero::NetworkFormat::InputFormat input_format,
+                     lczero::FillEmptyHistory history_fill, NNCache* cache);
 
   // How many inputs are not found in cache and will be forwarded to a wrapped
   // computation.
@@ -63,11 +65,10 @@ class CachingComputation {
   // Adds input by hash with existing lock. Assumes the given lock holds a real
   // reference.
   void AddInputByHash(uint64_t hash, NNCacheLock&& lock);
-  // Adds a sample to the batch.
+  // Adds a sample to the batch. Also calls EncodePositionForNN() if needed.
   // @hash is a hash to store/lookup it in the cache.
-  // @probabilities_to_cache is which indices of policy head to store.
-  void AddInput(uint64_t hash, InputPlanes&& input,
-                std::vector<uint16_t>&& probabilities_to_cache);
+  void AddInput(uint64_t hash, const PositionHistory& history,
+                MoveList&& moves);
   // Undos last AddInput. If it was a cache miss, the it's actually not removed
   // from parent's batch.
   void PopLastInputHit();
@@ -98,6 +99,8 @@ class CachingComputation {
   };
 
   std::unique_ptr<NetworkComputation> parent_;
+  pblczero::NetworkFormat::InputFormat input_format_;
+  lczero::FillEmptyHistory history_fill_;
   NNCache* cache_;
   std::vector<WorkItem> batch_;
 };

From cd7782b7ea62f7dcc89b2ea496d4fadbf746dac4 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Sat, 1 Jun 2024 00:20:29 +0300
Subject: [PATCH 04/12] move softmax temp to CachingComputation constructor

---
 src/mcts/search.cc  | 7 +++----
 src/neural/cache.cc | 7 ++++---
 src/neural/cache.h  | 6 ++++--
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/mcts/search.cc b/src/mcts/search.cc
index 490e624ddc..fdc408d78b 100644
--- a/src/mcts/search.cc
+++ b/src/mcts/search.cc
@@ -1260,7 +1260,8 @@ void SearchWorker::InitializeIteration(
     std::unique_ptr<NetworkComputation> computation) {
   computation_ = std::make_unique<CachingComputation>(
       std::move(computation), search_->network_->GetCapabilities().input_format,
-      params_.GetHistoryFill(), search_->cache_);
+      params_.GetHistoryFill(), params_.GetPolicySoftmaxTemp(),
+      search_->cache_);
   computation_->Reserve(target_minibatch_size_);
   minibatch_.clear();
   minibatch_.reserve(2 * target_minibatch_size_);
@@ -2169,9 +2170,7 @@ int SearchWorker::PrefetchIntoCache(Node* node, int budget, bool is_odd_depth) {
 
 // 4. Run NN computation.
 // ~~~~~~~~~~~~~~~~~~~~~~
-void SearchWorker::RunNNComputation() {
-  computation_->ComputeBlocking(params_.GetPolicySoftmaxTemp());
-}
+void SearchWorker::RunNNComputation() { computation_->ComputeBlocking(); }
 
 // 5. Retrieve NN computations (and terminal values) into nodes.
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/src/neural/cache.cc b/src/neural/cache.cc
index 6b72bd34cf..1e0d155725 100644
--- a/src/neural/cache.cc
+++ b/src/neural/cache.cc
@@ -38,10 +38,11 @@ namespace lczero {
 CachingComputation::CachingComputation(
     std::unique_ptr<NetworkComputation> parent,
     pblczero::NetworkFormat::InputFormat input_format,
-    lczero::FillEmptyHistory history_fill, NNCache* cache)
+    lczero::FillEmptyHistory history_fill, float softmax_temp, NNCache* cache)
     : parent_(std::move(parent)),
       input_format_(input_format),
       history_fill_(history_fill),
+      softmax_temp_(softmax_temp),
       cache_(cache) {}
 
 int CachingComputation::GetCacheMisses() const {
@@ -96,7 +97,7 @@ void CachingComputation::PopLastInputHit() {
   batch_.pop_back();
 }
 
-void CachingComputation::ComputeBlocking(float softmax_temp) {
+void CachingComputation::ComputeBlocking() {
   if (parent_->GetBatchSize() == 0) return;
   parent_->ComputeBlocking();
 
@@ -124,7 +125,7 @@ void CachingComputation::ComputeBlocking(float softmax_temp) {
     for (int i = 0; i < counter; i++) {
       // Perform softmax and take into account policy softmax temperature T.
       // Note that we want to calculate (exp(p-max_p))^(1/T) = exp((p-max_p)/T).
-      float p = FastExp((intermediate[i] - max_p) / softmax_temp);
+      float p = FastExp((intermediate[i] - max_p) / softmax_temp_);
       intermediate[i] = p;
       total += p;
     }
diff --git a/src/neural/cache.h b/src/neural/cache.h
index f1d4a050b1..45904c337f 100644
--- a/src/neural/cache.h
+++ b/src/neural/cache.h
@@ -52,7 +52,8 @@ class CachingComputation {
  public:
   CachingComputation(std::unique_ptr<NetworkComputation> parent,
                      pblczero::NetworkFormat::InputFormat input_format,
-                     lczero::FillEmptyHistory history_fill, NNCache* cache);
+                     lczero::FillEmptyHistory history_fill, float softmax_temp,
+                     NNCache* cache);
 
   // How many inputs are not found in cache and will be forwarded to a wrapped
   // computation.
@@ -73,7 +74,7 @@ class CachingComputation {
   // from parent's batch.
   void PopLastInputHit();
   // Do the computation.
-  void ComputeBlocking(float softmax_temp);
+  void ComputeBlocking();
   // Returns Q value of @sample.
   float GetQVal(int sample) const;
   // Returns probability of draw if NN has WDL value head.
@@ -101,6 +102,7 @@ class CachingComputation {
   std::unique_ptr<NetworkComputation> parent_;
   pblczero::NetworkFormat::InputFormat input_format_;
   lczero::FillEmptyHistory history_fill_;
+  float softmax_temp_;
   NNCache* cache_;
   std::vector<WorkItem> batch_;
 };

From 185ce8a1c18ec10900b9b813a6fe9f9fa0d0ef50 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Sun, 9 Jun 2024 00:05:47 +0300
Subject: [PATCH 05/12] do move generation before ExtendNode

---
 src/mcts/search.cc | 41 ++++++++++++++++++-----------------------
 src/mcts/search.h  |  4 ++--
 2 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/src/mcts/search.cc b/src/mcts/search.cc
index fdc408d78b..4b11a28b8c 100644
--- a/src/mcts/search.cc
+++ b/src/mcts/search.cc
@@ -1471,7 +1471,15 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx,
     // of the game), it means that we already visited this node before.
     if (picked_node.IsExtendable()) {
       // Node was never visited, extend it.
-      ExtendNode(node, picked_node.depth, picked_node.moves_to_visit, &history);
+      // Initialize position sequence with pre-move position.
+      history.Trim(search_->played_history_.GetLength());
+      for (size_t i = 0; i < picked_node.moves_to_visit.size(); i++) {
+        history.Append(picked_node.moves_to_visit[i]);
+      }
+
+      picked_node.moves = history.Last().GetBoard().GenerateLegalMoves();
+
+      ExtendNode(node, picked_node.depth, history, picked_node.moves);
       if (!node->IsTerminal()) {
         picked_node.nn_queried = true;
         const auto hash = history.HashLast(params_.GetCacheHistoryLength() + 1);
@@ -1479,11 +1487,6 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx,
         picked_node.lock = NNCacheLock(search_->cache_, hash);
         picked_node.is_cache_hit = picked_node.lock;
         if (!picked_node.is_cache_hit) {
-          // Legal moves are known, use them.
-          picked_node.moves.reserve(node->GetNumEdges());
-          for (const auto& edge : node->Edges()) {
-            picked_node.moves.emplace_back(edge.GetMove());
-          }
           picked_node.history = history;
         }
       }
@@ -1931,19 +1934,11 @@ void SearchWorker::PickNodesToExtendTask(
 }
 
 void SearchWorker::ExtendNode(Node* node, int depth,
-                              const std::vector<Move>& moves_to_node,
-                              PositionHistory* history) {
-  // Initialize position sequence with pre-move position.
-  history->Trim(search_->played_history_.GetLength());
-  for (size_t i = 0; i < moves_to_node.size(); i++) {
-    history->Append(moves_to_node[i]);
-  }
-
+                              const PositionHistory& history,
+                              const MoveList& legal_moves) {
   // We don't need the mutex because other threads will see that N=0 and
   // N-in-flight=1 and will not touch this node.
-  const auto& board = history->Last().GetBoard();
-  auto legal_moves = board.GenerateLegalMoves();
-
+  const auto& board = history.Last().GetBoard();
   // Check whether it's a draw/lose by position. Importantly, we must check
   // these before doing the by-rule checks below.
   if (legal_moves.empty()) {
@@ -1964,12 +1959,12 @@ void SearchWorker::ExtendNode(Node* node, int depth,
       return;
     }
 
-    if (history->Last().GetRule50Ply() >= 100) {
+    if (history.Last().GetRule50Ply() >= 100) {
       node->MakeTerminal(GameResult::DRAW);
       return;
     }
 
-    const auto repetitions = history->Last().GetRepetitions();
+    const auto repetitions = history.Last().GetRepetitions();
     // Mark two-fold repetitions as draws according to settings.
     // Depth starts with 1 at root, so number of plies in PV is depth - 1.
     if (repetitions >= 2) {
@@ -1977,8 +1972,8 @@ void SearchWorker::ExtendNode(Node* node, int depth,
       return;
     } else if (repetitions == 1 && depth - 1 >= 4 &&
                params_.GetTwoFoldDraws() &&
-               depth - 1 >= history->Last().GetPliesSincePrevRepetition()) {
-      const auto cycle_length = history->Last().GetPliesSincePrevRepetition();
+               depth - 1 >= history.Last().GetPliesSincePrevRepetition()) {
+      const auto cycle_length = history.Last().GetPliesSincePrevRepetition();
       // use plies since first repetition as moves left; exact if forced draw.
       node->MakeTerminal(GameResult::DRAW, (float)cycle_length,
                          Node::Terminal::TwoFold);
@@ -1988,12 +1983,12 @@ void SearchWorker::ExtendNode(Node* node, int depth,
     // Neither by-position or by-rule termination, but maybe it's a TB position.
     if (search_->syzygy_tb_ && !search_->root_is_in_dtz_ &&
         board.castlings().no_legal_castle() &&
-        history->Last().GetRule50Ply() == 0 &&
+        history.Last().GetRule50Ply() == 0 &&
         (board.ours() | board.theirs()).count() <=
             search_->syzygy_tb_->max_cardinality()) {
       ProbeState state;
       const WDLScore wdl =
-          search_->syzygy_tb_->probe_wdl(history->Last(), &state);
+          search_->syzygy_tb_->probe_wdl(history.Last(), &state);
       // Only fail state means the WDL is wrong, probe_wdl may produce correct
       // result with a stat other than OK.
       if (state != FAIL) {
diff --git a/src/mcts/search.h b/src/mcts/search.h
index 5ffca3a6b0..5067db5b33 100644
--- a/src/mcts/search.h
+++ b/src/mcts/search.h
@@ -441,8 +441,8 @@ class SearchWorker {
   void EnsureNodeTwoFoldCorrectForDepth(Node* node, int depth);
   void ProcessPickedTask(int batch_start, int batch_end,
                          TaskWorkspace* workspace);
-  void ExtendNode(Node* node, int depth, const std::vector<Move>& moves_to_add,
-                  PositionHistory* history);
+  void ExtendNode(Node* node, int depth, const PositionHistory& history,
+                  const MoveList& legal_moves);
   template <typename Computation>
   void FetchSingleNodeResult(NodeToProcess* node_to_process,
                              const Computation& computation,

From 72a77c5fbe1001fa159fcb3eac397e4b64a52155 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Sat, 8 Jun 2024 23:00:39 +0300
Subject: [PATCH 06/12] cleaner cache interface

---
 src/mcts/search.cc  | 22 +++++++---------------
 src/mcts/search.h   | 10 +++++-----
 src/neural/cache.cc | 18 ++++++++++++++++--
 src/neural/cache.h  | 21 ++++++++++++---------
 4 files changed, 40 insertions(+), 31 deletions(-)

diff --git a/src/mcts/search.cc b/src/mcts/search.cc
index 4b11a28b8c..86b7e6b70c 100644
--- a/src/mcts/search.cc
+++ b/src/mcts/search.cc
@@ -1421,14 +1421,8 @@ void SearchWorker::GatherMinibatch() {
       // There are no OOO though.
       // Also terminals when OOO is disabled.
       if (!minibatch_[i].nn_queried) continue;
-      if (minibatch_[i].is_cache_hit) {
-        // Since minibatch_[i] holds cache lock, this is guaranteed to succeed.
-        computation_->AddInputByHash(minibatch_[i].hash,
-                                     std::move(minibatch_[i].lock));
-      } else {
-        computation_->AddInput(minibatch_[i].hash, minibatch_[i].history,
-                               std::move(minibatch_[i].moves));
-      }
+      computation_->AddInput(minibatch_[i].hash, minibatch_[i].history,
+                             minibatch_[i].moves);
     }
 
     // Check for stop at the end so we have at least one node.
@@ -1484,11 +1478,9 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx,
         picked_node.nn_queried = true;
         const auto hash = history.HashLast(params_.GetCacheHistoryLength() + 1);
         picked_node.hash = hash;
-        picked_node.lock = NNCacheLock(search_->cache_, hash);
-        picked_node.is_cache_hit = picked_node.lock;
-        if (!picked_node.is_cache_hit) {
-          picked_node.history = history;
-        }
+        picked_node.is_cache_hit =
+            computation_->CacheLookup(hash, &picked_node.entry);
+        picked_node.history = history;
       }
     }
     if (params_.GetOutOfOrderEval() && picked_node.CanEvalOutOfOrder()) {
@@ -2025,7 +2017,7 @@ void SearchWorker::ExtendNode(Node* node, int depth,
 // Returns whether node was already in cache.
 bool SearchWorker::AddNodeToComputation(Node* node) {
   const auto hash = history_.HashLast(params_.GetCacheHistoryLength() + 1);
-  if (search_->cache_->ContainsKey(hash)) {
+  if (computation_->CacheLookup(hash)) {
     return true;
   }
   MoveList moves;
@@ -2041,7 +2033,7 @@ bool SearchWorker::AddNodeToComputation(Node* node) {
     moves = history_.Last().GetBoard().GenerateLegalMoves();
   }
 
-  computation_->AddInput(hash, history_, std::move(moves));
+  computation_->AddInput(hash, history_, moves);
   return false;
 }
 
diff --git a/src/mcts/search.h b/src/mcts/search.h
index 5067db5b33..8f99fa9b56 100644
--- a/src/mcts/search.h
+++ b/src/mcts/search.h
@@ -339,7 +339,7 @@ class SearchWorker {
 
     // Details that are filled in as we go.
     uint64_t hash;
-    NNCacheLock lock;
+    CachedNNRequest entry;
     MoveList moves;
     PositionHistory history;
     mutable int last_idx = 0;
@@ -360,13 +360,13 @@ class SearchWorker {
     // Methods to allow NodeToProcess to conform as a 'Computation'. Only safe
     // to call if is_cache_hit is true in the multigather path.
 
-    float GetQVal(int) const { return lock->q; }
+    float GetQVal(int) const { return entry.q; }
 
-    float GetDVal(int) const { return lock->d; }
+    float GetDVal(int) const { return entry.d; }
 
-    float GetMVal(int) const { return lock->m; }
+    float GetMVal(int) const { return entry.m; }
 
-    uint16_t GetPVal(int, int move_ct) const { return lock->p[move_ct]; }
+    uint16_t GetPVal(int, int move_ct) const { return entry.p[move_ct]; }
 
    private:
     NodeToProcess(Node* node, uint16_t depth, bool is_collision, int multivisit,
diff --git a/src/neural/cache.cc b/src/neural/cache.cc
index 1e0d155725..40c4fbb636 100644
--- a/src/neural/cache.cc
+++ b/src/neural/cache.cc
@@ -72,8 +72,22 @@ void CachingComputation::PopCacheHit() {
   batch_.pop_back();
 }
 
+bool CachingComputation::CacheLookup(uint64_t hash, CachedNNRequest* entry) {
+  NNCacheLock lock = NNCacheLock(cache_, hash);
+  if (!lock) return false;
+  if (entry != nullptr) {
+    entry->q = lock->q;
+    entry->d = lock->d;
+    entry->m = lock->m;
+    entry->p.clear();
+    entry->p.resize(lock->p.size());
+    for (size_t i = 0; i < lock->p.size(); i++) entry->p[i] = lock->p[i];
+  }
+  return true;
+}
+
 void CachingComputation::AddInput(uint64_t hash, const PositionHistory& history,
-                                  MoveList&& moves) {
+                                  const MoveList& moves) {
   if (AddInputByHash(hash)) return;
 
   int transform;
@@ -166,7 +180,7 @@ uint16_t CachingComputation::GetPVal(int sample, int move_ct) const {
     }
     return item.probabilities_to_cache[move_ct];
   }
-  if (move_ct > item.lock->p.size()) {
+  if (static_cast<size_t>(move_ct) > item.lock->p.size()) {
     return 0;  // Hash collision.
   }
   return item.lock->p[move_ct];
diff --git a/src/neural/cache.h b/src/neural/cache.h
index 45904c337f..8e3e916cc7 100644
--- a/src/neural/cache.h
+++ b/src/neural/cache.h
@@ -34,12 +34,12 @@
 namespace lczero {
 
 struct CachedNNRequest {
-  CachedNNRequest(size_t size) : p(size) {}
+  CachedNNRequest(size_t size = 0) : p(size) {}
   float q;
   float d;
   float m;
   // Store p only for valid moves.
-  SmallArray<uint16_t> p;
+  std::vector<uint16_t> p;
 };
 
 typedef HashKeyedCache<CachedNNRequest> NNCache;
@@ -60,16 +60,12 @@ class CachingComputation {
   int GetCacheMisses() const;
   // Total number of times AddInput/AddInputByHash were (successfully) called.
   int GetBatchSize() const;
-  // Adds input by hash only. If that hash is not in cache, returns false
-  // and does nothing. Otherwise adds.
-  bool AddInputByHash(uint64_t hash);
-  // Adds input by hash with existing lock. Assumes the given lock holds a real
-  // reference.
-  void AddInputByHash(uint64_t hash, NNCacheLock&& lock);
+  // Check if entry is in the cache.
+  bool CacheLookup(uint64_t hash, CachedNNRequest* entry = nullptr);
   // Adds a sample to the batch. Also calls EncodePositionForNN() if needed.
   // @hash is a hash to store/lookup it in the cache.
   void AddInput(uint64_t hash, const PositionHistory& history,
-                MoveList&& moves);
+                const MoveList& moves);
   // Undos last AddInput. If it was a cache miss, the it's actually not removed
   // from parent's batch.
   void PopLastInputHit();
@@ -91,6 +87,13 @@ class CachingComputation {
   void Reserve(int batch_size) { batch_.reserve(batch_size); }
 
  private:
+  // Adds input by hash only. If that hash is not in cache, returns false
+  // and does nothing. Otherwise adds.
+  bool AddInputByHash(uint64_t hash);
+  // Adds input by hash with existing lock. Assumes the given lock holds a real
+  // reference.
+  void AddInputByHash(uint64_t hash, NNCacheLock&& lock);
+
   struct WorkItem {
     uint64_t hash;
     NNCacheLock lock;

From 928660d79afe438e72ad15ac68d11a596fc4d8ea Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Sat, 8 Jun 2024 23:36:49 +0300
Subject: [PATCH 07/12] really guard against hash collisions

---
 src/mcts/search.cc  |  4 ++--
 src/neural/cache.cc | 32 ++++++++++----------------------
 src/neural/cache.h  | 10 ++--------
 3 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/src/mcts/search.cc b/src/mcts/search.cc
index 86b7e6b70c..d14c938ca8 100644
--- a/src/mcts/search.cc
+++ b/src/mcts/search.cc
@@ -1478,8 +1478,8 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx,
         picked_node.nn_queried = true;
         const auto hash = history.HashLast(params_.GetCacheHistoryLength() + 1);
         picked_node.hash = hash;
-        picked_node.is_cache_hit =
-            computation_->CacheLookup(hash, &picked_node.entry);
+        picked_node.is_cache_hit = computation_->CacheLookup(
+            hash, picked_node.moves, &picked_node.entry);
         picked_node.history = history;
       }
     }
diff --git a/src/neural/cache.cc b/src/neural/cache.cc
index 40c4fbb636..3601c04e8e 100644
--- a/src/neural/cache.cc
+++ b/src/neural/cache.cc
@@ -51,20 +51,6 @@ int CachingComputation::GetCacheMisses() const {
 
 int CachingComputation::GetBatchSize() const { return batch_.size(); }
 
-bool CachingComputation::AddInputByHash(uint64_t hash) {
-  NNCacheLock lock(cache_, hash);
-  if (!lock) return false;
-  AddInputByHash(hash, std::move(lock));
-  return true;
-}
-
-void CachingComputation::AddInputByHash(uint64_t hash, NNCacheLock&& lock) {
-  assert(lock);
-  batch_.emplace_back();
-  batch_.back().lock = std::move(lock);
-  batch_.back().hash = hash;
-}
-
 void CachingComputation::PopCacheHit() {
   assert(!batch_.empty());
   assert(batch_.back().lock);
@@ -72,10 +58,12 @@ void CachingComputation::PopCacheHit() {
   batch_.pop_back();
 }
 
-bool CachingComputation::CacheLookup(uint64_t hash, CachedNNRequest* entry) {
+bool CachingComputation::CacheLookup(uint64_t hash, const MoveList& moves,
+                                     CachedNNRequest* entry) {
   NNCacheLock lock = NNCacheLock(cache_, hash);
   if (!lock) return false;
   if (entry != nullptr) {
+    if (moves.size() != lock->p.size()) return false;
     entry->q = lock->q;
     entry->d = lock->d;
     entry->m = lock->m;
@@ -88,7 +76,13 @@ bool CachingComputation::CacheLookup(uint64_t hash, CachedNNRequest* entry) {
 
 void CachingComputation::AddInput(uint64_t hash, const PositionHistory& history,
                                   const MoveList& moves) {
-  if (AddInputByHash(hash)) return;
+  NNCacheLock lock(cache_, hash);
+  if (lock && moves.size() == lock->p.size()) {
+    batch_.emplace_back();
+    batch_.back().lock = std::move(lock);
+    batch_.back().hash = hash;
+    return;
+  }
 
   int transform;
   auto input =
@@ -175,14 +169,8 @@ float CachingComputation::GetMVal(int sample) const {
 uint16_t CachingComputation::GetPVal(int sample, int move_ct) const {
   auto& item = batch_[sample];
   if (item.idx_in_parent >= 0) {
-    if (move_ct > static_cast<int>(item.probabilities_to_cache.size())) {
-      return 0;  // Hash collision.
-    }
     return item.probabilities_to_cache[move_ct];
   }
-  if (static_cast<size_t>(move_ct) > item.lock->p.size()) {
-    return 0;  // Hash collision.
-  }
   return item.lock->p[move_ct];
 }
 
diff --git a/src/neural/cache.h b/src/neural/cache.h
index 8e3e916cc7..dee32bc82c 100644
--- a/src/neural/cache.h
+++ b/src/neural/cache.h
@@ -61,7 +61,8 @@ class CachingComputation {
   // Total number of times AddInput/AddInputByHash were (successfully) called.
   int GetBatchSize() const;
   // Check if entry is in the cache.
-  bool CacheLookup(uint64_t hash, CachedNNRequest* entry = nullptr);
+  bool CacheLookup(uint64_t hash, const MoveList& moves = {},
+                   CachedNNRequest* entry = nullptr);
   // Adds a sample to the batch. Also calls EncodePositionForNN() if needed.
   // @hash is a hash to store/lookup it in the cache.
   void AddInput(uint64_t hash, const PositionHistory& history,
@@ -87,13 +88,6 @@ class CachingComputation {
   void Reserve(int batch_size) { batch_.reserve(batch_size); }
 
  private:
-  // Adds input by hash only. If that hash is not in cache, returns false
-  // and does nothing. Otherwise adds.
-  bool AddInputByHash(uint64_t hash);
-  // Adds input by hash with existing lock. Assumes the given lock holds a real
-  // reference.
-  void AddInputByHash(uint64_t hash, NNCacheLock&& lock);
-
   struct WorkItem {
     uint64_t hash;
     NNCacheLock lock;

From 8369d5265140f919097de6eb5cc2be2e89c99d54 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Sun, 9 Jun 2024 01:37:28 +0300
Subject: [PATCH 08/12] remove hash from cache interface

---
 src/mcts/search.cc  | 14 +++++---------
 src/mcts/search.h   |  1 -
 src/neural/cache.cc | 13 +++++++++----
 src/neural/cache.h  |  8 ++++----
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/mcts/search.cc b/src/mcts/search.cc
index d14c938ca8..0260944a24 100644
--- a/src/mcts/search.cc
+++ b/src/mcts/search.cc
@@ -1261,7 +1261,7 @@ void SearchWorker::InitializeIteration(
   computation_ = std::make_unique<CachingComputation>(
       std::move(computation), search_->network_->GetCapabilities().input_format,
       params_.GetHistoryFill(), params_.GetPolicySoftmaxTemp(),
-      search_->cache_);
+      params_.GetCacheHistoryLength() + 1, search_->cache_);
   computation_->Reserve(target_minibatch_size_);
   minibatch_.clear();
   minibatch_.reserve(2 * target_minibatch_size_);
@@ -1421,8 +1421,7 @@ void SearchWorker::GatherMinibatch() {
       // There are no OOO though.
       // Also terminals when OOO is disabled.
       if (!minibatch_[i].nn_queried) continue;
-      computation_->AddInput(minibatch_[i].hash, minibatch_[i].history,
-                             minibatch_[i].moves);
+      computation_->AddInput(minibatch_[i].history, minibatch_[i].moves);
     }
 
     // Check for stop at the end so we have at least one node.
@@ -1476,10 +1475,8 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx,
       ExtendNode(node, picked_node.depth, history, picked_node.moves);
       if (!node->IsTerminal()) {
         picked_node.nn_queried = true;
-        const auto hash = history.HashLast(params_.GetCacheHistoryLength() + 1);
-        picked_node.hash = hash;
         picked_node.is_cache_hit = computation_->CacheLookup(
-            hash, picked_node.moves, &picked_node.entry);
+            history, picked_node.moves, &picked_node.entry);
         picked_node.history = history;
       }
     }
@@ -2016,8 +2013,7 @@ void SearchWorker::ExtendNode(Node* node, int depth,
 
 // Returns whether node was already in cache.
 bool SearchWorker::AddNodeToComputation(Node* node) {
-  const auto hash = history_.HashLast(params_.GetCacheHistoryLength() + 1);
-  if (computation_->CacheLookup(hash)) {
+  if (computation_->CacheLookup(history_)) {
     return true;
   }
   MoveList moves;
@@ -2033,7 +2029,7 @@ bool SearchWorker::AddNodeToComputation(Node* node) {
     moves = history_.Last().GetBoard().GenerateLegalMoves();
   }
 
-  computation_->AddInput(hash, history_, moves);
+  computation_->AddInput(history_, moves);
   return false;
 }
 
diff --git a/src/mcts/search.h b/src/mcts/search.h
index 8f99fa9b56..85074c9d19 100644
--- a/src/mcts/search.h
+++ b/src/mcts/search.h
@@ -338,7 +338,6 @@ class SearchWorker {
     std::vector<Move> moves_to_visit;
 
     // Details that are filled in as we go.
-    uint64_t hash;
     CachedNNRequest entry;
     MoveList moves;
     PositionHistory history;
diff --git a/src/neural/cache.cc b/src/neural/cache.cc
index 3601c04e8e..fd661e8fef 100644
--- a/src/neural/cache.cc
+++ b/src/neural/cache.cc
@@ -38,11 +38,13 @@ namespace lczero {
 CachingComputation::CachingComputation(
     std::unique_ptr<NetworkComputation> parent,
     pblczero::NetworkFormat::InputFormat input_format,
-    lczero::FillEmptyHistory history_fill, float softmax_temp, NNCache* cache)
+    lczero::FillEmptyHistory history_fill, float softmax_temp,
+    int history_length, NNCache* cache)
     : parent_(std::move(parent)),
       input_format_(input_format),
       history_fill_(history_fill),
       softmax_temp_(softmax_temp),
+      history_length_(history_length),
       cache_(cache) {}
 
 int CachingComputation::GetCacheMisses() const {
@@ -58,9 +60,11 @@ void CachingComputation::PopCacheHit() {
   batch_.pop_back();
 }
 
-bool CachingComputation::CacheLookup(uint64_t hash, const MoveList& moves,
+bool CachingComputation::CacheLookup(const PositionHistory& history,
+                                     const MoveList& moves,
                                      CachedNNRequest* entry) {
-  NNCacheLock lock = NNCacheLock(cache_, hash);
+  const auto hash = history.HashLast(history_length_);
+  NNCacheLock lock(cache_, hash);
   if (!lock) return false;
   if (entry != nullptr) {
     if (moves.size() != lock->p.size()) return false;
@@ -74,8 +78,9 @@ bool CachingComputation::CacheLookup(uint64_t hash, const MoveList& moves,
   return true;
 }
 
-void CachingComputation::AddInput(uint64_t hash, const PositionHistory& history,
+void CachingComputation::AddInput(const PositionHistory& history,
                                   const MoveList& moves) {
+  const auto hash = history.HashLast(history_length_);
   NNCacheLock lock(cache_, hash);
   if (lock && moves.size() == lock->p.size()) {
     batch_.emplace_back();
diff --git a/src/neural/cache.h b/src/neural/cache.h
index dee32bc82c..2d25a936a5 100644
--- a/src/neural/cache.h
+++ b/src/neural/cache.h
@@ -53,7 +53,7 @@ class CachingComputation {
   CachingComputation(std::unique_ptr<NetworkComputation> parent,
                      pblczero::NetworkFormat::InputFormat input_format,
                      lczero::FillEmptyHistory history_fill, float softmax_temp,
-                     NNCache* cache);
+                     int history_length, NNCache* cache);
 
   // How many inputs are not found in cache and will be forwarded to a wrapped
   // computation.
@@ -61,12 +61,11 @@ class CachingComputation {
   // Total number of times AddInput/AddInputByHash were (successfully) called.
   int GetBatchSize() const;
   // Check if entry is in the cache.
-  bool CacheLookup(uint64_t hash, const MoveList& moves = {},
+  bool CacheLookup(const PositionHistory& history, const MoveList& moves = {},
                    CachedNNRequest* entry = nullptr);
   // Adds a sample to the batch. Also calls EncodePositionForNN() if needed.
   // @hash is a hash to store/lookup it in the cache.
-  void AddInput(uint64_t hash, const PositionHistory& history,
-                const MoveList& moves);
+  void AddInput(const PositionHistory& history, const MoveList& moves);
   // Undos last AddInput. If it was a cache miss, the it's actually not removed
   // from parent's batch.
   void PopLastInputHit();
@@ -100,6 +99,7 @@ class CachingComputation {
   pblczero::NetworkFormat::InputFormat input_format_;
   lczero::FillEmptyHistory history_fill_;
   float softmax_temp_;
+  int history_length_;
   NNCache* cache_;
   std::vector<WorkItem> batch_;
 };

From c2449bf61dcdb97bcaf15ccd10efa08a78cf93e1 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Sun, 9 Jun 2024 15:09:33 +0300
Subject: [PATCH 09/12] do not add cache hits to batch

---
 src/mcts/search.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/mcts/search.cc b/src/mcts/search.cc
index 0260944a24..6fdab6eabf 100644
--- a/src/mcts/search.cc
+++ b/src/mcts/search.cc
@@ -1420,7 +1420,7 @@ void SearchWorker::GatherMinibatch() {
       // If there was no OOO, there can stil be collisions.
       // There are no OOO though.
       // Also terminals when OOO is disabled.
-      if (!minibatch_[i].nn_queried) continue;
+      if (!minibatch_[i].nn_queried || minibatch_[i].is_cache_hit) continue;
       computation_->AddInput(minibatch_[i].history, minibatch_[i].moves);
     }
 
@@ -1466,6 +1466,8 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx,
       // Node was never visited, extend it.
       // Initialize position sequence with pre-move position.
       history.Trim(search_->played_history_.GetLength());
+      history.Reserve(search_->played_history_.GetLength() +
+                      picked_node.moves_to_visit.size());
       for (size_t i = 0; i < picked_node.moves_to_visit.size(); i++) {
         history.Append(picked_node.moves_to_visit[i]);
       }
@@ -1477,7 +1479,7 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx,
         picked_node.nn_queried = true;
         picked_node.is_cache_hit = computation_->CacheLookup(
             history, picked_node.moves, &picked_node.entry);
-        picked_node.history = history;
+        if (!picked_node.is_cache_hit) picked_node.history = history;
       }
     }
     if (params_.GetOutOfOrderEval() && picked_node.CanEvalOutOfOrder()) {
@@ -2161,6 +2163,10 @@ void SearchWorker::FetchMinibatchResults() {
   // Populate NN/cached results, or terminal results, into nodes.
   int idx_in_computation = 0;
   for (auto& node_to_process : minibatch_) {
+    if (node_to_process.is_cache_hit) {
+      FetchSingleNodeResult(&node_to_process, node_to_process, 0);
+      continue;
+    }
     FetchSingleNodeResult(&node_to_process, *computation_, idx_in_computation);
     if (node_to_process.nn_queried) ++idx_in_computation;
   }

From 2880ba141d0a96f6477ae5cf5ac398c914aa0ce4 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Thu, 31 Oct 2024 17:17:36 +0200
Subject: [PATCH 10/12] make use of pfloat16 as a class

---
 src/mcts/node.h                  |  8 +++----
 src/mcts/search.cc               |  2 +-
 src/mcts/search.h                |  3 ++-
 src/neural/cache.cc              | 17 ++++++++------
 src/neural/cache.h               |  5 +++--
 src/trainingdata/trainingdata.cc |  2 +-
 src/utils/pfloat16.h             | 38 +++++++++++++++++++-------------
 7 files changed, 44 insertions(+), 31 deletions(-)

diff --git a/src/mcts/node.h b/src/mcts/node.h
index 60461e69a5..4cb31f9097 100644
--- a/src/mcts/node.h
+++ b/src/mcts/node.h
@@ -92,9 +92,9 @@ class Edge {
 
   // Returns or sets value of Move policy prior returned from the neural net
   // (but can be changed by adding Dirichlet noise). Must be in [0,1].
-  float GetP() const { return Pfloat16ToFloat(p_); }
-  void SetP(float val) { p_ = FloatToPfloat16(val); }
-  void SetPCompressed(uint16_t p) { p_ = p; }
+  float GetP() const { return p_; }
+  void SetP(float val) { p_ = val; }
+  void SetP(pfloat16 p) { p_ = p; }
 
   // Debug information about the edge.
   std::string DebugString() const;
@@ -107,7 +107,7 @@ class Edge {
 
   // Probability that this move will be made, from the policy head of the neural
   // network; compressed to a 16 bit format (5 bits exp, 11 bits significand).
-  uint16_t p_ = 0;
+  pfloat16 p_;
   friend class Node;
 };
 
diff --git a/src/mcts/search.cc b/src/mcts/search.cc
index 5fb8e876cf..da9e39b58a 100644
--- a/src/mcts/search.cc
+++ b/src/mcts/search.cc
@@ -2210,7 +2210,7 @@ void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process,
   // softmax.
   int idx = 0;
   for (auto& edge : node->Edges()) {
-    edge.edge()->SetPCompressed(computation.GetPVal(idx_in_computation, idx++));
+    edge.edge()->SetP(computation.GetPVal(idx_in_computation, idx++));
   }
   // Add Dirichlet noise if enabled and at root.
   if (params_.GetNoiseEpsilon() && node == search_->root_node_) {
diff --git a/src/mcts/search.h b/src/mcts/search.h
index 85074c9d19..98ec6fe5eb 100644
--- a/src/mcts/search.h
+++ b/src/mcts/search.h
@@ -44,6 +44,7 @@
 #include "syzygy/syzygy.h"
 #include "utils/logging.h"
 #include "utils/mutex.h"
+#include "utils/pfloat16.h"
 
 namespace lczero {
 
@@ -365,7 +366,7 @@ class SearchWorker {
 
     float GetMVal(int) const { return entry.m; }
 
-    uint16_t GetPVal(int, int move_ct) const { return entry.p[move_ct]; }
+    pfloat16 GetPVal(int, int move_ct) const { return entry.p[move_ct]; }
 
    private:
     NodeToProcess(Node* node, uint16_t depth, bool is_collision, int multivisit,
diff --git a/src/neural/cache.cc b/src/neural/cache.cc
index fd661e8fef..2194a45ffa 100644
--- a/src/neural/cache.cc
+++ b/src/neural/cache.cc
@@ -114,6 +114,10 @@ void CachingComputation::ComputeBlocking() {
   if (parent_->GetBatchSize() == 0) return;
   parent_->ComputeBlocking();
 
+  // Intermediate array to store values when processing policy.
+  // There are never more than 256 valid legal moves in any legal position.
+  std::array<float, 256> intermediate;
+
   // Fill cache with data from NN.
   for (auto& item : batch_) {
     if (item.idx_in_parent == -1) continue;
@@ -125,9 +129,6 @@ void CachingComputation::ComputeBlocking() {
 
     // Calculate maximum first.
     float max_p = -std::numeric_limits<float>::infinity();
-    // Intermediate array to store values when processing policy.
-    // There are never more than 256 valid legal moves in any legal position.
-    std::array<float, 256> intermediate;
     int counter = 0;
     for (auto x : item.probabilities_to_cache) {
       float p = parent_->GetPVal(item.idx_in_parent, x);
@@ -145,9 +146,9 @@ void CachingComputation::ComputeBlocking() {
     // Normalize P values to add up to 1.0.
     const float scale = total > 0.0f ? 1.0f / total : 1.0f;
     for (size_t ct = 0; ct < item.probabilities_to_cache.size(); ct++) {
-      uint16_t p = FloatToPfloat16(intermediate[ct] * scale);
+      pfloat16 p = intermediate[ct] * scale;
       req->p[ct] = p;
-      item.probabilities_to_cache[ct] = p;
+      std::memcpy(&item.probabilities_to_cache[ct], &p, sizeof(pfloat16));
     }
     cache_->Insert(item.hash, std::move(req));
   }
@@ -171,10 +172,12 @@ float CachingComputation::GetMVal(int sample) const {
   return item.lock->m;
 }
 
-uint16_t CachingComputation::GetPVal(int sample, int move_ct) const {
+pfloat16 CachingComputation::GetPVal(int sample, int move_ct) const {
   auto& item = batch_[sample];
   if (item.idx_in_parent >= 0) {
-    return item.probabilities_to_cache[move_ct];
+    pfloat16 r;
+    std::memcpy(&r, &item.probabilities_to_cache[move_ct], sizeof(pfloat16));
+    return r;
   }
   return item.lock->p[move_ct];
 }
diff --git a/src/neural/cache.h b/src/neural/cache.h
index 2d25a936a5..2ab93e4e3b 100644
--- a/src/neural/cache.h
+++ b/src/neural/cache.h
@@ -29,6 +29,7 @@
 #include "mcts/node.h"
 #include "neural/network.h"
 #include "utils/cache.h"
+#include "utils/pfloat16.h"
 #include "utils/smallarray.h"
 
 namespace lczero {
@@ -39,7 +40,7 @@ struct CachedNNRequest {
   float d;
   float m;
   // Store p only for valid moves.
-  std::vector<uint16_t> p;
+  std::vector<pfloat16> p;
 };
 
 typedef HashKeyedCache<CachedNNRequest> NNCache;
@@ -78,7 +79,7 @@ class CachingComputation {
   // Returns estimated remaining moves.
   float GetMVal(int sample) const;
   // Returns compressed P value @move_id of @sample.
-  uint16_t GetPVal(int sample, int move_ct) const;
+  pfloat16 GetPVal(int sample, int move_ct) const;
   // Pops last input from the computation. Only allowed for inputs which were
   // cached.
   void PopCacheHit();
diff --git a/src/trainingdata/trainingdata.cc b/src/trainingdata/trainingdata.cc
index 0597f8bc18..a474ff0205 100644
--- a/src/trainingdata/trainingdata.cc
+++ b/src/trainingdata/trainingdata.cc
@@ -156,7 +156,7 @@ void V6TrainingDataArray::Add(const Node* node, const PositionHistory& history,
       float p = 0;
       for (size_t i = 0; i < legal_moves.size(); i++) {
         if (move == legal_moves[i]) {
-          p = Pfloat16ToFloat(nneval->p[i]);
+          p = nneval->p[i];
           break;
         }
       }
diff --git a/src/utils/pfloat16.h b/src/utils/pfloat16.h
index 0554377eca..e35c652d79 100644
--- a/src/utils/pfloat16.h
+++ b/src/utils/pfloat16.h
@@ -59,21 +59,29 @@ namespace lczero {
 // subtracting the two bits from the input and checking for a negative result
 // (the subtraction works despite crossing from exponent to significand). This
 // is combined with the round-to-nearest addition (1<<11) into one op.
-static inline uint16_t FloatToPfloat16(const float &p) {
-  assert(0.0f <= p && p <= 1.0f);
-  constexpr int32_t roundings = (1 << 11) - (3 << 28);
-  int32_t tmp;
-  std::memcpy(&tmp, &p, sizeof(float));
-  tmp += roundings;
-  return (tmp < 0) ? 0 : static_cast<uint16_t>(tmp >> 12);
-}
 
-static inline float Pfloat16ToFloat(const uint16_t &p) {
-  // Reshift into place and set the assumed-set exponent bits.
-  uint32_t tmp = (static_cast<uint32_t>(p) << 12) | (3 << 28);
-  float ret;
-  std::memcpy(&ret, &tmp, sizeof(uint32_t));
-  return ret;
-}
+class pfloat16 {
+ public:
+  pfloat16() { value = 0; }
 
+  pfloat16(const float &p) {
+    assert(0.0f <= p && p <= 1.0f);
+    constexpr int32_t roundings = (1 << 11) - (3 << 28);
+    int32_t tmp;
+    std::memcpy(&tmp, &p, sizeof(float));
+    tmp += roundings;
+    value = (tmp < 0) ? 0 : static_cast<uint16_t>(tmp >> 12);
+  }
+
+  operator float() const {
+    // Reshift into place and set the assumed-set exponent bits.
+    uint32_t tmp = (static_cast<uint32_t>(value) << 12) | (3 << 28);
+    float ret;
+    std::memcpy(&ret, &tmp, sizeof(uint32_t));
+    return ret;
+  }
+
+ private:
+  uint16_t value = 0;
+};
 }  // namespace lczero

From efcc88f79fa0797a81f39d4d4fec239cbbc0f2fb Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Mon, 16 Dec 2024 15:09:55 +0200
Subject: [PATCH 11/12] warning fix

---
 src/neural/cache.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/neural/cache.cc b/src/neural/cache.cc
index 2194a45ffa..a89fb9b2bf 100644
--- a/src/neural/cache.cc
+++ b/src/neural/cache.cc
@@ -176,7 +176,8 @@ pfloat16 CachingComputation::GetPVal(int sample, int move_ct) const {
   auto& item = batch_[sample];
   if (item.idx_in_parent >= 0) {
     pfloat16 r;
-    std::memcpy(&r, &item.probabilities_to_cache[move_ct], sizeof(pfloat16));
+    std::memcpy(&r, (pfloat16*)&item.probabilities_to_cache[move_ct],
+                sizeof(pfloat16));
     return r;
   }
   return item.lock->p[move_ct];

From ecb7ad88deafbabb2e0c6eb7bcd1966c1956672a Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Mon, 16 Dec 2024 18:25:26 +0200
Subject: [PATCH 12/12] fixes

---
 src/neural/cache.h   | 3 +--
 src/utils/pfloat16.h | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/neural/cache.h b/src/neural/cache.h
index 2ab93e4e3b..72565af12f 100644
--- a/src/neural/cache.h
+++ b/src/neural/cache.h
@@ -59,13 +59,12 @@ class CachingComputation {
   // How many inputs are not found in cache and will be forwarded to a wrapped
   // computation.
   int GetCacheMisses() const;
-  // Total number of times AddInput/AddInputByHash were (successfully) called.
+  // Total number of times AddInput was (successfully) called.
   int GetBatchSize() const;
   // Check if entry is in the cache.
   bool CacheLookup(const PositionHistory& history, const MoveList& moves = {},
                    CachedNNRequest* entry = nullptr);
   // Adds a sample to the batch. Also calls EncodePositionForNN() if needed.
-  // @hash is a hash to store/lookup it in the cache.
   void AddInput(const PositionHistory& history, const MoveList& moves);
   // Undos last AddInput. If it was a cache miss, the it's actually not removed
   // from parent's batch.
diff --git a/src/utils/pfloat16.h b/src/utils/pfloat16.h
index e35c652d79..bdbe7c8fd6 100644
--- a/src/utils/pfloat16.h
+++ b/src/utils/pfloat16.h
@@ -73,6 +73,8 @@ class pfloat16 {
     value = (tmp < 0) ? 0 : static_cast<uint16_t>(tmp >> 12);
   }
 
+  pfloat16(const pfloat16 &) = default;
+
   operator float() const {
     // Reshift into place and set the assumed-set exponent bits.
     uint32_t tmp = (static_cast<uint32_t>(value) << 12) | (3 << 28);