Fix bug in HyperClockCache ApplyToEntries; cleanup (facebook#10768)

Summary: We have seen some rare crash test failures in HyperClockCache, and the source could certainly be a bug fixed in this change, in ClockHandleTable::ConstApplyToEntriesRange. It wasn't properly accounting for the fact that incrementing the acquire counter could be ineffective, due to parallel updates. (When incrementing the acquire counter is ineffective, it is incorrect to then decrement it.) This change includes some other minor clean-up in HyperClockCache, and adds stats_dump_period_sec with a much lower period to the crash test. This should be the primary caller of ApplyToEntries, in collecting cache entry stats. Pull Request resolved: facebook#10768 Test Plan: haven't been able to reproduce the failure, but should be in a better state (bug fix and improved crash test) Reviewed By: anand1976 Differential Revision: D40034747 Pulled By: anand1976 fbshipit-source-id: a06fcefe146e17ee35001984445cedcf3b63eb68
sightmachine · Oct 6, 2022 · b205c6d · b205c6d
1 parent f461e06
commit b205c6d
Show file tree

Hide file tree

Showing 6 changed files with 49 additions and 28 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -11,6 +11,7 @@
 * Fixed a bug causing manual flush with `flush_opts.wait=false` to stall when database has stopped all writes (#10001).
 * Fixed a bug in iterator refresh that was not freeing up SuperVersion, which could cause excessive resource pinniung (#10770).
 * Fixed a bug where RocksDB could be doing compaction endlessly when allow_ingest_behind is true and the bottommost level is not filled (#10767).
+* Fixed a memory safety bug in experimental HyperClockCache (#10768)
 
 ### Performance Improvements
 * Try to align the compaction output file boundaries to the next level ones, which can reduce more than 10% compaction load for the default level compaction. The feature is enabled by default, to disable, set `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size` to false. As a side effect, it can create SSTs larger than the target_file_size (capped at 2x target_file_size) or smaller files.

diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
@@ -23,6 +23,12 @@ namespace ROCKSDB_NAMESPACE {
 
 namespace hyper_clock_cache {
 
+inline uint64_t GetRefcount(uint64_t meta) {
+  return ((meta >> ClockHandle::kAcquireCounterShift) -
+          (meta >> ClockHandle::kReleaseCounterShift)) &
+         ClockHandle::kCounterMask;
+}
+
 static_assert(sizeof(ClockHandle) == 64U,
               "Expecting size / alignment with common cache line size");
 
@@ -49,6 +55,7 @@ ClockHandleTable::~ClockHandleTable() {
         break;
       case ClockHandle::kStateInvisible:  // rare but possible
       case ClockHandle::kStateVisible:
+        assert(GetRefcount(h.meta) == 0);
         h.FreeData();
 #ifndef NDEBUG
         Rollback(h.hash, &h);
@@ -562,10 +569,7 @@ bool ClockHandleTable::Release(ClockHandle* h, bool useful,
     }
     // Take ownership if no refs
     do {
-      uint64_t refcount = ((old_meta >> ClockHandle::kAcquireCounterShift) -
-                           (old_meta >> ClockHandle::kReleaseCounterShift)) &
-                          ClockHandle::kCounterMask;
-      if (refcount != 0) {
+      if (GetRefcount(old_meta) != 0) {
         // Not last ref at some point in time during this Release call
         // Correct for possible (but rare) overflow
         CorrectNearOverflow(old_meta, h->meta);
@@ -622,6 +626,8 @@ void ClockHandleTable::Ref(ClockHandle& h) {
 
   assert((old_meta >> ClockHandle::kStateShift) &
          ClockHandle::kStateShareableBit);
+  // Must have already had a reference
+  assert(GetRefcount(old_meta) > 0);
   (void)old_meta;
 }
 
@@ -671,10 +677,7 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
             old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit}
                           << ClockHandle::kStateShift);
             for (;;) {
-              uint64_t refcount =
-                  ((old_meta >> ClockHandle::kAcquireCounterShift) -
-                   (old_meta >> ClockHandle::kReleaseCounterShift)) &
-                  ClockHandle::kCounterMask;
+              uint64_t refcount = GetRefcount(old_meta);
               assert(refcount > 0);
               if (refcount > 1) {
                 // Not last ref at some point in time during this Erase call
@@ -683,8 +686,10 @@ void ClockHandleTable::Erase(const CacheKeyBytes& key, uint32_t hash) {
                                   std::memory_order_release);
                 break;
               } else if (h->meta.compare_exchange_weak(
-                             old_meta, uint64_t{ClockHandle::kStateConstruction}
-                                           << ClockHandle::kStateShift)) {
+                             old_meta,
+                             uint64_t{ClockHandle::kStateConstruction}
+                                 << ClockHandle::kStateShift,
+                             std::memory_order_acq_rel)) {
                 // Took ownership
                 assert(hash == h->hash);
                 // TODO? Delay freeing?
@@ -740,20 +745,32 @@ void ClockHandleTable::ConstApplyToEntriesRange(
   for (uint32_t i = index_begin; i < index_end; i++) {
     ClockHandle& h = array_[i];
 
+    // Note: to avoid using compare_exchange, we have to be extra careful.
     uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
     // Check if it's an entry visible to lookups
     if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
-      // Increment acquire counter
+      // Increment acquire counter. Note: it's possible that the entry has
+      // completely changed since we loaded old_meta, but incrementing acquire
+      // count is always safe. (Similar to optimistic Lookup here.)
       old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement,
                                   std::memory_order_acquire);
-      // Double-check
-      if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
-        func(h);
+      // Check whether we actually acquired a reference.
+      if ((old_meta >> ClockHandle::kStateShift) &
+          ClockHandle::kStateShareableBit) {
+        // Apply func if appropriate
+        if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
+          func(h);
+        }
+        // Pretend we never took the reference
+        h.meta.fetch_sub(ClockHandle::kAcquireIncrement,
+                         std::memory_order_release);
+        // No net change, so don't need to check for overflow
+      } else {
+        // For other states, incrementing the acquire counter has no effect
+        // so we don't need to undo it. Furthermore, we cannot safely undo
+        // it because we did not acquire a read reference to lock the
+        // entry in a Shareable state.
       }
-      // Pretend we never took the reference
-      h.meta.fetch_sub(ClockHandle::kAcquireIncrement,
-                       std::memory_order_release);
-      // No net change, so don't need to check for overflow
     }
   }
 }
@@ -763,12 +780,9 @@ void ClockHandleTable::EraseUnRefEntries() {
     ClockHandle& h = array_[i];
 
     uint64_t old_meta = h.meta.load(std::memory_order_relaxed);
-    uint64_t refcount = ((old_meta >> ClockHandle::kAcquireCounterShift) -
-                         (old_meta >> ClockHandle::kReleaseCounterShift)) &
-                        ClockHandle::kCounterMask;
     if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
                     << ClockHandle::kStateShift) &&
-        refcount == 0 &&
+        GetRefcount(old_meta) == 0 &&
         h.meta.compare_exchange_strong(old_meta,
                                        uint64_t{ClockHandle::kStateConstruction}
                                            << ClockHandle::kStateShift,
@@ -877,13 +891,12 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
         // Only clock update entries with no outstanding refs
         continue;
       }
-      if (!(meta >> ClockHandle::kStateShift &
+      if (!((meta >> ClockHandle::kStateShift) &
             ClockHandle::kStateShareableBit)) {
         // Only clock update Shareable entries
         continue;
       }
-      // ModTableSize(old_clock_pointer + i));
-      if (meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible &&
+      if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) &&
           acquire_count > 0) {
         // Decrement clock
         uint64_t new_count = std::min(acquire_count - 1,
@@ -1101,9 +1114,7 @@ size_t ClockCacheShard::GetPinnedUsage() const {
   table_.ConstApplyToEntriesRange(
       [&table_pinned_usage, charge_metadata](const ClockHandle& h) {
         uint64_t meta = h.meta.load(std::memory_order_relaxed);
-        uint64_t refcount = ((meta >> ClockHandle::kAcquireCounterShift) -
-                             (meta >> ClockHandle::kReleaseCounterShift)) &
-                            ClockHandle::kCounterMask;
+        uint64_t refcount = GetRefcount(meta);
         // Holding one ref for ConstApplyToEntriesRange
         assert(refcount > 0);
         if (refcount > 1) {

diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
@@ -174,6 +174,7 @@ DECLARE_bool(mock_direct_io);
 DECLARE_bool(statistics);
 DECLARE_bool(sync);
 DECLARE_bool(use_fsync);
+DECLARE_uint64(stats_dump_period_sec);
 DECLARE_uint64(bytes_per_sync);
 DECLARE_uint64(wal_bytes_per_sync);
 DECLARE_int32(kill_random_test);

diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
@@ -1063,4 +1063,8 @@ DEFINE_bool(
     "be preserved similarly under `FLAGS_expected_values_dir/unverified` when "
     "`--expected_values_dir` is nonempty.");
 
+DEFINE_uint64(stats_dump_period_sec,
+              ROCKSDB_NAMESPACE::Options().stats_dump_period_sec,
+              "Gap between printing stats to log in seconds");
+
 #endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
@@ -3095,6 +3095,8 @@ void InitializeOptionsFromFlags(
   options.experimental_mempurge_threshold =
       FLAGS_experimental_mempurge_threshold;
   options.periodic_compaction_seconds = FLAGS_periodic_compaction_seconds;
+  options.stats_dump_period_sec =
+      static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
   options.ttl = FLAGS_compaction_ttl;
   options.enable_pipelined_write = FLAGS_enable_pipelined_write;
   options.enable_write_thread_adaptive_yield =

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
@@ -137,6 +137,8 @@
     "index_block_restart_interval": lambda: random.choice(range(1, 16)),
     "use_multiget": lambda: random.randint(0, 1),
     "periodic_compaction_seconds": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
+    # 0 = never (used by some), 10 = often (for threading bugs), 600 = default
+    "stats_dump_period_sec": lambda: random.choice([0, 10, 600]),
     "compaction_ttl": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
     # Test small max_manifest_file_size in a smaller chance, as most of the
     # time we wnat manifest history to be preserved to help debug