From 405de0ec3dbb9007014d118ef2a973f3f4925239 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Mon, 7 Oct 2024 19:22:27 -0700 Subject: [PATCH] Raftstore v2 (#389) Signed-off-by: Spade A Signed-off-by: Yang Zhang Signed-off-by: SpadeA-Tang Co-authored-by: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> --- CMakeLists.txt | 2 + Makefile | 3 + TARGETS | 7 + db/c.cc | 18 +- db/c_test.c | 8 +- db/column_family.cc | 107 ++- db/column_family.h | 8 + db/compaction/compaction_iterator.cc | 15 +- db/compaction/compaction_iterator_test.cc | 34 + db/db_filesnapshot.cc | 6 +- db/db_flush_test.cc | 32 +- db/db_impl/compacted_db_impl.h | 5 +- db/db_impl/db_impl.cc | 48 +- db/db_impl/db_impl.h | 56 +- db/db_impl/db_impl_compaction_flush.cc | 62 +- db/db_impl/db_impl_debug.cc | 14 + db/db_impl/db_impl_merge.cc | 396 +++++++++++ db/db_impl/db_impl_open.cc | 43 ++ db/db_impl/db_impl_readonly.h | 5 +- db/db_impl/db_impl_secondary.h | 5 +- db/db_impl/db_impl_write.cc | 181 ++--- db/db_merge_test.cc | 647 ++++++++++++++++++ db/db_properties_test.cc | 8 +- db/db_test.cc | 4 +- db/db_test2.cc | 157 ++++- db/db_test_util.cc | 16 + db/db_test_util.h | 5 + db/db_write_buffer_manager_test.cc | 533 ++++++++++++--- db/db_write_test.cc | 97 +++ db/flush_job.cc | 11 +- db/flush_job.h | 5 +- db/memtable.cc | 75 +- db/memtable.h | 38 + db/memtable_list.cc | 17 + db/memtable_list.h | 3 + db/write_batch.cc | 6 + db/write_thread.cc | 2 + db/write_thread.h | 11 +- encryption/encryption.cc | 13 +- include/rocksdb/c.h | 8 +- include/rocksdb/compaction_filter.h | 26 +- include/rocksdb/db.h | 72 +- include/rocksdb/encryption.h | 10 + include/rocksdb/listener.h | 2 + include/rocksdb/options.h | 36 +- include/rocksdb/slice.h | 1 + include/rocksdb/utilities/stackable_db.h | 6 +- include/rocksdb/write_buffer_manager.h | 177 +++-- memtable/alloc_tracker.cc | 9 +- memtable/write_buffer_manager.cc | 166 ++++- memtable/write_buffer_manager_test.cc | 66 +- options/options_settable_test.cc | 2 + src.mk | 2 + test_util/testutil.h | 12 + utilities/blob_db/blob_db.h | 5 +- utilities/blob_db/blob_db_impl.cc | 5 +- utilities/blob_db/blob_db_impl.h | 3 +- utilities/checkpoint/checkpoint_impl.cc | 6 +- .../optimistic_transaction_db_impl.h | 7 +- utilities/ttl/db_ttl_impl.cc | 5 +- utilities/ttl/db_ttl_impl.h | 4 +- 61 files changed, 2808 insertions(+), 525 deletions(-) create mode 100644 db/db_impl/db_impl_merge.cc create mode 100644 db/db_merge_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index b913d921a0b..d17b07306bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -684,6 +684,7 @@ set(SOURCES db/db_impl/db_impl_experimental.cc db/db_impl/db_impl_readonly.cc db/db_impl/db_impl_secondary.cc + db/db_impl/db_impl_merge.cc db/db_info_dumper.cc db/db_iter.cc db/dbformat.cc @@ -1327,6 +1328,7 @@ if(WITH_TESTS) db/db_memtable_test.cc db/db_merge_operator_test.cc db/db_merge_operand_test.cc + db/db_merge_test.cc db/db_options_test.cc db/db_properties_test.cc db/db_range_del_test.cc diff --git a/Makefile b/Makefile index 90a394cb0df..8f393988548 100644 --- a/Makefile +++ b/Makefile @@ -1511,6 +1511,9 @@ db_merge_operator_test: $(OBJ_DIR)/db/db_merge_operator_test.o $(TEST_LIBRARY) $ db_merge_operand_test: $(OBJ_DIR)/db/db_merge_operand_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +db_merge_test: $(OBJ_DIR)/db/db_merge_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + db_options_test: $(OBJ_DIR)/db/db_options_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) diff --git a/TARGETS b/TARGETS index 9a314821518..6ca67ffebea 100644 --- a/TARGETS +++ b/TARGETS @@ -58,6 +58,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "db/db_impl/db_impl_debug.cc", "db/db_impl/db_impl_experimental.cc", "db/db_impl/db_impl_files.cc", + "db/db_impl/db_impl_merge.cc", "db/db_impl/db_impl_open.cc", "db/db_impl/db_impl_readonly.cc", "db/db_impl/db_impl_secondary.cc", @@ -4862,6 +4863,12 @@ cpp_unittest_wrapper(name="db_merge_operator_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="db_merge_test", + srcs=["db/db_merge_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="db_options_test", srcs=["db/db_options_test.cc"], deps=[":rocksdb_test_lib"], diff --git a/db/c.cc b/db/c.cc index f1597fe4750..4ddb4676095 100644 --- a/db/c.cc +++ b/db/c.cc @@ -4944,11 +4944,6 @@ bool rocksdb_write_buffer_manager_enabled(rocksdb_write_buffer_manager_t* wbm) { return wbm->rep->enabled(); } -bool rocksdb_write_buffer_manager_cost_to_cache( - rocksdb_write_buffer_manager_t* wbm) { - return wbm->rep->cost_to_cache(); -} - size_t rocksdb_write_buffer_manager_memory_usage( rocksdb_write_buffer_manager_t* wbm) { return wbm->rep->memory_usage(); @@ -4963,17 +4958,10 @@ size_t rocksdb_write_buffer_manager_dummy_entries_in_cache_usage( rocksdb_write_buffer_manager_t* wbm) { return wbm->rep->dummy_entries_in_cache_usage(); } -size_t rocksdb_write_buffer_manager_buffer_size( + +size_t rocksdb_write_buffer_manager_flush_size( rocksdb_write_buffer_manager_t* wbm) { - return wbm->rep->buffer_size(); -} -void rocksdb_write_buffer_manager_set_buffer_size( - rocksdb_write_buffer_manager_t* wbm, size_t new_size) { - wbm->rep->SetBufferSize(new_size); -} -ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_allow_stall( - rocksdb_write_buffer_manager_t* wbm, bool new_allow_stall) { - wbm->rep->SetAllowStall(new_allow_stall); + return wbm->rep->flush_size(); } rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, diff --git a/db/c_test.c b/db/c_test.c index 66722049692..b9bee287f0f 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -3792,14 +3792,8 @@ int main(int argc, char** argv) { CheckCondition(true == rocksdb_write_buffer_manager_enabled(write_buffer_manager)); - CheckCondition(true == rocksdb_write_buffer_manager_cost_to_cache( - write_buffer_manager)); CheckCondition( - 200 == rocksdb_write_buffer_manager_buffer_size(write_buffer_manager)); - - rocksdb_write_buffer_manager_set_buffer_size(write_buffer_manager, 300); - CheckCondition( - 300 == rocksdb_write_buffer_manager_buffer_size(write_buffer_manager)); + 200 == rocksdb_write_buffer_manager_flush_size(write_buffer_manager)); rocksdb_write_buffer_manager_destroy(write_buffer_manager); rocksdb_cache_destroy(lru); diff --git a/db/column_family.cc b/db/column_family.cc index 3ac603da71d..bb9cb87796f 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -64,6 +64,9 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() { for (auto& listener : cfd_->ioptions()->listeners) { listener->OnColumnFamilyHandleDeletionStarted(this); } + if (cfd_->write_buffer_mgr()) { + cfd_->write_buffer_mgr()->UnregisterColumnFamily(this); + } // Job id == 0 means that this is not our background process, but rather // user thread // Need to hold some shared pointers owned by the initial_cf_options @@ -1246,6 +1249,105 @@ Status ColumnFamilyData::RangesOverlapWithMemtables( return status; } +Status ColumnFamilyData::GetMemtablesUserKeyRange(PinnableSlice* smallest, + PinnableSlice* largest, + bool* found) { + assert(smallest && largest && found); + Status s; + auto* ucmp = user_comparator(); + Arena arena; + ReadOptions read_opts; + read_opts.total_order_seek = true; + MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); + merge_iter_builder.AddIterator(mem_->NewIterator(read_opts, &arena)); + imm_.current()->AddIterators(read_opts, &merge_iter_builder, false); + ScopedArenaIterator mem_iter(merge_iter_builder.Finish()); + mem_iter->SeekToFirst(); + if (mem_iter->Valid()) { + auto ukey = mem_iter->user_key(); + if (!(*found) || ucmp->Compare(ukey, *smallest) < 0) { + smallest->PinSelf(ukey); + } + mem_iter->SeekToLast(); + assert(mem_iter->Valid()); + ukey = mem_iter->user_key(); + if (!(*found) || ucmp->Compare(*largest, ukey) < 0) { + largest->PinSelf(ukey); + } + *found = true; + } + + if (s.ok()) { + autovector memtables{mem_}; + imm_.ExportMemtables(&memtables); + for (auto* mem : memtables) { + auto* iter = + mem->NewRangeTombstoneIterator(read_opts, kMaxSequenceNumber, false); + if (iter != nullptr) { + iter->SeekToFirst(); + if (iter->Valid()) { + // It's already a user key. + auto ukey = iter->start_key(); + if (!(*found) || ucmp->Compare(ukey, *smallest) < 0) { + smallest->PinSelf(ukey); + } + iter->SeekToLast(); + assert(iter->Valid()); + // Get the end_key of all tombstones. + ukey = iter->end_key(); + if (!(*found) || ucmp->Compare(*largest, ukey) < 0) { + largest->PinSelf(ukey); + } + *found = true; + } + } + } + } + + return s; +} + +Status ColumnFamilyData::GetUserKeyRange(PinnableSlice* smallest, + PinnableSlice* largest, bool* found) { + assert(smallest && largest && found); + if (ioptions_.compaction_style != CompactionStyle::kCompactionStyleLevel) { + return Status::NotSupported("Unexpected compaction style"); + } + Status s = GetMemtablesUserKeyRange(smallest, largest, found); + if (!s.ok()) { + return s; + } + + VersionStorageInfo& vsi = *current()->storage_info(); + auto* ucmp = user_comparator(); + for (const auto& f : vsi.LevelFiles(0)) { + Slice start = f->smallest.user_key(); + Slice end = f->largest.user_key(); + if (!(*found) || ucmp->Compare(start, *smallest) < 0) { + smallest->PinSelf(start); + } + if (!(*found) || ucmp->Compare(*largest, end) < 0) { + largest->PinSelf(end); + } + *found = true; + } + for (int level = 1; level < vsi.num_levels(); ++level) { + const auto& level_files = vsi.LevelFiles(level); + if (level_files.size() > 0) { + Slice start = level_files.front()->smallest.user_key(); + Slice end = level_files.back()->largest.user_key(); + if (!(*found) || ucmp->Compare(start, *smallest) < 0) { + smallest->PinSelf(start); + } + if (!(*found) || ucmp->Compare(*largest, end) < 0) { + largest->PinSelf(end); + } + *found = true; + } + } + return s; +} + const int ColumnFamilyData::kCompactAllLevels = -1; const int ColumnFamilyData::kCompactToBaseLevel = -2; @@ -1733,8 +1835,11 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( const std::string& name, uint32_t id, Version* dummy_versions, const ColumnFamilyOptions& options) { assert(column_families_.find(name) == column_families_.end()); + auto* write_buffer_manager = options.cf_write_buffer_manager != nullptr + ? options.cf_write_buffer_manager.get() + : write_buffer_manager_; ColumnFamilyData* new_cfd = new ColumnFamilyData( - id, name, dummy_versions, table_cache_, write_buffer_manager_, options, + id, name, dummy_versions, table_cache_, write_buffer_manager, options, *db_options_, &file_options_, this, block_cache_tracer_, io_tracer_, db_id_, db_session_id_); column_families_.insert({name, id}); diff --git a/db/column_family.h b/db/column_family.h index c0b85fede03..7a0a75ace91 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -406,6 +406,14 @@ class ColumnFamilyData { SuperVersion* super_version, bool allow_data_in_errors, bool* overlap); + // Get user key range of memtables. Tombstones are counted. + Status GetMemtablesUserKeyRange(PinnableSlice* smallest, + PinnableSlice* largest, bool* found); + + // Get user key range of all data. Tombstones are counted. + Status GetUserKeyRange(PinnableSlice* smallest, PinnableSlice* largest, + bool* found); + // A flag to tell a manual compaction is to compact all levels together // instead of a specific level. static const int kCompactAllLevels; diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index 85d1c039bd3..81e38be352b 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -229,17 +229,20 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, } if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex && - ikey_.type != kTypeWideColumnEntity) { + ikey_.type != kTypeWideColumnEntity && ikey_.type != kTypeDeletion) { return true; } CompactionFilter::Decision decision = CompactionFilter::Decision::kUndetermined; - CompactionFilter::ValueType value_type = - ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue - : ikey_.type == kTypeBlobIndex - ? CompactionFilter::ValueType::kBlobIndex - : CompactionFilter::ValueType::kWideColumnEntity; + CompactionFilter::ValueType value_type = CompactionFilter::ValueType::kValue; + if (ikey_.type == kTypeBlobIndex) { + value_type = CompactionFilter::ValueType::kBlobIndex; + } else if (ikey_.type == kTypeWideColumnEntity) { + value_type = CompactionFilter::ValueType::kWideColumnEntity; + } else if (ikey_.type == kTypeDeletion) { + value_type = CompactionFilter::ValueType::kDeletion; + } // Hack: pass internal key to BlobIndexCompactionFilter since it needs // to get sequence number. diff --git a/db/compaction/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc index 699e629693d..7b4e8985024 100644 --- a/db/compaction/compaction_iterator_test.cc +++ b/db/compaction/compaction_iterator_test.cc @@ -719,6 +719,40 @@ TEST_P(CompactionIteratorTest, SingleMergeOperand) { ASSERT_EQ("cv1cv2", c_iter_->value().ToString()); } +TEST_P(CompactionIteratorTest, RemoveAllSingleDeletes) { + struct Filter : public CompactionFilter { + Decision UnsafeFilter(int /*level*/, const Slice& key, ValueType t, + const Slice& /*existing_value*/, + std::string* /*new_value*/, + std::string* skip_until) const override { + if (t == ValueType::kDeletion) { + *skip_until = key.ToString(); + skip_until->back() += 1; + filtered += 1; + return Decision::kRemoveAndSkipUntil; + } + return Decision::kKeep; + } + + const char* Name() const override { + return "CompactionIteratorTest.SingleDelete::Filter"; + } + mutable size_t filtered = 0; + }; + + Filter filter; + InitIterators( + {test::KeyStr("a", 70, kTypeDeletion), test::KeyStr("a", 50, kTypeValue), + test::KeyStr("c", 70, kTypeDeletion), + test::KeyStr("c", 50, kTypeDeletion)}, + {"", "a", "", ""}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, + nullptr, &filter); + + c_iter_->SeekToFirst(); + ASSERT_TRUE(!c_iter_->Valid()); + ASSERT_EQ(filter.filtered, 2); +} + // In bottommost level, values earlier than earliest snapshot can be output // with sequence = 0. TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) { diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 40e7ac15548..988996ba96d 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). // - #include #include #include @@ -29,7 +28,10 @@ namespace ROCKSDB_NAMESPACE { Status DBImpl::FlushForGetLiveFiles() { - return DBImpl::FlushAllColumnFamilies(FlushOptions(), + FlushOptions flush_options; + flush_options.allow_write_stall = true; + flush_options.check_if_compaction_disabled = true; + return DBImpl::FlushAllColumnFamilies(flush_options, FlushReason::kGetLiveFiles); } diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc index b2c9f4e67c3..3b943cfd4fa 100644 --- a/db/db_flush_test.cc +++ b/db/db_flush_test.cc @@ -740,9 +740,11 @@ class TestFlushListener : public EventListener { DBFlushTest* test_; }; +// Disabled, because of +// https://github.com/tikv/rocksdb/pull/389/commits/cc433939ed937a82d0a0ccad1280d5907b048654 TEST_F( DBFlushTest, - FixUnrecoverableWriteDuringAtomicFlushWaitUntilFlushWouldNotStallWrites) { + DISABLED_FixUnrecoverableWriteDuringAtomicFlushWaitUntilFlushWouldNotStallWrites) { Options options = CurrentOptions(); options.atomic_flush = true; @@ -2012,6 +2014,13 @@ TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) { } } + void OnFlushBegin(DB* /*db*/, const FlushJobInfo& info) override { + ASSERT_LE(info.smallest_seqno, info.largest_seqno); + if (info.largest_seqno != seq1) { + ASSERT_EQ(info.largest_seqno, seq2); + } + } + void CheckFlushResultCommitted(DB* db, SequenceNumber seq) { DBImpl* db_impl = static_cast_with_check(db); InstrumentedMutex* mutex = db_impl->mutex(); @@ -3189,6 +3198,27 @@ TEST_P(DBAtomicFlushTest, NoWaitWhenWritesStopped) { SyncPoint::GetInstance()->DisableProcessing(); } +TEST_P(DBAtomicFlushTest, DisableManualCompaction) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = GetParam(); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(2, handles_.size()); + ASSERT_OK(dbfull()->PauseBackgroundWork()); + ASSERT_OK(Put(0, "key00", "value00")); + ASSERT_OK(Put(1, "key10", "value10")); + dbfull()->DisableManualCompaction(); + FlushOptions flush_opts; + flush_opts.wait = true; + flush_opts.check_if_compaction_disabled = true; + ASSERT_TRUE(dbfull()->Flush(flush_opts, handles_).IsIncomplete()); + ASSERT_OK(Put(0, "key01", "value01")); + ASSERT_OK(db_->ContinueBackgroundWork()); + dbfull()->EnableManualCompaction(); + ASSERT_OK(dbfull()->Flush(flush_opts, handles_)); + Close(); +} + INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest, testing::Bool()); diff --git a/db/db_impl/compacted_db_impl.h b/db/db_impl/compacted_db_impl.h index e1c605e420b..cf8702895f7 100644 --- a/db/db_impl/compacted_db_impl.h +++ b/db/db_impl/compacted_db_impl.h @@ -77,8 +77,9 @@ class CompactedDBImpl : public DBImpl { const Slice& /*key*/) override { return Status::NotSupported("Not supported in compacted db mode."); } - virtual Status Write(const WriteOptions& /*options*/, - WriteBatch* /*updates*/) override { + using DBImpl::Write; + virtual Status Write(const WriteOptions& /*options*/, WriteBatch* /*updates*/, + PostWriteCallback* /*callback*/) override { return Status::NotSupported("Not supported in compacted db mode."); } using DBImpl::CompactRange; diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 297c6aceb76..417304e1f17 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -395,6 +395,7 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { FlushOptions flush_opts; // We allow flush to stall write since we are trying to resume from error. flush_opts.allow_write_stall = true; + flush_opts.check_if_compaction_disabled = true; s = FlushAllColumnFamilies(flush_opts, context.flush_reason); } if (!s.ok()) { @@ -491,7 +492,10 @@ void DBImpl::CancelAllBackgroundWork(bool wait) { if (!shutting_down_.load(std::memory_order_acquire) && has_unpersisted_data_.load(std::memory_order_relaxed) && !mutable_db_options_.avoid_flush_during_shutdown) { - s = DBImpl::FlushAllColumnFamilies(FlushOptions(), FlushReason::kShutDown); + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + flush_opts.check_if_compaction_disabled = true; + s = DBImpl::FlushAllColumnFamilies(flush_opts, FlushReason::kShutDown); s.PermitUncheckedError(); //**TODO: What to do on error? } @@ -655,6 +659,14 @@ Status DBImpl::CloseHelper() { delete txn_entry.second; } + mutex_.Unlock(); + // We can only access cf_based_write_buffer_manager_ before versions_.reset(), + // after which all cf write buffer managers will be freed. + for (auto m : cf_based_write_buffer_manager_) { + m->UnregisterDB(this); + } + mutex_.Lock(); + // versions need to be destroyed before table_cache since it can hold // references to table_cache. versions_.reset(); @@ -684,7 +696,10 @@ Status DBImpl::CloseHelper() { } if (write_buffer_manager_ && wbm_stall_) { - write_buffer_manager_->RemoveDBFromQueue(wbm_stall_.get()); + write_buffer_manager_->RemoveFromStallQueue(wbm_stall_.get()); + } + if (write_buffer_manager_) { + write_buffer_manager_->UnregisterDB(this); } IOStatus io_s = directories_.Close(IOOptions(), nullptr /* dbg */); @@ -3647,6 +3662,22 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, if (s.ok()) { NewThreadStatusCfInfo( static_cast_with_check(*handle)->cfd()); + if (cf_options.cf_write_buffer_manager != nullptr) { + auto* write_buffer_manager = cf_options.cf_write_buffer_manager.get(); + bool exist = false; + for (auto m : cf_based_write_buffer_manager_) { + if (m == write_buffer_manager) { + exist = true; + } + } + if (!exist) { + return Status::NotSupported( + "New cf write buffer manager is not supported after Open"); + } + write_buffer_manager->RegisterColumnFamily(this, *handle); + } else if (write_buffer_manager_ != nullptr) { + write_buffer_manager_->RegisterColumnFamily(this, *handle); + } } return s; } @@ -4635,6 +4666,18 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family, ReturnAndCleanupSuperVersion(cfd, sv); } +void DBImpl::GetApproximateActiveMemTableStats( + ColumnFamilyHandle* column_family, uint64_t* const memory_bytes, + uint64_t* const oldest_key_time) { + auto* cf_impl = static_cast(column_family); + if (memory_bytes) { + *memory_bytes = cf_impl->cfd()->mem()->ApproximateMemoryUsageFast(); + } + if (oldest_key_time) { + *oldest_key_time = cf_impl->cfd()->mem()->ApproximateOldestKeyTime(); + } +} + Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, ColumnFamilyHandle* column_family, const Range* range, int n, uint64_t* sizes) { @@ -5844,6 +5887,7 @@ Status DBImpl::IngestExternalFiles( if (status.ok() && at_least_one_cf_need_flush) { FlushOptions flush_opts; flush_opts.allow_write_stall = true; + flush_opts.check_if_compaction_disabled = true; if (immutable_db_options_.atomic_flush) { mutex_.Unlock(); status = AtomicFlushMemTables( diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index ed771324827..c75c8c33a77 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -82,6 +82,7 @@ class WriteCallback; struct JobContext; struct ExternalSstFileInfo; struct MemTableInfo; +class WriteBlocker; // Class to maintain directories for all database paths other than main one. class Directories { @@ -229,12 +230,13 @@ class DBImpl : public DB { const Slice& end_key, const Slice& ts) override; using DB::Write; - virtual Status Write(const WriteOptions& options, - WriteBatch* updates) override; + virtual Status Write(const WriteOptions& options, WriteBatch* updates, + PostWriteCallback* callback) override; using DB::MultiBatchWrite; virtual Status MultiBatchWrite(const WriteOptions& options, - std::vector&& updates) override; + std::vector&& updates, + PostWriteCallback* callback) override; using DB::Get; virtual Status Get(const ReadOptions& options, @@ -393,6 +395,12 @@ class DBImpl : public DB { const Range& range, uint64_t* const count, uint64_t* const size) override; + + using DB::GetApproximateActiveMemTableStats; + virtual void GetApproximateActiveMemTableStats( + ColumnFamilyHandle* column_family, uint64_t* const memory_bytes, + uint64_t* const oldest_key_time) override; + using DB::CompactRange; virtual Status CompactRange(const CompactRangeOptions& options, ColumnFamilyHandle* column_family, @@ -493,8 +501,7 @@ class DBImpl : public DB { virtual Status GetSortedWalFiles(VectorLogPtr& files) override; virtual Status GetCurrentWalFile( std::unique_ptr* current_log_file) override; - virtual Status GetCreationTimeOfOldestFile( - uint64_t* creation_time) override; + virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override; virtual Status GetUpdatesSince( SequenceNumber seq_number, std::unique_ptr* iter, @@ -616,7 +623,6 @@ class DBImpl : public DB { ColumnFamilyHandle* column_family, const Range* range, std::size_t n, TablePropertiesCollection* props) override; - // ---- End of implementations of the DB interface ---- SystemClock* GetSystemClock() const; @@ -1061,6 +1067,15 @@ class DBImpl : public DB { std::vector* handles, DB** dbptr, const bool seq_per_batch, const bool batch_per_txn); + // Validate `rhs` can be merged into this DB with given merge options. + Status ValidateForMerge(const MergeInstanceOptions& merge_options, + DBImpl* rhs); + + Status CheckInRange(const Slice* begin, const Slice* end) override; + + Status MergeDisjointInstances(const MergeInstanceOptions& merge_options, + const std::vector& instances) override; + static IOStatus CreateAndNewDirectory( FileSystem* fs, const std::string& dirname, std::unique_ptr* directory); @@ -1196,6 +1211,7 @@ class DBImpl : public DB { SeqnoToTimeMapping TEST_GetSeqnoToTimeMapping() const; const autovector& TEST_GetFilesToQuarantine() const; size_t TEST_EstimateInMemoryStatsHistorySize() const; + void TEST_ClearBackgroundJobs(); uint64_t TEST_GetCurrentLogNumber() const { InstrumentedMutexLock l(mutex()); @@ -1425,7 +1441,9 @@ class DBImpl : public DB { void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, const MutableCFOptions& mutable_cf_options, - int job_id, FlushReason flush_reason); + int job_id, FlushReason flush_reason, + SequenceNumber earliest_seqno, + SequenceNumber largest_seqno); void NotifyOnFlushCompleted( ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, @@ -1477,26 +1495,30 @@ class DBImpl : public DB { bool disable_memtable = false, uint64_t* seq_used = nullptr, size_t batch_cnt = 0, PreReleaseCallback* pre_release_callback = nullptr, - PostMemTableCallback* post_memtable_callback = nullptr); + PostMemTableCallback* post_memtable_callback = nullptr, + PostWriteCallback* post_callback = nullptr); Status MultiBatchWriteImpl(const WriteOptions& write_options, std::vector&& my_batch, WriteCallback* callback = nullptr, uint64_t* log_used = nullptr, uint64_t log_ref = 0, - uint64_t* seq_used = nullptr); + uint64_t* seq_used = nullptr, + PostWriteCallback* post_callback = nullptr); void MultiBatchWriteCommit(CommitRequest* request); Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates, WriteCallback* callback = nullptr, uint64_t* log_used = nullptr, uint64_t log_ref = 0, bool disable_memtable = false, - uint64_t* seq_used = nullptr); + uint64_t* seq_used = nullptr, + PostWriteCallback* post_callback = nullptr); // Write only to memtables without joining any write queue Status UnorderedWriteMemtable(const WriteOptions& write_options, WriteBatch* my_batch, WriteCallback* callback, uint64_t log_ref, SequenceNumber seq, - const size_t sub_batch_cnt); + const size_t sub_batch_cnt, + PostWriteCallback* post_callback = nullptr); // Whether the batch requires to be assigned with an order enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder }; @@ -1612,6 +1634,7 @@ class DBImpl : public DB { friend class WriteBatchWithIndex; friend class WriteUnpreparedTxnDB; friend class WriteUnpreparedTxn; + friend class WriteBlocker; friend class ForwardIterator; friend struct SuperVersion; @@ -1797,8 +1820,8 @@ class DBImpl : public DB { const InternalKey* begin = nullptr; // nullptr means beginning of key range const InternalKey* end = nullptr; // nullptr means end of key range InternalKey* manual_end = nullptr; // how far we are compacting - InternalKey tmp_storage; // Used to keep track of compaction progress - InternalKey tmp_storage1; // Used to keep track of compaction progress + InternalKey tmp_storage; // Used to keep track of compaction progress + InternalKey tmp_storage1; // Used to keep track of compaction progress // When the user provides a canceled pointer in CompactRangeOptions, the // above varaibe is the reference of the user-provided @@ -2056,9 +2079,6 @@ class DBImpl : public DB { // REQUIRES: mutex locked and in write thread. Status SwitchWAL(WriteContext* write_context); - // REQUIRES: mutex locked and in write thread. - Status HandleWriteBufferManagerFlush(WriteContext* write_context); - // REQUIRES: mutex locked Status PreprocessWrite(const WriteOptions& write_options, LogContext* log_context, WriteContext* write_context); @@ -2578,6 +2598,10 @@ class DBImpl : public DB { Directories directories_; WriteBufferManager* write_buffer_manager_; + // For simplicity, CF based write buffer manager does not support stall the + // write. + // Note: It's only modifed in Open, so mutex is not needed. + autovector cf_based_write_buffer_manager_; WriteThread write_thread_; WriteBatch tmp_batch_; diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 71c23de95a5..c2bd7af0476 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -306,8 +306,10 @@ Status DBImpl::FlushMemTableToOutputFile( job_context->job_id, s.ToString().c_str()); } + SequenceNumber earliest_seqno = 0; + SequenceNumber largest_seqno = 0; if (s.ok()) { - flush_job.PickMemTable(); + flush_job.PickMemTable(&earliest_seqno, &largest_seqno); need_cancel = true; } TEST_SYNC_POINT_CALLBACK( @@ -315,7 +317,7 @@ Status DBImpl::FlushMemTableToOutputFile( // may temporarily unlock and lock the mutex. NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id, - flush_reason); + flush_reason, earliest_seqno, largest_seqno); bool switched_to_mempurge = false; // Within flush_job.Run, rocksdb may call event listener to notify @@ -538,14 +540,6 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( IOStatus log_io_s = IOStatus::OK(); assert(num_cfs == static_cast(jobs.size())); - for (int i = 0; i != num_cfs; ++i) { - const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i); - // may temporarily unlock and lock the mutex. - FlushReason flush_reason = bg_flush_args[i].flush_reason_; - NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options, - job_context->job_id, flush_reason); - } - if (logfile_number_ > 0) { // TODO (yanqin) investigate whether we should sync the closed logs for // single column family case. @@ -598,13 +592,24 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( job_context->job_id, s.ToString().c_str()); } + std::vector earliest_seqnos(num_cfs, 0); + std::vector largest_seqnos(num_cfs, 0); if (s.ok()) { for (int i = 0; i != num_cfs; ++i) { - jobs[i]->PickMemTable(); + jobs[i]->PickMemTable(&earliest_seqnos[i], &largest_seqnos[i]); pick_status[i] = true; } } + for (int i = 0; i != num_cfs; ++i) { + const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i); + // may temporarily unlock and lock the mutex. + FlushReason flush_reason = bg_flush_args[i].flush_reason_; + NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options, + job_context->job_id, flush_reason, earliest_seqnos[i], + largest_seqnos[i]); + } + if (s.ok()) { assert(switched_to_mempurge.size() == static_cast(num_cfs)); @@ -914,7 +919,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, const MutableCFOptions& mutable_cf_options, - int job_id, FlushReason flush_reason) { + int job_id, FlushReason flush_reason, + SequenceNumber smallest_seqno, + SequenceNumber largest_seqno) { if (immutable_db_options_.listeners.size() == 0U) { return; } @@ -944,8 +951,10 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, info.job_id = job_id; info.triggered_writes_slowdown = triggered_writes_slowdown; info.triggered_writes_stop = triggered_writes_stop; - info.smallest_seqno = file_meta->fd.smallest_seqno; - info.largest_seqno = file_meta->fd.largest_seqno; + // This sequence number is actually smaller than or equal to the sequence + // number of any key that be inserted into the flushed memtable. + info.smallest_seqno = smallest_seqno; + info.largest_seqno = largest_seqno; info.flush_reason = flush_reason; for (auto listener : immutable_db_options_.listeners) { listener->OnFlushBegin(this, info); @@ -1126,6 +1135,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, if (s.ok() && flush_needed) { FlushOptions fo; fo.allow_write_stall = options.allow_write_stall; + fo.check_if_compaction_disabled = true; if (immutable_db_options_.atomic_flush) { s = AtomicFlushMemTables(fo, FlushReason::kManualCompaction); } else { @@ -1951,6 +1961,7 @@ Status DBImpl::Flush(const FlushOptions& flush_options, ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.", cfh->GetName().c_str()); Status s; + TEST_SYNC_POINT_CALLBACK("DBImpl::Flush:ScheduleFlushReq", column_family); if (immutable_db_options_.atomic_flush) { s = AtomicFlushMemTables(flush_options, FlushReason::kManualFlush, {cfh->cfd()}); @@ -2264,12 +2275,28 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, } } - const bool needs_to_join_write_thread = !entered_write_thread; + const bool needs_to_join_write_thread = + !entered_write_thread && !flush_options._write_stopped; + autovector flush_reqs; autovector memtable_ids_to_wait; { WriteContext context; InstrumentedMutexLock guard_lock(&mutex_); + // Need to check inside lock to avoid [flush()] -> [disable] -> [schedule]. + if (flush_options.check_if_compaction_disabled && + manual_compaction_paused_.load(std::memory_order_acquire) > 0) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + if (flush_options.expected_oldest_key_time != 0 && + cfd->mem()->ApproximateOldestKeyTime() != + flush_options.expected_oldest_key_time) { + std::ostringstream oss; + oss << "Oldest key time doesn't match. expected=" + << flush_options.expected_oldest_key_time + << ", actual=" << cfd->mem()->ApproximateOldestKeyTime(); + return Status::Incomplete(oss.str()); + } WriteThread::Writer w; WriteThread::Writer nonmem_w; @@ -2444,6 +2471,11 @@ Status DBImpl::AtomicFlushMemTables( { WriteContext context; InstrumentedMutexLock guard_lock(&mutex_); + // Need to check inside lock to avoid [flush()] -> [disable] -> [schedule]. + if (flush_options.check_if_compaction_disabled && + manual_compaction_paused_.load(std::memory_order_acquire) > 0) { + return Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } WriteThread::Writer w; WriteThread::Writer nonmem_w; diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index 17050e4651f..ac566829fbc 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -318,5 +318,19 @@ size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const { InstrumentedMutexLock l(&const_cast(this)->stats_history_mutex_); return EstimateInMemoryStatsHistorySize(); } + +void DBImpl::TEST_ClearBackgroundJobs() { + // Matching `CloseHelper()`. + while (!flush_queue_.empty()) { + const FlushRequest& flush_req = PopFirstFromFlushQueue(); + for (const auto& iter : flush_req.cfd_to_max_mem_id_to_persist) { + iter.first->UnrefAndTryDelete(); + } + } + while (!compaction_queue_.empty()) { + auto cfd = PopFirstFromCompactionQueue(); + cfd->UnrefAndTryDelete(); + } +} } // namespace ROCKSDB_NAMESPACE #endif // NDEBUG diff --git a/db/db_impl/db_impl_merge.cc b/db/db_impl/db_impl_merge.cc new file mode 100644 index 00000000000..e6e01136c99 --- /dev/null +++ b/db/db_impl/db_impl_merge.cc @@ -0,0 +1,396 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/db_impl/db_impl.h" + +namespace ROCKSDB_NAMESPACE { + +/// A RAII-style helper used to block DB writes. +class WriteBlocker { + public: + WriteBlocker(DBImpl* db) : db_(db), writer_(new WriteThread::Writer()) { + db_->mutex_.Lock(); + db_->write_thread_.EnterUnbatched(writer_.get(), &db_->mutex_); + db_->WaitForPendingWrites(); + } + + ~WriteBlocker() { + db_->write_thread_.ExitUnbatched(writer_.get()); + db_->mutex_.Unlock(); + } + + private: + DBImpl* db_; + std::unique_ptr writer_; +}; + +Status DBImpl::ValidateForMerge(const MergeInstanceOptions& mopts, + DBImpl* rhs) { + if (rhs->two_write_queues_) { + return Status::NotSupported("two_write_queues == true"); + } + for (auto cfd : *versions_->GetColumnFamilySet()) { + auto rhs_cfd = + rhs->versions_->GetColumnFamilySet()->GetColumnFamily(cfd->GetName()); + if (rhs_cfd != nullptr) { + if (strcmp(cfd->ioptions()->table_factory->Name(), + rhs_cfd->ioptions()->table_factory->Name()) != 0) { + return Status::InvalidArgument( + "table_factory must be of the same type"); + } + } + } + if (mopts.merge_memtable) { + if (rhs->total_log_size_ > 0) { + return Status::InvalidArgument("DB WAL is not empty"); + } + } + if (rhs->table_cache_ == table_cache_) { + return Status::InvalidArgument("table_cache must not be shared"); + } + return Status::OK(); +} + +Status DBImpl::CheckInRange(const Slice* begin, const Slice* end) { + Status s; + if (begin == nullptr && end == nullptr) { + return s; + } + for (auto cfd : *versions_->GetColumnFamilySet()) { + assert(cfd != nullptr); + auto* comparator = cfd->user_comparator(); + PinnableSlice smallest, largest; + bool found = false; + s = cfd->GetUserKeyRange(&smallest, &largest, &found); + if (!s.ok()) { + return s; + } + if (!found) { + continue; + } + if (begin != nullptr && comparator->Compare(smallest, *begin) < 0) { + return Status::InvalidArgument("Has data smaller than left boundary"); + } else if (end != nullptr && comparator->Compare(largest, *end) >= 0) { + return Status::InvalidArgument("Has data larger than right boundary"); + } + } + return s; +} + +Status DBImpl::MergeDisjointInstances(const MergeInstanceOptions& merge_options, + const std::vector& instances) { + Status s; + autovector this_cfds; + for (auto cfd : *versions_->GetColumnFamilySet()) { + assert(cfd != nullptr); + if (!cfd->IsDropped()) { + this_cfds.push_back(cfd); + } + } + const size_t num_cfs = this_cfds.size(); + + // # Sanity checks + // Check target instance (`this`). + if (two_write_queues_) { + return Status::NotSupported("target instance two_write_queues == true"); + } + autovector db_impls; + autovector all_db_impls{this}; + // A list of source db super versions grouped by cf. nullptr if the cf is + // missing. + autovector> cf_db_super_versions; + std::shared_ptr _defer(nullptr, [&](...) { + for (auto& db_super_versions : cf_db_super_versions) { + for (auto* super_version : db_super_versions) { + if (super_version != nullptr && super_version->Unref()) { + super_version->Cleanup(); + } + } + } + }); + // Check source instances. + for (size_t i = 0; i < instances.size(); i++) { + auto* db_impl = static_cast(instances[i]); + s = ValidateForMerge(merge_options, db_impl); + if (s.ok()) { + db_impls.push_back(db_impl); + all_db_impls.push_back(db_impl); + } else { + return s; + } + } + + // Block all writes. + autovector> write_blockers; + for (auto* db : all_db_impls) { + write_blockers.emplace_back(new WriteBlocker(db)); + } + + // # Internal key range check + assert(s.ok()); + for (auto* this_cfd : this_cfds) { + auto& name = this_cfd->GetName(); + auto* comparator = this_cfd->user_comparator(); + using CfRange = std::pair; + std::vector db_ranges; + auto process_cf = [&](ColumnFamilyData* cfd) { + assert(cfd && s.ok()); + PinnableSlice smallest, largest; + bool found = false; + s = cfd->GetUserKeyRange(&smallest, &largest, &found); + if (s.ok() && found) { + db_ranges.emplace_back( + std::make_pair(std::move(smallest), std::move(largest))); + } + }; + process_cf(this_cfd); + if (!s.ok()) { + return s; + } + for (auto* db : db_impls) { + auto cfd = db->versions_->GetColumnFamilySet()->GetColumnFamily(name); + if (cfd && !cfd->IsDropped()) { + process_cf(cfd); + if (!s.ok()) { + return s; + } + } + } + std::sort(db_ranges.begin(), db_ranges.end(), + [=](const CfRange& a, const CfRange& b) { + return comparator->Compare(a.first, b.first) < 0; + }); + Slice last_largest; + for (auto& range : db_ranges) { + if (last_largest.size() == 0 || + comparator->Compare(last_largest, range.first) < 0) { + last_largest = range.second; + } else { + return Status::InvalidArgument("Source DBs have overlapping range"); + } + } + } + + // # Handle transient states + // + // - Acquire snapshots of table files (`SuperVersion`). + // + // - Do memtable merge if needed. We do this together with acquiring + // snapshot + // to avoid the case where a memtable is flushed shortly after being + // merged, and the resulting L0 data is merged again as a table file. + assert(s.ok()); + autovector to_delete; // not used. + // Key-value freshness is determined by its sequence number. To avoid + // incoming writes being shadowed by history data from other instances, we + // must increment target instance's sequence number to be larger than all + // source data. See [A]. + uint64_t max_seq_number = 0; + // RocksDB's recovery is heavily dependent on the one-on-one mapping between + // memtable and WAL (even when WAL is empty). Each memtable keeps a record + // of `next_log_number` to mark its position within a series of WALs. This + // counter must be monotonic. We work around this issue by setting the + // counters of all involved memtables to the same maximum value. See [B]. + uint64_t max_log_number = 0; + for (auto* db : all_db_impls) { + max_seq_number = std::max(max_seq_number, db->versions_->LastSequence()); + max_log_number = std::max(max_log_number, db->logfile_number_); + } + // [A] Bump sequence number. + versions_->SetLastAllocatedSequence(max_seq_number); + versions_->SetLastSequence(max_seq_number); + cf_db_super_versions.resize(num_cfs); + for (size_t cf_i = 0; cf_i < num_cfs; cf_i++) { + cf_db_super_versions[cf_i].resize(db_impls.size()); + auto* this_cfd = this_cfds[cf_i]; + auto& cf_name = this_cfd->GetName(); + autovector mems; + for (size_t db_i = 0; db_i < db_impls.size(); db_i++) { + auto& db = db_impls[db_i]; + auto cfd = db->versions_->GetColumnFamilySet()->GetColumnFamily(cf_name); + if (cfd == nullptr || cfd->IsDropped()) { + cf_db_super_versions[cf_i][db_i] = nullptr; + continue; + } + + if (merge_options.merge_memtable) { + if (!cfd->mem()->IsEmpty()) { + WriteContext write_context; + assert(log_empty_); + s = SwitchMemtable(cfd, &write_context); + if (!s.ok()) { + return s; + } + } + assert(cfd->mem()->IsEmpty()); + + // [B] Bump log number for active memtable. Even though it's not + // shared, it must still be larger than other shared immutable + // memtables. + cfd->mem()->SetNextLogNumber(max_log_number); + cfd->imm()->ExportMemtables(&mems); + } + + // Acquire super version. + cf_db_super_versions[cf_i][db_i] = cfd->GetSuperVersion()->Ref(); + } + for (auto mem : mems) { + assert(mem != nullptr); + mem->Ref(); + // [B] Bump log number for shared memtables. + mem->SetNextLogNumber(max_log_number); + this_cfd->imm()->Add(mem, &to_delete); + } + this_cfd->mem()->SetNextLogNumber(max_log_number); + } + for (size_t i = 0; i < all_db_impls.size(); i++) { + auto* db = all_db_impls[i]; + bool check_log_number = (i == 0 || merge_options.allow_source_write) && + merge_options.merge_memtable; + if (check_log_number && max_log_number != db->logfile_number_) { + assert(max_log_number > db->logfile_number_); + // [B] Create a new WAL so that future memtable will use the correct log + // number as well. + log::Writer* new_log = nullptr; + s = db->CreateWAL(max_log_number, 0 /*recycle_log_number*/, + 0 /*preallocate_block_size*/, &new_log); + if (!s.ok()) { + return s; + } + db->logfile_number_ = max_log_number; + assert(new_log != nullptr); + db->logs_.emplace_back(max_log_number, new_log); + auto current = db->versions_->current_next_file_number(); + if (current <= max_log_number) { + db->versions_->FetchAddFileNumber(max_log_number - current + 1); + } + } + } + + // Unblock writes. + write_blockers.clear(); + + TEST_SYNC_POINT("DBImpl::MergeDisjointInstances:AfterMergeMemtable:1"); + + // # Merge table files + assert(s.ok()); + autovector cf_edits; + cf_edits.resize(num_cfs); + for (size_t cf_i = 0; cf_i < num_cfs; cf_i++) { + auto* this_cfd = this_cfds[cf_i]; + auto& edit = cf_edits[cf_i]; + edit.SetColumnFamily(this_cfd->GetID()); + for (size_t db_i = 0; db_i < db_impls.size(); db_i++) { + auto* super_version = cf_db_super_versions[cf_i][db_i]; + if (super_version == nullptr) { + continue; + } + VersionStorageInfo& vsi = *super_version->current->storage_info(); + auto& cf_paths = super_version->cfd->ioptions()->cf_paths; + auto SourcePath = [&](size_t path_id) { + // Matching `TableFileName()`. + if (path_id >= cf_paths.size()) { + assert(false); + return cf_paths.back().path; + } else { + return cf_paths[path_id].path; + } + }; + const auto& target_path = this_cfd->ioptions()->cf_paths.front().path; + const uint64_t target_path_id = 0; + for (int level = 0; level < vsi.num_levels(); ++level) { + for (const auto& f : vsi.LevelFiles(level)) { + assert(f != nullptr); + const uint64_t source_file_number = f->fd.GetNumber(); + const uint64_t target_file_number = versions_->FetchAddFileNumber(1); + std::string src = MakeTableFileName(SourcePath(f->fd.GetPathId()), + source_file_number); + std::string target = + MakeTableFileName(target_path, target_file_number); + s = GetEnv()->LinkFile(src, target); + if (!s.ok()) { + return s; + } + edit.AddFile(level, target_file_number, target_path_id, + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, f->temperature, + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->epoch_number, f->file_checksum, + f->file_checksum_func_name, f->unique_id, + f->compensated_range_deletion_size, f->tail_size, + f->user_defined_timestamps_persisted); + } + } + vsi.RecoverEpochNumbers(this_cfd); + } + } + + // # Apply version edits + assert(s.ok()); + { + autovector> edit_ptrs; + autovector cf_mopts; + for (size_t i = 0; i < num_cfs; i++) { + edit_ptrs.push_back({&cf_edits[i]}); + cf_mopts.push_back(this_cfds[i]->GetLatestMutableCFOptions()); + } + + auto old_capacity = table_cache_->GetCapacity(); + if (merge_options.max_preload_files >= 0) { + // Refer to `LoadTableHandlers` for calculation details. + // This trick will be wrong if table_cache is shared. + table_cache_->SetCapacity( + (table_cache_->GetUsage() + merge_options.max_preload_files) * 4); + } + + InstrumentedMutexLock lock(&mutex_); + s = versions_->LogAndApply(this_cfds, cf_mopts, ReadOptions(), edit_ptrs, + &mutex_, directories_.GetDbDir(), false); + if (!s.ok()) { + return s; + } + for (size_t i = 0; i < num_cfs; i++) { + SuperVersionContext sv_context(/* create_superversion */ true); + InstallSuperVersionAndScheduleWork(this_cfds[i], &sv_context, + *cf_mopts[i]); + sv_context.Clean(); + } + + if (immutable_db_options_.atomic_flush) { + AssignAtomicFlushSeq(this_cfds); + } + for (auto cfd : this_cfds) { + cfd->imm()->FlushRequested(); + if (!immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest({cfd}, FlushReason::kWriteBufferFull, &flush_req); + SchedulePendingFlush(flush_req); + } + } + if (immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest(this_cfds, FlushReason::kWriteBufferFull, + &flush_req); + SchedulePendingFlush(flush_req); + } + for (auto cfd : this_cfds) { + SchedulePendingCompaction(cfd); + } + MaybeScheduleFlushOrCompaction(); + + if (merge_options.max_preload_files >= 0) { + table_cache_->SetCapacity(old_capacity); + } + } + + assert(s.ok()); + return s; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 086e014e581..d41cd5b3a70 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1969,6 +1969,22 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, } DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn); + for (auto cf : column_families) { + if (cf.options.cf_write_buffer_manager != nullptr) { + auto* write_buffer_manager = cf.options.cf_write_buffer_manager.get(); + bool already_exist = false; + for (auto m : impl->cf_based_write_buffer_manager_) { + if (m == write_buffer_manager) { + already_exist = true; + break; + } + } + if (!already_exist) { + impl->cf_based_write_buffer_manager_.push_back(write_buffer_manager); + } + } + } + if (!impl->immutable_db_options_.info_log) { s = impl->init_logger_creation_s_; delete impl; @@ -2265,6 +2281,33 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, } if (s.ok()) { s = impl->StartPeriodicTaskScheduler(); + if (impl->write_buffer_manager_) { + impl->write_buffer_manager_->UnregisterDB(impl); + } + for (auto m : impl->cf_based_write_buffer_manager_) { + m->UnregisterDB(impl); + } + + for (size_t i = 0; i < (*handles).size(); ++i) { + auto cf_opt = column_families[i].options; + + auto* cf = (*handles)[i]; + std::string cf_name = cf->GetName(); + auto* write_buffer_manager = cf_opt.cf_write_buffer_manager != nullptr + ? cf_opt.cf_write_buffer_manager.get() + : impl->write_buffer_manager_; + if (write_buffer_manager) { + if (cf->GetName() == kDefaultColumnFamilyName) { + write_buffer_manager->RegisterColumnFamily(impl, + impl->default_cf_handle_); + } else if (cf->GetName() == kPersistentStatsColumnFamilyName) { + write_buffer_manager->RegisterColumnFamily( + impl, impl->persist_stats_cf_handle_); + } else { + write_buffer_manager->RegisterColumnFamily(impl, cf); + } + } + } } if (s.ok()) { s = impl->RegisterRecordSeqnoTimeWorker(recovery_ctx.is_new_db_); diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h index 32bc8560706..93103d120a7 100644 --- a/db/db_impl/db_impl_readonly.h +++ b/db/db_impl/db_impl_readonly.h @@ -76,8 +76,9 @@ class DBImplReadOnly : public DBImpl { const Slice& /*key*/) override { return Status::NotSupported("Not supported operation in read only mode."); } - virtual Status Write(const WriteOptions& /*options*/, - WriteBatch* /*updates*/) override { + using DBImpl::Write; + virtual Status Write(const WriteOptions& /*options*/, WriteBatch* /*updates*/, + PostWriteCallback* /*callback*/) override { return Status::NotSupported("Not supported operation in read only mode."); } using DBImpl::CompactRange; diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h index 12a8bbdd707..00bea3a28c1 100644 --- a/db/db_impl/db_impl_secondary.h +++ b/db/db_impl/db_impl_secondary.h @@ -162,8 +162,9 @@ class DBImplSecondary : public DBImpl { return Status::NotSupported("Not supported operation in secondary mode."); } - Status Write(const WriteOptions& /*options*/, - WriteBatch* /*updates*/) override { + using DBImpl::Write; + Status Write(const WriteOptions& /*options*/, WriteBatch* /*updates*/, + PostWriteCallback* /*callback*/) override { return Status::NotSupported("Not supported operation in secondary mode."); } diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 536c514a2ec..c74ec8dba93 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -147,7 +147,8 @@ void DBImpl::SetRecoverableStatePreReleaseCallback( recoverable_state_pre_release_callback_.reset(callback); } -Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { +Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch, + PostWriteCallback* callback) { Status s; if (write_options.protection_bytes_per_key > 0) { s = WriteBatchInternal::UpdateProtectionInfo( @@ -155,7 +156,10 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { } if (s.ok()) { s = WriteImpl(write_options, my_batch, /*callback=*/nullptr, - /*log_used=*/nullptr); + /*log_used=*/nullptr, /*log_ref=*/0, + /*disable_memtable=*/false, /*seq=*/nullptr, /*batch_cnt=*/0, + /*pre_release_callback=*/nullptr, + /*post_memtable_callback=*/nullptr, callback); } return s; } @@ -188,9 +192,12 @@ void DBImpl::MultiBatchWriteCommit(CommitRequest* request) { } Status DBImpl::MultiBatchWrite(const WriteOptions& options, - std::vector&& updates) { + std::vector&& updates, + PostWriteCallback* callback) { if (immutable_db_options_.enable_multi_batch_write) { - return MultiBatchWriteImpl(options, std::move(updates), nullptr, nullptr); + return MultiBatchWriteImpl(options, std::move(updates), + /*callback=*/nullptr, /*log_used=*/nullptr, + /*log_ref=*/0, /*seq=*/nullptr, callback); } else { return Status::NotSupported(); } @@ -239,12 +246,15 @@ Status DBImpl::MultiBatchWrite(const WriteOptions& options, Status DBImpl::MultiBatchWriteImpl(const WriteOptions& write_options, std::vector&& my_batch, WriteCallback* callback, uint64_t* log_used, - uint64_t log_ref, uint64_t* seq_used) { + uint64_t log_ref, uint64_t* seq_used, + PostWriteCallback* post_callback) { PERF_TIMER_GUARD(write_pre_and_post_process_time); StopWatch write_sw(immutable_db_options_.clock, immutable_db_options_.statistics.get(), DB_WRITE); WriteThread::Writer writer(write_options, std::move(my_batch), callback, - log_ref, false /*disable_memtable*/); + log_ref, false /*disable_memtable*/, + /*pre_release_callback=*/nullptr, + /*post_memtable_callback=*/nullptr, post_callback); CommitRequest request(&writer); writer.request = &request; write_thread_.JoinBatchGroup(&writer); @@ -294,6 +304,8 @@ Status DBImpl::MultiBatchWriteImpl(const WriteOptions& write_options, next_sequence += count; total_count += count; memtable_write_cnt++; + } else if (w->post_callback) { + w->post_callback->Callback(w->sequence); } } total_byte_size = WriteBatchInternal::AppendedByteSize( @@ -406,7 +418,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, bool disable_memtable, uint64_t* seq_used, size_t batch_cnt, PreReleaseCallback* pre_release_callback, - PostMemTableCallback* post_memtable_callback) { + PostMemTableCallback* post_memtable_callback, + PostWriteCallback* post_callback) { assert(!seq_per_batch_ || batch_cnt != 0); assert(my_batch == nullptr || my_batch->Count() == 0 || write_options.protection_bytes_per_key == 0 || @@ -467,6 +480,14 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, return Status::NotSupported( "pipelined_writes is not compatible with concurrent prepares"); } + if (two_write_queues_ && post_callback) { + return Status::NotSupported( + "post write callback is not compatible with concurrent prepares"); + } + if (disable_memtable && post_callback) { + return Status::NotSupported( + "post write callback is not compatible with disabling memtable"); + } if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) { // TODO(yiwu): update pipeline write with seq_per_batch and batch_cnt return Status::NotSupported( @@ -529,8 +550,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, } if (!disable_memtable) { TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"); - status = UnorderedWriteMemtable(write_options, my_batch, callback, - log_ref, seq, sub_batch_cnt); + status = + UnorderedWriteMemtable(write_options, my_batch, callback, log_ref, + seq, sub_batch_cnt, post_callback); } return status; } @@ -539,18 +561,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, std::vector updates(1); updates[0] = my_batch; return MultiBatchWriteImpl(write_options, std::move(updates), callback, - log_used, log_ref, seq_used); + log_used, log_ref, seq_used, post_callback); } if (immutable_db_options_.enable_pipelined_write) { return PipelinedWriteImpl(write_options, my_batch, callback, log_used, - log_ref, disable_memtable, seq_used); + log_ref, disable_memtable, seq_used, + post_callback); } PERF_TIMER_GUARD(write_pre_and_post_process_time); WriteThread::Writer w(write_options, my_batch, callback, log_ref, disable_memtable, batch_cnt, pre_release_callback, - post_memtable_callback); + post_memtable_callback, post_callback); StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); write_thread_.JoinBatchGroup(&w); @@ -908,7 +931,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used, uint64_t log_ref, - bool disable_memtable, uint64_t* seq_used) { + bool disable_memtable, uint64_t* seq_used, + PostWriteCallback* post_callback) { PERF_TIMER_GUARD(write_pre_and_post_process_time); StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); @@ -916,7 +940,8 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, WriteThread::Writer w(write_options, my_batch, callback, log_ref, disable_memtable, /*_batch_cnt=*/0, - /*_pre_release_callback=*/nullptr); + /*_pre_release_callback=*/nullptr, + /*_post_memtable_callback=*/nullptr, post_callback); write_thread_.JoinBatchGroup(&w); TEST_SYNC_POINT("DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup"); if (w.state == WriteThread::STATE_GROUP_LEADER) { @@ -1086,12 +1111,15 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options, WriteBatch* my_batch, WriteCallback* callback, uint64_t log_ref, SequenceNumber seq, - const size_t sub_batch_cnt) { + const size_t sub_batch_cnt, + PostWriteCallback* post_callback) { PERF_TIMER_GUARD(write_pre_and_post_process_time); StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE); WriteThread::Writer w(write_options, my_batch, callback, log_ref, - false /*disable_memtable*/); + false /*disable_memtable*/, 0, + /*pre_release_callback=*/nullptr, + /*post_memtable_callback=*/nullptr, post_callback); if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) { w.sequence = seq; @@ -1425,15 +1453,14 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, } } + // Ordering: before write delay. if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) { - // Before a new memtable is added in SwitchMemtable(), - // write_buffer_manager_->ShouldFlush() will keep returning true. If another - // thread is writing to another DB with the same write buffer, they may also - // be flushed. We may end up with flushing much more DBs than needed. It's - // suboptimal but still correct. - InstrumentedMutexLock l(&mutex_); - WaitForPendingWrites(); - status = HandleWriteBufferManagerFlush(write_context); + write_buffer_manager_->MaybeFlush(this); + } + for (auto write_buffer_manager : cf_based_write_buffer_manager_) { + if (UNLIKELY(status.ok() && write_buffer_manager->ShouldFlush())) { + write_buffer_manager->MaybeFlush(this); + } } if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) { @@ -1966,98 +1993,6 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) { return status; } -Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) { - mutex_.AssertHeld(); - assert(write_context != nullptr); - Status status; - - // Before a new memtable is added in SwitchMemtable(), - // write_buffer_manager_->ShouldFlush() will keep returning true. If another - // thread is writing to another DB with the same write buffer, they may also - // be flushed. We may end up with flushing much more DBs than needed. It's - // suboptimal but still correct. - // no need to refcount because drop is happening in write thread, so can't - // happen while we're in the write thread - autovector cfds; - if (immutable_db_options_.atomic_flush) { - SelectColumnFamiliesForAtomicFlush(&cfds); - } else { - ColumnFamilyData* cfd_picked = nullptr; - SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber; - - for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->IsDropped()) { - continue; - } - if (!cfd->mem()->IsEmpty() && !cfd->imm()->IsFlushPendingOrRunning()) { - // We only consider flush on CFs with bytes in the mutable memtable, - // and no immutable memtables for which flush has yet to finish. If - // we triggered flush on CFs already trying to flush, we would risk - // creating too many immutable memtables leading to write stalls. - uint64_t seq = cfd->mem()->GetCreationSeq(); - if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) { - cfd_picked = cfd; - seq_num_for_cf_picked = seq; - } - } - } - if (cfd_picked != nullptr) { - cfds.push_back(cfd_picked); - } - MaybeFlushStatsCF(&cfds); - } - if (!cfds.empty()) { - ROCKS_LOG_INFO( - immutable_db_options_.info_log, - "Flushing triggered to alleviate write buffer memory usage. Write " - "buffer is using %" ROCKSDB_PRIszt - " bytes out of a total of %" ROCKSDB_PRIszt ".", - write_buffer_manager_->memory_usage(), - write_buffer_manager_->buffer_size()); - } - - WriteThread::Writer nonmem_w; - if (two_write_queues_) { - nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); - } - for (const auto cfd : cfds) { - if (cfd->mem()->IsEmpty()) { - continue; - } - cfd->Ref(); - status = SwitchMemtable(cfd, write_context); - cfd->UnrefAndTryDelete(); - if (!status.ok()) { - break; - } - } - if (two_write_queues_) { - nonmem_write_thread_.ExitUnbatched(&nonmem_w); - } - - if (status.ok()) { - if (immutable_db_options_.atomic_flush) { - AssignAtomicFlushSeq(cfds); - } - for (const auto cfd : cfds) { - cfd->imm()->FlushRequested(); - if (!immutable_db_options_.atomic_flush) { - FlushRequest flush_req; - GenerateFlushRequest({cfd}, FlushReason::kWriteBufferManager, - &flush_req); - SchedulePendingFlush(flush_req); - } - } - if (immutable_db_options_.atomic_flush) { - FlushRequest flush_req; - GenerateFlushRequest(cfds, FlushReason::kWriteBufferManager, &flush_req); - SchedulePendingFlush(flush_req); - } - MaybeScheduleFlushOrCompaction(); - } - return status; -} - uint64_t DBImpl::GetMaxTotalWalSize() const { uint64_t max_total_wal_size = max_total_wal_size_.load(std::memory_order_acquire); @@ -2400,6 +2335,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { memtable_info.cf_name = cfd->GetName(); memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber(); memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber(); + memtable_info.largest_seqno = cfd->mem()->GetLargestSequenceNumber(); memtable_info.num_entries = cfd->mem()->num_entries(); memtable_info.num_deletes = cfd->mem()->num_deletes(); // Log this later after lock release. It may be outdated, e.g., if background @@ -2583,10 +2519,15 @@ size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const { if (immutable_db_options_.db_write_buffer_size > 0) { bsize = std::min(bsize, immutable_db_options_.db_write_buffer_size); } - if (immutable_db_options_.write_buffer_manager && - immutable_db_options_.write_buffer_manager->enabled()) { - bsize = std::min( - bsize, immutable_db_options_.write_buffer_manager->buffer_size()); + if (immutable_db_options_.write_buffer_manager) { + size_t buffer_size = + immutable_db_options_.write_buffer_manager->flush_size(); + for (auto manager : cf_based_write_buffer_manager_) { + buffer_size += manager->flush_size(); + } + if (buffer_size > 0) { + bsize = std::min(bsize, buffer_size); + } } return bsize; diff --git a/db/db_merge_test.cc b/db/db_merge_test.cc new file mode 100644 index 00000000000..e55239bf139 --- /dev/null +++ b/db/db_merge_test.cc @@ -0,0 +1,647 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include + +#include "db/db_test_util.h" +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +const uint32_t default_cf = 0; +uint32_t operator"" _db(unsigned long long int i) { return uint32_t(i); } +uint32_t operator"" _cf(unsigned long long int i) { + assert(i > 0); + return uint32_t(i); +} + +class DBMergeTest : public testing::Test { + struct DBHandles { + std::string path; + DBImpl* db; + std::unordered_map cfs; + }; + + public: + DBMergeTest() { + options_.create_if_missing = true; + options_.write_buffer_manager.reset( + new WriteBufferManager(options_.db_write_buffer_size)); + // avoid stalling the tests. + options_.disable_write_stall = true; + options_.avoid_flush_during_shutdown = true; + // avoid background flush/compaction. + options_.level0_file_num_compaction_trigger = 10; + options_.level0_slowdown_writes_trigger = 10; + options_.level0_stop_writes_trigger = 10; + options_.max_write_buffer_number = 10; + } + + ~DBMergeTest() { DestroyAll(); } + + void IsOverlapError(Status s) { + ASSERT_EQ(s.ToString(), + "Invalid argument: Source DBs have overlapping range"); + } + + void IsWALNotEmpty(Status s) { + ASSERT_EQ(s.ToString(), "Invalid argument: DB WAL is not empty"); + } + + // 0 for default cf. + std::vector GenColumnFamilyDescriptors( + const std::vector& cf_ids) { + std::vector column_families; + for (auto cf_id : cf_ids) { + if (cf_id == 0) { + column_families.push_back( + ColumnFamilyDescriptor(ROCKSDB_NAMESPACE::kDefaultColumnFamilyName, + ColumnFamilyOptions(options_))); + } else { + column_families.push_back(ColumnFamilyDescriptor( + std::to_string(cf_id), ColumnFamilyOptions(options_))); + } + } + return column_families; + } + + std::string GenDBPath(uint32_t db_id) { + return test::PerThreadDBPath(env_, std::to_string(db_id)); + } + + void AddDB(uint32_t db_id, DB* db, + std::vector cf_handles) { + assert(dbs_.count(db_id) == 0); + DBHandles db_handles; + db_handles.path = GenDBPath(db_id); + db_handles.db = static_cast(db); + for (auto* handle : cf_handles) { + uint32_t id = 0; + if (handle->GetName() != "default") { + id = uint32_t(stoul(handle->GetName())); + } + db_handles.cfs[id] = handle; + } + dbs_[db_id] = db_handles; + } + + void Open(uint32_t db_id, const std::vector& cf_ids, + bool reopen = false) { + if (dbs_.count(db_id) > 0) { + if (reopen) { + auto& db_handles = dbs_[db_id]; + auto* db = db_handles.db; + for (auto& cf : db_handles.cfs) { + ASSERT_OK(db->DestroyColumnFamilyHandle(cf.second)); + } + delete db; + dbs_.erase(db_id); + } else { + Destroy(db_id); + } + } + std::vector column_families = + GenColumnFamilyDescriptors(cf_ids); + auto path = GenDBPath(db_id); + DB* db = nullptr; + if (!reopen) { + ASSERT_OK(DB::Open(options_, path, &db)); + for (auto& cf : column_families) { + if (cf.name != "default") { + ColumnFamilyHandle* cf_handle; + ASSERT_OK(db->CreateColumnFamily(cf.options, cf.name, &cf_handle)); + ASSERT_OK(db->DestroyColumnFamilyHandle(cf_handle)); + } + } + delete db; + db = nullptr; + } + std::vector handles; + ASSERT_OK(DB::Open(options_, path, column_families, &handles, &db)); + AddDB(db_id, db, handles); + } + + void Destroy(uint32_t db_id) { + DestroyImpl(dbs_[db_id]); + dbs_.erase(db_id); + } + + void DestroyAll() { + for (auto& db_handles : dbs_) { + DestroyImpl(db_handles.second); + } + dbs_.clear(); + } + + void DestroyImpl(DBHandles& db_handles) { + auto* db = db_handles.db; + for (auto& cf : db_handles.cfs) { + ASSERT_OK(db->DestroyColumnFamilyHandle(cf.second)); + } + delete db; + ASSERT_OK(DestroyDB(db_handles.path, options_)); + } + + // cfs are ignored if target already exists + Status Merge(const MergeInstanceOptions& mopts, std::vector&& from, + uint32_t to, + const std::vector& cfs = std::vector()) { + std::vector source_dbs; + for (auto db_id : from) { + source_dbs.push_back(get_db(db_id)); + } + bool newly_opened = false; + if (dbs_.count(to) == 0) { + assert(cfs.size() > 0); + Open(to, cfs); + newly_opened = true; + } + auto s = get_db(to)->MergeDisjointInstances(mopts, source_dbs); + if (newly_opened && !s.ok()) { + Destroy(to); + } + return s; + } + + void VerifyKeyValue(uint32_t db_id, uint32_t cf_id, std::string key, + std::string value, + const ReadOptions& ropts = ReadOptions()) { + std::string ret; + if (value == "NotFound") { + assert(get_db(db_id) + ->Get(ropts, get_cf(db_id, cf_id), key, &ret) + .IsNotFound()); + } else { + ASSERT_OK(get_db(db_id)->Get(ropts, get_cf(db_id, cf_id), key, &ret)); + ASSERT_EQ(value, ret); + } + } + + int Property(uint32_t db_id, const std::string& name) { + std::string property; + int result; + if (get_db(db_id)->GetProperty(name, &property) && + sscanf(property.c_str(), "%d", &result) == 1) { + return result; + } else { + return -1; + } + } + + bool has_db(uint32_t db_id) { return dbs_.count(db_id) > 0; } + + DBImpl* get_db(uint32_t db_id) { + assert(dbs_.count(db_id) == 1); + return dbs_[db_id].db; + } + + ColumnFamilyHandle* get_cf(uint32_t db_id, uint32_t cf_id) { + assert(dbs_.count(db_id) == 1); + return dbs_[db_id].cfs[cf_id]; + } + + Env* env_ = Env::Default(); + Options options_; + std::unordered_map dbs_; +}; + +TEST_F(DBMergeTest, MultiMerge) { + FlushOptions fopts; + fopts.allow_write_stall = true; + MergeInstanceOptions mopts; + mopts.merge_memtable = true; + WriteOptions wopts; + wopts.disableWAL = true; + Random rnd(301); + + std::unordered_map kvs[3]; + for (uint32_t i = 0; i < 10; ++i) { + Open(i, {default_cf, 1_cf, 2_cf}); + auto* db = get_db(i); + uint32_t keys_per_file = 1 + (i - 5) * (i - 5); // scatter seqno. + for (auto cf : {default_cf, 1_cf, 2_cf}) { + for (uint32_t f = 0; f < 20; ++f) { + std::string prefix = + std::to_string(cf) + std::to_string(i) + std::to_string(f); + for (uint32_t k = 0; k < keys_per_file; ++k) { + auto keystr = prefix + "-" + std::to_string(k); + ASSERT_OK(db->Put(wopts, get_cf(i, cf), keystr, keystr)); + kvs[cf][keystr] = keystr; + } + ASSERT_OK(db->Flush(fopts, get_cf(i, cf))); + if (f % 5 == 0) { + ASSERT_OK(db->CompactRange(CompactRangeOptions(), get_cf(i, cf), + nullptr, nullptr)); + } + } + } + } + + ASSERT_OK(Merge(mopts, + {0_db, 1_db, 2_db, 3_db, 4_db, 5_db, 6_db, 7_db, 8_db, 9_db}, + 10_db, {default_cf, 1_cf, 2_cf})); + ASSERT_OK(Merge(mopts, {0_db, 1_db, 2_db, 3_db, 4_db, 5_db, 6_db, 7_db, 8_db}, + 9_db)); + + for (auto cf : {default_cf, 1_cf, 2_cf}) { + for (auto& kv : kvs[cf]) { + VerifyKeyValue(9_db, cf, kv.first, kv.second); + VerifyKeyValue(10_db, cf, kv.first, kv.second); + } + } + + // overwrite random to 9 and 10. + for (auto cf : {default_cf, 1_cf, 2_cf}) { + for (uint32_t i = 0; i < 10; ++i) { + auto iter = kvs[cf].begin(); + std::advance(iter, rnd.Next() % kvs[cf].size()); + + ASSERT_OK( + get_db(9_db)->Put(wopts, get_cf(9_db, cf), iter->first, "new_v")); + ASSERT_OK( + get_db(10_db)->Put(wopts, get_cf(10_db, cf), iter->first, "new_v")); + iter->second = "new_v"; + } + for (auto& kv : kvs[cf]) { + VerifyKeyValue(9_db, cf, kv.first, kv.second); + VerifyKeyValue(10_db, cf, kv.first, kv.second); + } + ASSERT_OK(get_db(9_db)->Flush(fopts, get_cf(9_db, cf))); + ASSERT_OK(get_db(10_db)->Flush(fopts, get_cf(10_db, cf))); + for (auto& kv : kvs[cf]) { + VerifyKeyValue(9_db, cf, kv.first, kv.second); + VerifyKeyValue(10_db, cf, kv.first, kv.second); + } + } + + // delete old instance. + for (auto db : {0_db, 1_db, 2_db, 3_db, 4_db, 5_db, 6_db, 7_db, 8_db}) { + Destroy(db); + } + for (auto cf : {default_cf, 1_cf, 2_cf}) { + for (uint32_t i = 0; i < 10; ++i) { + auto iter = kvs[cf].begin(); + std::advance(iter, rnd.Next() % kvs[cf].size()); + + ASSERT_OK( + get_db(9_db)->Put(wopts, get_cf(9_db, cf), iter->first, "new_v2")); + ASSERT_OK( + get_db(10_db)->Put(wopts, get_cf(10_db, cf), iter->first, "new_v2")); + iter->second = "new_v2"; + } + for (auto& kv : kvs[cf]) { + VerifyKeyValue(9_db, cf, kv.first, kv.second); + VerifyKeyValue(10_db, cf, kv.first, kv.second); + } + ASSERT_OK(get_db(9_db)->Flush(fopts, get_cf(9_db, cf))); + ASSERT_OK(get_db(10_db)->Flush(fopts, get_cf(10_db, cf))); + for (auto& kv : kvs[cf]) { + VerifyKeyValue(9_db, cf, kv.first, kv.second); + VerifyKeyValue(10_db, cf, kv.first, kv.second); + } + } + + Open(9_db, {default_cf, 1_cf, 2_cf}, true /*reopen*/); + Open(10_db, {default_cf, 1_cf, 2_cf}, true /*reopen*/); + for (auto cf : {default_cf, 1_cf, 2_cf}) { + for (auto& kv : kvs[cf]) { + VerifyKeyValue(9_db, cf, kv.first, kv.second); + VerifyKeyValue(10_db, cf, kv.first, kv.second); + } + } +} + +TEST_F(DBMergeTest, BinaryMerge) { + FlushOptions fopts; + fopts.allow_write_stall = true; + MergeInstanceOptions mopts; + mopts.merge_memtable = true; + WriteOptions wopts; + wopts.disableWAL = true; + Random rnd(301); + + std::unordered_map kvs[3]; + std::vector dbs = {0_db, 1_db, 2_db, 3_db, 4_db, + 5_db, 6_db, 7_db, 8_db, 9_db}; + while (dbs.size() >= 2) { + for (uint32_t i = 0; i < dbs.size(); ++i) { + if (!has_db(dbs[i])) { + Open(dbs[i], {default_cf, 1_cf, 2_cf}); + } + auto* db = get_db(dbs[i]); + uint32_t keys_per_file = 1 + (i - 5) * (i - 5); // scatter seqno. + for (auto cf : {default_cf, 1_cf, 2_cf}) { + for (uint32_t f = 0; f < 3; ++f) { + std::string prefix = + std::to_string(cf) + std::to_string(dbs[i]) + std::to_string(f); + for (uint32_t k = 0; k < keys_per_file; ++k) { + auto keystr = prefix + "-" + std::to_string(k); + if (rnd.Next() % 4 == 0) { + ASSERT_OK(db->SingleDelete(wopts, get_cf(dbs[i], cf), keystr)); + kvs[cf][keystr] = "NotFound"; + } else { + auto value = rnd.RandomString(16); + ASSERT_OK(db->Put(wopts, get_cf(dbs[i], cf), keystr, value)); + kvs[cf][keystr] = value; + } + } + ASSERT_OK(db->Flush(fopts, get_cf(dbs[i], cf))); + } + } + } + // merge random neighbors. + uint32_t src = rnd.Next() % dbs.size(); + uint32_t dst = (src + 1) % dbs.size(); + if ((rnd.Next() % 2 == 0 && src > 0) || dst == 0) { + dst = (src - 1) % dbs.size(); + } + ASSERT_OK(Merge(mopts, {dbs[src]}, dbs[dst])); + Destroy(dbs[src]); + dbs.erase(dbs.begin() + src); + } + for (auto cf : {default_cf, 1_cf, 2_cf}) { + for (auto& kv : kvs[cf]) { + VerifyKeyValue(dbs[0], cf, kv.first, kv.second); + } + ASSERT_OK(get_db(dbs[0])->Flush(fopts, get_cf(dbs[0], cf))); + } + Open(dbs[0], {default_cf, 1_cf, 2_cf}, true /*reopen*/); + for (auto cf : {default_cf, 1_cf, 2_cf}) { + for (auto& kv : kvs[cf]) { + VerifyKeyValue(dbs[0], cf, kv.first, kv.second); + } + } +} + +TEST_F(DBMergeTest, KeyOverlappedInstance) { + FlushOptions fopts; + fopts.allow_write_stall = true; + MergeInstanceOptions mopts; + mopts.merge_memtable = false; + WriteOptions wopts; + wopts.disableWAL = true; + CompactRangeOptions copts; + copts.bottommost_level_compaction = BottommostLevelCompaction::kForce; + + Open(1_db, {default_cf, 1_cf}); + Open(2_db, {1_cf, default_cf}); + ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, 1_cf), "1", "v1")); + ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, 1_cf), "0", "v0")); + + ASSERT_OK(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf})); + Destroy(3_db); + + ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, 1_cf), "3", "v3")); + IsOverlapError(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf})); + IsOverlapError(Merge(mopts, {1_db}, 2_db, {default_cf, 1_cf})); + + // Skip overlapped cf. + ASSERT_OK(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf})); + Destroy(3_db); + + // Only flush one. + ASSERT_OK(get_db(2_db)->Flush(fopts, get_cf(2_db, 1_cf))); + IsOverlapError(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf})); + IsOverlapError(Merge(mopts, {1_db}, 2_db, {default_cf, 1_cf})); + + // Both flushed. + ASSERT_OK(get_db(1_db)->Flush(fopts, get_cf(1_db, 1_cf))); + IsOverlapError(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf})); + IsOverlapError(Merge(mopts, {1_db}, 2_db, {default_cf, 1_cf})); + + // Delete in memory. + ASSERT_OK(get_db(1_db)->SingleDelete(wopts, get_cf(1_db, 1_cf), "1")); + IsOverlapError(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf})); + IsOverlapError(Merge(mopts, {1_db}, 2_db, {default_cf, 1_cf})); + + ASSERT_OK(get_db(1_db)->Flush(fopts, get_cf(1_db, 1_cf))); + IsOverlapError(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf})); + IsOverlapError(Merge(mopts, {1_db}, 2_db, {default_cf, 1_cf})); + + ASSERT_OK( + get_db(1_db)->CompactRange(copts, get_cf(1_db, 1_cf), nullptr, nullptr)); + ASSERT_OK(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf})); + + VerifyKeyValue(3_db, 1_cf, "0", "v0"); + VerifyKeyValue(3_db, 1_cf, "3", "v3"); + VerifyKeyValue(3_db, 1_cf, "1", "NotFound"); +} + +TEST_F(DBMergeTest, TombstoneOverlappedInstance) { + WriteOptions wopts; + wopts.disableWAL = true; + MergeInstanceOptions mopts; + mopts.merge_memtable = false; + CompactRangeOptions copts; + copts.bottommost_level_compaction = BottommostLevelCompaction::kForce; + + Open(1_db, {default_cf, 1_cf}); + Open(2_db, {default_cf, 1_cf}); + Open(3_db, {default_cf, 1_cf}); + Open(4_db, {default_cf, 1_cf}); + ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, 1_cf), "1", "v1")); + ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, 1_cf), "2", "v2")); + ASSERT_OK(get_db(3_db)->Put(wopts, get_cf(3_db, 1_cf), "3", "v3")); + ASSERT_OK(get_db(4_db)->Put(wopts, get_cf(4_db, 1_cf), "4", "v4")); + + ASSERT_OK(Merge(mopts, {1_db, 2_db, 3_db, 4_db}, 0_db, {default_cf, 1_cf})); + Destroy(0_db); + + // Lower bound overlap. + ASSERT_OK(get_db(2_db)->DeleteRange(wopts, get_cf(2_db, 1_cf), "0", "9")); + ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, 1_cf), "2", "v2")); + IsOverlapError(Merge(mopts, {1_db, 2_db}, 0_db, {default_cf, 1_cf})); + + // Upper bound overlap. + ASSERT_OK(get_db(3_db)->DeleteRange(wopts, get_cf(3_db, 1_cf), "0", "9")); + ASSERT_OK(get_db(3_db)->Put(wopts, get_cf(3_db, 1_cf), "3", "v3")); + IsOverlapError(Merge(mopts, {3_db, 4_db}, 0_db, {default_cf, 1_cf})); + + // nullptr is an empty key. + ASSERT_OK(get_db(4_db)->SingleDelete(wopts, get_cf(4_db, 1_cf), nullptr)); + IsOverlapError(Merge(mopts, {1_db, 4_db}, 0_db, {default_cf, 1_cf})); + + Slice start = "0"; + Slice end = "2"; + ASSERT_OK( + get_db(2_db)->CompactRange(copts, get_cf(2_db, 1_cf), &start, &end)); + start = "22"; + end = "99"; + ASSERT_OK( + get_db(2_db)->CompactRange(copts, get_cf(2_db, 1_cf), &start, &end)); + + start = "0"; + end = "3"; + ASSERT_OK( + get_db(3_db)->CompactRange(copts, get_cf(3_db, 1_cf), &start, &end)); + start = "33"; + end = "99"; + ASSERT_OK( + get_db(3_db)->CompactRange(copts, get_cf(3_db, 1_cf), &start, &end)); + + end = "4"; + ASSERT_OK( + get_db(4_db)->CompactRange(copts, get_cf(4_db, 1_cf), nullptr, &end)); + + mopts.merge_memtable = true; + ASSERT_OK(Merge(mopts, {1_db, 2_db, 3_db, 4_db}, 0_db, {default_cf, 1_cf})); + + VerifyKeyValue(0_db, 1_cf, "1", "v1"); + VerifyKeyValue(0_db, 1_cf, "2", "v2"); + VerifyKeyValue(0_db, 1_cf, "3", "v3"); + VerifyKeyValue(0_db, 1_cf, "4", "v4"); +} + +TEST_F(DBMergeTest, WithWAL) { + WriteOptions wopts; + wopts.disableWAL = false; + MergeInstanceOptions mopts; + FlushOptions fopts; + fopts.allow_write_stall = true; + + Open(1_db, {default_cf, 1_cf}); + Open(2_db, {default_cf, 1_cf}); + ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, 1_cf), "1", "v1")); + ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, 1_cf), "2", "v2")); + + // Ignore WAL and memtable. + mopts.merge_memtable = false; + ASSERT_OK(Merge(mopts, {1_db}, 2_db)); + VerifyKeyValue(2_db, 1_cf, "2", "v2"); + VerifyKeyValue(2_db, 1_cf, "1", "NotFound"); + + mopts.merge_memtable = true; + IsWALNotEmpty(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf})); + + for (auto db : {1_db, 2_db}) { + ASSERT_OK(get_db(db)->Flush(fopts, get_cf(db, 1_cf))); + } + ASSERT_OK(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf})); +} + +TEST_F(DBMergeTest, MemtableIsolation) { + WriteOptions wopts; + wopts.disableWAL = true; + MergeInstanceOptions mopts; + mopts.merge_memtable = true; + + Open(1_db, {default_cf}); + Open(2_db, {default_cf}); + ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, default_cf), "1", "v1")); + ASSERT_OK(Merge(mopts, {1_db}, 2_db, {default_cf})); + VerifyKeyValue(2_db, default_cf, "1", "v1"); + ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, default_cf), "1", "v2")); + // Increase the seqno of 2_db so that snapshot might include new writes. + ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, default_cf), "2", "v")); + ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, default_cf), "2", "v")); + // Check merged DB is not affected by source DB writes. + VerifyKeyValue(2_db, default_cf, "1", "v1"); +} + +TEST_F(DBMergeTest, CacheReuse) { + BlockBasedTableOptions table_options; + // Otherwise the reader will not attempt to read cache first. + table_options.cache_index_and_filter_blocks = true; + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + WriteOptions wopts; + wopts.disableWAL = true; + ReadOptions ropts; + ropts.fill_cache = true; + MergeInstanceOptions mopts; + mopts.merge_memtable = true; + + Open(1_db, {default_cf}); + Open(2_db, {default_cf}); + ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, default_cf), "1", "v1")); + ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, default_cf), "2", "v1")); + for (auto db : {1_db, 2_db}) { + ASSERT_OK(get_db(db)->Flush(FlushOptions(), get_cf(db, default_cf))); + } + VerifyKeyValue(1_db, default_cf, "1", "v1", ropts); + VerifyKeyValue(2_db, default_cf, "2", "v1", ropts); + ropts.read_tier = ReadTier::kBlockCacheTier; + VerifyKeyValue(1_db, default_cf, "1", "v1", ropts); + VerifyKeyValue(2_db, default_cf, "2", "v1", ropts); + + ASSERT_OK(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf})); + + ropts.read_tier = ReadTier::kBlockCacheTier; + VerifyKeyValue(3_db, default_cf, "1", "v1", ropts); + VerifyKeyValue(3_db, default_cf, "2", "v1", ropts); +} + +TEST_F(DBMergeTest, ConcurrentFlush) { + WriteOptions wopts; + wopts.disableWAL = true; + MergeInstanceOptions mopts; + mopts.merge_memtable = true; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::MergeDisjointInstances:AfterMergeMemtable:1", + [&](void* /*arg*/) { + for (auto db : {1_db, 2_db}) { + ASSERT_OK(get_db(db)->Flush(FlushOptions(), get_cf(db, default_cf))); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + Open(1_db, {default_cf}); + Open(2_db, {default_cf}); + Open(3_db, {default_cf}); + ASSERT_OK(get_db(3_db)->PauseBackgroundWork()); + + // Put some to memtable. + ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, default_cf), "1", "v1")); + ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, default_cf), "2", "v1")); + ASSERT_EQ(Property(1_db, "rocksdb.num-files-at-level0"), 0); + ASSERT_EQ(Property(2_db, "rocksdb.num-files-at-level0"), 0); + + ASSERT_OK(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf})); + ASSERT_EQ(Property(1_db, "rocksdb.num-files-at-level0"), 1); + ASSERT_EQ(Property(2_db, "rocksdb.num-files-at-level0"), 1); + ASSERT_EQ(Property(3_db, "rocksdb.num-files-at-level0"), 0); + + VerifyKeyValue(3_db, default_cf, "1", "v1"); + VerifyKeyValue(3_db, default_cf, "2", "v1"); + + ASSERT_OK(get_db(3_db)->ContinueBackgroundWork()); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(DBMergeTest, MissingCF) { + WriteOptions wopts; + wopts.disableWAL = true; + MergeInstanceOptions mopts; + mopts.merge_memtable = true; + + Open(1_db, {default_cf, 1_cf}); + Open(2_db, {default_cf, 2_cf}); + Open(3_db, {default_cf, 3_cf}); + ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, 1_cf), "key", "v1")); + ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, 2_cf), "key", "v2")); + ASSERT_OK(get_db(3_db)->Put(wopts, get_cf(3_db, 3_cf), "key", "v3")); + + ASSERT_OK( + Merge(mopts, {1_db, 2_db, 3_db}, 4_db, {default_cf, 1_cf, 2_cf, 3_cf})); + + VerifyKeyValue(4_db, 1_cf, "key", "v1"); + VerifyKeyValue(4_db, 2_cf, "key", "v2"); + VerifyKeyValue(4_db, 3_cf, "key", "v3"); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + RegisterCustomObjects(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index 337eadb7328..73100d74933 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -1081,7 +1081,6 @@ TEST_F(DBPropertiesTest, EstimateCompressionRatio) { ASSERT_GT(CompressionRatioAtLevel(1), 10.0); } - class CountingUserTblPropCollector : public TablePropertiesCollector { public: const char* Name() const override { return "CountingUserTblPropCollector"; } @@ -2171,7 +2170,7 @@ TEST_F(DBPropertiesTest, GetMapPropertyWriteStallStats) { WriteStallCause::kMemtableLimit}) { if (test_cause == WriteStallCause::kWriteBufferManagerLimit) { options.write_buffer_manager.reset( - new WriteBufferManager(100000, nullptr, true)); + new WriteBufferManager(100000, nullptr, 1.0)); } else if (test_cause == WriteStallCause::kMemtableLimit) { options.max_write_buffer_number = 2; options.disable_auto_compactions = true; @@ -2207,13 +2206,13 @@ TEST_F(DBPropertiesTest, GetMapPropertyWriteStallStats) { if (test_cause == WriteStallCause::kWriteBufferManagerLimit) { ASSERT_OK(dbfull()->Put( WriteOptions(), handles_[1], Key(1), - DummyString(options.write_buffer_manager->buffer_size()))); + DummyString(options.write_buffer_manager->flush_size()))); WriteOptions wo; wo.no_slowdown = true; Status s = dbfull()->Put( wo, handles_[1], Key(2), - DummyString(options.write_buffer_manager->buffer_size())); + DummyString(options.write_buffer_manager->flush_size())); ASSERT_TRUE(s.IsIncomplete()); ASSERT_TRUE(s.ToString().find("Write stall") != std::string::npos); } else if (test_cause == WriteStallCause::kMemtableLimit) { @@ -2364,7 +2363,6 @@ TEST_F(DBPropertiesTest, TableMetaIndexKeys) { } while (ChangeOptions()); } - } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_test.cc b/db/db_test.cc index 646e3101f50..f0bdca59528 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -3219,7 +3219,9 @@ class ModelDB : public DB { delete reinterpret_cast(snapshot); } - Status Write(const WriteOptions& /*options*/, WriteBatch* batch) override { + using DB::Write; + Status Write(const WriteOptions& /*options*/, WriteBatch* batch, + PostWriteCallback* /*callback*/) override { class Handler : public WriteBatch::Handler { public: KVMap* map_; diff --git a/db/db_test2.cc b/db/db_test2.cc index 026334509d1..4f33841c334 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -25,6 +25,7 @@ #include "rocksdb/trace_record_result.h" #include "rocksdb/utilities/replayer.h" #include "rocksdb/wal_filter.h" +#include "test_util/mock_time_env.h" #include "test_util/testutil.h" #include "util/random.h" #include "utilities/fault_injection_env.h" @@ -331,11 +332,11 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { ASSERT_LT(cache->GetUsage(), 256 * 1024); if (use_old_interface_) { - options.db_write_buffer_size = 120000; // this is the real limit + options.db_write_buffer_size = 100000; } else if (!cost_cache_) { - options.write_buffer_manager.reset(new WriteBufferManager(114285)); + options.write_buffer_manager.reset(new WriteBufferManager(100000)); } else { - options.write_buffer_manager.reset(new WriteBufferManager(114285, cache)); + options.write_buffer_manager.reset(new WriteBufferManager(100000, cache)); } options.write_buffer_size = 500000; // this is never hit CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); @@ -366,7 +367,6 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), static_cast(1)); - flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager; ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); if (cost_cache_) { ASSERT_GE(cache->GetUsage(), 256 * 1024); @@ -512,10 +512,8 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); options.write_buffer_size = 500000; // this is never hit - // Use a write buffer total size so that the soft limit is about - // 105000. - options.write_buffer_manager.reset(new WriteBufferManager(120000)); - CreateAndReopenWithCF({"cf1", "cf2"}, options); + options.write_buffer_manager.reset(new WriteBufferManager(100000)); + CreateAndReopenWithCF({"cf1"}, options); ASSERT_OK(DestroyDB(dbname2, options)); DB* db2 = nullptr; @@ -527,7 +525,6 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) { std::function wait_flush = [&]() { ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); - ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2])); ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); // Ensure background work is fully finished including listener callbacks // before accessing listener state. @@ -536,49 +533,134 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) { static_cast_with_check(db2)->TEST_WaitForBackgroundWork()); }; - // Trigger a flush on cf2 - flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager; - ASSERT_OK(Put(2, Key(1), DummyString(70000), wo)); - wait_flush(); + // Trigger a flush on DB1.cf1 + flush_listener->expected_flush_reason = FlushReason::kManualFlush; ASSERT_OK(Put(0, Key(1), DummyString(20000), wo)); wait_flush(); + ASSERT_OK(Put(1, Key(1), DummyString(70000), wo)); + wait_flush(); // Insert to DB2 + // [20000, 70000, 20000] ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000))); wait_flush(); - ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(1, Key(1), DummyString(1), wo)); wait_flush(); - ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); { - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") + - GetNumberOfSstFilesForColumnFamily(db_, "cf1") + - GetNumberOfSstFilesForColumnFamily(db_, "cf2"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), static_cast(1)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), static_cast(0)); } - // Triggering to flush another CF in DB1 + // Triggering to flush DB2 by writing to DB1 + // [20000, 0, 90000] ASSERT_OK(db2->Put(wo, Key(2), DummyString(70000))); wait_flush(); - ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); - wait_flush(); { ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), - static_cast(1)); + static_cast(0)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"), + } + ASSERT_OK(Put(1, Key(1), DummyString(1), wo)); + wait_flush(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), static_cast(1)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + static_cast(1)); + } + + // Triggering flush in DB2 by writing to DB2 + // [20000, 0, 80000] + ASSERT_OK(db2->Put(wo, Key(3), DummyString(80000))); + ASSERT_OK(db2->Put(wo, Key(1), DummyString(10000))); + wait_flush(); + ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), static_cast(0)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + static_cast(2)); } - // Triggering flush in DB2. - ASSERT_OK(db2->Put(wo, Key(3), DummyString(40000))); + delete db2; + ASSERT_OK(DestroyDB(dbname2, options)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB_RankByAge) { + std::string dbname2 = test::PerThreadDBPath("db_shared_wb_age_db2"); + Options options = CurrentOptions(); + options.arena_block_size = 4096; + auto flush_listener = std::make_shared(); + options.listeners.push_back(flush_listener); + // Don't trip the listener at shutdown. + options.avoid_flush_during_shutdown = true; + // Avoid undeterministic value by malloc_usable_size(); + // Force arena block size to 1 + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Arena::Arena:0", [&](void* arg) { + size_t* block_size = static_cast(arg); + *block_size = 1; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "Arena::AllocateNewBlock:0", [&](void* arg) { + std::pair* pair = + static_cast*>(arg); + *std::get<0>(*pair) = *std::get<1>(*pair); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + options.write_buffer_size = 500000; // this is never hit + options.write_buffer_manager.reset(new WriteBufferManager( + 100000, nullptr /*cache*/, 0.0 /*stall_ratio*/, true /*flush_oldest*/)); + + auto mock_clock = std::make_shared(SystemClock::Default()); + options.env = new CompositeEnvWrapper(options.env, mock_clock); + + CreateAndReopenWithCF({"cf1"}, options); + + ASSERT_OK(DestroyDB(dbname2, options)); + DB* db2 = nullptr; + ASSERT_OK(DB::Open(options, dbname2, &db2)); + + WriteOptions wo; + wo.disableWAL = true; + + std::function wait_flush = [&]() { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); + }; + + // Trigger a flush on DB1.cf2 + flush_listener->expected_flush_reason = FlushReason::kManualFlush; + mock_clock->SetCurrentTime(50); + ASSERT_OK(Put(0, Key(1), DummyString(20000), wo)); + wait_flush(); + mock_clock->SetCurrentTime(100); + ASSERT_OK(Put(1, Key(1), DummyString(70000), wo)); + wait_flush(); + mock_clock->SetCurrentTime(150); + + // Insert to DB2 + // [20000, 70000, 20000] + ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000))); wait_flush(); - ASSERT_OK(db2->Put(wo, Key(1), DummyString(1))); + + ASSERT_OK(Put(1, Key(1), DummyString(1), wo)); wait_flush(); ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); { @@ -586,10 +668,31 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) { static_cast(1)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), static_cast(0)); - ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"), + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + static_cast(0)); + } + + // Triggering to flush DB1 by writing to DB2 + // [20000, 0, 90000] + ASSERT_OK(db2->Put(wo, Key(2), DummyString(70000))); + wait_flush(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), + static_cast(0)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + static_cast(0)); + } + ASSERT_OK(db2->Put(wo, Key(3), DummyString(1))); + wait_flush(); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), + static_cast(0)); } delete db2; diff --git a/db/db_test_util.cc b/db/db_test_util.cc index 28d67527fe9..6d46be7ea1a 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -617,6 +617,22 @@ void DBTestBase::ReopenWithColumnFamilies(const std::vector& cfs, ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); } +void DBTestBase::OpenWithCFWriteBufferManager( + const std::vector& cfs, + const std::vector> wbms, + const Options& options) { + CreateColumnFamilies(cfs, options); + std::vector cfs_plus_default = cfs; + cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName); + std::vector cf_options; + for (size_t i = 0; i < wbms.size(); ++i) { + auto o = options; + o.cf_write_buffer_manager = wbms[i]; + cf_options.push_back(o); + } + ReopenWithColumnFamilies(cfs_plus_default, cf_options); +} + void DBTestBase::SetTimeElapseOnlySleepOnReopen(DBOptions* options) { time_elapse_only_sleep_on_reopen_ = true; diff --git a/db/db_test_util.h b/db/db_test_util.h index dc34352dc2e..f1298dc6bf5 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -1118,6 +1118,11 @@ class DBTestBase : public testing::Test { Status TryReopenWithColumnFamilies(const std::vector& cfs, const Options& options); + void OpenWithCFWriteBufferManager( + const std::vector& cfs, + const std::vector> wbms, + const Options& options); + void Reopen(const Options& options); void Close(); diff --git a/db/db_write_buffer_manager_test.cc b/db/db_write_buffer_manager_test.cc index eb33ec41e12..28dc9908c6a 100644 --- a/db/db_write_buffer_manager_test.cc +++ b/db/db_write_buffer_manager_test.cc @@ -31,10 +31,10 @@ TEST_P(DBWriteBufferManagerTest, SharedBufferAcrossCFs1) { if (cost_cache_) { options.write_buffer_manager.reset( - new WriteBufferManager(100000, cache, true)); + new WriteBufferManager(100000, cache, 1.0)); } else { options.write_buffer_manager.reset( - new WriteBufferManager(100000, nullptr, true)); + new WriteBufferManager(100000, nullptr, 1.0)); } WriteOptions wo; @@ -74,10 +74,10 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) { if (cost_cache_) { options.write_buffer_manager.reset( - new WriteBufferManager(100000, cache, true)); + new WriteBufferManager(100000, cache, 1.0)); } else { options.write_buffer_manager.reset( - new WriteBufferManager(100000, nullptr, true)); + new WriteBufferManager(100000, nullptr, 1.0)); } WriteOptions wo; wo.disableWAL = true; @@ -179,6 +179,374 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } +// Compared with `SharedWriteBufferAcrossCFs2` this test uses CF based write +// buffer manager CF level write buffer manager will not block write even +// exceeds the stall threshold DB level write buffer manager will block all +// write including CFs not use it. +TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs3) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + std::shared_ptr cf_write_buffer_manager; + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, 1.0)); + cf_write_buffer_manager.reset(new WriteBufferManager(100000, cache, 1.0)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, 1.0)); + cf_write_buffer_manager.reset(new WriteBufferManager(100000, nullptr, 1.0)); + } + + WriteOptions wo; + wo.disableWAL = true; + + std::vector cfs = {"cf1", "cf2", "cf3", "cf4", "cf5"}; + std::vector> wbms = { + nullptr, + nullptr, + nullptr, + nullptr, + cf_write_buffer_manager, + cf_write_buffer_manager}; + OpenWithCFWriteBufferManager(cfs, wbms, options); + auto opts = db_->GetOptions(); + + ASSERT_OK(Put(4, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(5, Key(1), DummyString(40000), wo)); + ASSERT_OK(Put(4, Key(1), DummyString(40000), wo)); + // Now, cf_write_buffer_manager reaches the stall level, but it will not block + // the write + + int num_writers_total = 6; + for (int i = 0; i < num_writers_total; i++) { + ASSERT_OK(Put(i, Key(1), DummyString(1), wo)); + } + + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + Flush(3); + ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); + Flush(0); + + // Write to "Default", "cf2" and "cf3". No flush will be triggered. + ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(40000), wo)); + ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); + + ASSERT_OK(Put(3, Key(2), DummyString(40000), wo)); + // WriteBufferManager::buffer_size_ has exceeded after the previous write is + // completed. + + std::unordered_set w_set; + std::vector threads; + int wait_count_db = 0; + int num_writers1 = 4; // default, cf1-cf3 + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + std::atomic thread_num(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WBMStallInterface::BlockDB", [&](void*) { + InstrumentedMutexLock lock(&mutex); + wait_count_db++; + cv.SignalAll(); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::WriteStall::Wait", [&](void* arg) { + InstrumentedMutexLock lock(&mutex); + WriteThread::Writer* w = reinterpret_cast(arg); + w_set.insert(w); + // Allow the flush to continue if all writer threads are blocked. + if (w_set.size() == (unsigned long)num_writers1) { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s = true; + + std::function writer = [&](int cf) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + Status tmp = Put(cf, Slice(key), DummyString(1), wo); + InstrumentedMutexLock lock(&mutex); + s = s && tmp.ok(); + }; + + threads.emplace_back(writer, 1); + { + InstrumentedMutexLock lock(&mutex); + while (wait_count_db != 1) { + cv.Wait(); + } + } + for (int i = 0; i < num_writers_total; i++) { + threads.emplace_back(writer, i % 6); + } + for (auto& t : threads) { + t.join(); + } + + ASSERT_TRUE(s); + + // Number of DBs blocked. + ASSERT_EQ(wait_count_db, 1); + // Number of Writer threads blocked. + ASSERT_EQ(w_set.size(), num_writers_total); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +// Test multiple WriteBufferManager are independent to flush +TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs4) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + std::shared_ptr cf_write_buffer_manager; + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, 0.0)); + cf_write_buffer_manager.reset(new WriteBufferManager(100000, cache, 0.0)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, 0.0)); + cf_write_buffer_manager.reset(new WriteBufferManager(100000, nullptr, 0.0)); + } + + WriteOptions wo; + wo.disableWAL = true; + + std::vector cfs = {"cf1", "cf2", "cf3", "cf4", "cf5"}; + std::vector> wbms = { + nullptr, + nullptr, + nullptr, + nullptr, + cf_write_buffer_manager, + cf_write_buffer_manager}; + OpenWithCFWriteBufferManager(cfs, wbms, options); + + ASSERT_OK(Put(4, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(4, Key(1), DummyString(40000), wo)); + + ASSERT_OK(Put(1, Key(1), DummyString(40000), wo)); + ASSERT_OK(Put(2, Key(1), DummyString(30000), wo)); + + ASSERT_OK(Put(5, Key(1), DummyString(50000), wo)); + + // The second WriteBufferManager::buffer_size_ has exceeded after the previous + // write is completed. + + std::unordered_set flush_cfs; + std::vector threads; + int num_writers_total = 6; + InstrumentedMutex mutex; + InstrumentedCondVar cv(&mutex); + std::atomic thread_num(0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::Flush:ScheduleFlushReq", [&](void* arg) { + InstrumentedMutexLock lock(&mutex); + ColumnFamilyHandle* cfd = reinterpret_cast(arg); + flush_cfs.insert(cfd->GetName()); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + bool s = true; + + std::function writer = [&](int cf, int val_size) { + int a = thread_num.fetch_add(1); + std::string key = "foo" + std::to_string(a); + Status tmp = Put(cf, Slice(key), DummyString(val_size), wo); + InstrumentedMutexLock lock(&mutex); + s = s && tmp.ok(); + }; + + for (int i = 0; i < num_writers_total; i++) { + threads.emplace_back(writer, i % 6, 1); + } + for (auto& t : threads) { + t.join(); + } + threads.clear(); + + ASSERT_TRUE(s); + ASSERT_EQ(flush_cfs.size(), 1); + ASSERT_NE(flush_cfs.find("cf4"), flush_cfs.end()); + flush_cfs.clear(); + + ASSERT_OK(Put(0, Key(1), DummyString(30000), wo)); + + for (int i = 0; i < num_writers_total; i++) { + threads.emplace_back(writer, i % 6, 1); + } + for (auto& t : threads) { + t.join(); + } + + ASSERT_EQ(flush_cfs.size(), 1); + ASSERT_NE(flush_cfs.find("cf1"), flush_cfs.end()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBWriteBufferManagerTest, FreeMemoryOnDestroy) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + options.max_write_buffer_number = 5; // Avoid unexpected stalling. + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, 1.0)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, 1.0)); + } + + CreateAndReopenWithCF({"cf1", "cf2"}, options); + std::string db2_name = test::PerThreadDBPath("free_memory_on_destroy_db2"); + DB* db2 = nullptr; + ASSERT_OK(DestroyDB(db2_name, options)); + ASSERT_OK(DB::Open(options, db2_name, &db2)); + + ASSERT_OK(db_->PauseBackgroundWork()); + ASSERT_OK(db2->PauseBackgroundWork()); + + WriteOptions wo; + wo.disableWAL = true; + wo.no_slowdown = true; + + ASSERT_OK(db2->Put(wo, Key(1), DummyString(30000))); + ASSERT_OK(Put(1, Key(1), DummyString(20000), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(40000), wo)); + + // Decrease flush size, at least two cfs must be freed to not stall write. + options.write_buffer_manager->SetFlushSize(50000); + ASSERT_TRUE(Put(0, Key(1), DummyString(30000), wo).IsIncomplete()); + + ASSERT_OK(db2->ContinueBackgroundWork()); // Close waits on pending jobs. + // Thanks to `UnregisterDB`, we don't have to delete it to free up space. + db2->Close(); + ASSERT_TRUE(Put(0, Key(1), DummyString(30000), wo).IsIncomplete()); + + dbfull()->TEST_ClearBackgroundJobs(); // Jobs hold ref of cfd. + ASSERT_OK(db_->DropColumnFamily(handles_[1])); + ASSERT_TRUE(Put(0, Key(1), DummyString(30000), wo).IsIncomplete()); + ASSERT_OK(db_->DestroyColumnFamilyHandle(handles_[1])); + handles_.erase(handles_.begin() + 1); + ASSERT_OK(Put(0, Key(1), DummyString(30000), wo)); + + delete db2; + DestroyDB(db2_name, options); + + ASSERT_OK(db_->ContinueBackgroundWork()); +} + +TEST_P(DBWriteBufferManagerTest, DynamicFlushSize) { + Options options = CurrentOptions(); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); + + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, 1.0)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, 1.0)); + } + + CreateAndReopenWithCF({"cf1", "cf2"}, options); + std::string db2_name = test::PerThreadDBPath("dynamic_flush_db2"); + DB* db2 = nullptr; + ASSERT_OK(DestroyDB(db2_name, options)); + ASSERT_OK(DB::Open(options, db2_name, &db2)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0", + "DBImpl::BackgroundCallFlush:start"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Increase flush size can unblock writers. + { + WriteOptions wo; + wo.disableWAL = true; + ASSERT_OK(db2->Put(wo, Key(1), DummyString(60000))); + ASSERT_OK(Put(1, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(30000), wo)); + // Write to DB. + std::vector threads; + std::atomic ready{false}; + std::function write_db = [&](DB* db) { + WriteOptions wopts; + wopts.disableWAL = true; + wopts.no_slowdown = true; + ASSERT_TRUE(db->Put(wopts, Key(3), DummyString(1)).IsIncomplete()); + ready = true; + wopts.no_slowdown = false; + ASSERT_OK(db->Put(wopts, Key(3), DummyString(1))); + }; + // Triggers db2 flush, but the flush is blocked. + threads.emplace_back(write_db, db_); + while (!ready) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + // Increase. + options.write_buffer_manager->SetFlushSize(200000); + for (auto& t : threads) { + t.join(); + } + TEST_SYNC_POINT("DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0"); + } + // Decrease flush size triggers flush. + { + WriteOptions wo; + wo.disableWAL = true; + wo.no_slowdown = true; + + ASSERT_OK(Put(0, Key(1), DummyString(60000), wo)); + // All memtables must be flushed to satisfy the new flush_size. + // Not too small because memtable has a minimum size. + options.write_buffer_manager->SetFlushSize(10240); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); + ASSERT_OK(db2->Put(wo, Key(1), DummyString(200000))); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + db2->Close(); + delete db2; + DestroyDB(db2_name, options); +} + // Test multiple DBs get blocked when WriteBufferManager limit exceeds and flush // is waiting to be finished but DBs tries to write meanwhile. TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) { @@ -201,10 +569,10 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) { if (cost_cache_) { options.write_buffer_manager.reset( - new WriteBufferManager(100000, cache, true)); + new WriteBufferManager(100000, cache, 1.0)); } else { options.write_buffer_manager.reset( - new WriteBufferManager(100000, nullptr, true)); + new WriteBufferManager(100000, nullptr, 1.0)); } CreateAndReopenWithCF({"cf1", "cf2"}, options); @@ -216,10 +584,10 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) { wo.disableWAL = true; for (int i = 0; i < num_dbs; i++) { - ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000))); + ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(25000))); } // Insert to db_. - ASSERT_OK(Put(0, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(25000), wo)); // WriteBufferManager Limit exceeded. std::vector threads; @@ -318,10 +686,10 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) { if (cost_cache_) { options.write_buffer_manager.reset( - new WriteBufferManager(100000, cache, true)); + new WriteBufferManager(100000, cache, 1.0)); } else { options.write_buffer_manager.reset( - new WriteBufferManager(100000, nullptr, true)); + new WriteBufferManager(100000, nullptr, 1.0)); } CreateAndReopenWithCF({"cf1", "cf2"}, options); @@ -333,10 +701,10 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) { wo.disableWAL = true; for (int i = 0; i < num_dbs; i++) { - ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000))); + ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(25000))); } // Insert to db_. - ASSERT_OK(Put(0, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(25000), wo)); // WriteBufferManager::buffer_size_ has exceeded after the previous write to // dbs[0] is completed. @@ -460,10 +828,10 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) { if (cost_cache_) { options.write_buffer_manager.reset( - new WriteBufferManager(100000, cache, true)); + new WriteBufferManager(100000, cache, 1.0)); } else { options.write_buffer_manager.reset( - new WriteBufferManager(100000, nullptr, true)); + new WriteBufferManager(100000, nullptr, 1.0)); } WriteOptions wo; wo.disableWAL = true; @@ -622,10 +990,10 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) { if (cost_cache_) { options.write_buffer_manager.reset( - new WriteBufferManager(100000, cache, true)); + new WriteBufferManager(100000, cache, 1.0)); } else { options.write_buffer_manager.reset( - new WriteBufferManager(100000, nullptr, true)); + new WriteBufferManager(100000, nullptr, 1.0)); } CreateAndReopenWithCF({"cf1", "cf2"}, options); @@ -637,10 +1005,10 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) { wo.disableWAL = true; for (int i = 0; i < num_dbs; i++) { - ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000))); + ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(25000))); } // Insert to db_. - ASSERT_OK(Put(0, Key(1), DummyString(30000), wo)); + ASSERT_OK(Put(0, Key(1), DummyString(25000), wo)); // WriteBufferManager::buffer_size_ has exceeded after the previous write to // dbs[0] is completed. @@ -780,7 +1148,6 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } - // Tests a `WriteBufferManager` constructed with `allow_stall == false` does not // thrash memtable switching when full and a CF receives multiple writes. // Instead, we expect to switch a CF's memtable for flush only when that CF does @@ -791,7 +1158,7 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) { // by writing to that CF's DB. // // Not supported in LITE mode due to `GetProperty()` unavailable. -TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) { +TEST_P(DBWriteBufferManagerTest, DISABLED_StopSwitchingMemTablesOnceFlushing) { Options options = CurrentOptions(); options.arena_block_size = 4 << 10; // 4KB options.write_buffer_size = 1 << 20; // 1MB @@ -846,72 +1213,84 @@ TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) { delete shared_wbm_db; } -TEST_F(DBWriteBufferManagerTest, RuntimeChangeableAllowStall) { - constexpr int kBigValue = 10000; +// Test write can progress even if manual compaction and background work is +// paused. +TEST_P(DBWriteBufferManagerTest, BackgroundWorkPaused) { + std::vector dbnames; + std::vector dbs; + int num_dbs = 4; + + for (int i = 0; i < num_dbs; i++) { + dbs.push_back(nullptr); + dbnames.push_back( + test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i))); + } Options options = CurrentOptions(); - options.write_buffer_manager.reset( - new WriteBufferManager(1, nullptr /* cache */, true /* allow_stall */)); - DestroyAndReopen(options); + options.arena_block_size = 4096; + options.write_buffer_size = 500000; // this is never hit + options.avoid_flush_during_shutdown = true; // avoid blocking destroy forever + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); + ASSERT_LT(cache->GetUsage(), 256 * 1024); + cost_cache_ = GetParam(); - // Pause flush thread so that - // (a) the only way to exist write stall below is to change the `allow_stall` - // (b) the write stall is "stable" without being interfered by flushes so that - // we can check it without flakiness - std::unique_ptr sleeping_task( - new test::SleepingBackgroundTask()); - env_->SetBackgroundThreads(1, Env::HIGH); - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - sleeping_task.get(), Env::Priority::HIGH); - sleeping_task->WaitUntilSleeping(); - - // Test 1: test setting `allow_stall` from true to false - // - // Assert existence of a write stall - WriteOptions wo_no_slowdown; - wo_no_slowdown.no_slowdown = true; - Status s = Put(Key(0), DummyString(kBigValue), wo_no_slowdown); - ASSERT_TRUE(s.IsIncomplete()); - ASSERT_TRUE(s.ToString().find("Write stall") != std::string::npos); + // Do not enable write stall. + if (cost_cache_) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, cache, 0.0)); + } else { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, 0.0)); + } + DestroyAndReopen(options); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"WBMStallInterface::BlockDB", - "DBWriteBufferManagerTest::RuntimeChangeableThreadSafeParameters::" - "ChangeParameter"}}); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(DestroyDB(dbnames[i], options)); + ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i]))); + } - // Test `SetAllowStall()` - port::Thread thread1([&] { ASSERT_OK(Put(Key(0), DummyString(kBigValue))); }); - port::Thread thread2([&] { - TEST_SYNC_POINT( - "DBWriteBufferManagerTest::RuntimeChangeableThreadSafeParameters::" - "ChangeParameter"); - options.write_buffer_manager->SetAllowStall(false); - }); - - // Verify `allow_stall` is successfully set to false in thread2. - // Othwerwise, thread1's write will be stalled and this test will hang - // forever. - thread1.join(); - thread2.join(); + dbfull()->DisableManualCompaction(); + ASSERT_OK(dbfull()->PauseBackgroundWork()); + for (int i = 0; i < num_dbs; i++) { + dbs[i]->DisableManualCompaction(); + ASSERT_OK(dbs[i]->PauseBackgroundWork()); + } - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + WriteOptions wo; + wo.disableWAL = true; - // Test 2: test setting `allow_stall` from false to true - // - // Assert no write stall - ASSERT_OK(Put(Key(0), DummyString(kBigValue), wo_no_slowdown)); + // Arrange the score like this: (this)2000, (0-th)100000, (1-th)1, ... + ASSERT_OK(Put(Key(1), DummyString(2000), wo)); + for (int i = 1; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(1))); + } + // Exceed the limit. + ASSERT_OK(dbs[0]->Put(wo, Key(1), DummyString(100000))); + // Write another one to trigger the flush. + ASSERT_OK(Put(Key(3), DummyString(1), wo)); - // Test `SetAllowStall()` - options.write_buffer_manager->SetAllowStall(true); + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->ContinueBackgroundWork()); + ASSERT_OK( + static_cast_with_check(dbs[i])->TEST_WaitForFlushMemTable()); + std::string property; + EXPECT_TRUE(dbs[i]->GetProperty("rocksdb.num-files-at-level0", &property)); + int num = atoi(property.c_str()); + ASSERT_EQ(num, 0); + } + ASSERT_OK(dbfull()->ContinueBackgroundWork()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + std::string property; + EXPECT_TRUE(dbfull()->GetProperty("rocksdb.num-files-at-level0", &property)); + int num = atoi(property.c_str()); + ASSERT_EQ(num, 1); - // Verify `allow_stall` is successfully set to true. - // Otherwise the following write will not be stalled and therefore succeed. - s = Put(Key(0), DummyString(kBigValue), wo_no_slowdown); - ASSERT_TRUE(s.IsIncomplete()); - ASSERT_TRUE(s.ToString().find("Write stall") != std::string::npos); - sleeping_task->WakeUp(); + // Clean up DBs. + for (int i = 0; i < num_dbs; i++) { + ASSERT_OK(dbs[i]->Close()); + ASSERT_OK(DestroyDB(dbnames[i], options)); + delete dbs[i]; + } } INSTANTIATE_TEST_CASE_P(DBWriteBufferManagerTest, DBWriteBufferManagerTest, diff --git a/db/db_write_test.cc b/db/db_write_test.cc index 0c6fdf849c5..daaa7ba4067 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -859,6 +859,103 @@ TEST_P(DBWriteTest, MultiThreadWrite) { Close(); } +class SimpleCallback : public PostWriteCallback { + std::function f_; + + public: + SimpleCallback(std::function&& f) : f_(f) {} + + void Callback(SequenceNumber seq) override { f_(seq); } +}; + +TEST_P(DBWriteTest, PostWriteCallback) { + Options options = GetOptions(); + if (options.two_write_queues) { + // Not compatible. + return; + } + Reopen(options); + + std::vector threads; + + port::Mutex the_first_can_exit_write_mutex; + the_first_can_exit_write_mutex.Lock(); + port::Mutex can_flush_mutex; + can_flush_mutex.Lock(); + port::Mutex the_second_can_exit_write_mutex; + the_second_can_exit_write_mutex.Lock(); + + std::atomic written(0); + std::atomic flushed(false); + + threads.push_back(port::Thread([&] { + WriteBatch batch; + WriteOptions opts; + opts.sync = false; + opts.disableWAL = true; + SimpleCallback callback([&](SequenceNumber seq) { + ASSERT_NE(seq, 0); + can_flush_mutex.Unlock(); + the_first_can_exit_write_mutex.Lock(); + the_second_can_exit_write_mutex.Unlock(); + }); + batch.Put("key", "value"); + ASSERT_OK(dbfull()->Write(opts, &batch, &callback)); + written.fetch_add(1, std::memory_order_relaxed); + })); + threads.push_back(port::Thread([&] { + WriteBatch batch; + WriteOptions opts; + opts.sync = false; + opts.disableWAL = true; + SimpleCallback callback([&](SequenceNumber seq) { + ASSERT_NE(seq, 0); + the_second_can_exit_write_mutex.Lock(); + }); + batch.Put("key", "value"); + ASSERT_OK(dbfull()->Write(opts, &batch, &callback)); + written.fetch_add(1, std::memory_order_relaxed); + })); + // Flush will enter write thread and wait for pending writes. + threads.push_back(port::Thread([&] { + FlushOptions opts; + opts.wait = false; + can_flush_mutex.Lock(); + ASSERT_OK(dbfull()->Flush(opts)); + flushed.store(true, std::memory_order_relaxed); + })); + + std::this_thread::sleep_for(std::chrono::milliseconds{100}); + ASSERT_EQ(written.load(std::memory_order_relaxed), 0); + ASSERT_EQ(flushed.load(std::memory_order_relaxed), false); + + the_first_can_exit_write_mutex.Unlock(); + std::this_thread::sleep_for(std::chrono::milliseconds{100}); + ASSERT_EQ(written.load(std::memory_order_relaxed), 2); + ASSERT_EQ(flushed.load(std::memory_order_relaxed), true); + + for (auto& t : threads) { + t.join(); + } +} + +TEST_P(DBWriteTest, PostWriteCallbackEmptyBatch) { + Options options = GetOptions(); + if (options.two_write_queues) { + // Not compatible. + return; + } + Reopen(options); + WriteBatch batch; + WriteOptions opts; + opts.sync = false; + opts.disableWAL = true; + SequenceNumber seq = 0; + SimpleCallback callback([&](SequenceNumber s) { seq = s; }); + ASSERT_OK(dbfull()->Write(opts, &batch, &callback)); + ASSERT_NE(seq, 0); +} + INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest, testing::Values(DBTestBase::kDefault, DBTestBase::kConcurrentWALWrites, diff --git a/db/flush_job.cc b/db/flush_job.cc index a3e168823a6..4052c8b7940 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -169,7 +169,8 @@ void FlushJob::RecordFlushIOStats() { ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written)); IOSTATS_RESET(bytes_written); } -void FlushJob::PickMemTable() { +void FlushJob::PickMemTable(SequenceNumber* earliest_seqno, + SequenceNumber* largest_seqno) { db_mutex_->AssertHeld(); assert(!pick_memtable_called); pick_memtable_called = true; @@ -214,6 +215,14 @@ void FlushJob::PickMemTable() { base_ = cfd_->current(); base_->Ref(); // it is likely that we do not need this reference + if (earliest_seqno != nullptr) { + *earliest_seqno = m->GetEarliestSequenceNumber(); + } + if (largest_seqno != nullptr) { + *largest_seqno = mems_.back()->GetLargestSequenceNumber(); + } + assert(earliest_seqno == nullptr || largest_seqno == nullptr || + *earliest_seqno <= *largest_seqno); } Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta, diff --git a/db/flush_job.h b/db/flush_job.h index aef33ef423a..dfe51e8366c 100644 --- a/db/flush_job.h +++ b/db/flush_job.h @@ -82,10 +82,13 @@ class FlushJob { // Require db_mutex held. // Once PickMemTable() is called, either Run() or Cancel() has to be called. - void PickMemTable(); + void PickMemTable(SequenceNumber* earliest_seqno = nullptr, + SequenceNumber* largest_seqno = nullptr); // @param skip_since_bg_error If not nullptr and if atomic_flush=false, // then it is set to true if flush installation is skipped and memtable // is rolled back due to existing background error. + // The earliest seqno and largest seqno will be returned through the + // parameters. Status Run(LogsWithPrepTracker* prep_tracker = nullptr, FileMetaData* file_meta = nullptr, bool* switched_to_mempurge = nullptr, diff --git a/db/memtable.cc b/db/memtable.cc index 0b8786bc2ff..e4f0804695f 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -78,13 +78,12 @@ MemTable::MemTable(const InternalKeyComparator& cmp, refs_(0), kArenaBlockSize(Arena::OptimizeBlockSize(moptions_.arena_block_size)), mem_tracker_(write_buffer_manager), - arena_(moptions_.arena_block_size, - (write_buffer_manager != nullptr && - (write_buffer_manager->enabled() || - write_buffer_manager->cost_to_cache())) - ? &mem_tracker_ - : nullptr, - mutable_cf_options.memtable_huge_page_size), + arena_( + moptions_.arena_block_size, + (write_buffer_manager != nullptr && (write_buffer_manager->enabled())) + ? &mem_tracker_ + : nullptr, + mutable_cf_options.memtable_huge_page_size), table_(ioptions.memtable_factory->CreateMemTableRep( comparator_, &arena_, mutable_cf_options.prefix_extractor.get(), ioptions.logger, column_family_id)), @@ -102,6 +101,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp, file_number_(0), first_seqno_(0), earliest_seqno_(latest_seq), + largest_seqno_(latest_seq), creation_seq_(latest_seq), mem_next_logfile_number_(0), min_prep_log_referenced_(0), @@ -109,6 +109,12 @@ MemTable::MemTable(const InternalKeyComparator& cmp, ? moptions_.inplace_update_num_locks : 0), prefix_extractor_(mutable_cf_options.prefix_extractor.get()), + needs_bloom_filter_( + (prefix_extractor_ || moptions_.memtable_whole_key_filtering) && + moptions_.memtable_prefix_bloom_bits > 0), + bloom_filter_ptr_(nullptr), + bloom_filter_(nullptr), + logger_(ioptions.logger), flush_state_(FLUSH_NOT_REQUESTED), clock_(ioptions.clock), insert_with_hint_prefix_extractor_( @@ -122,14 +128,6 @@ MemTable::MemTable(const InternalKeyComparator& cmp, // something went wrong if we need to flush before inserting anything assert(!ShouldScheduleFlush()); - // use bloom_filter_ for both whole key and prefix bloom filter - if ((prefix_extractor_ || moptions_.memtable_whole_key_filtering) && - moptions_.memtable_prefix_bloom_bits > 0) { - bloom_filter_.reset( - new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits, - 6 /* hard coded 6 probes */, - moptions_.memtable_huge_page_size, ioptions.logger)); - } // Initialize cached_range_tombstone_ here since it could // be read before it is constructed in MemTable::Add(), which could also lead // to a data race on the global mutex table backing atomic shared_ptr. @@ -361,8 +359,8 @@ const char* EncodeKey(std::string* scratch, const Slice& target) { class MemTableIterator : public InternalIterator { public: - MemTableIterator(const MemTable& mem, const ReadOptions& read_options, - Arena* arena, bool use_range_del_table = false) + MemTableIterator(MemTable& mem, const ReadOptions& read_options, Arena* arena, + bool use_range_del_table = false) : bloom_(nullptr), prefix_extractor_(mem.prefix_extractor_), comparator_(mem.comparator_), @@ -379,7 +377,7 @@ class MemTableIterator : public InternalIterator { } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek && !read_options.auto_prefix_mode) { // Auto prefix mode is not implemented in memtable yet. - bloom_ = mem.bloom_filter_.get(); + bloom_ = mem.GetBloomFilter(); iter_ = mem.table_->GetDynamicPrefixIterator(arena); } else { iter_ = mem.table_->GetIterator(arena); @@ -772,12 +770,13 @@ Status MemTable::Add(SequenceNumber s, ValueType type, num_range_deletes_.store(val, std::memory_order_relaxed); } - if (bloom_filter_ && prefix_extractor_ && + auto bloom_filter = GetBloomFilter(); + if (bloom_filter && prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts)) { - bloom_filter_->Add(prefix_extractor_->Transform(key_without_ts)); + bloom_filter->Add(prefix_extractor_->Transform(key_without_ts)); } - if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { - bloom_filter_->Add(key_without_ts); + if (bloom_filter && moptions_.memtable_whole_key_filtering) { + bloom_filter->Add(key_without_ts); } // The first sequence number inserted into the memtable @@ -791,6 +790,9 @@ Status MemTable::Add(SequenceNumber s, ValueType type, } assert(first_seqno_.load() >= earliest_seqno_.load()); } + if (s > largest_seqno_) { + largest_seqno_.store(s, std::memory_order_relaxed); + } assert(post_process_info == nullptr); // TODO(yuzhangyu): support updating newest UDT for when `allow_concurrent` // is true. @@ -811,16 +813,17 @@ Status MemTable::Add(SequenceNumber s, ValueType type, post_process_info->num_deletes++; } - if (bloom_filter_ && prefix_extractor_ && + auto bloom_filter = GetBloomFilter(); + if (bloom_filter && prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts)) { - bloom_filter_->AddConcurrently( + bloom_filter->AddConcurrently( prefix_extractor_->Transform(key_without_ts)); } - if (bloom_filter_ && moptions_.memtable_whole_key_filtering) { - bloom_filter_->AddConcurrently(key_without_ts); + if (bloom_filter && moptions_.memtable_whole_key_filtering) { + bloom_filter->AddConcurrently(key_without_ts); } - // atomically update first_seqno_ and earliest_seqno_. + // atomically update first_seqno_, earliest_seqno_ and largest_seqno_. uint64_t cur_seq_num = first_seqno_.load(std::memory_order_relaxed); while ((cur_seq_num == 0 || s < cur_seq_num) && !first_seqno_.compare_exchange_weak(cur_seq_num, s)) { @@ -831,6 +834,10 @@ Status MemTable::Add(SequenceNumber s, ValueType type, (cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) && !earliest_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) { } + uint64_t cur_largest_seqno = largest_seqno_.load(std::memory_order_acquire); + while (s > cur_largest_seqno && + !largest_seqno_.compare_exchange_weak(cur_largest_seqno, s)) { + } } if (type == kTypeRangeDeletion) { auto new_cache = std::make_shared(); @@ -1258,23 +1265,24 @@ bool MemTable::Get(const LookupKey& key, std::string* value, bool may_contain = true; Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz_); bool bloom_checked = false; - if (bloom_filter_) { + auto bloom_filter = GetBloomFilter(); + if (bloom_filter) { // when both memtable_whole_key_filtering and prefix_extractor_ are set, // only do whole key filtering for Get() to save CPU if (moptions_.memtable_whole_key_filtering) { - may_contain = bloom_filter_->MayContain(user_key_without_ts); + may_contain = bloom_filter->MayContain(user_key_without_ts); bloom_checked = true; } else { assert(prefix_extractor_); if (prefix_extractor_->InDomain(user_key_without_ts)) { - may_contain = bloom_filter_->MayContain( + may_contain = bloom_filter->MayContain( prefix_extractor_->Transform(user_key_without_ts)); bloom_checked = true; } } } - if (bloom_filter_ && !may_contain) { + if (bloom_filter && !may_contain) { // iter is null if prefix bloom says the key does not exist PERF_COUNTER_ADD(bloom_memtable_miss_count, 1); *seq = kMaxSequenceNumber; @@ -1345,7 +1353,8 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, bool no_range_del = read_options.ignore_range_deletions || is_range_del_table_empty_.load(std::memory_order_relaxed); MultiGetRange temp_range(*range, range->begin(), range->end()); - if (bloom_filter_ && no_range_del) { + auto bloom_filter = GetBloomFilter(); + if (bloom_filter && no_range_del) { bool whole_key = !prefix_extractor_ || moptions_.memtable_whole_key_filtering; std::array bloom_keys; @@ -1362,7 +1371,7 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, range_indexes[num_keys++] = iter.index(); } } - bloom_filter_->MayContain(num_keys, bloom_keys.data(), may_match.data()); + bloom_filter->MayContain(num_keys, bloom_keys.data(), may_match.data()); for (int i = 0; i < num_keys; ++i) { if (!may_match[i]) { temp_range.SkipIndex(range_indexes[i]); diff --git a/db/memtable.h b/db/memtable.h index c55b34761ef..b2df0df816d 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -425,6 +425,15 @@ class MemTable { return earliest_seqno_.store(earliest_seqno, std::memory_order_relaxed); } + // Returns the sequence number that is guaranteed to be larger than the + // sequence number of any key that could be inserted into this memtable. + // + // If the largest sequence number could not be determined, + // 0 will be returned. + SequenceNumber GetLargestSequenceNumber() { + return largest_seqno_.load(std::memory_order_relaxed); + } + // DB's latest sequence ID when the memtable is created. This number // may be updated to a more recent one before any key is inserted. SequenceNumber GetCreationSeq() const { return creation_seq_; } @@ -598,6 +607,9 @@ class MemTable { // if not set. std::atomic earliest_seqno_; + // The largest sequence number of writes in this memtable. + std::atomic largest_seqno_; + SequenceNumber creation_seq_; // The log files earlier than this number can be deleted. @@ -611,7 +623,14 @@ class MemTable { std::vector locks_; const SliceTransform* const prefix_extractor_; + // Bloom filter initialization is delayed to the actual read/write. This is to + // reduce memory footprint of empty memtable. + const bool needs_bloom_filter_; + std::atomic bloom_filter_ptr_; + SpinMutex bloom_filter_mutex_; std::unique_ptr bloom_filter_; + // Only used to initialize bloom filter. + Logger* logger_; std::atomic flush_state_; @@ -697,6 +716,25 @@ class MemTable { SequenceNumber s, char* checksum_ptr); void MaybeUpdateNewestUDT(const Slice& user_key); + + inline DynamicBloom* GetBloomFilter() { + if (needs_bloom_filter_) { + auto ptr = bloom_filter_ptr_.load(std::memory_order_relaxed); + if (UNLIKELY(ptr == nullptr)) { + std::lock_guard guard(bloom_filter_mutex_); + if (bloom_filter_ == nullptr) { + bloom_filter_.reset( + new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits, + 6 /* hard coded 6 probes */, + moptions_.memtable_huge_page_size, logger_)); + } + ptr = bloom_filter_.get(); + bloom_filter_ptr_.store(ptr, std::memory_order_relaxed); + } + return ptr; + } + return nullptr; + } }; extern const char* EncodeKey(std::string* scratch, const Slice& target); diff --git a/db/memtable_list.cc b/db/memtable_list.cc index dfa93461bb1..414b179e61c 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -433,6 +433,23 @@ void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id, } } +void MemTableList::ExportMemtables(autovector* ret) { + const auto& memlist = current_->memlist_; + autovector tmp; + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* m = *it; + tmp.push_back(m); + } + // For mempurge feature. + std::sort(tmp.begin(), tmp.end(), + [](const MemTable* m1, const MemTable* m2) -> bool { + return m1->GetID() < m2->GetID(); + }); + for (auto m : tmp) { + ret->push_back(m); + } +} + void MemTableList::RollbackMemtableFlush(const autovector& mems, bool rollback_succeeding_memtables) { TEST_SYNC_POINT("RollbackMemtableFlush"); diff --git a/db/memtable_list.h b/db/memtable_list.h index 81b60288d87..7abbbf1a3e3 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -269,6 +269,9 @@ class MemTableList { autovector* mems, uint64_t* max_next_log_number = nullptr); + // Returns all memtable ordered from the oldest to the newest. + void ExportMemtables(autovector* mems); + // Reset status of the given memtable list back to pending state so that // they can get picked up again on the next round of flush. // diff --git a/db/write_batch.cc b/db/write_batch.cc index 0b55cb4aae5..338dafa32ee 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -2961,6 +2961,9 @@ Status WriteBatchInternal::InsertInto( if (!w->status.ok()) { return w->status; } + if (w->post_callback) { + w->post_callback->Callback(w->sequence); + } assert(!seq_per_batch || w->batch_cnt != 0); assert(!seq_per_batch || inserter.sequence() - w->sequence == w->batch_cnt); } @@ -2988,6 +2991,9 @@ Status WriteBatchInternal::InsertInto( inserter.set_log_number_ref(writer->log_ref); inserter.set_prot_info(writer->multi_batch.batches[0]->prot_info_.get()); Status s = writer->multi_batch.batches[0]->Iterate(&inserter); + if (writer->post_callback && s.ok()) { + writer->post_callback->Callback(sequence); + } assert(!seq_per_batch || batch_cnt != 0); assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt); if (concurrent_memtable_writes) { diff --git a/db/write_thread.cc b/db/write_thread.cc index b24d3667af8..66f01a753c2 100644 --- a/db/write_thread.cc +++ b/db/write_thread.cc @@ -903,6 +903,8 @@ void WriteThread::Writer::ConsumeOne(size_t claimed) { if (!s.ok()) { std::lock_guard guard(this->status_lock); this->status = s; + } else if (post_callback) { + post_callback->Callback(sequence); } multi_batch.pending_wb_cnt.fetch_sub(1, std::memory_order_acq_rel); } diff --git a/db/write_thread.h b/db/write_thread.h index b0c8fb5c435..a7c9fc6ba28 100644 --- a/db/write_thread.h +++ b/db/write_thread.h @@ -21,6 +21,7 @@ #include "db/trim_history_scheduler.h" #include "db/write_callback.h" #include "monitoring/instrumented_mutex.h" +#include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/status.h" #include "rocksdb/types.h" @@ -186,6 +187,7 @@ class WriteThread { size_t protection_bytes_per_key; PreReleaseCallback* pre_release_callback; PostMemTableCallback* post_memtable_callback; + PostWriteCallback* post_callback; uint64_t log_used; // log number that this batch was inserted into uint64_t log_ref; // log number that memtable insert should reference WriteCallback* callback; @@ -215,6 +217,7 @@ class WriteThread { protection_bytes_per_key(0), pre_release_callback(nullptr), post_memtable_callback(nullptr), + post_callback(nullptr), log_used(0), log_ref(0), callback(nullptr), @@ -230,7 +233,8 @@ class WriteThread { WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable, size_t _batch_cnt = 0, PreReleaseCallback* _pre_release_callback = nullptr, - PostMemTableCallback* _post_memtable_callback = nullptr) + PostMemTableCallback* _post_memtable_callback = nullptr, + PostWriteCallback* _post_callback = nullptr) : sync(write_options.sync), no_slowdown(write_options.no_slowdown), disable_wal(write_options.disableWAL), @@ -240,6 +244,7 @@ class WriteThread { protection_bytes_per_key(_batch->GetProtectionBytesPerKey()), pre_release_callback(_pre_release_callback), post_memtable_callback(_post_memtable_callback), + post_callback(_post_callback), log_used(0), log_ref(_log_ref), callback(_callback), @@ -257,7 +262,8 @@ class WriteThread { Writer(const WriteOptions& write_options, std::vector&& _batch, WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable, PreReleaseCallback* _pre_release_callback = nullptr, - PostMemTableCallback* _post_memtable_callback = nullptr) + PostMemTableCallback* _post_memtable_callback = nullptr, + PostWriteCallback* _post_callback = nullptr) : sync(write_options.sync), no_slowdown(write_options.no_slowdown), disable_wal(write_options.disableWAL), @@ -266,6 +272,7 @@ class WriteThread { batch_cnt(0), pre_release_callback(_pre_release_callback), post_memtable_callback(_post_memtable_callback), + post_callback(_post_callback), log_used(0), log_ref(_log_ref), callback(_callback), diff --git a/encryption/encryption.cc b/encryption/encryption.cc index 02f7f1bdc7b..dd9f3ca0d4f 100644 --- a/encryption/encryption.cc +++ b/encryption/encryption.cc @@ -529,15 +529,24 @@ Status KeyManagedEncryptedEnv::RenameFile(const std::string& src_fname, } s = target()->RenameFile(src_fname, dst_fname); if (s.ok()) { - s = key_manager_->DeleteFile(src_fname); + s = key_manager_->DeleteFileExt(src_fname, dst_fname); } else { Status delete_status __attribute__((__unused__)) = - key_manager_->DeleteFile(dst_fname); + key_manager_->DeleteFileExt(dst_fname, src_fname); assert(delete_status.ok()); } return s; } +Status KeyManagedEncryptedEnv::DeleteDir(const std::string& dname) { + // We don't guarantee atomicity. Delete keys first. + Status s = key_manager_->DeleteFile(dname); + if (!s.ok()) { + return s; + } + return target()->DeleteDir(dname); +} + Env* NewKeyManagedEncryptedEnv(Env* base_env, std::shared_ptr& key_manager) { std::shared_ptr provider( diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 8148a29673d..30a7dbec8eb 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -2099,8 +2099,6 @@ extern ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_destroy( rocksdb_write_buffer_manager_t* wbm); extern ROCKSDB_LIBRARY_API bool rocksdb_write_buffer_manager_enabled( rocksdb_write_buffer_manager_t* wbm); -extern ROCKSDB_LIBRARY_API bool rocksdb_write_buffer_manager_cost_to_cache( - rocksdb_write_buffer_manager_t* wbm); extern ROCKSDB_LIBRARY_API size_t rocksdb_write_buffer_manager_memory_usage(rocksdb_write_buffer_manager_t* wbm); extern ROCKSDB_LIBRARY_API size_t @@ -2110,11 +2108,7 @@ extern ROCKSDB_LIBRARY_API size_t rocksdb_write_buffer_manager_dummy_entries_in_cache_usage( rocksdb_write_buffer_manager_t* wbm); extern ROCKSDB_LIBRARY_API size_t -rocksdb_write_buffer_manager_buffer_size(rocksdb_write_buffer_manager_t* wbm); -extern ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_buffer_size( - rocksdb_write_buffer_manager_t* wbm, size_t new_size); -extern ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_allow_stall( - rocksdb_write_buffer_manager_t* wbm, bool new_allow_stall); +rocksdb_write_buffer_manager_flush_size(rocksdb_write_buffer_manager_t* wbm); /* HyperClockCache */ diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h index 410ee4d3ab0..9592b8e4a7b 100644 --- a/include/rocksdb/compaction_filter.h +++ b/include/rocksdb/compaction_filter.h @@ -67,6 +67,7 @@ class CompactionFilter : public Customizable { kBlobIndex, // Wide-column entity kWideColumnEntity, + kDeletion, // used only by TiKV's region range filter. }; // Potential decisions that can be returned by the compaction filter's @@ -254,9 +255,13 @@ class CompactionFilter : public Customizable { case ValueType::kBlobIndex: return Decision::kKeep; - default: + case ValueType::kDeletion: + // Should not appear in this API. assert(false); return Decision::kKeep; + + default: + return Decision::kKeep; } } @@ -298,8 +303,23 @@ class CompactionFilter : public Customizable { return Decision::kKeep; } - return FilterV2(level, key, value_type, *existing_value, new_value, - skip_until); + return UnsafeFilter(level, key, value_type, *existing_value, new_value, + skip_until); + } + + // This interface is reserved for TiKV's region range filter. Only this + // interface can accept `value_type=kTypeDeletion`. + virtual Decision UnsafeFilter(int level, const Slice& key, + ValueType value_type, + const Slice& existing_value, + std::string* new_value, + std::string* skip_until) const { + if (value_type != ValueType::kDeletion) { + return FilterV2(level, key, value_type, existing_value, new_value, + skip_until); + } else { + return Decision::kKeep; + } } // Internal (BlobDB) use only. Do not override in application code. diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 662522976b2..494f5af1865 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -149,6 +149,14 @@ struct GetMergeOperandsOptions { using TablePropertiesCollection = std::unordered_map>; +class PostWriteCallback { + public: + virtual ~PostWriteCallback() {} + + // Will be called while on the write thread after the write executes. + virtual void Callback(SequenceNumber seq) = 0; +}; + // A DB is a persistent, versioned ordered map from keys to values. // A DB is safe for concurrent access from multiple threads without // any external synchronization. @@ -323,6 +331,50 @@ class DB { // auto-resume is in progress, without waiting for it to complete. // See DBOptions::max_bgerror_resume_count and // EventListener::OnErrorRecoveryBegin + // Merge multiple DBs into this one. All DBs must have disjoint internal + // keys. + // + // # Tips + // + // The provided DBs must be disjoint: their internal key ranges don't overlap + // each other. Calling `CompactRange` on the complementary ranges can make + // sure user-visible key range consistent with internal key range. Caveats are + // (1) sometimes `bottommost_level_compaction` needs to be configured to avoid + // trivial move; (2) range tombstones are very tricky, they might be retained + // even if there's no out-of-ranges key. + // + // To avoid triggering L0 (or Memtable) stall conditions, user can consider + // dynamically decreasing the corresponding limits before entering merge. + // + // WAL merge is not supported. User must write with disableWAL=true, or wait + // for all WALs to be retired before merging. + // + // To have the best performance, use the same `block_cache` and + // `prefix_extractor` in DB options. + // + // # Safety + // + // Performing merge on DBs that are still undergoing writes results in + // undefined behavior. + // + // Using different implementations of user comparator results in undefined + // behavior as well. + // + // Concurrently apply several merge operations on the same instance can cause + // deadlock. + // + virtual Status MergeDisjointInstances( + const MergeInstanceOptions& /*merge_options*/, + const std::vector& /*instances*/) { + return Status::NotSupported("`MergeDisjointInstances` not implemented"); + } + + // Check all data written before this call is in the range [begin, end). + // Return InvalidArgument if not. + virtual Status CheckInRange(const Slice* /*begin*/, const Slice* /*end*/) { + return Status::NotSupported("`AssertInRange` not implemented"); + } + virtual Status Resume() { return Status::NotSupported(); } // Close the DB by releasing resources, closing files etc. This should be @@ -540,18 +592,30 @@ class DB { // options.sync=true. // Returns OK on success, non-OK on failure. // Note: consider setting options.sync = true. - virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; + virtual Status Write(const WriteOptions& options, WriteBatch* updates, + PostWriteCallback* callback) = 0; + virtual Status Write(const WriteOptions& options, WriteBatch* updates) { + return Write(options, updates, nullptr); + } virtual Status MultiBatchWrite(const WriteOptions& /*options*/, - std::vector&& /*updates*/) { + std::vector&& /*updates*/, + PostWriteCallback* /*callback*/) { return Status::NotSupported(); } + virtual Status MultiBatchWrite(const WriteOptions& options, + std::vector&& updates) { + return MultiBatchWrite(options, std::move(updates), nullptr); + } + // If the column family specified by "column_family" contains an entry for // "key", return the corresponding value in "*value". If the entry is a plain // key-value, return the value as-is; if it is a wide-column entity, return // the value of its default anonymous column (see kDefaultWideColumnName) if // any, or an empty value otherwise. + // If the database contains an entry for "key" store the + // corresponding value in *value and return OK. // // If timestamp is enabled and a non-null timestamp pointer is passed in, // timestamp is returned. @@ -1407,6 +1471,10 @@ class DB { GetApproximateMemTableStats(DefaultColumnFamily(), range, count, size); } + virtual void GetApproximateActiveMemTableStats( + ColumnFamilyHandle* /*column_family*/, uint64_t* const /*memory_bytes*/, + uint64_t* const /*oldest_key_time*/) {} + // Compact the underlying storage for the key range [*begin,*end]. // The actual compaction interval might be superset of [*begin, *end]. // In particular, deleted and overwritten versions are discarded, diff --git a/include/rocksdb/encryption.h b/include/rocksdb/encryption.h index b8f5e91e985..f1257d697ed 100644 --- a/include/rocksdb/encryption.h +++ b/include/rocksdb/encryption.h @@ -54,9 +54,17 @@ class KeyManager { FileEncryptionInfo* file_info) = 0; virtual Status NewFile(const std::string& fname, FileEncryptionInfo* file_info) = 0; + // Used with both file and directory. virtual Status DeleteFile(const std::string& fname) = 0; virtual Status LinkFile(const std::string& src_fname, const std::string& dst_fname) = 0; + // Provide additional hint of physical file when the key name doesn't map to + // one. A typical use case of this is atomically deleting a directory by + // renaming it first. + virtual Status DeleteFileExt(const std::string& fname, + const std::string& /*physical_fname*/) { + return DeleteFile(fname); + } }; // An Env with underlying files being encrypted. It holds a reference to an @@ -96,6 +104,8 @@ class KeyManagedEncryptedEnv : public EnvWrapper { Status RenameFile(const std::string& src_fname, const std::string& dst_fname) override; + Status DeleteDir(const std::string& dname) override; + private: const std::shared_ptr key_manager_; const std::shared_ptr provider_; diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index 787ed206ae8..063c51071cc 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -487,6 +487,8 @@ struct MemTableInfo { // memtable. It can then be assumed that any write with a larger(or equal) // sequence number will be present in this memtable or a later memtable. SequenceNumber earliest_seqno; + // The largest sequence number of writes in this memtable. + SequenceNumber largest_seqno; // Total number of entries in memtable uint64_t num_entries; // Total number of deletes in memtable diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index a33f8eea4bb..fffbe8bd1ba 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -349,6 +349,13 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API uint32_t memtable_max_range_deletions = 0; + // Column family based write buffer manager, if this is set, this column + // facmily will not report memtable memory usage to the write buffer manager + // in DBImpl. + // + // Default: null + std::shared_ptr cf_write_buffer_manager = nullptr; + // Create ColumnFamilyOptions with default values for all fields ColumnFamilyOptions(); // Create ColumnFamilyOptions from Options @@ -1885,7 +1892,23 @@ struct FlushOptions { // is performed by someone else (foreground call or background thread). // Default: false bool allow_write_stall; - FlushOptions() : wait(true), allow_write_stall(false) {} + // Only flush memtable if it has the expected oldest key time. + // This option is ignored for atomic flush. Zero means disabling the check. + // Default: 0 + uint64_t expected_oldest_key_time; + // Abort flush if compaction is disabled via `DisableManualCompaction`. + // Default: false + bool check_if_compaction_disabled; + // Used by RocksDB internally. + // Default: false + bool _write_stopped; + + FlushOptions() + : wait(true), + allow_write_stall(false), + expected_oldest_key_time(0), + check_if_compaction_disabled(false), + _write_stopped(false) {} }; // Create a Logger from provided DBOptions @@ -2208,4 +2231,15 @@ struct WaitForCompactOptions { std::chrono::microseconds timeout = std::chrono::microseconds::zero(); }; +struct MergeInstanceOptions { + // Whether to merge memtable. WAL must be empty to perform a memtable merge. + // Either write with disableWAL=true, or flush memtables before merge. + bool merge_memtable = false; + // Whether or not writes to source DBs are still allowed after the merge. + // Some optimizations are possible only with this flag set to false. + bool allow_source_write = true; + // No limit if negative. + int max_preload_files = 16; +}; + } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h index 0d7eb59499e..b105bf3e7d8 100644 --- a/include/rocksdb/slice.h +++ b/include/rocksdb/slice.h @@ -145,6 +145,7 @@ class PinnableSlice : public Slice, public Cleanable { // No copy constructor and copy assignment allowed. PinnableSlice(PinnableSlice&) = delete; + PinnableSlice(const PinnableSlice&) = delete; PinnableSlice& operator=(PinnableSlice&) = delete; inline void PinSlice(const Slice& s, CleanupFunction f, void* arg1, diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index 86e1477a4f5..c5854e1ab78 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -258,8 +258,10 @@ class StackableDB : public DB { return db_->Merge(options, column_family, key, ts, value); } - virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override { - return db_->Write(opts, updates); + using DB::Write; + virtual Status Write(const WriteOptions& opts, WriteBatch* updates, + PostWriteCallback* callback) override { + return db_->Write(opts, updates, callback); } using DB::NewIterator; diff --git a/include/rocksdb/write_buffer_manager.h b/include/rocksdb/write_buffer_manager.h index 61e75c8888e..4d840961cc4 100644 --- a/include/rocksdb/write_buffer_manager.h +++ b/include/rocksdb/write_buffer_manager.h @@ -22,6 +22,8 @@ namespace ROCKSDB_NAMESPACE { class CacheReservationManager; +class DB; +class ColumnFamilyHandle; // Interface to block and signal DB instances, intended for RocksDB // internal use only. Each DB instance contains ptr to StallInterface. @@ -37,34 +39,42 @@ class StallInterface { class WriteBufferManager final { public: // Parameters: - // _buffer_size: _buffer_size = 0 indicates no limit. Memory won't be capped. - // memory_usage() won't be valid and ShouldFlush() will always return true. + // - flush_size: When the total size of mutable memtables exceeds this limit, + // the largest one will be frozen and scheduled for flush. Disabled when 0. // - // cache_: if `cache` is provided, we'll put dummy entries in the cache and - // cost the memory allocated to the cache. It can be used even if _buffer_size - // = 0. + // Immutable memtables are excluded for this reason: RocksDB always schedule + // a flush for newly created immutable memtable. We can consider them evicted + // from memory if flush bandwidth is sufficient. // - // allow_stall: if set true, it will enable stalling of writes when - // memory_usage() exceeds buffer_size. It will wait for flush to complete and - // memory usage to drop down. - explicit WriteBufferManager(size_t _buffer_size, + // It's an undefined behavior to enable/disable flush limit after the manager + // has been used by a DB instance. + // + // - stall_ratio: When the total size of memtables exceeds ratio*flush_size, + // user writes will be delayed. Disabled when smaller than 1. + // + // - flush_oldest_first: By default we freeze the largest mutable memtable + // when `flush_size` is triggered. By enabling this flag, the oldest mutable + // memtable will be frozen instead. + // + // - cache: if `cache` is provided, memtable memory will be charged as a + // dummy entry This is useful to keep the memory sum of both memtable and + // block cache under control. + explicit WriteBufferManager(size_t flush_size, std::shared_ptr cache = {}, - bool allow_stall = false); + float stall_ratio = 0.0, + bool flush_oldest_first = false); // No copying allowed WriteBufferManager(const WriteBufferManager&) = delete; WriteBufferManager& operator=(const WriteBufferManager&) = delete; ~WriteBufferManager(); - // Returns true if buffer_limit is passed to limit the total memory usage and - // is greater than 0. - bool enabled() const { return buffer_size() > 0; } - - // Returns true if pointer to cache is passed. - bool cost_to_cache() const { return cache_res_mgr_ != nullptr; } + // Returns true if a non-zero buffer_limit is passed to limit the total + // memory usage or cache is provided to charge write buffer memory. + bool enabled() const { return flush_size() > 0 || cache_res_mgr_ != nullptr; } // Returns the total memory used by memtables. - // Only valid if enabled() + // Only valid if enabled(). size_t memory_usage() const { return memory_used_.load(std::memory_order_relaxed); } @@ -76,45 +86,29 @@ class WriteBufferManager final { size_t dummy_entries_in_cache_usage() const; - // Returns the buffer_size. - size_t buffer_size() const { - return buffer_size_.load(std::memory_order_relaxed); + // Returns the flush_size. + size_t flush_size() const { + return flush_size_.load(std::memory_order_relaxed); } - // REQUIRED: `new_size` > 0 - void SetBufferSize(size_t new_size) { - assert(new_size > 0); - buffer_size_.store(new_size, std::memory_order_relaxed); - mutable_limit_.store(new_size * 7 / 8, std::memory_order_relaxed); - // Check if stall is active and can be ended. - MaybeEndWriteStall(); + size_t stall_size() const { + return static_cast(flush_size() * stall_ratio_); } - void SetAllowStall(bool new_allow_stall) { - allow_stall_.store(new_allow_stall, std::memory_order_relaxed); - MaybeEndWriteStall(); + void SetFlushSize(size_t new_size); + + void SetFlushOldestFirst(bool v) { + flush_oldest_first_.store(v, std::memory_order_relaxed); } // Below functions should be called by RocksDB internally. - // Should only be called from write thread - bool ShouldFlush() const { - if (enabled()) { - if (mutable_memtable_memory_usage() > - mutable_limit_.load(std::memory_order_relaxed)) { - return true; - } - size_t local_size = buffer_size(); - if (memory_usage() >= local_size && - mutable_memtable_memory_usage() >= local_size / 2) { - // If the memory exceeds the buffer size, we trigger more aggressive - // flush. But if already more than half memory is being flushed, - // triggering more flush may not help. We will hold it instead. - return true; - } - } - return false; - } + // This handle is the same as the one created by `DB::Open` or + // `DB::CreateColumnFamily`. + // Must be called not holding db mutex and not inside write thread. + // `UnregisterColumnFamily()` must be called by DB before the handle is + // destroyed. + void RegisterColumnFamily(DB* db, ColumnFamilyHandle* cf); // Returns true if total memory usage exceeded buffer_size. // We stall the writes untill memory_usage drops below buffer_size. When the @@ -124,22 +118,18 @@ class WriteBufferManager final { // // Should only be called by RocksDB internally . bool ShouldStall() const { - if (!allow_stall_.load(std::memory_order_relaxed) || !enabled()) { + if (!allow_stall_ || flush_size() == 0) { return false; } - - return IsStallActive() || IsStallThresholdExceeded(); + return is_stall_active() || is_stall_threshold_exceeded(); } + // Called during `DB::Close`. + // Must be called not holding db mutex and not inside write thread. + void UnregisterDB(DB* db); - // Returns true if stall is active. - bool IsStallActive() const { - return stall_active_.load(std::memory_order_relaxed); - } - - // Returns true if stalling condition is met. - bool IsStallThresholdExceeded() const { - return memory_usage() >= buffer_size_; - } + // Called during `DestroyColumnFamilyHandle`. + // Must be called not holding db mutex and not inside write thread. + void UnregisterColumnFamily(ColumnFamilyHandle* cf); void ReserveMem(size_t mem); @@ -149,6 +139,36 @@ class WriteBufferManager final { void FreeMem(size_t mem); + // Whether the DB writer should call `MaybeFlush` before write. + bool ShouldFlush() { + size_t local_size = flush_size(); + return local_size > 0 && mutable_memtable_memory_usage() >= local_size; + } + + // Must be called without holding db mutex. When called in write thread, + // must pass in the pointer to the db. + void MaybeFlush(DB* this_db) { + if (sentinels_mu_.try_lock()) { + MaybeFlushLocked(this_db); + sentinels_mu_.unlock(); + } + } + + // Must ensure that the mutex of all dbs except this_db are not held. If + // this_db is not nullptr, the mutex of it must be held. + void MaybeFlushLocked(DB* this_db = nullptr); + + // Returns true if stall is active. + bool is_stall_active() const { + return stall_active_.load(std::memory_order_relaxed); + } + + // Returns true if stalling condition is met. Only valid if buffer_size_ is + // non-zero. + bool is_stall_threshold_exceeded() const { + return memory_usage() >= stall_size(); + } + // Add the DB instance to the queue and block the DB. // Should only be called by RocksDB internally. void BeginWriteStall(StallInterface* wbm_stall); @@ -157,26 +177,41 @@ class WriteBufferManager final { // signal them to continue. void MaybeEndWriteStall(); - void RemoveDBFromQueue(StallInterface* wbm_stall); + // Called when DB instance is closed. + void RemoveFromStallQueue(StallInterface* wbm_stall); private: - std::atomic buffer_size_; - std::atomic mutable_limit_; + struct WriteBufferSentinel { + DB* db; + ColumnFamilyHandle* cf; + }; + // Protected by `sentinels_mu_`. + std::list> sentinels_; + std::mutex sentinels_mu_; + + // Shared by flush_size limit and cache charging. + // When cache charging is enabled, this is updated under cache_res_mgr_mu_. std::atomic memory_used_; - // Memory that hasn't been scheduled to free. + + std::atomic flush_size_; + // Only used when flush_size is non-zero. std::atomic memory_active_; - std::shared_ptr cache_res_mgr_; - // Protects cache_res_mgr_ - std::mutex cache_res_mgr_mu_; + std::atomic flush_oldest_first_; + const bool allow_stall_; + const float stall_ratio_; std::list queue_; // Protects the queue_ and stall_active_. - std::mutex mu_; - std::atomic allow_stall_; - // Value should only be changed by BeginWriteStall() and MaybeEndWriteStall() - // while holding mu_, but it can be read without a lock. + std::mutex stall_mu_; + // Value should only be changed by BeginWriteStall() and + // MaybeEndWriteStall() while holding mu_, but it can be read without a + // lock. std::atomic stall_active_; + std::shared_ptr cache_res_mgr_; + // Protects cache_res_mgr_ + std::mutex cache_res_mgr_mu_; + void ReserveMemWithCache(size_t mem); void FreeMemWithCache(size_t mem); }; diff --git a/memtable/alloc_tracker.cc b/memtable/alloc_tracker.cc index d780df0bf3d..d7fdd40958a 100644 --- a/memtable/alloc_tracker.cc +++ b/memtable/alloc_tracker.cc @@ -25,8 +25,7 @@ AllocTracker::~AllocTracker() { FreeMem(); } void AllocTracker::Allocate(size_t bytes) { assert(write_buffer_manager_ != nullptr); - if (write_buffer_manager_->enabled() || - write_buffer_manager_->cost_to_cache()) { + if (write_buffer_manager_->enabled()) { bytes_allocated_.fetch_add(bytes, std::memory_order_relaxed); write_buffer_manager_->ReserveMem(bytes); } @@ -34,8 +33,7 @@ void AllocTracker::Allocate(size_t bytes) { void AllocTracker::DoneAllocating() { if (write_buffer_manager_ != nullptr && !done_allocating_) { - if (write_buffer_manager_->enabled() || - write_buffer_manager_->cost_to_cache()) { + if (write_buffer_manager_->enabled()) { write_buffer_manager_->ScheduleFreeMem( bytes_allocated_.load(std::memory_order_relaxed)); } else { @@ -50,8 +48,7 @@ void AllocTracker::FreeMem() { DoneAllocating(); } if (write_buffer_manager_ != nullptr && !freed_) { - if (write_buffer_manager_->enabled() || - write_buffer_manager_->cost_to_cache()) { + if (write_buffer_manager_->enabled()) { write_buffer_manager_->FreeMem( bytes_allocated_.load(std::memory_order_relaxed)); } else { diff --git a/memtable/write_buffer_manager.cc b/memtable/write_buffer_manager.cc index ce1789c20d6..2dae1b75531 100644 --- a/memtable/write_buffer_manager.cc +++ b/memtable/write_buffer_manager.cc @@ -14,20 +14,24 @@ #include "cache/cache_entry_roles.h" #include "cache/cache_reservation_manager.h" #include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "rocksdb/options.h" #include "rocksdb/status.h" #include "util/coding.h" namespace ROCKSDB_NAMESPACE { -WriteBufferManager::WriteBufferManager(size_t _buffer_size, +WriteBufferManager::WriteBufferManager(size_t _flush_size, std::shared_ptr cache, - bool allow_stall) - : buffer_size_(_buffer_size), - mutable_limit_(buffer_size_ * 7 / 8), - memory_used_(0), + float stall_ratio, + bool flush_oldest_first) + : memory_used_(0), + flush_size_(_flush_size), memory_active_(0), - cache_res_mgr_(nullptr), - allow_stall_(allow_stall), - stall_active_(false) { + flush_oldest_first_(flush_oldest_first), + allow_stall_(stall_ratio >= 1.0), + stall_ratio_(stall_ratio), + stall_active_(false), + cache_res_mgr_(nullptr) { if (cache) { // Memtable's memory usage tends to fluctuate frequently // therefore we set delayed_decrease = true to save some dummy entry @@ -40,7 +44,7 @@ WriteBufferManager::WriteBufferManager(size_t _buffer_size, WriteBufferManager::~WriteBufferManager() { #ifndef NDEBUG - std::unique_lock lock(mu_); + std::unique_lock lock(stall_mu_); assert(queue_.empty()); #endif } @@ -53,13 +57,55 @@ std::size_t WriteBufferManager::dummy_entries_in_cache_usage() const { } } +void WriteBufferManager::SetFlushSize(size_t new_size) { + if (flush_size_.exchange(new_size, std::memory_order_relaxed) > new_size) { + // Threshold is decreased. We must make sure all outstanding memtables + // are flushed. + std::lock_guard lock(sentinels_mu_); + auto max_retry = sentinels_.size(); + while ((max_retry--) && ShouldFlush()) { + MaybeFlushLocked(); + } + } else { + // Check if stall is active and can be ended. + MaybeEndWriteStall(); + } +} + +void WriteBufferManager::RegisterColumnFamily(DB* db, ColumnFamilyHandle* cf) { + assert(db != nullptr); + auto sentinel = std::make_shared(); + sentinel->db = db; + sentinel->cf = cf; + std::lock_guard lock(sentinels_mu_); + MaybeFlushLocked(); + sentinels_.push_back(sentinel); +} + +void WriteBufferManager::UnregisterDB(DB* db) { + std::lock_guard lock(sentinels_mu_); + sentinels_.remove_if([=](const std::shared_ptr& s) { + return s->db == db; + }); + MaybeFlushLocked(); +} + +void WriteBufferManager::UnregisterColumnFamily(ColumnFamilyHandle* cf) { + std::lock_guard lock(sentinels_mu_); + sentinels_.remove_if([=](const std::shared_ptr& s) { + return s->cf == cf; + }); + MaybeFlushLocked(); +} + void WriteBufferManager::ReserveMem(size_t mem) { + size_t local_size = flush_size(); if (cache_res_mgr_ != nullptr) { ReserveMemWithCache(mem); - } else if (enabled()) { + } else if (local_size > 0) { memory_used_.fetch_add(mem, std::memory_order_relaxed); } - if (enabled()) { + if (local_size > 0) { memory_active_.fetch_add(mem, std::memory_order_relaxed); } } @@ -84,7 +130,7 @@ void WriteBufferManager::ReserveMemWithCache(size_t mem) { } void WriteBufferManager::ScheduleFreeMem(size_t mem) { - if (enabled()) { + if (flush_size() > 0) { memory_active_.fetch_sub(mem, std::memory_order_relaxed); } } @@ -92,7 +138,7 @@ void WriteBufferManager::ScheduleFreeMem(size_t mem) { void WriteBufferManager::FreeMem(size_t mem) { if (cache_res_mgr_ != nullptr) { FreeMemWithCache(mem); - } else if (enabled()) { + } else if (flush_size() > 0) { memory_used_.fetch_sub(mem, std::memory_order_relaxed); } // Check if stall is active and can be ended. @@ -115,6 +161,87 @@ void WriteBufferManager::FreeMemWithCache(size_t mem) { s.PermitUncheckedError(); } +void WriteBufferManager::MaybeFlushLocked(DB* this_db) { + if (!ShouldFlush()) { + return; + } + // Have at least one candidate to flush with + // check_if_compaction_disabled=false when all others failed. + constexpr size_t kCandidateSize = 2; + // (score, age). + using Candidate = std::tuple; + auto cmp = [](const Candidate& a, const Candidate& b) { + return std::get<1>(a) <= std::get<1>(b); + }; + std::set candidates(cmp); + + for (auto& s : sentinels_) { + // TODO: move this calculation to a callback. + uint64_t current_score = 0; + uint64_t current_memory_bytes = std::numeric_limits::max(); + uint64_t oldest_time = std::numeric_limits::max(); + s->db->GetApproximateActiveMemTableStats(s->cf, ¤t_memory_bytes, + &oldest_time); + if (flush_oldest_first_.load(std::memory_order_relaxed)) { + // Convert oldest to highest score. + current_score = std::numeric_limits::max() - oldest_time; + } else { + current_score = current_memory_bytes; + } + // A very mild penalty for too many L0 files. + uint64_t level0; + // 3 is to optimize the frequency of getting options, which uses mutex. + if (s->db->GetIntProperty(DB::Properties::kNumFilesAtLevelPrefix + "0", + &level0) && + level0 >= 3) { + auto opts = s->db->GetOptions(s->cf); + if (opts.level0_file_num_compaction_trigger > 0 && + level0 >= + static_cast(opts.level0_file_num_compaction_trigger)) { + auto diff = level0 - static_cast( + opts.level0_file_num_compaction_trigger); + // 0->2, +1->4, +2->8, +3->12, +4->18 + uint64_t factor = (diff + 2) * (diff + 2) / 2; + if (factor > 100) { + factor = 100; + } + current_score = current_score * (100 - factor) / factor; + } + } + candidates.insert({s.get(), current_score, oldest_time}); + if (candidates.size() > kCandidateSize) { + candidates.erase(candidates.begin()); + } + } + + // We only flush at most one column family at a time. + // This is enough to keep size under control except when flush_size is + // dynamically decreased. That case is managed in `SetFlushSize`. + auto candidate = candidates.rbegin(); + while (candidate != candidates.rend()) { + auto sentinel = std::get<0>(*candidate); + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + flush_opts.wait = false; + flush_opts._write_stopped = (sentinel->db == this_db); + flush_opts.expected_oldest_key_time = std::get<2>(*candidate); + candidate++; + if (candidate != candidates.rend()) { + // Don't check it for the last candidate. Otherwise we could end up + // never progressing. + flush_opts.check_if_compaction_disabled = true; + } + auto s = sentinel->db->Flush(flush_opts, sentinel->cf); + if (s.ok()) { + return; + } + auto opts = sentinel->db->GetDBOptions(); + ROCKS_LOG_WARN(opts.info_log, "WriteBufferManager fails to flush: %s", + s.ToString().c_str()); + // Fallback to the next best candidate. + } +} + void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) { assert(wbm_stall != nullptr); @@ -122,7 +249,7 @@ void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) { std::list new_node = {wbm_stall}; { - std::unique_lock lock(mu_); + std::unique_lock lock(stall_mu_); // Verify if the stall conditions are stil active. if (ShouldStall()) { stall_active_.store(true, std::memory_order_relaxed); @@ -140,15 +267,14 @@ void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) { // Called when memory is freed in FreeMem or the buffer size has changed. void WriteBufferManager::MaybeEndWriteStall() { // Stall conditions have not been resolved. - if (allow_stall_.load(std::memory_order_relaxed) && - IsStallThresholdExceeded()) { + if (allow_stall_ && is_stall_threshold_exceeded()) { return; } // Perform all deallocations outside of the lock. std::list cleanup; - std::unique_lock lock(mu_); + std::unique_lock lock(stall_mu_); if (!stall_active_.load(std::memory_order_relaxed)) { return; // Nothing to do. } @@ -163,14 +289,14 @@ void WriteBufferManager::MaybeEndWriteStall() { cleanup = std::move(queue_); } -void WriteBufferManager::RemoveDBFromQueue(StallInterface* wbm_stall) { +void WriteBufferManager::RemoveFromStallQueue(StallInterface* wbm_stall) { assert(wbm_stall != nullptr); // Deallocate the removed nodes outside of the lock. std::list cleanup; - if (enabled() && allow_stall_.load(std::memory_order_relaxed)) { - std::unique_lock lock(mu_); + if (allow_stall_) { + std::unique_lock lock(stall_mu_); for (auto it = queue_.begin(); it != queue_.end();) { auto next = std::next(it); if (*it == wbm_stall) { diff --git a/memtable/write_buffer_manager_test.cc b/memtable/write_buffer_manager_test.cc index c992d2eabcb..9f4c5c0164b 100644 --- a/memtable/write_buffer_manager_test.cc +++ b/memtable/write_buffer_manager_test.cc @@ -24,57 +24,19 @@ TEST_F(WriteBufferManagerTest, ShouldFlush) { wbf->ReserveMem(8 * 1024 * 1024); ASSERT_FALSE(wbf->ShouldFlush()); - // 90% of the hard limit will hit the condition - wbf->ReserveMem(1 * 1024 * 1024); + wbf->ReserveMem(2 * 1024 * 1024); ASSERT_TRUE(wbf->ShouldFlush()); // Scheduling for freeing will release the condition wbf->ScheduleFreeMem(1 * 1024 * 1024); ASSERT_FALSE(wbf->ShouldFlush()); - wbf->ReserveMem(2 * 1024 * 1024); - ASSERT_TRUE(wbf->ShouldFlush()); - - wbf->ScheduleFreeMem(4 * 1024 * 1024); - // 11MB total, 6MB mutable. hard limit still hit + // change size: 8M limit. + wbf->SetFlushSize(8 * 1024 * 1024); + // 9MB mutable. ASSERT_TRUE(wbf->ShouldFlush()); wbf->ScheduleFreeMem(2 * 1024 * 1024); - // 11MB total, 4MB mutable. hard limit stills but won't flush because more - // than half data is already being flushed. - ASSERT_FALSE(wbf->ShouldFlush()); - - wbf->ReserveMem(4 * 1024 * 1024); - // 15 MB total, 8MB mutable. - ASSERT_TRUE(wbf->ShouldFlush()); - - wbf->FreeMem(7 * 1024 * 1024); - // 8MB total, 8MB mutable. - ASSERT_FALSE(wbf->ShouldFlush()); - - // change size: 8M limit, 7M mutable limit - wbf->SetBufferSize(8 * 1024 * 1024); - // 8MB total, 8MB mutable. - ASSERT_TRUE(wbf->ShouldFlush()); - - wbf->ScheduleFreeMem(2 * 1024 * 1024); - // 8MB total, 6MB mutable. - ASSERT_TRUE(wbf->ShouldFlush()); - - wbf->FreeMem(2 * 1024 * 1024); - // 6MB total, 6MB mutable. - ASSERT_FALSE(wbf->ShouldFlush()); - - wbf->ReserveMem(1 * 1024 * 1024); - // 7MB total, 7MB mutable. - ASSERT_FALSE(wbf->ShouldFlush()); - - wbf->ReserveMem(1 * 1024 * 1024); - // 8MB total, 8MB mutable. - ASSERT_TRUE(wbf->ShouldFlush()); - - wbf->ScheduleFreeMem(1 * 1024 * 1024); - wbf->FreeMem(1 * 1024 * 1024); - // 7MB total, 7MB mutable. + // 7MB mutable. ASSERT_FALSE(wbf->ShouldFlush()); } @@ -123,7 +85,6 @@ TEST_F(ChargeWriteBufferTest, Basic) { ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry); ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead); - ASSERT_FALSE(wbf->ShouldFlush()); // Allocate another 41MB, memory_used_ = 52045KB wbf->ReserveMem(41 * 1024 * 1024); @@ -131,19 +92,6 @@ TEST_F(ChargeWriteBufferTest, Basic) { ASSERT_GE(cache->GetPinnedUsage(), 204 * 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 204 * 256 * 1024 + kMetaDataChargeOverhead); - ASSERT_TRUE(wbf->ShouldFlush()); - - ASSERT_TRUE(wbf->ShouldFlush()); - - // Schedule free 20MB, memory_used_ = 52045KB - // It will not cause any change in memory_used and cache cost - wbf->ScheduleFreeMem(20 * 1024 * 1024); - ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 204 * kSizeDummyEntry); - ASSERT_GE(cache->GetPinnedUsage(), 204 * 256 * 1024); - ASSERT_LT(cache->GetPinnedUsage(), - 204 * 256 * 1024 + kMetaDataChargeOverhead); - // Still need flush as the hard limit hits - ASSERT_TRUE(wbf->ShouldFlush()); // Free 20MB, memory_used_ = 31565KB // It will releae 80 dummy entries from cache since @@ -156,8 +104,6 @@ TEST_F(ChargeWriteBufferTest, Basic) { ASSERT_LT(cache->GetPinnedUsage(), 124 * 256 * 1024 + kMetaDataChargeOverhead); - ASSERT_FALSE(wbf->ShouldFlush()); - // Free 16KB, memory_used_ = 31549KB // It will not release any dummy entry since memory_used_ >= // dummy_entries_in_cache_usage * (3/4) @@ -214,8 +160,6 @@ TEST_F(ChargeWriteBufferTest, BasicWithNoBufferSizeLimit) { ASSERT_GE(cache->GetPinnedUsage(), 40 * 256 * 1024); ASSERT_LT(cache->GetPinnedUsage(), 40 * 256 * 1024 + kMetaDataChargeOverhead); - ASSERT_FALSE(wbf->ShouldFlush()); - // Free 9MB, memory_used_ = 1024KB // It will free 36 dummy entries wbf->FreeMem(9 * 1024 * 1024); diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index ced8597a9d6..f8a45d1f4aa 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -438,6 +438,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { sizeof(std::shared_ptr)}, {offsetof(struct ColumnFamilyOptions, sst_partitioner_factory), sizeof(std::shared_ptr)}, + {offsetof(struct ColumnFamilyOptions, cf_write_buffer_manager), + sizeof(std::shared_ptr)}, }; char* options_ptr = new char[sizeof(ColumnFamilyOptions)]; diff --git a/src.mk b/src.mk index 0bc1c2e398c..4a8e6fdf6ee 100644 --- a/src.mk +++ b/src.mk @@ -55,6 +55,7 @@ LIB_SOURCES = \ db/db_impl/db_impl_readonly.cc \ db/db_impl/db_impl_secondary.cc \ db/db_impl/db_impl_write.cc \ + db/db_impl/db_impl_merge.cc \ db/db_info_dumper.cc \ db/db_iter.cc \ db/dbformat.cc \ @@ -488,6 +489,7 @@ TEST_MAIN_SOURCES = \ db/db_memtable_test.cc \ db/db_merge_operator_test.cc \ db/db_merge_operand_test.cc \ + db/db_merge_test.cc \ db/db_options_test.cc \ db/db_properties_test.cc \ db/db_range_del_test.cc \ diff --git a/test_util/testutil.h b/test_util/testutil.h index 5a173ca40c0..6dfd649dfa1 100644 --- a/test_util/testutil.h +++ b/test_util/testutil.h @@ -85,6 +85,18 @@ class TestKeyManager : public encryption::KeyManager { Status DeleteFile(const std::string& fname) override { std::lock_guard l(mutex); file_set.erase(fname); + if (!fname.empty()) { + std::string copy = fname; + if (copy.back() != '/') { + copy.push_back('/'); + } + auto begin = file_set.lower_bound(copy); + auto end = begin; + while (end != file_set.end() && end->compare(0, copy.size(), copy) == 0) { + end++; + } + file_set.erase(begin, end); + } return Status::OK(); } diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h index e2f0b7bdbdd..c83260aff9e 100644 --- a/utilities/blob_db/blob_db.h +++ b/utilities/blob_db/blob_db.h @@ -196,8 +196,9 @@ class BlobDB : public StackableDB { return Status::NotSupported("Not supported operation in blob db."); } - virtual Status Write(const WriteOptions& opts, - WriteBatch* updates) override = 0; + using rocksdb::StackableDB::Write; + virtual Status Write(const WriteOptions& opts, WriteBatch* updates, + PostWriteCallback* callback) override = 0; using ROCKSDB_NAMESPACE::StackableDB::NewIterator; virtual Iterator* NewIterator(const ReadOptions& options) override = 0; diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 2fa7ae898f5..1e73b42ddd9 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -1003,7 +1003,8 @@ class BlobDBImpl::BlobInserter : public WriteBatch::Handler { void LogData(const Slice& blob) override { batch_.PutLogData(blob); } }; -Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) { +Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates, + PostWriteCallback* callback) { StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS); RecordTick(statistics_, BLOB_DB_NUM_WRITE); uint32_t default_cf_id = @@ -1021,7 +1022,7 @@ Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) { if (!s.ok()) { return s; } - return db_->Write(options, blob_inserter.batch()); + return db_->Write(options, blob_inserter.batch(), callback); } Status BlobDBImpl::Put(const WriteOptions& options, const Slice& key, diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h index d491108d3e6..205c07e9c92 100644 --- a/utilities/blob_db/blob_db_impl.h +++ b/utilities/blob_db/blob_db_impl.h @@ -128,7 +128,8 @@ class BlobDBImpl : public BlobDB { std::vector* values) override; using BlobDB::Write; - virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + virtual Status Write(const WriteOptions& opts, WriteBatch* updates, + PostWriteCallback* callback) override; virtual Status Close() override; diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index e1f09451309..48f6e159e2f 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -311,7 +311,11 @@ Status CheckpointImpl::ExportColumnFamily( s = db_->GetEnv()->CreateDir(tmp_export_dir); if (s.ok()) { - s = db_->Flush(ROCKSDB_NAMESPACE::FlushOptions(), handle); + auto opts = ROCKSDB_NAMESPACE::FlushOptions(); + // In TiKV context: If tablet is to be destroyed, its background work will + // be paused. Manual flush can never make progress. + opts.check_if_compaction_disabled = true; + s = db_->Flush(opts, handle); } ColumnFamilyMetaData db_metadata; diff --git a/utilities/transactions/optimistic_transaction_db_impl.h b/utilities/transactions/optimistic_transaction_db_impl.h index 7bc718e9bdc..3bf6ff9a422 100644 --- a/utilities/transactions/optimistic_transaction_db_impl.h +++ b/utilities/transactions/optimistic_transaction_db_impl.h @@ -81,12 +81,13 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB { // Range deletions also must not be snuck into `WriteBatch`es as they are // incompatible with `OptimisticTransactionDB`. - virtual Status Write(const WriteOptions& write_opts, - WriteBatch* batch) override { + using OptimisticTransactionDB::Write; + virtual Status Write(const WriteOptions& write_opts, WriteBatch* batch, + PostWriteCallback* callback) override { if (batch->HasDeleteRange()) { return Status::NotSupported(); } - return OptimisticTransactionDB::Write(write_opts, batch); + return OptimisticTransactionDB::Write(write_opts, batch, callback); } OccValidationPolicy GetValidatePolicy() const { return validate_policy_; } diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc index e4bff782658..089849cc1b5 100644 --- a/utilities/ttl/db_ttl_impl.cc +++ b/utilities/ttl/db_ttl_impl.cc @@ -547,7 +547,8 @@ Status DBWithTTLImpl::Merge(const WriteOptions& options, return st; } -Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) { +Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates, + PostWriteCallback* callback) { class Handler : public WriteBatch::Handler { public: explicit Handler(SystemClock* clock) : clock_(clock) {} @@ -590,7 +591,7 @@ Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) { if (!st.ok()) { return st; } else { - return db_->Write(opts, &(handler.updates_ttl)); + return db_->Write(opts, &(handler.updates_ttl), callback); } } diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h index b125d79b067..fbc93cc11f8 100644 --- a/utilities/ttl/db_ttl_impl.h +++ b/utilities/ttl/db_ttl_impl.h @@ -75,7 +75,9 @@ class DBWithTTLImpl : public DBWithTTL { ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) override; - virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + using StackableDB::Write; + virtual Status Write(const WriteOptions& opts, WriteBatch* updates, + PostWriteCallback* callback) override; using StackableDB::NewIterator; virtual Iterator* NewIterator(const ReadOptions& _read_options,