From 405de0ec3dbb9007014d118ef2a973f3f4925239 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.zhang@pingcap.com>
Date: Mon, 7 Oct 2024 19:22:27 -0700
Subject: [PATCH] Raftstore v2 (#389)

Signed-off-by: Spade A <u6748471@anu.edu.au>
Signed-off-by: Yang Zhang <yang.zhang@pingcap.com>
Signed-off-by: SpadeA-Tang <u6748471@anu.edu.au>

Co-authored-by: Spade  A <71589810+SpadeA-Tang@users.noreply.github.com>
---
 CMakeLists.txt                                |   2 +
 Makefile                                      |   3 +
 TARGETS                                       |   7 +
 db/c.cc                                       |  18 +-
 db/c_test.c                                   |   8 +-
 db/column_family.cc                           | 107 ++-
 db/column_family.h                            |   8 +
 db/compaction/compaction_iterator.cc          |  15 +-
 db/compaction/compaction_iterator_test.cc     |  34 +
 db/db_filesnapshot.cc                         |   6 +-
 db/db_flush_test.cc                           |  32 +-
 db/db_impl/compacted_db_impl.h                |   5 +-
 db/db_impl/db_impl.cc                         |  48 +-
 db/db_impl/db_impl.h                          |  56 +-
 db/db_impl/db_impl_compaction_flush.cc        |  62 +-
 db/db_impl/db_impl_debug.cc                   |  14 +
 db/db_impl/db_impl_merge.cc                   | 396 +++++++++++
 db/db_impl/db_impl_open.cc                    |  43 ++
 db/db_impl/db_impl_readonly.h                 |   5 +-
 db/db_impl/db_impl_secondary.h                |   5 +-
 db/db_impl/db_impl_write.cc                   | 181 ++---
 db/db_merge_test.cc                           | 647 ++++++++++++++++++
 db/db_properties_test.cc                      |   8 +-
 db/db_test.cc                                 |   4 +-
 db/db_test2.cc                                | 157 ++++-
 db/db_test_util.cc                            |  16 +
 db/db_test_util.h                             |   5 +
 db/db_write_buffer_manager_test.cc            | 533 ++++++++++++---
 db/db_write_test.cc                           |  97 +++
 db/flush_job.cc                               |  11 +-
 db/flush_job.h                                |   5 +-
 db/memtable.cc                                |  75 +-
 db/memtable.h                                 |  38 +
 db/memtable_list.cc                           |  17 +
 db/memtable_list.h                            |   3 +
 db/write_batch.cc                             |   6 +
 db/write_thread.cc                            |   2 +
 db/write_thread.h                             |  11 +-
 encryption/encryption.cc                      |  13 +-
 include/rocksdb/c.h                           |   8 +-
 include/rocksdb/compaction_filter.h           |  26 +-
 include/rocksdb/db.h                          |  72 +-
 include/rocksdb/encryption.h                  |  10 +
 include/rocksdb/listener.h                    |   2 +
 include/rocksdb/options.h                     |  36 +-
 include/rocksdb/slice.h                       |   1 +
 include/rocksdb/utilities/stackable_db.h      |   6 +-
 include/rocksdb/write_buffer_manager.h        | 177 +++--
 memtable/alloc_tracker.cc                     |   9 +-
 memtable/write_buffer_manager.cc              | 166 ++++-
 memtable/write_buffer_manager_test.cc         |  66 +-
 options/options_settable_test.cc              |   2 +
 src.mk                                        |   2 +
 test_util/testutil.h                          |  12 +
 utilities/blob_db/blob_db.h                   |   5 +-
 utilities/blob_db/blob_db_impl.cc             |   5 +-
 utilities/blob_db/blob_db_impl.h              |   3 +-
 utilities/checkpoint/checkpoint_impl.cc       |   6 +-
 .../optimistic_transaction_db_impl.h          |   7 +-
 utilities/ttl/db_ttl_impl.cc                  |   5 +-
 utilities/ttl/db_ttl_impl.h                   |   4 +-
 61 files changed, 2808 insertions(+), 525 deletions(-)
 create mode 100644 db/db_impl/db_impl_merge.cc
 create mode 100644 db/db_merge_test.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b913d921a0b..d17b07306bd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -684,6 +684,7 @@ set(SOURCES
         db/db_impl/db_impl_experimental.cc
         db/db_impl/db_impl_readonly.cc
         db/db_impl/db_impl_secondary.cc
+        db/db_impl/db_impl_merge.cc
         db/db_info_dumper.cc
         db/db_iter.cc
         db/dbformat.cc
@@ -1327,6 +1328,7 @@ if(WITH_TESTS)
         db/db_memtable_test.cc
         db/db_merge_operator_test.cc
         db/db_merge_operand_test.cc
+        db/db_merge_test.cc
         db/db_options_test.cc
         db/db_properties_test.cc
         db/db_range_del_test.cc
diff --git a/Makefile b/Makefile
index 90a394cb0df..8f393988548 100644
--- a/Makefile
+++ b/Makefile
@@ -1511,6 +1511,9 @@ db_merge_operator_test: $(OBJ_DIR)/db/db_merge_operator_test.o $(TEST_LIBRARY) $
 db_merge_operand_test: $(OBJ_DIR)/db/db_merge_operand_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+db_merge_test: $(OBJ_DIR)/db/db_merge_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 db_options_test: $(OBJ_DIR)/db/db_options_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
diff --git a/TARGETS b/TARGETS
index 9a314821518..6ca67ffebea 100644
--- a/TARGETS
+++ b/TARGETS
@@ -58,6 +58,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "db/db_impl/db_impl_debug.cc",
         "db/db_impl/db_impl_experimental.cc",
         "db/db_impl/db_impl_files.cc",
+        "db/db_impl/db_impl_merge.cc",
         "db/db_impl/db_impl_open.cc",
         "db/db_impl/db_impl_readonly.cc",
         "db/db_impl/db_impl_secondary.cc",
@@ -4862,6 +4863,12 @@ cpp_unittest_wrapper(name="db_merge_operator_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="db_merge_test",
+            srcs=["db/db_merge_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="db_options_test",
             srcs=["db/db_options_test.cc"],
             deps=[":rocksdb_test_lib"],
diff --git a/db/c.cc b/db/c.cc
index f1597fe4750..4ddb4676095 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -4944,11 +4944,6 @@ bool rocksdb_write_buffer_manager_enabled(rocksdb_write_buffer_manager_t* wbm) {
   return wbm->rep->enabled();
 }
 
-bool rocksdb_write_buffer_manager_cost_to_cache(
-    rocksdb_write_buffer_manager_t* wbm) {
-  return wbm->rep->cost_to_cache();
-}
-
 size_t rocksdb_write_buffer_manager_memory_usage(
     rocksdb_write_buffer_manager_t* wbm) {
   return wbm->rep->memory_usage();
@@ -4963,17 +4958,10 @@ size_t rocksdb_write_buffer_manager_dummy_entries_in_cache_usage(
     rocksdb_write_buffer_manager_t* wbm) {
   return wbm->rep->dummy_entries_in_cache_usage();
 }
-size_t rocksdb_write_buffer_manager_buffer_size(
+
+size_t rocksdb_write_buffer_manager_flush_size(
     rocksdb_write_buffer_manager_t* wbm) {
-  return wbm->rep->buffer_size();
-}
-void rocksdb_write_buffer_manager_set_buffer_size(
-    rocksdb_write_buffer_manager_t* wbm, size_t new_size) {
-  wbm->rep->SetBufferSize(new_size);
-}
-ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_allow_stall(
-    rocksdb_write_buffer_manager_t* wbm, bool new_allow_stall) {
-  wbm->rep->SetAllowStall(new_allow_stall);
+  return wbm->rep->flush_size();
 }
 
 rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path,
diff --git a/db/c_test.c b/db/c_test.c
index 66722049692..b9bee287f0f 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -3792,14 +3792,8 @@ int main(int argc, char** argv) {
 
     CheckCondition(true ==
                    rocksdb_write_buffer_manager_enabled(write_buffer_manager));
-    CheckCondition(true == rocksdb_write_buffer_manager_cost_to_cache(
-                               write_buffer_manager));
     CheckCondition(
-        200 == rocksdb_write_buffer_manager_buffer_size(write_buffer_manager));
-
-    rocksdb_write_buffer_manager_set_buffer_size(write_buffer_manager, 300);
-    CheckCondition(
-        300 == rocksdb_write_buffer_manager_buffer_size(write_buffer_manager));
+        200 == rocksdb_write_buffer_manager_flush_size(write_buffer_manager));
 
     rocksdb_write_buffer_manager_destroy(write_buffer_manager);
     rocksdb_cache_destroy(lru);
diff --git a/db/column_family.cc b/db/column_family.cc
index 3ac603da71d..bb9cb87796f 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -64,6 +64,9 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
     for (auto& listener : cfd_->ioptions()->listeners) {
       listener->OnColumnFamilyHandleDeletionStarted(this);
     }
+    if (cfd_->write_buffer_mgr()) {
+      cfd_->write_buffer_mgr()->UnregisterColumnFamily(this);
+    }
     // Job id == 0 means that this is not our background process, but rather
     // user thread
     // Need to hold some shared pointers owned by the initial_cf_options
@@ -1246,6 +1249,105 @@ Status ColumnFamilyData::RangesOverlapWithMemtables(
   return status;
 }
 
+Status ColumnFamilyData::GetMemtablesUserKeyRange(PinnableSlice* smallest,
+                                                  PinnableSlice* largest,
+                                                  bool* found) {
+  assert(smallest && largest && found);
+  Status s;
+  auto* ucmp = user_comparator();
+  Arena arena;
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
+  merge_iter_builder.AddIterator(mem_->NewIterator(read_opts, &arena));
+  imm_.current()->AddIterators(read_opts, &merge_iter_builder, false);
+  ScopedArenaIterator mem_iter(merge_iter_builder.Finish());
+  mem_iter->SeekToFirst();
+  if (mem_iter->Valid()) {
+    auto ukey = mem_iter->user_key();
+    if (!(*found) || ucmp->Compare(ukey, *smallest) < 0) {
+      smallest->PinSelf(ukey);
+    }
+    mem_iter->SeekToLast();
+    assert(mem_iter->Valid());
+    ukey = mem_iter->user_key();
+    if (!(*found) || ucmp->Compare(*largest, ukey) < 0) {
+      largest->PinSelf(ukey);
+    }
+    *found = true;
+  }
+
+  if (s.ok()) {
+    autovector<MemTable*> memtables{mem_};
+    imm_.ExportMemtables(&memtables);
+    for (auto* mem : memtables) {
+      auto* iter =
+          mem->NewRangeTombstoneIterator(read_opts, kMaxSequenceNumber, false);
+      if (iter != nullptr) {
+        iter->SeekToFirst();
+        if (iter->Valid()) {
+          // It's already a user key.
+          auto ukey = iter->start_key();
+          if (!(*found) || ucmp->Compare(ukey, *smallest) < 0) {
+            smallest->PinSelf(ukey);
+          }
+          iter->SeekToLast();
+          assert(iter->Valid());
+          // Get the end_key of all tombstones.
+          ukey = iter->end_key();
+          if (!(*found) || ucmp->Compare(*largest, ukey) < 0) {
+            largest->PinSelf(ukey);
+          }
+          *found = true;
+        }
+      }
+    }
+  }
+
+  return s;
+}
+
+Status ColumnFamilyData::GetUserKeyRange(PinnableSlice* smallest,
+                                         PinnableSlice* largest, bool* found) {
+  assert(smallest && largest && found);
+  if (ioptions_.compaction_style != CompactionStyle::kCompactionStyleLevel) {
+    return Status::NotSupported("Unexpected compaction style");
+  }
+  Status s = GetMemtablesUserKeyRange(smallest, largest, found);
+  if (!s.ok()) {
+    return s;
+  }
+
+  VersionStorageInfo& vsi = *current()->storage_info();
+  auto* ucmp = user_comparator();
+  for (const auto& f : vsi.LevelFiles(0)) {
+    Slice start = f->smallest.user_key();
+    Slice end = f->largest.user_key();
+    if (!(*found) || ucmp->Compare(start, *smallest) < 0) {
+      smallest->PinSelf(start);
+    }
+    if (!(*found) || ucmp->Compare(*largest, end) < 0) {
+      largest->PinSelf(end);
+    }
+    *found = true;
+  }
+  for (int level = 1; level < vsi.num_levels(); ++level) {
+    const auto& level_files = vsi.LevelFiles(level);
+    if (level_files.size() > 0) {
+      Slice start = level_files.front()->smallest.user_key();
+      Slice end = level_files.back()->largest.user_key();
+      if (!(*found) || ucmp->Compare(start, *smallest) < 0) {
+        smallest->PinSelf(start);
+      }
+      if (!(*found) || ucmp->Compare(*largest, end) < 0) {
+        largest->PinSelf(end);
+      }
+      *found = true;
+    }
+  }
+  return s;
+}
+
 const int ColumnFamilyData::kCompactAllLevels = -1;
 const int ColumnFamilyData::kCompactToBaseLevel = -2;
 
@@ -1733,8 +1835,11 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
     const std::string& name, uint32_t id, Version* dummy_versions,
     const ColumnFamilyOptions& options) {
   assert(column_families_.find(name) == column_families_.end());
+  auto* write_buffer_manager = options.cf_write_buffer_manager != nullptr
+                                   ? options.cf_write_buffer_manager.get()
+                                   : write_buffer_manager_;
   ColumnFamilyData* new_cfd = new ColumnFamilyData(
-      id, name, dummy_versions, table_cache_, write_buffer_manager_, options,
+      id, name, dummy_versions, table_cache_, write_buffer_manager, options,
       *db_options_, &file_options_, this, block_cache_tracer_, io_tracer_,
       db_id_, db_session_id_);
   column_families_.insert({name, id});
diff --git a/db/column_family.h b/db/column_family.h
index c0b85fede03..7a0a75ace91 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -406,6 +406,14 @@ class ColumnFamilyData {
                                     SuperVersion* super_version,
                                     bool allow_data_in_errors, bool* overlap);
 
+  // Get user key range of memtables. Tombstones are counted.
+  Status GetMemtablesUserKeyRange(PinnableSlice* smallest,
+                                  PinnableSlice* largest, bool* found);
+
+  // Get user key range of all data. Tombstones are counted.
+  Status GetUserKeyRange(PinnableSlice* smallest, PinnableSlice* largest,
+                         bool* found);
+
   // A flag to tell a manual compaction is to compact all levels together
   // instead of a specific level.
   static const int kCompactAllLevels;
diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc
index 85d1c039bd3..81e38be352b 100644
--- a/db/compaction/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@@ -229,17 +229,20 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
   }
 
   if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex &&
-      ikey_.type != kTypeWideColumnEntity) {
+      ikey_.type != kTypeWideColumnEntity && ikey_.type != kTypeDeletion) {
     return true;
   }
 
   CompactionFilter::Decision decision =
       CompactionFilter::Decision::kUndetermined;
-  CompactionFilter::ValueType value_type =
-      ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
-      : ikey_.type == kTypeBlobIndex
-          ? CompactionFilter::ValueType::kBlobIndex
-          : CompactionFilter::ValueType::kWideColumnEntity;
+  CompactionFilter::ValueType value_type = CompactionFilter::ValueType::kValue;
+  if (ikey_.type == kTypeBlobIndex) {
+    value_type = CompactionFilter::ValueType::kBlobIndex;
+  } else if (ikey_.type == kTypeWideColumnEntity) {
+    value_type = CompactionFilter::ValueType::kWideColumnEntity;
+  } else if (ikey_.type == kTypeDeletion) {
+    value_type = CompactionFilter::ValueType::kDeletion;
+  }
 
   // Hack: pass internal key to BlobIndexCompactionFilter since it needs
   // to get sequence number.
diff --git a/db/compaction/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc
index 699e629693d..7b4e8985024 100644
--- a/db/compaction/compaction_iterator_test.cc
+++ b/db/compaction/compaction_iterator_test.cc
@@ -719,6 +719,40 @@ TEST_P(CompactionIteratorTest, SingleMergeOperand) {
   ASSERT_EQ("cv1cv2", c_iter_->value().ToString());
 }
 
+TEST_P(CompactionIteratorTest, RemoveAllSingleDeletes) {
+  struct Filter : public CompactionFilter {
+    Decision UnsafeFilter(int /*level*/, const Slice& key, ValueType t,
+                          const Slice& /*existing_value*/,
+                          std::string* /*new_value*/,
+                          std::string* skip_until) const override {
+      if (t == ValueType::kDeletion) {
+        *skip_until = key.ToString();
+        skip_until->back() += 1;
+        filtered += 1;
+        return Decision::kRemoveAndSkipUntil;
+      }
+      return Decision::kKeep;
+    }
+
+    const char* Name() const override {
+      return "CompactionIteratorTest.SingleDelete::Filter";
+    }
+    mutable size_t filtered = 0;
+  };
+
+  Filter filter;
+  InitIterators(
+      {test::KeyStr("a", 70, kTypeDeletion), test::KeyStr("a", 50, kTypeValue),
+       test::KeyStr("c", 70, kTypeDeletion),
+       test::KeyStr("c", 50, kTypeDeletion)},
+      {"", "a", "", ""}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+      nullptr, &filter);
+
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(!c_iter_->Valid());
+  ASSERT_EQ(filter.filtered, 2);
+}
+
 // In bottommost level, values earlier than earliest snapshot can be output
 // with sequence = 0.
 TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) {
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index 40e7ac15548..988996ba96d 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -4,7 +4,6 @@
 //  (found in the LICENSE.Apache file in the root directory).
 //
 
-
 #include <algorithm>
 #include <cstdint>
 #include <memory>
@@ -29,7 +28,10 @@
 namespace ROCKSDB_NAMESPACE {
 
 Status DBImpl::FlushForGetLiveFiles() {
-  return DBImpl::FlushAllColumnFamilies(FlushOptions(),
+  FlushOptions flush_options;
+  flush_options.allow_write_stall = true;
+  flush_options.check_if_compaction_disabled = true;
+  return DBImpl::FlushAllColumnFamilies(flush_options,
                                         FlushReason::kGetLiveFiles);
 }
 
diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index b2c9f4e67c3..3b943cfd4fa 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -740,9 +740,11 @@ class TestFlushListener : public EventListener {
   DBFlushTest* test_;
 };
 
+// Disabled, because of
+// https://github.com/tikv/rocksdb/pull/389/commits/cc433939ed937a82d0a0ccad1280d5907b048654
 TEST_F(
     DBFlushTest,
-    FixUnrecoverableWriteDuringAtomicFlushWaitUntilFlushWouldNotStallWrites) {
+    DISABLED_FixUnrecoverableWriteDuringAtomicFlushWaitUntilFlushWouldNotStallWrites) {
   Options options = CurrentOptions();
   options.atomic_flush = true;
 
@@ -2012,6 +2014,13 @@ TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) {
       }
     }
 
+    void OnFlushBegin(DB* /*db*/, const FlushJobInfo& info) override {
+      ASSERT_LE(info.smallest_seqno, info.largest_seqno);
+      if (info.largest_seqno != seq1) {
+        ASSERT_EQ(info.largest_seqno, seq2);
+      }
+    }
+
     void CheckFlushResultCommitted(DB* db, SequenceNumber seq) {
       DBImpl* db_impl = static_cast_with_check<DBImpl>(db);
       InstrumentedMutex* mutex = db_impl->mutex();
@@ -3189,6 +3198,27 @@ TEST_P(DBAtomicFlushTest, NoWaitWhenWritesStopped) {
   SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_P(DBAtomicFlushTest, DisableManualCompaction) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(dbfull()->PauseBackgroundWork());
+  ASSERT_OK(Put(0, "key00", "value00"));
+  ASSERT_OK(Put(1, "key10", "value10"));
+  dbfull()->DisableManualCompaction();
+  FlushOptions flush_opts;
+  flush_opts.wait = true;
+  flush_opts.check_if_compaction_disabled = true;
+  ASSERT_TRUE(dbfull()->Flush(flush_opts, handles_).IsIncomplete());
+  ASSERT_OK(Put(0, "key01", "value01"));
+  ASSERT_OK(db_->ContinueBackgroundWork());
+  dbfull()->EnableManualCompaction();
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  Close();
+}
+
 INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest,
                         testing::Bool());
 
diff --git a/db/db_impl/compacted_db_impl.h b/db/db_impl/compacted_db_impl.h
index e1c605e420b..cf8702895f7 100644
--- a/db/db_impl/compacted_db_impl.h
+++ b/db/db_impl/compacted_db_impl.h
@@ -77,8 +77,9 @@ class CompactedDBImpl : public DBImpl {
                         const Slice& /*key*/) override {
     return Status::NotSupported("Not supported in compacted db mode.");
   }
-  virtual Status Write(const WriteOptions& /*options*/,
-                       WriteBatch* /*updates*/) override {
+  using DBImpl::Write;
+  virtual Status Write(const WriteOptions& /*options*/, WriteBatch* /*updates*/,
+                       PostWriteCallback* /*callback*/) override {
     return Status::NotSupported("Not supported in compacted db mode.");
   }
   using DBImpl::CompactRange;
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 297c6aceb76..417304e1f17 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -395,6 +395,7 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
       FlushOptions flush_opts;
       // We allow flush to stall write since we are trying to resume from error.
       flush_opts.allow_write_stall = true;
+      flush_opts.check_if_compaction_disabled = true;
       s = FlushAllColumnFamilies(flush_opts, context.flush_reason);
     }
     if (!s.ok()) {
@@ -491,7 +492,10 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
   if (!shutting_down_.load(std::memory_order_acquire) &&
       has_unpersisted_data_.load(std::memory_order_relaxed) &&
       !mutable_db_options_.avoid_flush_during_shutdown) {
-    s = DBImpl::FlushAllColumnFamilies(FlushOptions(), FlushReason::kShutDown);
+    FlushOptions flush_opts;
+    flush_opts.allow_write_stall = true;
+    flush_opts.check_if_compaction_disabled = true;
+    s = DBImpl::FlushAllColumnFamilies(flush_opts, FlushReason::kShutDown);
     s.PermitUncheckedError();  //**TODO: What to do on error?
   }
 
@@ -655,6 +659,14 @@ Status DBImpl::CloseHelper() {
     delete txn_entry.second;
   }
 
+  mutex_.Unlock();
+  // We can only access cf_based_write_buffer_manager_ before versions_.reset(),
+  // after which all cf write buffer managers will be freed.
+  for (auto m : cf_based_write_buffer_manager_) {
+    m->UnregisterDB(this);
+  }
+  mutex_.Lock();
+
   // versions need to be destroyed before table_cache since it can hold
   // references to table_cache.
   versions_.reset();
@@ -684,7 +696,10 @@ Status DBImpl::CloseHelper() {
   }
 
   if (write_buffer_manager_ && wbm_stall_) {
-    write_buffer_manager_->RemoveDBFromQueue(wbm_stall_.get());
+    write_buffer_manager_->RemoveFromStallQueue(wbm_stall_.get());
+  }
+  if (write_buffer_manager_) {
+    write_buffer_manager_->UnregisterDB(this);
   }
 
   IOStatus io_s = directories_.Close(IOOptions(), nullptr /* dbg */);
@@ -3647,6 +3662,22 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
   if (s.ok()) {
     NewThreadStatusCfInfo(
         static_cast_with_check<ColumnFamilyHandleImpl>(*handle)->cfd());
+    if (cf_options.cf_write_buffer_manager != nullptr) {
+      auto* write_buffer_manager = cf_options.cf_write_buffer_manager.get();
+      bool exist = false;
+      for (auto m : cf_based_write_buffer_manager_) {
+        if (m == write_buffer_manager) {
+          exist = true;
+        }
+      }
+      if (!exist) {
+        return Status::NotSupported(
+            "New cf write buffer manager is not supported after Open");
+      }
+      write_buffer_manager->RegisterColumnFamily(this, *handle);
+    } else if (write_buffer_manager_ != nullptr) {
+      write_buffer_manager_->RegisterColumnFamily(this, *handle);
+    }
   }
   return s;
 }
@@ -4635,6 +4666,18 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
   ReturnAndCleanupSuperVersion(cfd, sv);
 }
 
+void DBImpl::GetApproximateActiveMemTableStats(
+    ColumnFamilyHandle* column_family, uint64_t* const memory_bytes,
+    uint64_t* const oldest_key_time) {
+  auto* cf_impl = static_cast<ColumnFamilyHandleImpl*>(column_family);
+  if (memory_bytes) {
+    *memory_bytes = cf_impl->cfd()->mem()->ApproximateMemoryUsageFast();
+  }
+  if (oldest_key_time) {
+    *oldest_key_time = cf_impl->cfd()->mem()->ApproximateOldestKeyTime();
+  }
+}
+
 Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
                                    ColumnFamilyHandle* column_family,
                                    const Range* range, int n, uint64_t* sizes) {
@@ -5844,6 +5887,7 @@ Status DBImpl::IngestExternalFiles(
     if (status.ok() && at_least_one_cf_need_flush) {
       FlushOptions flush_opts;
       flush_opts.allow_write_stall = true;
+      flush_opts.check_if_compaction_disabled = true;
       if (immutable_db_options_.atomic_flush) {
         mutex_.Unlock();
         status = AtomicFlushMemTables(
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index ed771324827..c75c8c33a77 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -82,6 +82,7 @@ class WriteCallback;
 struct JobContext;
 struct ExternalSstFileInfo;
 struct MemTableInfo;
+class WriteBlocker;
 
 // Class to maintain directories for all database paths other than main one.
 class Directories {
@@ -229,12 +230,13 @@ class DBImpl : public DB {
                      const Slice& end_key, const Slice& ts) override;
 
   using DB::Write;
-  virtual Status Write(const WriteOptions& options,
-                       WriteBatch* updates) override;
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates,
+                       PostWriteCallback* callback) override;
 
   using DB::MultiBatchWrite;
   virtual Status MultiBatchWrite(const WriteOptions& options,
-                                 std::vector<WriteBatch*>&& updates) override;
+                                 std::vector<WriteBatch*>&& updates,
+                                 PostWriteCallback* callback) override;
 
   using DB::Get;
   virtual Status Get(const ReadOptions& options,
@@ -393,6 +395,12 @@ class DBImpl : public DB {
                                            const Range& range,
                                            uint64_t* const count,
                                            uint64_t* const size) override;
+
+  using DB::GetApproximateActiveMemTableStats;
+  virtual void GetApproximateActiveMemTableStats(
+      ColumnFamilyHandle* column_family, uint64_t* const memory_bytes,
+      uint64_t* const oldest_key_time) override;
+
   using DB::CompactRange;
   virtual Status CompactRange(const CompactRangeOptions& options,
                               ColumnFamilyHandle* column_family,
@@ -493,8 +501,7 @@ class DBImpl : public DB {
   virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
   virtual Status GetCurrentWalFile(
       std::unique_ptr<LogFile>* current_log_file) override;
-  virtual Status GetCreationTimeOfOldestFile(
-      uint64_t* creation_time) override;
+  virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override;
 
   virtual Status GetUpdatesSince(
       SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
@@ -616,7 +623,6 @@ class DBImpl : public DB {
       ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
       TablePropertiesCollection* props) override;
 
-
   // ---- End of implementations of the DB interface ----
   SystemClock* GetSystemClock() const;
 
@@ -1061,6 +1067,15 @@ class DBImpl : public DB {
                      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
                      const bool seq_per_batch, const bool batch_per_txn);
 
+  // Validate `rhs` can be merged into this DB with given merge options.
+  Status ValidateForMerge(const MergeInstanceOptions& merge_options,
+                          DBImpl* rhs);
+
+  Status CheckInRange(const Slice* begin, const Slice* end) override;
+
+  Status MergeDisjointInstances(const MergeInstanceOptions& merge_options,
+                                const std::vector<DB*>& instances) override;
+
   static IOStatus CreateAndNewDirectory(
       FileSystem* fs, const std::string& dirname,
       std::unique_ptr<FSDirectory>* directory);
@@ -1196,6 +1211,7 @@ class DBImpl : public DB {
   SeqnoToTimeMapping TEST_GetSeqnoToTimeMapping() const;
   const autovector<uint64_t>& TEST_GetFilesToQuarantine() const;
   size_t TEST_EstimateInMemoryStatsHistorySize() const;
+  void TEST_ClearBackgroundJobs();
 
   uint64_t TEST_GetCurrentLogNumber() const {
     InstrumentedMutexLock l(mutex());
@@ -1425,7 +1441,9 @@ class DBImpl : public DB {
 
   void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
                           const MutableCFOptions& mutable_cf_options,
-                          int job_id, FlushReason flush_reason);
+                          int job_id, FlushReason flush_reason,
+                          SequenceNumber earliest_seqno,
+                          SequenceNumber largest_seqno);
 
   void NotifyOnFlushCompleted(
       ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
@@ -1477,26 +1495,30 @@ class DBImpl : public DB {
                    bool disable_memtable = false, uint64_t* seq_used = nullptr,
                    size_t batch_cnt = 0,
                    PreReleaseCallback* pre_release_callback = nullptr,
-                   PostMemTableCallback* post_memtable_callback = nullptr);
+                   PostMemTableCallback* post_memtable_callback = nullptr,
+                   PostWriteCallback* post_callback = nullptr);
 
   Status MultiBatchWriteImpl(const WriteOptions& write_options,
                              std::vector<WriteBatch*>&& my_batch,
                              WriteCallback* callback = nullptr,
                              uint64_t* log_used = nullptr, uint64_t log_ref = 0,
-                             uint64_t* seq_used = nullptr);
+                             uint64_t* seq_used = nullptr,
+                             PostWriteCallback* post_callback = nullptr);
   void MultiBatchWriteCommit(CommitRequest* request);
 
   Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
                             WriteCallback* callback = nullptr,
                             uint64_t* log_used = nullptr, uint64_t log_ref = 0,
                             bool disable_memtable = false,
-                            uint64_t* seq_used = nullptr);
+                            uint64_t* seq_used = nullptr,
+                            PostWriteCallback* post_callback = nullptr);
 
   // Write only to memtables without joining any write queue
   Status UnorderedWriteMemtable(const WriteOptions& write_options,
                                 WriteBatch* my_batch, WriteCallback* callback,
                                 uint64_t log_ref, SequenceNumber seq,
-                                const size_t sub_batch_cnt);
+                                const size_t sub_batch_cnt,
+                                PostWriteCallback* post_callback = nullptr);
 
   // Whether the batch requires to be assigned with an order
   enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder };
@@ -1612,6 +1634,7 @@ class DBImpl : public DB {
   friend class WriteBatchWithIndex;
   friend class WriteUnpreparedTxnDB;
   friend class WriteUnpreparedTxn;
+  friend class WriteBlocker;
 
   friend class ForwardIterator;
   friend struct SuperVersion;
@@ -1797,8 +1820,8 @@ class DBImpl : public DB {
     const InternalKey* begin = nullptr;  // nullptr means beginning of key range
     const InternalKey* end = nullptr;    // nullptr means end of key range
     InternalKey* manual_end = nullptr;   // how far we are compacting
-    InternalKey tmp_storage;      // Used to keep track of compaction progress
-    InternalKey tmp_storage1;     // Used to keep track of compaction progress
+    InternalKey tmp_storage;   // Used to keep track of compaction progress
+    InternalKey tmp_storage1;  // Used to keep track of compaction progress
 
     // When the user provides a canceled pointer in CompactRangeOptions, the
     // above varaibe is the reference of the user-provided
@@ -2056,9 +2079,6 @@ class DBImpl : public DB {
   // REQUIRES: mutex locked and in write thread.
   Status SwitchWAL(WriteContext* write_context);
 
-  // REQUIRES: mutex locked and in write thread.
-  Status HandleWriteBufferManagerFlush(WriteContext* write_context);
-
   // REQUIRES: mutex locked
   Status PreprocessWrite(const WriteOptions& write_options,
                          LogContext* log_context, WriteContext* write_context);
@@ -2578,6 +2598,10 @@ class DBImpl : public DB {
   Directories directories_;
 
   WriteBufferManager* write_buffer_manager_;
+  // For simplicity, CF based write buffer manager does not support stall the
+  // write.
+  // Note: It's only modifed in Open, so mutex is not needed.
+  autovector<WriteBufferManager*> cf_based_write_buffer_manager_;
 
   WriteThread write_thread_;
   WriteBatch tmp_batch_;
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 71c23de95a5..c2bd7af0476 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -306,8 +306,10 @@ Status DBImpl::FlushMemTableToOutputFile(
                      job_context->job_id, s.ToString().c_str());
   }
 
+  SequenceNumber earliest_seqno = 0;
+  SequenceNumber largest_seqno = 0;
   if (s.ok()) {
-    flush_job.PickMemTable();
+    flush_job.PickMemTable(&earliest_seqno, &largest_seqno);
     need_cancel = true;
   }
   TEST_SYNC_POINT_CALLBACK(
@@ -315,7 +317,7 @@ Status DBImpl::FlushMemTableToOutputFile(
 
   // may temporarily unlock and lock the mutex.
   NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id,
-                     flush_reason);
+                     flush_reason, earliest_seqno, largest_seqno);
 
   bool switched_to_mempurge = false;
   // Within flush_job.Run, rocksdb may call event listener to notify
@@ -538,14 +540,6 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
   IOStatus log_io_s = IOStatus::OK();
   assert(num_cfs == static_cast<int>(jobs.size()));
 
-  for (int i = 0; i != num_cfs; ++i) {
-    const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i);
-    // may temporarily unlock and lock the mutex.
-    FlushReason flush_reason = bg_flush_args[i].flush_reason_;
-    NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options,
-                       job_context->job_id, flush_reason);
-  }
-
   if (logfile_number_ > 0) {
     // TODO (yanqin) investigate whether we should sync the closed logs for
     // single column family case.
@@ -598,13 +592,24 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
                      job_context->job_id, s.ToString().c_str());
   }
 
+  std::vector<SequenceNumber> earliest_seqnos(num_cfs, 0);
+  std::vector<SequenceNumber> largest_seqnos(num_cfs, 0);
   if (s.ok()) {
     for (int i = 0; i != num_cfs; ++i) {
-      jobs[i]->PickMemTable();
+      jobs[i]->PickMemTable(&earliest_seqnos[i], &largest_seqnos[i]);
       pick_status[i] = true;
     }
   }
 
+  for (int i = 0; i != num_cfs; ++i) {
+    const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i);
+    // may temporarily unlock and lock the mutex.
+    FlushReason flush_reason = bg_flush_args[i].flush_reason_;
+    NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options,
+                       job_context->job_id, flush_reason, earliest_seqnos[i],
+                       largest_seqnos[i]);
+  }
+
   if (s.ok()) {
     assert(switched_to_mempurge.size() ==
            static_cast<long unsigned int>(num_cfs));
@@ -914,7 +919,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
 
 void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
                                 const MutableCFOptions& mutable_cf_options,
-                                int job_id, FlushReason flush_reason) {
+                                int job_id, FlushReason flush_reason,
+                                SequenceNumber smallest_seqno,
+                                SequenceNumber largest_seqno) {
   if (immutable_db_options_.listeners.size() == 0U) {
     return;
   }
@@ -944,8 +951,10 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
     info.job_id = job_id;
     info.triggered_writes_slowdown = triggered_writes_slowdown;
     info.triggered_writes_stop = triggered_writes_stop;
-    info.smallest_seqno = file_meta->fd.smallest_seqno;
-    info.largest_seqno = file_meta->fd.largest_seqno;
+    // This sequence number is actually smaller than or equal to the sequence
+    // number of any key that be inserted into the flushed memtable.
+    info.smallest_seqno = smallest_seqno;
+    info.largest_seqno = largest_seqno;
     info.flush_reason = flush_reason;
     for (auto listener : immutable_db_options_.listeners) {
       listener->OnFlushBegin(this, info);
@@ -1126,6 +1135,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
   if (s.ok() && flush_needed) {
     FlushOptions fo;
     fo.allow_write_stall = options.allow_write_stall;
+    fo.check_if_compaction_disabled = true;
     if (immutable_db_options_.atomic_flush) {
       s = AtomicFlushMemTables(fo, FlushReason::kManualCompaction);
     } else {
@@ -1951,6 +1961,7 @@ Status DBImpl::Flush(const FlushOptions& flush_options,
   ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.",
                  cfh->GetName().c_str());
   Status s;
+  TEST_SYNC_POINT_CALLBACK("DBImpl::Flush:ScheduleFlushReq", column_family);
   if (immutable_db_options_.atomic_flush) {
     s = AtomicFlushMemTables(flush_options, FlushReason::kManualFlush,
                              {cfh->cfd()});
@@ -2264,12 +2275,28 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
     }
   }
 
-  const bool needs_to_join_write_thread = !entered_write_thread;
+  const bool needs_to_join_write_thread =
+      !entered_write_thread && !flush_options._write_stopped;
+
   autovector<FlushRequest> flush_reqs;
   autovector<uint64_t> memtable_ids_to_wait;
   {
     WriteContext context;
     InstrumentedMutexLock guard_lock(&mutex_);
+    // Need to check inside lock to avoid [flush()] -> [disable] -> [schedule].
+    if (flush_options.check_if_compaction_disabled &&
+        manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+      return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    }
+    if (flush_options.expected_oldest_key_time != 0 &&
+        cfd->mem()->ApproximateOldestKeyTime() !=
+            flush_options.expected_oldest_key_time) {
+      std::ostringstream oss;
+      oss << "Oldest key time doesn't match. expected="
+          << flush_options.expected_oldest_key_time
+          << ", actual=" << cfd->mem()->ApproximateOldestKeyTime();
+      return Status::Incomplete(oss.str());
+    }
 
     WriteThread::Writer w;
     WriteThread::Writer nonmem_w;
@@ -2444,6 +2471,11 @@ Status DBImpl::AtomicFlushMemTables(
   {
     WriteContext context;
     InstrumentedMutexLock guard_lock(&mutex_);
+    // Need to check inside lock to avoid [flush()] -> [disable] -> [schedule].
+    if (flush_options.check_if_compaction_disabled &&
+        manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+      return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    }
 
     WriteThread::Writer w;
     WriteThread::Writer nonmem_w;
diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc
index 17050e4651f..ac566829fbc 100644
--- a/db/db_impl/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@@ -318,5 +318,19 @@ size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
   InstrumentedMutexLock l(&const_cast<DBImpl*>(this)->stats_history_mutex_);
   return EstimateInMemoryStatsHistorySize();
 }
+
+void DBImpl::TEST_ClearBackgroundJobs() {
+  // Matching `CloseHelper()`.
+  while (!flush_queue_.empty()) {
+    const FlushRequest& flush_req = PopFirstFromFlushQueue();
+    for (const auto& iter : flush_req.cfd_to_max_mem_id_to_persist) {
+      iter.first->UnrefAndTryDelete();
+    }
+  }
+  while (!compaction_queue_.empty()) {
+    auto cfd = PopFirstFromCompactionQueue();
+    cfd->UnrefAndTryDelete();
+  }
+}
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // NDEBUG
diff --git a/db/db_impl/db_impl_merge.cc b/db/db_impl/db_impl_merge.cc
new file mode 100644
index 00000000000..e6e01136c99
--- /dev/null
+++ b/db/db_impl/db_impl_merge.cc
@@ -0,0 +1,396 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/// A RAII-style helper used to block DB writes.
+class WriteBlocker {
+ public:
+  WriteBlocker(DBImpl* db) : db_(db), writer_(new WriteThread::Writer()) {
+    db_->mutex_.Lock();
+    db_->write_thread_.EnterUnbatched(writer_.get(), &db_->mutex_);
+    db_->WaitForPendingWrites();
+  }
+
+  ~WriteBlocker() {
+    db_->write_thread_.ExitUnbatched(writer_.get());
+    db_->mutex_.Unlock();
+  }
+
+ private:
+  DBImpl* db_;
+  std::unique_ptr<WriteThread::Writer> writer_;
+};
+
+Status DBImpl::ValidateForMerge(const MergeInstanceOptions& mopts,
+                                DBImpl* rhs) {
+  if (rhs->two_write_queues_) {
+    return Status::NotSupported("two_write_queues == true");
+  }
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    auto rhs_cfd =
+        rhs->versions_->GetColumnFamilySet()->GetColumnFamily(cfd->GetName());
+    if (rhs_cfd != nullptr) {
+      if (strcmp(cfd->ioptions()->table_factory->Name(),
+                 rhs_cfd->ioptions()->table_factory->Name()) != 0) {
+        return Status::InvalidArgument(
+            "table_factory must be of the same type");
+      }
+    }
+  }
+  if (mopts.merge_memtable) {
+    if (rhs->total_log_size_ > 0) {
+      return Status::InvalidArgument("DB WAL is not empty");
+    }
+  }
+  if (rhs->table_cache_ == table_cache_) {
+    return Status::InvalidArgument("table_cache must not be shared");
+  }
+  return Status::OK();
+}
+
+Status DBImpl::CheckInRange(const Slice* begin, const Slice* end) {
+  Status s;
+  if (begin == nullptr && end == nullptr) {
+    return s;
+  }
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    assert(cfd != nullptr);
+    auto* comparator = cfd->user_comparator();
+    PinnableSlice smallest, largest;
+    bool found = false;
+    s = cfd->GetUserKeyRange(&smallest, &largest, &found);
+    if (!s.ok()) {
+      return s;
+    }
+    if (!found) {
+      continue;
+    }
+    if (begin != nullptr && comparator->Compare(smallest, *begin) < 0) {
+      return Status::InvalidArgument("Has data smaller than left boundary");
+    } else if (end != nullptr && comparator->Compare(largest, *end) >= 0) {
+      return Status::InvalidArgument("Has data larger than right boundary");
+    }
+  }
+  return s;
+}
+
+Status DBImpl::MergeDisjointInstances(const MergeInstanceOptions& merge_options,
+                                      const std::vector<DB*>& instances) {
+  Status s;
+  autovector<ColumnFamilyData*> this_cfds;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    assert(cfd != nullptr);
+    if (!cfd->IsDropped()) {
+      this_cfds.push_back(cfd);
+    }
+  }
+  const size_t num_cfs = this_cfds.size();
+
+  // # Sanity checks
+  // Check target instance (`this`).
+  if (two_write_queues_) {
+    return Status::NotSupported("target instance two_write_queues == true");
+  }
+  autovector<DBImpl*> db_impls;
+  autovector<DBImpl*> all_db_impls{this};
+  // A list of source db super versions grouped by cf. nullptr if the cf is
+  // missing.
+  autovector<autovector<SuperVersion*>> cf_db_super_versions;
+  std::shared_ptr<void> _defer(nullptr, [&](...) {
+    for (auto& db_super_versions : cf_db_super_versions) {
+      for (auto* super_version : db_super_versions) {
+        if (super_version != nullptr && super_version->Unref()) {
+          super_version->Cleanup();
+        }
+      }
+    }
+  });
+  // Check source instances.
+  for (size_t i = 0; i < instances.size(); i++) {
+    auto* db_impl = static_cast<DBImpl*>(instances[i]);
+    s = ValidateForMerge(merge_options, db_impl);
+    if (s.ok()) {
+      db_impls.push_back(db_impl);
+      all_db_impls.push_back(db_impl);
+    } else {
+      return s;
+    }
+  }
+
+  // Block all writes.
+  autovector<std::unique_ptr<WriteBlocker>> write_blockers;
+  for (auto* db : all_db_impls) {
+    write_blockers.emplace_back(new WriteBlocker(db));
+  }
+
+  // # Internal key range check
+  assert(s.ok());
+  for (auto* this_cfd : this_cfds) {
+    auto& name = this_cfd->GetName();
+    auto* comparator = this_cfd->user_comparator();
+    using CfRange = std::pair<PinnableSlice, PinnableSlice>;
+    std::vector<CfRange> db_ranges;
+    auto process_cf = [&](ColumnFamilyData* cfd) {
+      assert(cfd && s.ok());
+      PinnableSlice smallest, largest;
+      bool found = false;
+      s = cfd->GetUserKeyRange(&smallest, &largest, &found);
+      if (s.ok() && found) {
+        db_ranges.emplace_back(
+            std::make_pair(std::move(smallest), std::move(largest)));
+      }
+    };
+    process_cf(this_cfd);
+    if (!s.ok()) {
+      return s;
+    }
+    for (auto* db : db_impls) {
+      auto cfd = db->versions_->GetColumnFamilySet()->GetColumnFamily(name);
+      if (cfd && !cfd->IsDropped()) {
+        process_cf(cfd);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    }
+    std::sort(db_ranges.begin(), db_ranges.end(),
+              [=](const CfRange& a, const CfRange& b) {
+                return comparator->Compare(a.first, b.first) < 0;
+              });
+    Slice last_largest;
+    for (auto& range : db_ranges) {
+      if (last_largest.size() == 0 ||
+          comparator->Compare(last_largest, range.first) < 0) {
+        last_largest = range.second;
+      } else {
+        return Status::InvalidArgument("Source DBs have overlapping range");
+      }
+    }
+  }
+
+  // # Handle transient states
+  //
+  // - Acquire snapshots of table files (`SuperVersion`).
+  //
+  // - Do memtable merge if needed. We do this together with acquiring
+  // snapshot
+  //   to avoid the case where a memtable is flushed shortly after being
+  //   merged, and the resulting L0 data is merged again as a table file.
+  assert(s.ok());
+  autovector<MemTable*> to_delete;  // not used.
+  // Key-value freshness is determined by its sequence number. To avoid
+  // incoming writes being shadowed by history data from other instances, we
+  // must increment target instance's sequence number to be larger than all
+  // source data. See [A].
+  uint64_t max_seq_number = 0;
+  // RocksDB's recovery is heavily dependent on the one-on-one mapping between
+  // memtable and WAL (even when WAL is empty). Each memtable keeps a record
+  // of `next_log_number` to mark its position within a series of WALs. This
+  // counter must be monotonic. We work around this issue by setting the
+  // counters of all involved memtables to the same maximum value. See [B].
+  uint64_t max_log_number = 0;
+  for (auto* db : all_db_impls) {
+    max_seq_number = std::max(max_seq_number, db->versions_->LastSequence());
+    max_log_number = std::max(max_log_number, db->logfile_number_);
+  }
+  // [A] Bump sequence number.
+  versions_->SetLastAllocatedSequence(max_seq_number);
+  versions_->SetLastSequence(max_seq_number);
+  cf_db_super_versions.resize(num_cfs);
+  for (size_t cf_i = 0; cf_i < num_cfs; cf_i++) {
+    cf_db_super_versions[cf_i].resize(db_impls.size());
+    auto* this_cfd = this_cfds[cf_i];
+    auto& cf_name = this_cfd->GetName();
+    autovector<MemTable*> mems;
+    for (size_t db_i = 0; db_i < db_impls.size(); db_i++) {
+      auto& db = db_impls[db_i];
+      auto cfd = db->versions_->GetColumnFamilySet()->GetColumnFamily(cf_name);
+      if (cfd == nullptr || cfd->IsDropped()) {
+        cf_db_super_versions[cf_i][db_i] = nullptr;
+        continue;
+      }
+
+      if (merge_options.merge_memtable) {
+        if (!cfd->mem()->IsEmpty()) {
+          WriteContext write_context;
+          assert(log_empty_);
+          s = SwitchMemtable(cfd, &write_context);
+          if (!s.ok()) {
+            return s;
+          }
+        }
+        assert(cfd->mem()->IsEmpty());
+
+        // [B] Bump log number for active memtable. Even though it's not
+        // shared, it must still be larger than other shared immutable
+        // memtables.
+        cfd->mem()->SetNextLogNumber(max_log_number);
+        cfd->imm()->ExportMemtables(&mems);
+      }
+
+      // Acquire super version.
+      cf_db_super_versions[cf_i][db_i] = cfd->GetSuperVersion()->Ref();
+    }
+    for (auto mem : mems) {
+      assert(mem != nullptr);
+      mem->Ref();
+      // [B] Bump log number for shared memtables.
+      mem->SetNextLogNumber(max_log_number);
+      this_cfd->imm()->Add(mem, &to_delete);
+    }
+    this_cfd->mem()->SetNextLogNumber(max_log_number);
+  }
+  for (size_t i = 0; i < all_db_impls.size(); i++) {
+    auto* db = all_db_impls[i];
+    bool check_log_number = (i == 0 || merge_options.allow_source_write) &&
+                            merge_options.merge_memtable;
+    if (check_log_number && max_log_number != db->logfile_number_) {
+      assert(max_log_number > db->logfile_number_);
+      // [B] Create a new WAL so that future memtable will use the correct log
+      // number as well.
+      log::Writer* new_log = nullptr;
+      s = db->CreateWAL(max_log_number, 0 /*recycle_log_number*/,
+                        0 /*preallocate_block_size*/, &new_log);
+      if (!s.ok()) {
+        return s;
+      }
+      db->logfile_number_ = max_log_number;
+      assert(new_log != nullptr);
+      db->logs_.emplace_back(max_log_number, new_log);
+      auto current = db->versions_->current_next_file_number();
+      if (current <= max_log_number) {
+        db->versions_->FetchAddFileNumber(max_log_number - current + 1);
+      }
+    }
+  }
+
+  // Unblock writes.
+  write_blockers.clear();
+
+  TEST_SYNC_POINT("DBImpl::MergeDisjointInstances:AfterMergeMemtable:1");
+
+  // # Merge table files
+  assert(s.ok());
+  autovector<VersionEdit> cf_edits;
+  cf_edits.resize(num_cfs);
+  for (size_t cf_i = 0; cf_i < num_cfs; cf_i++) {
+    auto* this_cfd = this_cfds[cf_i];
+    auto& edit = cf_edits[cf_i];
+    edit.SetColumnFamily(this_cfd->GetID());
+    for (size_t db_i = 0; db_i < db_impls.size(); db_i++) {
+      auto* super_version = cf_db_super_versions[cf_i][db_i];
+      if (super_version == nullptr) {
+        continue;
+      }
+      VersionStorageInfo& vsi = *super_version->current->storage_info();
+      auto& cf_paths = super_version->cfd->ioptions()->cf_paths;
+      auto SourcePath = [&](size_t path_id) {
+        // Matching `TableFileName()`.
+        if (path_id >= cf_paths.size()) {
+          assert(false);
+          return cf_paths.back().path;
+        } else {
+          return cf_paths[path_id].path;
+        }
+      };
+      const auto& target_path = this_cfd->ioptions()->cf_paths.front().path;
+      const uint64_t target_path_id = 0;
+      for (int level = 0; level < vsi.num_levels(); ++level) {
+        for (const auto& f : vsi.LevelFiles(level)) {
+          assert(f != nullptr);
+          const uint64_t source_file_number = f->fd.GetNumber();
+          const uint64_t target_file_number = versions_->FetchAddFileNumber(1);
+          std::string src = MakeTableFileName(SourcePath(f->fd.GetPathId()),
+                                              source_file_number);
+          std::string target =
+              MakeTableFileName(target_path, target_file_number);
+          s = GetEnv()->LinkFile(src, target);
+          if (!s.ok()) {
+            return s;
+          }
+          edit.AddFile(level, target_file_number, target_path_id,
+                       f->fd.GetFileSize(), f->smallest, f->largest,
+                       f->fd.smallest_seqno, f->fd.largest_seqno,
+                       f->marked_for_compaction, f->temperature,
+                       f->oldest_blob_file_number, f->oldest_ancester_time,
+                       f->file_creation_time, f->epoch_number, f->file_checksum,
+                       f->file_checksum_func_name, f->unique_id,
+                       f->compensated_range_deletion_size, f->tail_size,
+                       f->user_defined_timestamps_persisted);
+        }
+      }
+      vsi.RecoverEpochNumbers(this_cfd);
+    }
+  }
+
+  // # Apply version edits
+  assert(s.ok());
+  {
+    autovector<autovector<VersionEdit*>> edit_ptrs;
+    autovector<const MutableCFOptions*> cf_mopts;
+    for (size_t i = 0; i < num_cfs; i++) {
+      edit_ptrs.push_back({&cf_edits[i]});
+      cf_mopts.push_back(this_cfds[i]->GetLatestMutableCFOptions());
+    }
+
+    auto old_capacity = table_cache_->GetCapacity();
+    if (merge_options.max_preload_files >= 0) {
+      // Refer to `LoadTableHandlers` for calculation details.
+      // This trick will be wrong if table_cache is shared.
+      table_cache_->SetCapacity(
+          (table_cache_->GetUsage() + merge_options.max_preload_files) * 4);
+    }
+
+    InstrumentedMutexLock lock(&mutex_);
+    s = versions_->LogAndApply(this_cfds, cf_mopts, ReadOptions(), edit_ptrs,
+                               &mutex_, directories_.GetDbDir(), false);
+    if (!s.ok()) {
+      return s;
+    }
+    for (size_t i = 0; i < num_cfs; i++) {
+      SuperVersionContext sv_context(/* create_superversion */ true);
+      InstallSuperVersionAndScheduleWork(this_cfds[i], &sv_context,
+                                         *cf_mopts[i]);
+      sv_context.Clean();
+    }
+
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(this_cfds);
+    }
+    for (auto cfd : this_cfds) {
+      cfd->imm()->FlushRequested();
+      if (!immutable_db_options_.atomic_flush) {
+        FlushRequest flush_req;
+        GenerateFlushRequest({cfd}, FlushReason::kWriteBufferFull, &flush_req);
+        SchedulePendingFlush(flush_req);
+      }
+    }
+    if (immutable_db_options_.atomic_flush) {
+      FlushRequest flush_req;
+      GenerateFlushRequest(this_cfds, FlushReason::kWriteBufferFull,
+                           &flush_req);
+      SchedulePendingFlush(flush_req);
+    }
+    for (auto cfd : this_cfds) {
+      SchedulePendingCompaction(cfd);
+    }
+    MaybeScheduleFlushOrCompaction();
+
+    if (merge_options.max_preload_files >= 0) {
+      table_cache_->SetCapacity(old_capacity);
+    }
+  }
+
+  assert(s.ok());
+  return s;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 086e014e581..d41cd5b3a70 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1969,6 +1969,22 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
   }
 
   DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn);
+  for (auto cf : column_families) {
+    if (cf.options.cf_write_buffer_manager != nullptr) {
+      auto* write_buffer_manager = cf.options.cf_write_buffer_manager.get();
+      bool already_exist = false;
+      for (auto m : impl->cf_based_write_buffer_manager_) {
+        if (m == write_buffer_manager) {
+          already_exist = true;
+          break;
+        }
+      }
+      if (!already_exist) {
+        impl->cf_based_write_buffer_manager_.push_back(write_buffer_manager);
+      }
+    }
+  }
+
   if (!impl->immutable_db_options_.info_log) {
     s = impl->init_logger_creation_s_;
     delete impl;
@@ -2265,6 +2281,33 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
   }
   if (s.ok()) {
     s = impl->StartPeriodicTaskScheduler();
+    if (impl->write_buffer_manager_) {
+      impl->write_buffer_manager_->UnregisterDB(impl);
+    }
+    for (auto m : impl->cf_based_write_buffer_manager_) {
+      m->UnregisterDB(impl);
+    }
+
+    for (size_t i = 0; i < (*handles).size(); ++i) {
+      auto cf_opt = column_families[i].options;
+
+      auto* cf = (*handles)[i];
+      std::string cf_name = cf->GetName();
+      auto* write_buffer_manager = cf_opt.cf_write_buffer_manager != nullptr
+                                       ? cf_opt.cf_write_buffer_manager.get()
+                                       : impl->write_buffer_manager_;
+      if (write_buffer_manager) {
+        if (cf->GetName() == kDefaultColumnFamilyName) {
+          write_buffer_manager->RegisterColumnFamily(impl,
+                                                     impl->default_cf_handle_);
+        } else if (cf->GetName() == kPersistentStatsColumnFamilyName) {
+          write_buffer_manager->RegisterColumnFamily(
+              impl, impl->persist_stats_cf_handle_);
+        } else {
+          write_buffer_manager->RegisterColumnFamily(impl, cf);
+        }
+      }
+    }
   }
   if (s.ok()) {
     s = impl->RegisterRecordSeqnoTimeWorker(recovery_ctx.is_new_db_);
diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h
index 32bc8560706..93103d120a7 100644
--- a/db/db_impl/db_impl_readonly.h
+++ b/db/db_impl/db_impl_readonly.h
@@ -76,8 +76,9 @@ class DBImplReadOnly : public DBImpl {
                               const Slice& /*key*/) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
-  virtual Status Write(const WriteOptions& /*options*/,
-                       WriteBatch* /*updates*/) override {
+  using DBImpl::Write;
+  virtual Status Write(const WriteOptions& /*options*/, WriteBatch* /*updates*/,
+                       PostWriteCallback* /*callback*/) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
   using DBImpl::CompactRange;
diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h
index 12a8bbdd707..00bea3a28c1 100644
--- a/db/db_impl/db_impl_secondary.h
+++ b/db/db_impl/db_impl_secondary.h
@@ -162,8 +162,9 @@ class DBImplSecondary : public DBImpl {
     return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
-  Status Write(const WriteOptions& /*options*/,
-               WriteBatch* /*updates*/) override {
+  using DBImpl::Write;
+  Status Write(const WriteOptions& /*options*/, WriteBatch* /*updates*/,
+               PostWriteCallback* /*callback*/) override {
     return Status::NotSupported("Not supported operation in secondary mode.");
   }
 
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 536c514a2ec..c74ec8dba93 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -147,7 +147,8 @@ void DBImpl::SetRecoverableStatePreReleaseCallback(
   recoverable_state_pre_release_callback_.reset(callback);
 }
 
-Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
+Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch,
+                     PostWriteCallback* callback) {
   Status s;
   if (write_options.protection_bytes_per_key > 0) {
     s = WriteBatchInternal::UpdateProtectionInfo(
@@ -155,7 +156,10 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
   }
   if (s.ok()) {
     s = WriteImpl(write_options, my_batch, /*callback=*/nullptr,
-                  /*log_used=*/nullptr);
+                  /*log_used=*/nullptr, /*log_ref=*/0,
+                  /*disable_memtable=*/false, /*seq=*/nullptr, /*batch_cnt=*/0,
+                  /*pre_release_callback=*/nullptr,
+                  /*post_memtable_callback=*/nullptr, callback);
   }
   return s;
 }
@@ -188,9 +192,12 @@ void DBImpl::MultiBatchWriteCommit(CommitRequest* request) {
 }
 
 Status DBImpl::MultiBatchWrite(const WriteOptions& options,
-                               std::vector<WriteBatch*>&& updates) {
+                               std::vector<WriteBatch*>&& updates,
+                               PostWriteCallback* callback) {
   if (immutable_db_options_.enable_multi_batch_write) {
-    return MultiBatchWriteImpl(options, std::move(updates), nullptr, nullptr);
+    return MultiBatchWriteImpl(options, std::move(updates),
+                               /*callback=*/nullptr, /*log_used=*/nullptr,
+                               /*log_ref=*/0, /*seq=*/nullptr, callback);
   } else {
     return Status::NotSupported();
   }
@@ -239,12 +246,15 @@ Status DBImpl::MultiBatchWrite(const WriteOptions& options,
 Status DBImpl::MultiBatchWriteImpl(const WriteOptions& write_options,
                                    std::vector<WriteBatch*>&& my_batch,
                                    WriteCallback* callback, uint64_t* log_used,
-                                   uint64_t log_ref, uint64_t* seq_used) {
+                                   uint64_t log_ref, uint64_t* seq_used,
+                                   PostWriteCallback* post_callback) {
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   StopWatch write_sw(immutable_db_options_.clock,
                      immutable_db_options_.statistics.get(), DB_WRITE);
   WriteThread::Writer writer(write_options, std::move(my_batch), callback,
-                             log_ref, false /*disable_memtable*/);
+                             log_ref, false /*disable_memtable*/,
+                             /*pre_release_callback=*/nullptr,
+                             /*post_memtable_callback=*/nullptr, post_callback);
   CommitRequest request(&writer);
   writer.request = &request;
   write_thread_.JoinBatchGroup(&writer);
@@ -294,6 +304,8 @@ Status DBImpl::MultiBatchWriteImpl(const WriteOptions& write_options,
               next_sequence += count;
               total_count += count;
               memtable_write_cnt++;
+            } else if (w->post_callback) {
+              w->post_callback->Callback(w->sequence);
             }
           }
           total_byte_size = WriteBatchInternal::AppendedByteSize(
@@ -406,7 +418,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
                          bool disable_memtable, uint64_t* seq_used,
                          size_t batch_cnt,
                          PreReleaseCallback* pre_release_callback,
-                         PostMemTableCallback* post_memtable_callback) {
+                         PostMemTableCallback* post_memtable_callback,
+                         PostWriteCallback* post_callback) {
   assert(!seq_per_batch_ || batch_cnt != 0);
   assert(my_batch == nullptr || my_batch->Count() == 0 ||
          write_options.protection_bytes_per_key == 0 ||
@@ -467,6 +480,14 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     return Status::NotSupported(
         "pipelined_writes is not compatible with concurrent prepares");
   }
+  if (two_write_queues_ && post_callback) {
+    return Status::NotSupported(
+        "post write callback is not compatible with concurrent prepares");
+  }
+  if (disable_memtable && post_callback) {
+    return Status::NotSupported(
+        "post write callback is not compatible with disabling memtable");
+  }
   if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) {
     // TODO(yiwu): update pipeline write with seq_per_batch and batch_cnt
     return Status::NotSupported(
@@ -529,8 +550,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     }
     if (!disable_memtable) {
       TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeUnorderedWriteMemtable");
-      status = UnorderedWriteMemtable(write_options, my_batch, callback,
-                                      log_ref, seq, sub_batch_cnt);
+      status =
+          UnorderedWriteMemtable(write_options, my_batch, callback, log_ref,
+                                 seq, sub_batch_cnt, post_callback);
     }
     return status;
   }
@@ -539,18 +561,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     std::vector<WriteBatch*> updates(1);
     updates[0] = my_batch;
     return MultiBatchWriteImpl(write_options, std::move(updates), callback,
-                               log_used, log_ref, seq_used);
+                               log_used, log_ref, seq_used, post_callback);
   }
 
   if (immutable_db_options_.enable_pipelined_write) {
     return PipelinedWriteImpl(write_options, my_batch, callback, log_used,
-                              log_ref, disable_memtable, seq_used);
+                              log_ref, disable_memtable, seq_used,
+                              post_callback);
   }
 
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   WriteThread::Writer w(write_options, my_batch, callback, log_ref,
                         disable_memtable, batch_cnt, pre_release_callback,
-                        post_memtable_callback);
+                        post_memtable_callback, post_callback);
   StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
 
   write_thread_.JoinBatchGroup(&w);
@@ -908,7 +931,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
 Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
                                   WriteBatch* my_batch, WriteCallback* callback,
                                   uint64_t* log_used, uint64_t log_ref,
-                                  bool disable_memtable, uint64_t* seq_used) {
+                                  bool disable_memtable, uint64_t* seq_used,
+                                  PostWriteCallback* post_callback) {
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
 
@@ -916,7 +940,8 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
 
   WriteThread::Writer w(write_options, my_batch, callback, log_ref,
                         disable_memtable, /*_batch_cnt=*/0,
-                        /*_pre_release_callback=*/nullptr);
+                        /*_pre_release_callback=*/nullptr,
+                        /*_post_memtable_callback=*/nullptr, post_callback);
   write_thread_.JoinBatchGroup(&w);
   TEST_SYNC_POINT("DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup");
   if (w.state == WriteThread::STATE_GROUP_LEADER) {
@@ -1086,12 +1111,15 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
                                       WriteBatch* my_batch,
                                       WriteCallback* callback, uint64_t log_ref,
                                       SequenceNumber seq,
-                                      const size_t sub_batch_cnt) {
+                                      const size_t sub_batch_cnt,
+                                      PostWriteCallback* post_callback) {
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
 
   WriteThread::Writer w(write_options, my_batch, callback, log_ref,
-                        false /*disable_memtable*/);
+                        false /*disable_memtable*/, 0,
+                        /*pre_release_callback=*/nullptr,
+                        /*post_memtable_callback=*/nullptr, post_callback);
 
   if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) {
     w.sequence = seq;
@@ -1425,15 +1453,14 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
     }
   }
 
+  // Ordering: before write delay.
   if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) {
-    // Before a new memtable is added in SwitchMemtable(),
-    // write_buffer_manager_->ShouldFlush() will keep returning true. If another
-    // thread is writing to another DB with the same write buffer, they may also
-    // be flushed. We may end up with flushing much more DBs than needed. It's
-    // suboptimal but still correct.
-    InstrumentedMutexLock l(&mutex_);
-    WaitForPendingWrites();
-    status = HandleWriteBufferManagerFlush(write_context);
+    write_buffer_manager_->MaybeFlush(this);
+  }
+  for (auto write_buffer_manager : cf_based_write_buffer_manager_) {
+    if (UNLIKELY(status.ok() && write_buffer_manager->ShouldFlush())) {
+      write_buffer_manager->MaybeFlush(this);
+    }
   }
 
   if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
@@ -1966,98 +1993,6 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
   return status;
 }
 
-Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
-  mutex_.AssertHeld();
-  assert(write_context != nullptr);
-  Status status;
-
-  // Before a new memtable is added in SwitchMemtable(),
-  // write_buffer_manager_->ShouldFlush() will keep returning true. If another
-  // thread is writing to another DB with the same write buffer, they may also
-  // be flushed. We may end up with flushing much more DBs than needed. It's
-  // suboptimal but still correct.
-  // no need to refcount because drop is happening in write thread, so can't
-  // happen while we're in the write thread
-  autovector<ColumnFamilyData*> cfds;
-  if (immutable_db_options_.atomic_flush) {
-    SelectColumnFamiliesForAtomicFlush(&cfds);
-  } else {
-    ColumnFamilyData* cfd_picked = nullptr;
-    SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber;
-
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->IsDropped()) {
-        continue;
-      }
-      if (!cfd->mem()->IsEmpty() && !cfd->imm()->IsFlushPendingOrRunning()) {
-        // We only consider flush on CFs with bytes in the mutable memtable,
-        // and no immutable memtables for which flush has yet to finish. If
-        // we triggered flush on CFs already trying to flush, we would risk
-        // creating too many immutable memtables leading to write stalls.
-        uint64_t seq = cfd->mem()->GetCreationSeq();
-        if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) {
-          cfd_picked = cfd;
-          seq_num_for_cf_picked = seq;
-        }
-      }
-    }
-    if (cfd_picked != nullptr) {
-      cfds.push_back(cfd_picked);
-    }
-    MaybeFlushStatsCF(&cfds);
-  }
-  if (!cfds.empty()) {
-    ROCKS_LOG_INFO(
-        immutable_db_options_.info_log,
-        "Flushing triggered to alleviate write buffer memory usage. Write "
-        "buffer is using %" ROCKSDB_PRIszt
-        " bytes out of a total of %" ROCKSDB_PRIszt ".",
-        write_buffer_manager_->memory_usage(),
-        write_buffer_manager_->buffer_size());
-  }
-
-  WriteThread::Writer nonmem_w;
-  if (two_write_queues_) {
-    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
-  }
-  for (const auto cfd : cfds) {
-    if (cfd->mem()->IsEmpty()) {
-      continue;
-    }
-    cfd->Ref();
-    status = SwitchMemtable(cfd, write_context);
-    cfd->UnrefAndTryDelete();
-    if (!status.ok()) {
-      break;
-    }
-  }
-  if (two_write_queues_) {
-    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
-  }
-
-  if (status.ok()) {
-    if (immutable_db_options_.atomic_flush) {
-      AssignAtomicFlushSeq(cfds);
-    }
-    for (const auto cfd : cfds) {
-      cfd->imm()->FlushRequested();
-      if (!immutable_db_options_.atomic_flush) {
-        FlushRequest flush_req;
-        GenerateFlushRequest({cfd}, FlushReason::kWriteBufferManager,
-                             &flush_req);
-        SchedulePendingFlush(flush_req);
-      }
-    }
-    if (immutable_db_options_.atomic_flush) {
-      FlushRequest flush_req;
-      GenerateFlushRequest(cfds, FlushReason::kWriteBufferManager, &flush_req);
-      SchedulePendingFlush(flush_req);
-    }
-    MaybeScheduleFlushOrCompaction();
-  }
-  return status;
-}
-
 uint64_t DBImpl::GetMaxTotalWalSize() const {
   uint64_t max_total_wal_size =
       max_total_wal_size_.load(std::memory_order_acquire);
@@ -2400,6 +2335,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
   memtable_info.cf_name = cfd->GetName();
   memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber();
   memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber();
+  memtable_info.largest_seqno = cfd->mem()->GetLargestSequenceNumber();
   memtable_info.num_entries = cfd->mem()->num_entries();
   memtable_info.num_deletes = cfd->mem()->num_deletes();
   // Log this later after lock release. It may be outdated, e.g., if background
@@ -2583,10 +2519,15 @@ size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const {
   if (immutable_db_options_.db_write_buffer_size > 0) {
     bsize = std::min<size_t>(bsize, immutable_db_options_.db_write_buffer_size);
   }
-  if (immutable_db_options_.write_buffer_manager &&
-      immutable_db_options_.write_buffer_manager->enabled()) {
-    bsize = std::min<size_t>(
-        bsize, immutable_db_options_.write_buffer_manager->buffer_size());
+  if (immutable_db_options_.write_buffer_manager) {
+    size_t buffer_size =
+        immutable_db_options_.write_buffer_manager->flush_size();
+    for (auto manager : cf_based_write_buffer_manager_) {
+      buffer_size += manager->flush_size();
+    }
+    if (buffer_size > 0) {
+      bsize = std::min<size_t>(bsize, buffer_size);
+    }
   }
 
   return bsize;
diff --git a/db/db_merge_test.cc b/db/db_merge_test.cc
new file mode 100644
index 00000000000..e55239bf139
--- /dev/null
+++ b/db/db_merge_test.cc
@@ -0,0 +1,647 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <string>
+#include <unordered_map>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint32_t default_cf = 0;
+uint32_t operator"" _db(unsigned long long int i) { return uint32_t(i); }
+uint32_t operator"" _cf(unsigned long long int i) {
+  assert(i > 0);
+  return uint32_t(i);
+}
+
+class DBMergeTest : public testing::Test {
+  struct DBHandles {
+    std::string path;
+    DBImpl* db;
+    std::unordered_map<uint32_t, ColumnFamilyHandle*> cfs;
+  };
+
+ public:
+  DBMergeTest() {
+    options_.create_if_missing = true;
+    options_.write_buffer_manager.reset(
+        new WriteBufferManager(options_.db_write_buffer_size));
+    // avoid stalling the tests.
+    options_.disable_write_stall = true;
+    options_.avoid_flush_during_shutdown = true;
+    // avoid background flush/compaction.
+    options_.level0_file_num_compaction_trigger = 10;
+    options_.level0_slowdown_writes_trigger = 10;
+    options_.level0_stop_writes_trigger = 10;
+    options_.max_write_buffer_number = 10;
+  }
+
+  ~DBMergeTest() { DestroyAll(); }
+
+  void IsOverlapError(Status s) {
+    ASSERT_EQ(s.ToString(),
+              "Invalid argument: Source DBs have overlapping range");
+  }
+
+  void IsWALNotEmpty(Status s) {
+    ASSERT_EQ(s.ToString(), "Invalid argument: DB WAL is not empty");
+  }
+
+  // 0 for default cf.
+  std::vector<ColumnFamilyDescriptor> GenColumnFamilyDescriptors(
+      const std::vector<uint32_t>& cf_ids) {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (auto cf_id : cf_ids) {
+      if (cf_id == 0) {
+        column_families.push_back(
+            ColumnFamilyDescriptor(ROCKSDB_NAMESPACE::kDefaultColumnFamilyName,
+                                   ColumnFamilyOptions(options_)));
+      } else {
+        column_families.push_back(ColumnFamilyDescriptor(
+            std::to_string(cf_id), ColumnFamilyOptions(options_)));
+      }
+    }
+    return column_families;
+  }
+
+  std::string GenDBPath(uint32_t db_id) {
+    return test::PerThreadDBPath(env_, std::to_string(db_id));
+  }
+
+  void AddDB(uint32_t db_id, DB* db,
+             std::vector<ColumnFamilyHandle*> cf_handles) {
+    assert(dbs_.count(db_id) == 0);
+    DBHandles db_handles;
+    db_handles.path = GenDBPath(db_id);
+    db_handles.db = static_cast<DBImpl*>(db);
+    for (auto* handle : cf_handles) {
+      uint32_t id = 0;
+      if (handle->GetName() != "default") {
+        id = uint32_t(stoul(handle->GetName()));
+      }
+      db_handles.cfs[id] = handle;
+    }
+    dbs_[db_id] = db_handles;
+  }
+
+  void Open(uint32_t db_id, const std::vector<uint32_t>& cf_ids,
+            bool reopen = false) {
+    if (dbs_.count(db_id) > 0) {
+      if (reopen) {
+        auto& db_handles = dbs_[db_id];
+        auto* db = db_handles.db;
+        for (auto& cf : db_handles.cfs) {
+          ASSERT_OK(db->DestroyColumnFamilyHandle(cf.second));
+        }
+        delete db;
+        dbs_.erase(db_id);
+      } else {
+        Destroy(db_id);
+      }
+    }
+    std::vector<ColumnFamilyDescriptor> column_families =
+        GenColumnFamilyDescriptors(cf_ids);
+    auto path = GenDBPath(db_id);
+    DB* db = nullptr;
+    if (!reopen) {
+      ASSERT_OK(DB::Open(options_, path, &db));
+      for (auto& cf : column_families) {
+        if (cf.name != "default") {
+          ColumnFamilyHandle* cf_handle;
+          ASSERT_OK(db->CreateColumnFamily(cf.options, cf.name, &cf_handle));
+          ASSERT_OK(db->DestroyColumnFamilyHandle(cf_handle));
+        }
+      }
+      delete db;
+      db = nullptr;
+    }
+    std::vector<ColumnFamilyHandle*> handles;
+    ASSERT_OK(DB::Open(options_, path, column_families, &handles, &db));
+    AddDB(db_id, db, handles);
+  }
+
+  void Destroy(uint32_t db_id) {
+    DestroyImpl(dbs_[db_id]);
+    dbs_.erase(db_id);
+  }
+
+  void DestroyAll() {
+    for (auto& db_handles : dbs_) {
+      DestroyImpl(db_handles.second);
+    }
+    dbs_.clear();
+  }
+
+  void DestroyImpl(DBHandles& db_handles) {
+    auto* db = db_handles.db;
+    for (auto& cf : db_handles.cfs) {
+      ASSERT_OK(db->DestroyColumnFamilyHandle(cf.second));
+    }
+    delete db;
+    ASSERT_OK(DestroyDB(db_handles.path, options_));
+  }
+
+  // cfs are ignored if target already exists
+  Status Merge(const MergeInstanceOptions& mopts, std::vector<uint32_t>&& from,
+               uint32_t to,
+               const std::vector<uint32_t>& cfs = std::vector<uint32_t>()) {
+    std::vector<DB*> source_dbs;
+    for (auto db_id : from) {
+      source_dbs.push_back(get_db(db_id));
+    }
+    bool newly_opened = false;
+    if (dbs_.count(to) == 0) {
+      assert(cfs.size() > 0);
+      Open(to, cfs);
+      newly_opened = true;
+    }
+    auto s = get_db(to)->MergeDisjointInstances(mopts, source_dbs);
+    if (newly_opened && !s.ok()) {
+      Destroy(to);
+    }
+    return s;
+  }
+
+  void VerifyKeyValue(uint32_t db_id, uint32_t cf_id, std::string key,
+                      std::string value,
+                      const ReadOptions& ropts = ReadOptions()) {
+    std::string ret;
+    if (value == "NotFound") {
+      assert(get_db(db_id)
+                 ->Get(ropts, get_cf(db_id, cf_id), key, &ret)
+                 .IsNotFound());
+    } else {
+      ASSERT_OK(get_db(db_id)->Get(ropts, get_cf(db_id, cf_id), key, &ret));
+      ASSERT_EQ(value, ret);
+    }
+  }
+
+  int Property(uint32_t db_id, const std::string& name) {
+    std::string property;
+    int result;
+    if (get_db(db_id)->GetProperty(name, &property) &&
+        sscanf(property.c_str(), "%d", &result) == 1) {
+      return result;
+    } else {
+      return -1;
+    }
+  }
+
+  bool has_db(uint32_t db_id) { return dbs_.count(db_id) > 0; }
+
+  DBImpl* get_db(uint32_t db_id) {
+    assert(dbs_.count(db_id) == 1);
+    return dbs_[db_id].db;
+  }
+
+  ColumnFamilyHandle* get_cf(uint32_t db_id, uint32_t cf_id) {
+    assert(dbs_.count(db_id) == 1);
+    return dbs_[db_id].cfs[cf_id];
+  }
+
+  Env* env_ = Env::Default();
+  Options options_;
+  std::unordered_map<uint32_t, DBHandles> dbs_;
+};
+
+TEST_F(DBMergeTest, MultiMerge) {
+  FlushOptions fopts;
+  fopts.allow_write_stall = true;
+  MergeInstanceOptions mopts;
+  mopts.merge_memtable = true;
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  Random rnd(301);
+
+  std::unordered_map<std::string, std::string> kvs[3];
+  for (uint32_t i = 0; i < 10; ++i) {
+    Open(i, {default_cf, 1_cf, 2_cf});
+    auto* db = get_db(i);
+    uint32_t keys_per_file = 1 + (i - 5) * (i - 5);  // scatter seqno.
+    for (auto cf : {default_cf, 1_cf, 2_cf}) {
+      for (uint32_t f = 0; f < 20; ++f) {
+        std::string prefix =
+            std::to_string(cf) + std::to_string(i) + std::to_string(f);
+        for (uint32_t k = 0; k < keys_per_file; ++k) {
+          auto keystr = prefix + "-" + std::to_string(k);
+          ASSERT_OK(db->Put(wopts, get_cf(i, cf), keystr, keystr));
+          kvs[cf][keystr] = keystr;
+        }
+        ASSERT_OK(db->Flush(fopts, get_cf(i, cf)));
+        if (f % 5 == 0) {
+          ASSERT_OK(db->CompactRange(CompactRangeOptions(), get_cf(i, cf),
+                                     nullptr, nullptr));
+        }
+      }
+    }
+  }
+
+  ASSERT_OK(Merge(mopts,
+                  {0_db, 1_db, 2_db, 3_db, 4_db, 5_db, 6_db, 7_db, 8_db, 9_db},
+                  10_db, {default_cf, 1_cf, 2_cf}));
+  ASSERT_OK(Merge(mopts, {0_db, 1_db, 2_db, 3_db, 4_db, 5_db, 6_db, 7_db, 8_db},
+                  9_db));
+
+  for (auto cf : {default_cf, 1_cf, 2_cf}) {
+    for (auto& kv : kvs[cf]) {
+      VerifyKeyValue(9_db, cf, kv.first, kv.second);
+      VerifyKeyValue(10_db, cf, kv.first, kv.second);
+    }
+  }
+
+  // overwrite random to 9 and 10.
+  for (auto cf : {default_cf, 1_cf, 2_cf}) {
+    for (uint32_t i = 0; i < 10; ++i) {
+      auto iter = kvs[cf].begin();
+      std::advance(iter, rnd.Next() % kvs[cf].size());
+
+      ASSERT_OK(
+          get_db(9_db)->Put(wopts, get_cf(9_db, cf), iter->first, "new_v"));
+      ASSERT_OK(
+          get_db(10_db)->Put(wopts, get_cf(10_db, cf), iter->first, "new_v"));
+      iter->second = "new_v";
+    }
+    for (auto& kv : kvs[cf]) {
+      VerifyKeyValue(9_db, cf, kv.first, kv.second);
+      VerifyKeyValue(10_db, cf, kv.first, kv.second);
+    }
+    ASSERT_OK(get_db(9_db)->Flush(fopts, get_cf(9_db, cf)));
+    ASSERT_OK(get_db(10_db)->Flush(fopts, get_cf(10_db, cf)));
+    for (auto& kv : kvs[cf]) {
+      VerifyKeyValue(9_db, cf, kv.first, kv.second);
+      VerifyKeyValue(10_db, cf, kv.first, kv.second);
+    }
+  }
+
+  // delete old instance.
+  for (auto db : {0_db, 1_db, 2_db, 3_db, 4_db, 5_db, 6_db, 7_db, 8_db}) {
+    Destroy(db);
+  }
+  for (auto cf : {default_cf, 1_cf, 2_cf}) {
+    for (uint32_t i = 0; i < 10; ++i) {
+      auto iter = kvs[cf].begin();
+      std::advance(iter, rnd.Next() % kvs[cf].size());
+
+      ASSERT_OK(
+          get_db(9_db)->Put(wopts, get_cf(9_db, cf), iter->first, "new_v2"));
+      ASSERT_OK(
+          get_db(10_db)->Put(wopts, get_cf(10_db, cf), iter->first, "new_v2"));
+      iter->second = "new_v2";
+    }
+    for (auto& kv : kvs[cf]) {
+      VerifyKeyValue(9_db, cf, kv.first, kv.second);
+      VerifyKeyValue(10_db, cf, kv.first, kv.second);
+    }
+    ASSERT_OK(get_db(9_db)->Flush(fopts, get_cf(9_db, cf)));
+    ASSERT_OK(get_db(10_db)->Flush(fopts, get_cf(10_db, cf)));
+    for (auto& kv : kvs[cf]) {
+      VerifyKeyValue(9_db, cf, kv.first, kv.second);
+      VerifyKeyValue(10_db, cf, kv.first, kv.second);
+    }
+  }
+
+  Open(9_db, {default_cf, 1_cf, 2_cf}, true /*reopen*/);
+  Open(10_db, {default_cf, 1_cf, 2_cf}, true /*reopen*/);
+  for (auto cf : {default_cf, 1_cf, 2_cf}) {
+    for (auto& kv : kvs[cf]) {
+      VerifyKeyValue(9_db, cf, kv.first, kv.second);
+      VerifyKeyValue(10_db, cf, kv.first, kv.second);
+    }
+  }
+}
+
+TEST_F(DBMergeTest, BinaryMerge) {
+  FlushOptions fopts;
+  fopts.allow_write_stall = true;
+  MergeInstanceOptions mopts;
+  mopts.merge_memtable = true;
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  Random rnd(301);
+
+  std::unordered_map<std::string, std::string> kvs[3];
+  std::vector<uint32_t> dbs = {0_db, 1_db, 2_db, 3_db, 4_db,
+                               5_db, 6_db, 7_db, 8_db, 9_db};
+  while (dbs.size() >= 2) {
+    for (uint32_t i = 0; i < dbs.size(); ++i) {
+      if (!has_db(dbs[i])) {
+        Open(dbs[i], {default_cf, 1_cf, 2_cf});
+      }
+      auto* db = get_db(dbs[i]);
+      uint32_t keys_per_file = 1 + (i - 5) * (i - 5);  // scatter seqno.
+      for (auto cf : {default_cf, 1_cf, 2_cf}) {
+        for (uint32_t f = 0; f < 3; ++f) {
+          std::string prefix =
+              std::to_string(cf) + std::to_string(dbs[i]) + std::to_string(f);
+          for (uint32_t k = 0; k < keys_per_file; ++k) {
+            auto keystr = prefix + "-" + std::to_string(k);
+            if (rnd.Next() % 4 == 0) {
+              ASSERT_OK(db->SingleDelete(wopts, get_cf(dbs[i], cf), keystr));
+              kvs[cf][keystr] = "NotFound";
+            } else {
+              auto value = rnd.RandomString(16);
+              ASSERT_OK(db->Put(wopts, get_cf(dbs[i], cf), keystr, value));
+              kvs[cf][keystr] = value;
+            }
+          }
+          ASSERT_OK(db->Flush(fopts, get_cf(dbs[i], cf)));
+        }
+      }
+    }
+    // merge random neighbors.
+    uint32_t src = rnd.Next() % dbs.size();
+    uint32_t dst = (src + 1) % dbs.size();
+    if ((rnd.Next() % 2 == 0 && src > 0) || dst == 0) {
+      dst = (src - 1) % dbs.size();
+    }
+    ASSERT_OK(Merge(mopts, {dbs[src]}, dbs[dst]));
+    Destroy(dbs[src]);
+    dbs.erase(dbs.begin() + src);
+  }
+  for (auto cf : {default_cf, 1_cf, 2_cf}) {
+    for (auto& kv : kvs[cf]) {
+      VerifyKeyValue(dbs[0], cf, kv.first, kv.second);
+    }
+    ASSERT_OK(get_db(dbs[0])->Flush(fopts, get_cf(dbs[0], cf)));
+  }
+  Open(dbs[0], {default_cf, 1_cf, 2_cf}, true /*reopen*/);
+  for (auto cf : {default_cf, 1_cf, 2_cf}) {
+    for (auto& kv : kvs[cf]) {
+      VerifyKeyValue(dbs[0], cf, kv.first, kv.second);
+    }
+  }
+}
+
+TEST_F(DBMergeTest, KeyOverlappedInstance) {
+  FlushOptions fopts;
+  fopts.allow_write_stall = true;
+  MergeInstanceOptions mopts;
+  mopts.merge_memtable = false;
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  CompactRangeOptions copts;
+  copts.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+  Open(1_db, {default_cf, 1_cf});
+  Open(2_db, {1_cf, default_cf});
+  ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, 1_cf), "1", "v1"));
+  ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, 1_cf), "0", "v0"));
+
+  ASSERT_OK(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf}));
+  Destroy(3_db);
+
+  ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, 1_cf), "3", "v3"));
+  IsOverlapError(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf}));
+  IsOverlapError(Merge(mopts, {1_db}, 2_db, {default_cf, 1_cf}));
+
+  // Skip overlapped cf.
+  ASSERT_OK(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf}));
+  Destroy(3_db);
+
+  // Only flush one.
+  ASSERT_OK(get_db(2_db)->Flush(fopts, get_cf(2_db, 1_cf)));
+  IsOverlapError(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf}));
+  IsOverlapError(Merge(mopts, {1_db}, 2_db, {default_cf, 1_cf}));
+
+  // Both flushed.
+  ASSERT_OK(get_db(1_db)->Flush(fopts, get_cf(1_db, 1_cf)));
+  IsOverlapError(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf}));
+  IsOverlapError(Merge(mopts, {1_db}, 2_db, {default_cf, 1_cf}));
+
+  // Delete in memory.
+  ASSERT_OK(get_db(1_db)->SingleDelete(wopts, get_cf(1_db, 1_cf), "1"));
+  IsOverlapError(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf}));
+  IsOverlapError(Merge(mopts, {1_db}, 2_db, {default_cf, 1_cf}));
+
+  ASSERT_OK(get_db(1_db)->Flush(fopts, get_cf(1_db, 1_cf)));
+  IsOverlapError(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf}));
+  IsOverlapError(Merge(mopts, {1_db}, 2_db, {default_cf, 1_cf}));
+
+  ASSERT_OK(
+      get_db(1_db)->CompactRange(copts, get_cf(1_db, 1_cf), nullptr, nullptr));
+  ASSERT_OK(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf}));
+
+  VerifyKeyValue(3_db, 1_cf, "0", "v0");
+  VerifyKeyValue(3_db, 1_cf, "3", "v3");
+  VerifyKeyValue(3_db, 1_cf, "1", "NotFound");
+}
+
+TEST_F(DBMergeTest, TombstoneOverlappedInstance) {
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  MergeInstanceOptions mopts;
+  mopts.merge_memtable = false;
+  CompactRangeOptions copts;
+  copts.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+  Open(1_db, {default_cf, 1_cf});
+  Open(2_db, {default_cf, 1_cf});
+  Open(3_db, {default_cf, 1_cf});
+  Open(4_db, {default_cf, 1_cf});
+  ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, 1_cf), "1", "v1"));
+  ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, 1_cf), "2", "v2"));
+  ASSERT_OK(get_db(3_db)->Put(wopts, get_cf(3_db, 1_cf), "3", "v3"));
+  ASSERT_OK(get_db(4_db)->Put(wopts, get_cf(4_db, 1_cf), "4", "v4"));
+
+  ASSERT_OK(Merge(mopts, {1_db, 2_db, 3_db, 4_db}, 0_db, {default_cf, 1_cf}));
+  Destroy(0_db);
+
+  // Lower bound overlap.
+  ASSERT_OK(get_db(2_db)->DeleteRange(wopts, get_cf(2_db, 1_cf), "0", "9"));
+  ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, 1_cf), "2", "v2"));
+  IsOverlapError(Merge(mopts, {1_db, 2_db}, 0_db, {default_cf, 1_cf}));
+
+  // Upper bound overlap.
+  ASSERT_OK(get_db(3_db)->DeleteRange(wopts, get_cf(3_db, 1_cf), "0", "9"));
+  ASSERT_OK(get_db(3_db)->Put(wopts, get_cf(3_db, 1_cf), "3", "v3"));
+  IsOverlapError(Merge(mopts, {3_db, 4_db}, 0_db, {default_cf, 1_cf}));
+
+  // nullptr is an empty key.
+  ASSERT_OK(get_db(4_db)->SingleDelete(wopts, get_cf(4_db, 1_cf), nullptr));
+  IsOverlapError(Merge(mopts, {1_db, 4_db}, 0_db, {default_cf, 1_cf}));
+
+  Slice start = "0";
+  Slice end = "2";
+  ASSERT_OK(
+      get_db(2_db)->CompactRange(copts, get_cf(2_db, 1_cf), &start, &end));
+  start = "22";
+  end = "99";
+  ASSERT_OK(
+      get_db(2_db)->CompactRange(copts, get_cf(2_db, 1_cf), &start, &end));
+
+  start = "0";
+  end = "3";
+  ASSERT_OK(
+      get_db(3_db)->CompactRange(copts, get_cf(3_db, 1_cf), &start, &end));
+  start = "33";
+  end = "99";
+  ASSERT_OK(
+      get_db(3_db)->CompactRange(copts, get_cf(3_db, 1_cf), &start, &end));
+
+  end = "4";
+  ASSERT_OK(
+      get_db(4_db)->CompactRange(copts, get_cf(4_db, 1_cf), nullptr, &end));
+
+  mopts.merge_memtable = true;
+  ASSERT_OK(Merge(mopts, {1_db, 2_db, 3_db, 4_db}, 0_db, {default_cf, 1_cf}));
+
+  VerifyKeyValue(0_db, 1_cf, "1", "v1");
+  VerifyKeyValue(0_db, 1_cf, "2", "v2");
+  VerifyKeyValue(0_db, 1_cf, "3", "v3");
+  VerifyKeyValue(0_db, 1_cf, "4", "v4");
+}
+
+TEST_F(DBMergeTest, WithWAL) {
+  WriteOptions wopts;
+  wopts.disableWAL = false;
+  MergeInstanceOptions mopts;
+  FlushOptions fopts;
+  fopts.allow_write_stall = true;
+
+  Open(1_db, {default_cf, 1_cf});
+  Open(2_db, {default_cf, 1_cf});
+  ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, 1_cf), "1", "v1"));
+  ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, 1_cf), "2", "v2"));
+
+  // Ignore WAL and memtable.
+  mopts.merge_memtable = false;
+  ASSERT_OK(Merge(mopts, {1_db}, 2_db));
+  VerifyKeyValue(2_db, 1_cf, "2", "v2");
+  VerifyKeyValue(2_db, 1_cf, "1", "NotFound");
+
+  mopts.merge_memtable = true;
+  IsWALNotEmpty(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf}));
+
+  for (auto db : {1_db, 2_db}) {
+    ASSERT_OK(get_db(db)->Flush(fopts, get_cf(db, 1_cf)));
+  }
+  ASSERT_OK(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf, 1_cf}));
+}
+
+TEST_F(DBMergeTest, MemtableIsolation) {
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  MergeInstanceOptions mopts;
+  mopts.merge_memtable = true;
+
+  Open(1_db, {default_cf});
+  Open(2_db, {default_cf});
+  ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, default_cf), "1", "v1"));
+  ASSERT_OK(Merge(mopts, {1_db}, 2_db, {default_cf}));
+  VerifyKeyValue(2_db, default_cf, "1", "v1");
+  ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, default_cf), "1", "v2"));
+  // Increase the seqno of 2_db so that snapshot might include new writes.
+  ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, default_cf), "2", "v"));
+  ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, default_cf), "2", "v"));
+  // Check merged DB is not affected by source DB writes.
+  VerifyKeyValue(2_db, default_cf, "1", "v1");
+}
+
+TEST_F(DBMergeTest, CacheReuse) {
+  BlockBasedTableOptions table_options;
+  // Otherwise the reader will not attempt to read cache first.
+  table_options.cache_index_and_filter_blocks = true;
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  ReadOptions ropts;
+  ropts.fill_cache = true;
+  MergeInstanceOptions mopts;
+  mopts.merge_memtable = true;
+
+  Open(1_db, {default_cf});
+  Open(2_db, {default_cf});
+  ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, default_cf), "1", "v1"));
+  ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, default_cf), "2", "v1"));
+  for (auto db : {1_db, 2_db}) {
+    ASSERT_OK(get_db(db)->Flush(FlushOptions(), get_cf(db, default_cf)));
+  }
+  VerifyKeyValue(1_db, default_cf, "1", "v1", ropts);
+  VerifyKeyValue(2_db, default_cf, "2", "v1", ropts);
+  ropts.read_tier = ReadTier::kBlockCacheTier;
+  VerifyKeyValue(1_db, default_cf, "1", "v1", ropts);
+  VerifyKeyValue(2_db, default_cf, "2", "v1", ropts);
+
+  ASSERT_OK(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf}));
+
+  ropts.read_tier = ReadTier::kBlockCacheTier;
+  VerifyKeyValue(3_db, default_cf, "1", "v1", ropts);
+  VerifyKeyValue(3_db, default_cf, "2", "v1", ropts);
+}
+
+TEST_F(DBMergeTest, ConcurrentFlush) {
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  MergeInstanceOptions mopts;
+  mopts.merge_memtable = true;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MergeDisjointInstances:AfterMergeMemtable:1",
+      [&](void* /*arg*/) {
+        for (auto db : {1_db, 2_db}) {
+          ASSERT_OK(get_db(db)->Flush(FlushOptions(), get_cf(db, default_cf)));
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Open(1_db, {default_cf});
+  Open(2_db, {default_cf});
+  Open(3_db, {default_cf});
+  ASSERT_OK(get_db(3_db)->PauseBackgroundWork());
+
+  // Put some to memtable.
+  ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, default_cf), "1", "v1"));
+  ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, default_cf), "2", "v1"));
+  ASSERT_EQ(Property(1_db, "rocksdb.num-files-at-level0"), 0);
+  ASSERT_EQ(Property(2_db, "rocksdb.num-files-at-level0"), 0);
+
+  ASSERT_OK(Merge(mopts, {1_db, 2_db}, 3_db, {default_cf}));
+  ASSERT_EQ(Property(1_db, "rocksdb.num-files-at-level0"), 1);
+  ASSERT_EQ(Property(2_db, "rocksdb.num-files-at-level0"), 1);
+  ASSERT_EQ(Property(3_db, "rocksdb.num-files-at-level0"), 0);
+
+  VerifyKeyValue(3_db, default_cf, "1", "v1");
+  VerifyKeyValue(3_db, default_cf, "2", "v1");
+
+  ASSERT_OK(get_db(3_db)->ContinueBackgroundWork());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBMergeTest, MissingCF) {
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  MergeInstanceOptions mopts;
+  mopts.merge_memtable = true;
+
+  Open(1_db, {default_cf, 1_cf});
+  Open(2_db, {default_cf, 2_cf});
+  Open(3_db, {default_cf, 3_cf});
+  ASSERT_OK(get_db(1_db)->Put(wopts, get_cf(1_db, 1_cf), "key", "v1"));
+  ASSERT_OK(get_db(2_db)->Put(wopts, get_cf(2_db, 2_cf), "key", "v2"));
+  ASSERT_OK(get_db(3_db)->Put(wopts, get_cf(3_db, 3_cf), "key", "v3"));
+
+  ASSERT_OK(
+      Merge(mopts, {1_db, 2_db, 3_db}, 4_db, {default_cf, 1_cf, 2_cf, 3_cf}));
+
+  VerifyKeyValue(4_db, 1_cf, "key", "v1");
+  VerifyKeyValue(4_db, 2_cf, "key", "v2");
+  VerifyKeyValue(4_db, 3_cf, "key", "v3");
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc
index 337eadb7328..73100d74933 100644
--- a/db/db_properties_test.cc
+++ b/db/db_properties_test.cc
@@ -1081,7 +1081,6 @@ TEST_F(DBPropertiesTest, EstimateCompressionRatio) {
   ASSERT_GT(CompressionRatioAtLevel(1), 10.0);
 }
 
-
 class CountingUserTblPropCollector : public TablePropertiesCollector {
  public:
   const char* Name() const override { return "CountingUserTblPropCollector"; }
@@ -2171,7 +2170,7 @@ TEST_F(DBPropertiesTest, GetMapPropertyWriteStallStats) {
                           WriteStallCause::kMemtableLimit}) {
     if (test_cause == WriteStallCause::kWriteBufferManagerLimit) {
       options.write_buffer_manager.reset(
-          new WriteBufferManager(100000, nullptr, true));
+          new WriteBufferManager(100000, nullptr, 1.0));
     } else if (test_cause == WriteStallCause::kMemtableLimit) {
       options.max_write_buffer_number = 2;
       options.disable_auto_compactions = true;
@@ -2207,13 +2206,13 @@ TEST_F(DBPropertiesTest, GetMapPropertyWriteStallStats) {
     if (test_cause == WriteStallCause::kWriteBufferManagerLimit) {
       ASSERT_OK(dbfull()->Put(
           WriteOptions(), handles_[1], Key(1),
-          DummyString(options.write_buffer_manager->buffer_size())));
+          DummyString(options.write_buffer_manager->flush_size())));
 
       WriteOptions wo;
       wo.no_slowdown = true;
       Status s = dbfull()->Put(
           wo, handles_[1], Key(2),
-          DummyString(options.write_buffer_manager->buffer_size()));
+          DummyString(options.write_buffer_manager->flush_size()));
       ASSERT_TRUE(s.IsIncomplete());
       ASSERT_TRUE(s.ToString().find("Write stall") != std::string::npos);
     } else if (test_cause == WriteStallCause::kMemtableLimit) {
@@ -2364,7 +2363,6 @@ TEST_F(DBPropertiesTest, TableMetaIndexKeys) {
   } while (ChangeOptions());
 }
 
-
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_test.cc b/db/db_test.cc
index 646e3101f50..f0bdca59528 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3219,7 +3219,9 @@ class ModelDB : public DB {
     delete reinterpret_cast<const ModelSnapshot*>(snapshot);
   }
 
-  Status Write(const WriteOptions& /*options*/, WriteBatch* batch) override {
+  using DB::Write;
+  Status Write(const WriteOptions& /*options*/, WriteBatch* batch,
+               PostWriteCallback* /*callback*/) override {
     class Handler : public WriteBatch::Handler {
      public:
       KVMap* map_;
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 026334509d1..4f33841c334 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -25,6 +25,7 @@
 #include "rocksdb/trace_record_result.h"
 #include "rocksdb/utilities/replayer.h"
 #include "rocksdb/wal_filter.h"
+#include "test_util/mock_time_env.h"
 #include "test_util/testutil.h"
 #include "util/random.h"
 #include "utilities/fault_injection_env.h"
@@ -331,11 +332,11 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   ASSERT_LT(cache->GetUsage(), 256 * 1024);
 
   if (use_old_interface_) {
-    options.db_write_buffer_size = 120000;  // this is the real limit
+    options.db_write_buffer_size = 100000;
   } else if (!cost_cache_) {
-    options.write_buffer_manager.reset(new WriteBufferManager(114285));
+    options.write_buffer_manager.reset(new WriteBufferManager(100000));
   } else {
-    options.write_buffer_manager.reset(new WriteBufferManager(114285, cache));
+    options.write_buffer_manager.reset(new WriteBufferManager(100000, cache));
   }
   options.write_buffer_size = 500000;  // this is never hit
   CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
@@ -366,7 +367,6 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
             static_cast<uint64_t>(1));
 
-  flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
   ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
   if (cost_cache_) {
     ASSERT_GE(cache->GetUsage(), 256 * 1024);
@@ -512,10 +512,8 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   options.write_buffer_size = 500000;  // this is never hit
-  // Use a write buffer total size so that the soft limit is about
-  // 105000.
-  options.write_buffer_manager.reset(new WriteBufferManager(120000));
-  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+  options.write_buffer_manager.reset(new WriteBufferManager(100000));
+  CreateAndReopenWithCF({"cf1"}, options);
 
   ASSERT_OK(DestroyDB(dbname2, options));
   DB* db2 = nullptr;
@@ -527,7 +525,6 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
   std::function<void()> wait_flush = [&]() {
     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
-    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
     ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
     // Ensure background work is fully finished including listener callbacks
     // before accessing listener state.
@@ -536,49 +533,134 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
         static_cast_with_check<DBImpl>(db2)->TEST_WaitForBackgroundWork());
   };
 
-  // Trigger a flush on cf2
-  flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
-  ASSERT_OK(Put(2, Key(1), DummyString(70000), wo));
-  wait_flush();
+  // Trigger a flush on DB1.cf1
+  flush_listener->expected_flush_reason = FlushReason::kManualFlush;
   ASSERT_OK(Put(0, Key(1), DummyString(20000), wo));
   wait_flush();
+  ASSERT_OK(Put(1, Key(1), DummyString(70000), wo));
+  wait_flush();
 
   // Insert to DB2
+  // [20000, 70000, 20000]
   ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000)));
   wait_flush();
 
-  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
   wait_flush();
-  ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") +
-                  GetNumberOfSstFilesForColumnFamily(db_, "cf1") +
-                  GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
               static_cast<uint64_t>(1));
     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
               static_cast<uint64_t>(0));
   }
 
-  // Triggering to flush another CF in DB1
+  // Triggering to flush DB2 by writing to DB1
+  // [20000, 0, 90000]
   ASSERT_OK(db2->Put(wo, Key(2), DummyString(70000)));
   wait_flush();
-  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
-  wait_flush();
   {
     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
-              static_cast<uint64_t>(1));
+              static_cast<uint64_t>(0));
     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+  }
+  ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
               static_cast<uint64_t>(1));
     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+              static_cast<uint64_t>(1));
+  }
+
+  // Triggering flush in DB2 by writing to DB2
+  // [20000, 0, 80000]
+  ASSERT_OK(db2->Put(wo, Key(3), DummyString(80000)));
+  ASSERT_OK(db2->Put(wo, Key(1), DummyString(10000)));
+  wait_flush();
+  ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
               static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+              static_cast<uint64_t>(2));
   }
 
-  // Triggering flush in DB2.
-  ASSERT_OK(db2->Put(wo, Key(3), DummyString(40000)));
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB_RankByAge) {
+  std::string dbname2 = test::PerThreadDBPath("db_shared_wb_age_db2");
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  auto flush_listener = std::make_shared<FlushCounterListener>();
+  options.listeners.push_back(flush_listener);
+  // Don't trip the listener at shutdown.
+  options.avoid_flush_during_shutdown = true;
+  // Avoid undeterministic value by malloc_usable_size();
+  // Force arena block size to 1
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Arena::Arena:0", [&](void* arg) {
+        size_t* block_size = static_cast<size_t*>(arg);
+        *block_size = 1;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Arena::AllocateNewBlock:0", [&](void* arg) {
+        std::pair<size_t*, size_t*>* pair =
+            static_cast<std::pair<size_t*, size_t*>*>(arg);
+        *std::get<0>(*pair) = *std::get<1>(*pair);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  options.write_buffer_size = 500000;  // this is never hit
+  options.write_buffer_manager.reset(new WriteBufferManager(
+      100000, nullptr /*cache*/, 0.0 /*stall_ratio*/, true /*flush_oldest*/));
+
+  auto mock_clock = std::make_shared<MockSystemClock>(SystemClock::Default());
+  options.env = new CompositeEnvWrapper(options.env, mock_clock);
+
+  CreateAndReopenWithCF({"cf1"}, options);
+
+  ASSERT_OK(DestroyDB(dbname2, options));
+  DB* db2 = nullptr;
+  ASSERT_OK(DB::Open(options, dbname2, &db2));
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  std::function<void()> wait_flush = [&]() {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+  };
+
+  // Trigger a flush on DB1.cf2
+  flush_listener->expected_flush_reason = FlushReason::kManualFlush;
+  mock_clock->SetCurrentTime(50);
+  ASSERT_OK(Put(0, Key(1), DummyString(20000), wo));
+  wait_flush();
+  mock_clock->SetCurrentTime(100);
+  ASSERT_OK(Put(1, Key(1), DummyString(70000), wo));
+  wait_flush();
+  mock_clock->SetCurrentTime(150);
+
+  // Insert to DB2
+  // [20000, 70000, 20000]
+  ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000)));
   wait_flush();
-  ASSERT_OK(db2->Put(wo, Key(1), DummyString(1)));
+
+  ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
   wait_flush();
   ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
   {
@@ -586,10 +668,31 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
               static_cast<uint64_t>(1));
     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+              static_cast<uint64_t>(0));
+  }
+
+  // Triggering to flush DB1 by writing to DB2
+  // [20000, 0, 90000]
+  ASSERT_OK(db2->Put(wo, Key(2), DummyString(70000)));
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
               static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+              static_cast<uint64_t>(0));
     ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+              static_cast<uint64_t>(0));
+  }
+  ASSERT_OK(db2->Put(wo, Key(3), DummyString(1)));
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
               static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+              static_cast<uint64_t>(0));
   }
 
   delete db2;
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index 28d67527fe9..6d46be7ea1a 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -617,6 +617,22 @@ void DBTestBase::ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
   ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
 }
 
+void DBTestBase::OpenWithCFWriteBufferManager(
+    const std::vector<std::string>& cfs,
+    const std::vector<std::shared_ptr<WriteBufferManager>> wbms,
+    const Options& options) {
+  CreateColumnFamilies(cfs, options);
+  std::vector<std::string> cfs_plus_default = cfs;
+  cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+  std::vector<Options> cf_options;
+  for (size_t i = 0; i < wbms.size(); ++i) {
+    auto o = options;
+    o.cf_write_buffer_manager = wbms[i];
+    cf_options.push_back(o);
+  }
+  ReopenWithColumnFamilies(cfs_plus_default, cf_options);
+}
+
 void DBTestBase::SetTimeElapseOnlySleepOnReopen(DBOptions* options) {
   time_elapse_only_sleep_on_reopen_ = true;
 
diff --git a/db/db_test_util.h b/db/db_test_util.h
index dc34352dc2e..f1298dc6bf5 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -1118,6 +1118,11 @@ class DBTestBase : public testing::Test {
   Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
                                      const Options& options);
 
+  void OpenWithCFWriteBufferManager(
+      const std::vector<std::string>& cfs,
+      const std::vector<std::shared_ptr<WriteBufferManager>> wbms,
+      const Options& options);
+
   void Reopen(const Options& options);
 
   void Close();
diff --git a/db/db_write_buffer_manager_test.cc b/db/db_write_buffer_manager_test.cc
index eb33ec41e12..28dc9908c6a 100644
--- a/db/db_write_buffer_manager_test.cc
+++ b/db/db_write_buffer_manager_test.cc
@@ -31,10 +31,10 @@ TEST_P(DBWriteBufferManagerTest, SharedBufferAcrossCFs1) {
 
   if (cost_cache_) {
     options.write_buffer_manager.reset(
-        new WriteBufferManager(100000, cache, true));
+        new WriteBufferManager(100000, cache, 1.0));
   } else {
     options.write_buffer_manager.reset(
-        new WriteBufferManager(100000, nullptr, true));
+        new WriteBufferManager(100000, nullptr, 1.0));
   }
 
   WriteOptions wo;
@@ -74,10 +74,10 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) {
 
   if (cost_cache_) {
     options.write_buffer_manager.reset(
-        new WriteBufferManager(100000, cache, true));
+        new WriteBufferManager(100000, cache, 1.0));
   } else {
     options.write_buffer_manager.reset(
-        new WriteBufferManager(100000, nullptr, true));
+        new WriteBufferManager(100000, nullptr, 1.0));
   }
   WriteOptions wo;
   wo.disableWAL = true;
@@ -179,6 +179,374 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+// Compared with `SharedWriteBufferAcrossCFs2` this test uses CF based write
+// buffer manager CF level write buffer manager will not block write even
+// exceeds the stall threshold DB level write buffer manager will block all
+// write including CFs not use it.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs3) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  std::shared_ptr<WriteBufferManager> cf_write_buffer_manager;
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, 1.0));
+    cf_write_buffer_manager.reset(new WriteBufferManager(100000, cache, 1.0));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, 1.0));
+    cf_write_buffer_manager.reset(new WriteBufferManager(100000, nullptr, 1.0));
+  }
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  std::vector<std::string> cfs = {"cf1", "cf2", "cf3", "cf4", "cf5"};
+  std::vector<std::shared_ptr<WriteBufferManager>> wbms = {
+      nullptr,
+      nullptr,
+      nullptr,
+      nullptr,
+      cf_write_buffer_manager,
+      cf_write_buffer_manager};
+  OpenWithCFWriteBufferManager(cfs, wbms, options);
+  auto opts = db_->GetOptions();
+
+  ASSERT_OK(Put(4, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(5, Key(1), DummyString(40000), wo));
+  ASSERT_OK(Put(4, Key(1), DummyString(40000), wo));
+  // Now, cf_write_buffer_manager reaches the stall level, but it will not block
+  // the write
+
+  int num_writers_total = 6;
+  for (int i = 0; i < num_writers_total; i++) {
+    ASSERT_OK(Put(i, Key(1), DummyString(1), wo));
+  }
+
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+
+  // Write to "Default", "cf2" and "cf3". No flush will be triggered.
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+
+  ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write is
+  // completed.
+
+  std::unordered_set<WriteThread::Writer*> w_set;
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  int num_writers1 = 4;  // default, cf1-cf3
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::atomic<int> thread_num(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        InstrumentedMutexLock lock(&mutex);
+        wait_count_db++;
+        cv.SignalAll();
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        InstrumentedMutexLock lock(&mutex);
+        WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        w_set.insert(w);
+        // Allow the flush to continue if all writer threads are blocked.
+        if (w_set.size() == (unsigned long)num_writers1) {
+          TEST_SYNC_POINT(
+              "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s = true;
+
+  std::function<void(int)> writer = [&](int cf) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    Status tmp = Put(cf, Slice(key), DummyString(1), wo);
+    InstrumentedMutexLock lock(&mutex);
+    s = s && tmp.ok();
+  };
+
+  threads.emplace_back(writer, 1);
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+  for (int i = 0; i < num_writers_total; i++) {
+    threads.emplace_back(writer, i % 6);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s);
+
+  // Number of DBs blocked.
+  ASSERT_EQ(wait_count_db, 1);
+  // Number of Writer threads blocked.
+  ASSERT_EQ(w_set.size(), num_writers_total);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple WriteBufferManager are independent to flush
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs4) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  std::shared_ptr<WriteBufferManager> cf_write_buffer_manager;
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, 0.0));
+    cf_write_buffer_manager.reset(new WriteBufferManager(100000, cache, 0.0));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, 0.0));
+    cf_write_buffer_manager.reset(new WriteBufferManager(100000, nullptr, 0.0));
+  }
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  std::vector<std::string> cfs = {"cf1", "cf2", "cf3", "cf4", "cf5"};
+  std::vector<std::shared_ptr<WriteBufferManager>> wbms = {
+      nullptr,
+      nullptr,
+      nullptr,
+      nullptr,
+      cf_write_buffer_manager,
+      cf_write_buffer_manager};
+  OpenWithCFWriteBufferManager(cfs, wbms, options);
+
+  ASSERT_OK(Put(4, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(4, Key(1), DummyString(40000), wo));
+
+  ASSERT_OK(Put(1, Key(1), DummyString(40000), wo));
+  ASSERT_OK(Put(2, Key(1), DummyString(30000), wo));
+
+  ASSERT_OK(Put(5, Key(1), DummyString(50000), wo));
+
+  // The second WriteBufferManager::buffer_size_ has exceeded after the previous
+  // write is completed.
+
+  std::unordered_set<std::string> flush_cfs;
+  std::vector<port::Thread> threads;
+  int num_writers_total = 6;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::atomic<int> thread_num(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::Flush:ScheduleFlushReq", [&](void* arg) {
+        InstrumentedMutexLock lock(&mutex);
+        ColumnFamilyHandle* cfd = reinterpret_cast<ColumnFamilyHandle*>(arg);
+        flush_cfs.insert(cfd->GetName());
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s = true;
+
+  std::function<void(int, int)> writer = [&](int cf, int val_size) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    Status tmp = Put(cf, Slice(key), DummyString(val_size), wo);
+    InstrumentedMutexLock lock(&mutex);
+    s = s && tmp.ok();
+  };
+
+  for (int i = 0; i < num_writers_total; i++) {
+    threads.emplace_back(writer, i % 6, 1);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  threads.clear();
+
+  ASSERT_TRUE(s);
+  ASSERT_EQ(flush_cfs.size(), 1);
+  ASSERT_NE(flush_cfs.find("cf4"), flush_cfs.end());
+  flush_cfs.clear();
+
+  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+  for (int i = 0; i < num_writers_total; i++) {
+    threads.emplace_back(writer, i % 6, 1);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_EQ(flush_cfs.size(), 1);
+  ASSERT_NE(flush_cfs.find("cf1"), flush_cfs.end());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBWriteBufferManagerTest, FreeMemoryOnDestroy) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;   // this is never hit
+  options.max_write_buffer_number = 5;  // Avoid unexpected stalling.
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, 1.0));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, 1.0));
+  }
+
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+  std::string db2_name = test::PerThreadDBPath("free_memory_on_destroy_db2");
+  DB* db2 = nullptr;
+  ASSERT_OK(DestroyDB(db2_name, options));
+  ASSERT_OK(DB::Open(options, db2_name, &db2));
+
+  ASSERT_OK(db_->PauseBackgroundWork());
+  ASSERT_OK(db2->PauseBackgroundWork());
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+  wo.no_slowdown = true;
+
+  ASSERT_OK(db2->Put(wo, Key(1), DummyString(30000)));
+  ASSERT_OK(Put(1, Key(1), DummyString(20000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+
+  // Decrease flush size, at least two cfs must be freed to not stall write.
+  options.write_buffer_manager->SetFlushSize(50000);
+  ASSERT_TRUE(Put(0, Key(1), DummyString(30000), wo).IsIncomplete());
+
+  ASSERT_OK(db2->ContinueBackgroundWork());  // Close waits on pending jobs.
+  // Thanks to `UnregisterDB`, we don't have to delete it to free up space.
+  db2->Close();
+  ASSERT_TRUE(Put(0, Key(1), DummyString(30000), wo).IsIncomplete());
+
+  dbfull()->TEST_ClearBackgroundJobs();  // Jobs hold ref of cfd.
+  ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+  ASSERT_TRUE(Put(0, Key(1), DummyString(30000), wo).IsIncomplete());
+  ASSERT_OK(db_->DestroyColumnFamilyHandle(handles_[1]));
+  handles_.erase(handles_.begin() + 1);
+  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+  delete db2;
+  DestroyDB(db2_name, options);
+
+  ASSERT_OK(db_->ContinueBackgroundWork());
+}
+
+TEST_P(DBWriteBufferManagerTest, DynamicFlushSize) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, 1.0));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, 1.0));
+  }
+
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+  std::string db2_name = test::PerThreadDBPath("dynamic_flush_db2");
+  DB* db2 = nullptr;
+  ASSERT_OK(DestroyDB(db2_name, options));
+  ASSERT_OK(DB::Open(options, db2_name, &db2));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Increase flush size can unblock writers.
+  {
+    WriteOptions wo;
+    wo.disableWAL = true;
+    ASSERT_OK(db2->Put(wo, Key(1), DummyString(60000)));
+    ASSERT_OK(Put(1, Key(1), DummyString(30000), wo));
+    ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+    // Write to DB.
+    std::vector<port::Thread> threads;
+    std::atomic<bool> ready{false};
+    std::function<void(DB*)> write_db = [&](DB* db) {
+      WriteOptions wopts;
+      wopts.disableWAL = true;
+      wopts.no_slowdown = true;
+      ASSERT_TRUE(db->Put(wopts, Key(3), DummyString(1)).IsIncomplete());
+      ready = true;
+      wopts.no_slowdown = false;
+      ASSERT_OK(db->Put(wopts, Key(3), DummyString(1)));
+    };
+    // Triggers db2 flush, but the flush is blocked.
+    threads.emplace_back(write_db, db_);
+    while (!ready) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+    // Increase.
+    options.write_buffer_manager->SetFlushSize(200000);
+    for (auto& t : threads) {
+      t.join();
+    }
+    TEST_SYNC_POINT("DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+  }
+  // Decrease flush size triggers flush.
+  {
+    WriteOptions wo;
+    wo.disableWAL = true;
+    wo.no_slowdown = true;
+
+    ASSERT_OK(Put(0, Key(1), DummyString(60000), wo));
+    // All memtables must be flushed to satisfy the new flush_size.
+    // Not too small because memtable has a minimum size.
+    options.write_buffer_manager->SetFlushSize(10240);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(db2->Put(wo, Key(1), DummyString(200000)));
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  db2->Close();
+  delete db2;
+  DestroyDB(db2_name, options);
+}
+
 // Test multiple DBs get blocked when WriteBufferManager limit exceeds and flush
 // is waiting to be finished but DBs tries to write meanwhile.
 TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
@@ -201,10 +569,10 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
 
   if (cost_cache_) {
     options.write_buffer_manager.reset(
-        new WriteBufferManager(100000, cache, true));
+        new WriteBufferManager(100000, cache, 1.0));
   } else {
     options.write_buffer_manager.reset(
-        new WriteBufferManager(100000, nullptr, true));
+        new WriteBufferManager(100000, nullptr, 1.0));
   }
   CreateAndReopenWithCF({"cf1", "cf2"}, options);
 
@@ -216,10 +584,10 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
   wo.disableWAL = true;
 
   for (int i = 0; i < num_dbs; i++) {
-    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(25000)));
   }
   // Insert to db_.
-  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(25000), wo));
 
   // WriteBufferManager Limit exceeded.
   std::vector<port::Thread> threads;
@@ -318,10 +686,10 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
 
   if (cost_cache_) {
     options.write_buffer_manager.reset(
-        new WriteBufferManager(100000, cache, true));
+        new WriteBufferManager(100000, cache, 1.0));
   } else {
     options.write_buffer_manager.reset(
-        new WriteBufferManager(100000, nullptr, true));
+        new WriteBufferManager(100000, nullptr, 1.0));
   }
   CreateAndReopenWithCF({"cf1", "cf2"}, options);
 
@@ -333,10 +701,10 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
   wo.disableWAL = true;
 
   for (int i = 0; i < num_dbs; i++) {
-    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(25000)));
   }
   // Insert to db_.
-  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(25000), wo));
 
   // WriteBufferManager::buffer_size_ has exceeded after the previous write to
   // dbs[0] is completed.
@@ -460,10 +828,10 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) {
 
   if (cost_cache_) {
     options.write_buffer_manager.reset(
-        new WriteBufferManager(100000, cache, true));
+        new WriteBufferManager(100000, cache, 1.0));
   } else {
     options.write_buffer_manager.reset(
-        new WriteBufferManager(100000, nullptr, true));
+        new WriteBufferManager(100000, nullptr, 1.0));
   }
   WriteOptions wo;
   wo.disableWAL = true;
@@ -622,10 +990,10 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
 
   if (cost_cache_) {
     options.write_buffer_manager.reset(
-        new WriteBufferManager(100000, cache, true));
+        new WriteBufferManager(100000, cache, 1.0));
   } else {
     options.write_buffer_manager.reset(
-        new WriteBufferManager(100000, nullptr, true));
+        new WriteBufferManager(100000, nullptr, 1.0));
   }
   CreateAndReopenWithCF({"cf1", "cf2"}, options);
 
@@ -637,10 +1005,10 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
   wo.disableWAL = true;
 
   for (int i = 0; i < num_dbs; i++) {
-    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(25000)));
   }
   // Insert to db_.
-  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(25000), wo));
 
   // WriteBufferManager::buffer_size_ has exceeded after the previous write to
   // dbs[0] is completed.
@@ -780,7 +1148,6 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
-
 // Tests a `WriteBufferManager` constructed with `allow_stall == false` does not
 // thrash memtable switching when full and a CF receives multiple writes.
 // Instead, we expect to switch a CF's memtable for flush only when that CF does
@@ -791,7 +1158,7 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
 // by writing to that CF's DB.
 //
 // Not supported in LITE mode due to `GetProperty()` unavailable.
-TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) {
+TEST_P(DBWriteBufferManagerTest, DISABLED_StopSwitchingMemTablesOnceFlushing) {
   Options options = CurrentOptions();
   options.arena_block_size = 4 << 10;   // 4KB
   options.write_buffer_size = 1 << 20;  // 1MB
@@ -846,72 +1213,84 @@ TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) {
   delete shared_wbm_db;
 }
 
-TEST_F(DBWriteBufferManagerTest, RuntimeChangeableAllowStall) {
-  constexpr int kBigValue = 10000;
+// Test write can progress even if manual compaction and background work is
+// paused.
+TEST_P(DBWriteBufferManagerTest, BackgroundWorkPaused) {
+  std::vector<std::string> dbnames;
+  std::vector<DB*> dbs;
+  int num_dbs = 4;
+
+  for (int i = 0; i < num_dbs; i++) {
+    dbs.push_back(nullptr);
+    dbnames.push_back(
+        test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+  }
 
   Options options = CurrentOptions();
-  options.write_buffer_manager.reset(
-      new WriteBufferManager(1, nullptr /* cache */, true /* allow_stall */));
-  DestroyAndReopen(options);
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;          // this is never hit
+  options.avoid_flush_during_shutdown = true;  // avoid blocking destroy forever
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
 
-  // Pause flush thread so that
-  // (a) the only way to exist write stall below is to change the `allow_stall`
-  // (b) the write stall is "stable" without being interfered by flushes so that
-  // we can check it without flakiness
-  std::unique_ptr<test::SleepingBackgroundTask> sleeping_task(
-      new test::SleepingBackgroundTask());
-  env_->SetBackgroundThreads(1, Env::HIGH);
-  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
-                 sleeping_task.get(), Env::Priority::HIGH);
-  sleeping_task->WaitUntilSleeping();
-
-  // Test 1: test setting `allow_stall` from true to false
-  //
-  // Assert existence of a write stall
-  WriteOptions wo_no_slowdown;
-  wo_no_slowdown.no_slowdown = true;
-  Status s = Put(Key(0), DummyString(kBigValue), wo_no_slowdown);
-  ASSERT_TRUE(s.IsIncomplete());
-  ASSERT_TRUE(s.ToString().find("Write stall") != std::string::npos);
+  // Do not enable write stall.
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, 0.0));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, 0.0));
+  }
+  DestroyAndReopen(options);
 
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {{"WBMStallInterface::BlockDB",
-        "DBWriteBufferManagerTest::RuntimeChangeableThreadSafeParameters::"
-        "ChangeParameter"}});
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+  }
 
-  // Test `SetAllowStall()`
-  port::Thread thread1([&] { ASSERT_OK(Put(Key(0), DummyString(kBigValue))); });
-  port::Thread thread2([&] {
-    TEST_SYNC_POINT(
-        "DBWriteBufferManagerTest::RuntimeChangeableThreadSafeParameters::"
-        "ChangeParameter");
-    options.write_buffer_manager->SetAllowStall(false);
-  });
-
-  // Verify `allow_stall` is successfully set to false in thread2.
-  // Othwerwise, thread1's write will be stalled and this test will hang
-  // forever.
-  thread1.join();
-  thread2.join();
+  dbfull()->DisableManualCompaction();
+  ASSERT_OK(dbfull()->PauseBackgroundWork());
+  for (int i = 0; i < num_dbs; i++) {
+    dbs[i]->DisableManualCompaction();
+    ASSERT_OK(dbs[i]->PauseBackgroundWork());
+  }
 
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  WriteOptions wo;
+  wo.disableWAL = true;
 
-  // Test 2: test setting `allow_stall` from false to true
-  //
-  // Assert no write stall
-  ASSERT_OK(Put(Key(0), DummyString(kBigValue), wo_no_slowdown));
+  // Arrange the score like this: (this)2000, (0-th)100000, (1-th)1, ...
+  ASSERT_OK(Put(Key(1), DummyString(2000), wo));
+  for (int i = 1; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(1)));
+  }
+  // Exceed the limit.
+  ASSERT_OK(dbs[0]->Put(wo, Key(1), DummyString(100000)));
+  // Write another one to trigger the flush.
+  ASSERT_OK(Put(Key(3), DummyString(1), wo));
 
-  // Test `SetAllowStall()`
-  options.write_buffer_manager->SetAllowStall(true);
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->ContinueBackgroundWork());
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(dbs[i])->TEST_WaitForFlushMemTable());
+    std::string property;
+    EXPECT_TRUE(dbs[i]->GetProperty("rocksdb.num-files-at-level0", &property));
+    int num = atoi(property.c_str());
+    ASSERT_EQ(num, 0);
+  }
+  ASSERT_OK(dbfull()->ContinueBackgroundWork());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  std::string property;
+  EXPECT_TRUE(dbfull()->GetProperty("rocksdb.num-files-at-level0", &property));
+  int num = atoi(property.c_str());
+  ASSERT_EQ(num, 1);
 
-  // Verify `allow_stall` is successfully set to true.
-  // Otherwise the following write will not be stalled and therefore succeed.
-  s = Put(Key(0), DummyString(kBigValue), wo_no_slowdown);
-  ASSERT_TRUE(s.IsIncomplete());
-  ASSERT_TRUE(s.ToString().find("Write stall") != std::string::npos);
-  sleeping_task->WakeUp();
+  // Clean up DBs.
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    delete dbs[i];
+  }
 }
 
 INSTANTIATE_TEST_CASE_P(DBWriteBufferManagerTest, DBWriteBufferManagerTest,
diff --git a/db/db_write_test.cc b/db/db_write_test.cc
index 0c6fdf849c5..daaa7ba4067 100644
--- a/db/db_write_test.cc
+++ b/db/db_write_test.cc
@@ -859,6 +859,103 @@ TEST_P(DBWriteTest, MultiThreadWrite) {
   Close();
 }
 
+class SimpleCallback : public PostWriteCallback {
+  std::function<void(SequenceNumber)> f_;
+
+ public:
+  SimpleCallback(std::function<void(SequenceNumber)>&& f) : f_(f) {}
+
+  void Callback(SequenceNumber seq) override { f_(seq); }
+};
+
+TEST_P(DBWriteTest, PostWriteCallback) {
+  Options options = GetOptions();
+  if (options.two_write_queues) {
+    // Not compatible.
+    return;
+  }
+  Reopen(options);
+
+  std::vector<port::Thread> threads;
+
+  port::Mutex the_first_can_exit_write_mutex;
+  the_first_can_exit_write_mutex.Lock();
+  port::Mutex can_flush_mutex;
+  can_flush_mutex.Lock();
+  port::Mutex the_second_can_exit_write_mutex;
+  the_second_can_exit_write_mutex.Lock();
+
+  std::atomic<uint64_t> written(0);
+  std::atomic<bool> flushed(false);
+
+  threads.push_back(port::Thread([&] {
+    WriteBatch batch;
+    WriteOptions opts;
+    opts.sync = false;
+    opts.disableWAL = true;
+    SimpleCallback callback([&](SequenceNumber seq) {
+      ASSERT_NE(seq, 0);
+      can_flush_mutex.Unlock();
+      the_first_can_exit_write_mutex.Lock();
+      the_second_can_exit_write_mutex.Unlock();
+    });
+    batch.Put("key", "value");
+    ASSERT_OK(dbfull()->Write(opts, &batch, &callback));
+    written.fetch_add(1, std::memory_order_relaxed);
+  }));
+  threads.push_back(port::Thread([&] {
+    WriteBatch batch;
+    WriteOptions opts;
+    opts.sync = false;
+    opts.disableWAL = true;
+    SimpleCallback callback([&](SequenceNumber seq) {
+      ASSERT_NE(seq, 0);
+      the_second_can_exit_write_mutex.Lock();
+    });
+    batch.Put("key", "value");
+    ASSERT_OK(dbfull()->Write(opts, &batch, &callback));
+    written.fetch_add(1, std::memory_order_relaxed);
+  }));
+  // Flush will enter write thread and wait for pending writes.
+  threads.push_back(port::Thread([&] {
+    FlushOptions opts;
+    opts.wait = false;
+    can_flush_mutex.Lock();
+    ASSERT_OK(dbfull()->Flush(opts));
+    flushed.store(true, std::memory_order_relaxed);
+  }));
+
+  std::this_thread::sleep_for(std::chrono::milliseconds{100});
+  ASSERT_EQ(written.load(std::memory_order_relaxed), 0);
+  ASSERT_EQ(flushed.load(std::memory_order_relaxed), false);
+
+  the_first_can_exit_write_mutex.Unlock();
+  std::this_thread::sleep_for(std::chrono::milliseconds{100});
+  ASSERT_EQ(written.load(std::memory_order_relaxed), 2);
+  ASSERT_EQ(flushed.load(std::memory_order_relaxed), true);
+
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+TEST_P(DBWriteTest, PostWriteCallbackEmptyBatch) {
+  Options options = GetOptions();
+  if (options.two_write_queues) {
+    // Not compatible.
+    return;
+  }
+  Reopen(options);
+  WriteBatch batch;
+  WriteOptions opts;
+  opts.sync = false;
+  opts.disableWAL = true;
+  SequenceNumber seq = 0;
+  SimpleCallback callback([&](SequenceNumber s) { seq = s; });
+  ASSERT_OK(dbfull()->Write(opts, &batch, &callback));
+  ASSERT_NE(seq, 0);
+}
+
 INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest,
                         testing::Values(DBTestBase::kDefault,
                                         DBTestBase::kConcurrentWALWrites,
diff --git a/db/flush_job.cc b/db/flush_job.cc
index a3e168823a6..4052c8b7940 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -169,7 +169,8 @@ void FlushJob::RecordFlushIOStats() {
       ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
   IOSTATS_RESET(bytes_written);
 }
-void FlushJob::PickMemTable() {
+void FlushJob::PickMemTable(SequenceNumber* earliest_seqno,
+                            SequenceNumber* largest_seqno) {
   db_mutex_->AssertHeld();
   assert(!pick_memtable_called);
   pick_memtable_called = true;
@@ -214,6 +215,14 @@ void FlushJob::PickMemTable() {
 
   base_ = cfd_->current();
   base_->Ref();  // it is likely that we do not need this reference
+  if (earliest_seqno != nullptr) {
+    *earliest_seqno = m->GetEarliestSequenceNumber();
+  }
+  if (largest_seqno != nullptr) {
+    *largest_seqno = mems_.back()->GetLargestSequenceNumber();
+  }
+  assert(earliest_seqno == nullptr || largest_seqno == nullptr ||
+         *earliest_seqno <= *largest_seqno);
 }
 
 Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta,
diff --git a/db/flush_job.h b/db/flush_job.h
index aef33ef423a..dfe51e8366c 100644
--- a/db/flush_job.h
+++ b/db/flush_job.h
@@ -82,10 +82,13 @@ class FlushJob {
 
   // Require db_mutex held.
   // Once PickMemTable() is called, either Run() or Cancel() has to be called.
-  void PickMemTable();
+  void PickMemTable(SequenceNumber* earliest_seqno = nullptr,
+                    SequenceNumber* largest_seqno = nullptr);
   // @param skip_since_bg_error If not nullptr and if atomic_flush=false,
   // then it is set to true if flush installation is skipped and memtable
   // is rolled back due to existing background error.
+  // The earliest seqno and largest seqno will be returned through the
+  // parameters.
   Status Run(LogsWithPrepTracker* prep_tracker = nullptr,
              FileMetaData* file_meta = nullptr,
              bool* switched_to_mempurge = nullptr,
diff --git a/db/memtable.cc b/db/memtable.cc
index 0b8786bc2ff..e4f0804695f 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -78,13 +78,12 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       refs_(0),
       kArenaBlockSize(Arena::OptimizeBlockSize(moptions_.arena_block_size)),
       mem_tracker_(write_buffer_manager),
-      arena_(moptions_.arena_block_size,
-             (write_buffer_manager != nullptr &&
-              (write_buffer_manager->enabled() ||
-               write_buffer_manager->cost_to_cache()))
-                 ? &mem_tracker_
-                 : nullptr,
-             mutable_cf_options.memtable_huge_page_size),
+      arena_(
+          moptions_.arena_block_size,
+          (write_buffer_manager != nullptr && (write_buffer_manager->enabled()))
+              ? &mem_tracker_
+              : nullptr,
+          mutable_cf_options.memtable_huge_page_size),
       table_(ioptions.memtable_factory->CreateMemTableRep(
           comparator_, &arena_, mutable_cf_options.prefix_extractor.get(),
           ioptions.logger, column_family_id)),
@@ -102,6 +101,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       file_number_(0),
       first_seqno_(0),
       earliest_seqno_(latest_seq),
+      largest_seqno_(latest_seq),
       creation_seq_(latest_seq),
       mem_next_logfile_number_(0),
       min_prep_log_referenced_(0),
@@ -109,6 +109,12 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
                  ? moptions_.inplace_update_num_locks
                  : 0),
       prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
+      needs_bloom_filter_(
+          (prefix_extractor_ || moptions_.memtable_whole_key_filtering) &&
+          moptions_.memtable_prefix_bloom_bits > 0),
+      bloom_filter_ptr_(nullptr),
+      bloom_filter_(nullptr),
+      logger_(ioptions.logger),
       flush_state_(FLUSH_NOT_REQUESTED),
       clock_(ioptions.clock),
       insert_with_hint_prefix_extractor_(
@@ -122,14 +128,6 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
   // something went wrong if we need to flush before inserting anything
   assert(!ShouldScheduleFlush());
 
-  // use bloom_filter_ for both whole key and prefix bloom filter
-  if ((prefix_extractor_ || moptions_.memtable_whole_key_filtering) &&
-      moptions_.memtable_prefix_bloom_bits > 0) {
-    bloom_filter_.reset(
-        new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
-                         6 /* hard coded 6 probes */,
-                         moptions_.memtable_huge_page_size, ioptions.logger));
-  }
   // Initialize cached_range_tombstone_ here since it could
   // be read before it is constructed in MemTable::Add(), which could also lead
   // to a data race on the global mutex table backing atomic shared_ptr.
@@ -361,8 +359,8 @@ const char* EncodeKey(std::string* scratch, const Slice& target) {
 
 class MemTableIterator : public InternalIterator {
  public:
-  MemTableIterator(const MemTable& mem, const ReadOptions& read_options,
-                   Arena* arena, bool use_range_del_table = false)
+  MemTableIterator(MemTable& mem, const ReadOptions& read_options, Arena* arena,
+                   bool use_range_del_table = false)
       : bloom_(nullptr),
         prefix_extractor_(mem.prefix_extractor_),
         comparator_(mem.comparator_),
@@ -379,7 +377,7 @@ class MemTableIterator : public InternalIterator {
     } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
                !read_options.auto_prefix_mode) {
       // Auto prefix mode is not implemented in memtable yet.
-      bloom_ = mem.bloom_filter_.get();
+      bloom_ = mem.GetBloomFilter();
       iter_ = mem.table_->GetDynamicPrefixIterator(arena);
     } else {
       iter_ = mem.table_->GetIterator(arena);
@@ -772,12 +770,13 @@ Status MemTable::Add(SequenceNumber s, ValueType type,
       num_range_deletes_.store(val, std::memory_order_relaxed);
     }
 
-    if (bloom_filter_ && prefix_extractor_ &&
+    auto bloom_filter = GetBloomFilter();
+    if (bloom_filter && prefix_extractor_ &&
         prefix_extractor_->InDomain(key_without_ts)) {
-      bloom_filter_->Add(prefix_extractor_->Transform(key_without_ts));
+      bloom_filter->Add(prefix_extractor_->Transform(key_without_ts));
     }
-    if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
-      bloom_filter_->Add(key_without_ts);
+    if (bloom_filter && moptions_.memtable_whole_key_filtering) {
+      bloom_filter->Add(key_without_ts);
     }
 
     // The first sequence number inserted into the memtable
@@ -791,6 +790,9 @@ Status MemTable::Add(SequenceNumber s, ValueType type,
       }
       assert(first_seqno_.load() >= earliest_seqno_.load());
     }
+    if (s > largest_seqno_) {
+      largest_seqno_.store(s, std::memory_order_relaxed);
+    }
     assert(post_process_info == nullptr);
     // TODO(yuzhangyu): support updating newest UDT for when `allow_concurrent`
     // is true.
@@ -811,16 +813,17 @@ Status MemTable::Add(SequenceNumber s, ValueType type,
       post_process_info->num_deletes++;
     }
 
-    if (bloom_filter_ && prefix_extractor_ &&
+    auto bloom_filter = GetBloomFilter();
+    if (bloom_filter && prefix_extractor_ &&
         prefix_extractor_->InDomain(key_without_ts)) {
-      bloom_filter_->AddConcurrently(
+      bloom_filter->AddConcurrently(
           prefix_extractor_->Transform(key_without_ts));
     }
-    if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
-      bloom_filter_->AddConcurrently(key_without_ts);
+    if (bloom_filter && moptions_.memtable_whole_key_filtering) {
+      bloom_filter->AddConcurrently(key_without_ts);
     }
 
-    // atomically update first_seqno_ and earliest_seqno_.
+    // atomically update first_seqno_, earliest_seqno_ and largest_seqno_.
     uint64_t cur_seq_num = first_seqno_.load(std::memory_order_relaxed);
     while ((cur_seq_num == 0 || s < cur_seq_num) &&
            !first_seqno_.compare_exchange_weak(cur_seq_num, s)) {
@@ -831,6 +834,10 @@ Status MemTable::Add(SequenceNumber s, ValueType type,
         (cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) &&
         !earliest_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) {
     }
+    uint64_t cur_largest_seqno = largest_seqno_.load(std::memory_order_acquire);
+    while (s > cur_largest_seqno &&
+           !largest_seqno_.compare_exchange_weak(cur_largest_seqno, s)) {
+    }
   }
   if (type == kTypeRangeDeletion) {
     auto new_cache = std::make_shared<FragmentedRangeTombstoneListCache>();
@@ -1258,23 +1265,24 @@ bool MemTable::Get(const LookupKey& key, std::string* value,
   bool may_contain = true;
   Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz_);
   bool bloom_checked = false;
-  if (bloom_filter_) {
+  auto bloom_filter = GetBloomFilter();
+  if (bloom_filter) {
     // when both memtable_whole_key_filtering and prefix_extractor_ are set,
     // only do whole key filtering for Get() to save CPU
     if (moptions_.memtable_whole_key_filtering) {
-      may_contain = bloom_filter_->MayContain(user_key_without_ts);
+      may_contain = bloom_filter->MayContain(user_key_without_ts);
       bloom_checked = true;
     } else {
       assert(prefix_extractor_);
       if (prefix_extractor_->InDomain(user_key_without_ts)) {
-        may_contain = bloom_filter_->MayContain(
+        may_contain = bloom_filter->MayContain(
             prefix_extractor_->Transform(user_key_without_ts));
         bloom_checked = true;
       }
     }
   }
 
-  if (bloom_filter_ && !may_contain) {
+  if (bloom_filter && !may_contain) {
     // iter is null if prefix bloom says the key does not exist
     PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
     *seq = kMaxSequenceNumber;
@@ -1345,7 +1353,8 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
   bool no_range_del = read_options.ignore_range_deletions ||
                       is_range_del_table_empty_.load(std::memory_order_relaxed);
   MultiGetRange temp_range(*range, range->begin(), range->end());
-  if (bloom_filter_ && no_range_del) {
+  auto bloom_filter = GetBloomFilter();
+  if (bloom_filter && no_range_del) {
     bool whole_key =
         !prefix_extractor_ || moptions_.memtable_whole_key_filtering;
     std::array<Slice, MultiGetContext::MAX_BATCH_SIZE> bloom_keys;
@@ -1362,7 +1371,7 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
         range_indexes[num_keys++] = iter.index();
       }
     }
-    bloom_filter_->MayContain(num_keys, bloom_keys.data(), may_match.data());
+    bloom_filter->MayContain(num_keys, bloom_keys.data(), may_match.data());
     for (int i = 0; i < num_keys; ++i) {
       if (!may_match[i]) {
         temp_range.SkipIndex(range_indexes[i]);
diff --git a/db/memtable.h b/db/memtable.h
index c55b34761ef..b2df0df816d 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -425,6 +425,15 @@ class MemTable {
     return earliest_seqno_.store(earliest_seqno, std::memory_order_relaxed);
   }
 
+  // Returns the sequence number that is guaranteed to be larger than the
+  // sequence number of any key that could be inserted into this memtable.
+  //
+  // If the largest sequence number could not be determined,
+  // 0 will be returned.
+  SequenceNumber GetLargestSequenceNumber() {
+    return largest_seqno_.load(std::memory_order_relaxed);
+  }
+
   // DB's latest sequence ID when the memtable is created. This number
   // may be updated to a more recent one before any key is inserted.
   SequenceNumber GetCreationSeq() const { return creation_seq_; }
@@ -598,6 +607,9 @@ class MemTable {
   // if not set.
   std::atomic<SequenceNumber> earliest_seqno_;
 
+  // The largest sequence number of writes in this memtable.
+  std::atomic<SequenceNumber> largest_seqno_;
+
   SequenceNumber creation_seq_;
 
   // The log files earlier than this number can be deleted.
@@ -611,7 +623,14 @@ class MemTable {
   std::vector<port::RWMutex> locks_;
 
   const SliceTransform* const prefix_extractor_;
+  // Bloom filter initialization is delayed to the actual read/write. This is to
+  // reduce memory footprint of empty memtable.
+  const bool needs_bloom_filter_;
+  std::atomic<DynamicBloom*> bloom_filter_ptr_;
+  SpinMutex bloom_filter_mutex_;
   std::unique_ptr<DynamicBloom> bloom_filter_;
+  // Only used to initialize bloom filter.
+  Logger* logger_;
 
   std::atomic<FlushStateEnum> flush_state_;
 
@@ -697,6 +716,25 @@ class MemTable {
                            SequenceNumber s, char* checksum_ptr);
 
   void MaybeUpdateNewestUDT(const Slice& user_key);
+
+  inline DynamicBloom* GetBloomFilter() {
+    if (needs_bloom_filter_) {
+      auto ptr = bloom_filter_ptr_.load(std::memory_order_relaxed);
+      if (UNLIKELY(ptr == nullptr)) {
+        std::lock_guard<SpinMutex> guard(bloom_filter_mutex_);
+        if (bloom_filter_ == nullptr) {
+          bloom_filter_.reset(
+              new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
+                               6 /* hard coded 6 probes */,
+                               moptions_.memtable_huge_page_size, logger_));
+        }
+        ptr = bloom_filter_.get();
+        bloom_filter_ptr_.store(ptr, std::memory_order_relaxed);
+      }
+      return ptr;
+    }
+    return nullptr;
+  }
 };
 
 extern const char* EncodeKey(std::string* scratch, const Slice& target);
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index dfa93461bb1..414b179e61c 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -433,6 +433,23 @@ void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id,
   }
 }
 
+void MemTableList::ExportMemtables(autovector<MemTable*>* ret) {
+  const auto& memlist = current_->memlist_;
+  autovector<MemTable*> tmp;
+  for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+    MemTable* m = *it;
+    tmp.push_back(m);
+  }
+  // For mempurge feature.
+  std::sort(tmp.begin(), tmp.end(),
+            [](const MemTable* m1, const MemTable* m2) -> bool {
+              return m1->GetID() < m2->GetID();
+            });
+  for (auto m : tmp) {
+    ret->push_back(m);
+  }
+}
+
 void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
                                          bool rollback_succeeding_memtables) {
   TEST_SYNC_POINT("RollbackMemtableFlush");
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 81b60288d87..7abbbf1a3e3 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -269,6 +269,9 @@ class MemTableList {
                             autovector<MemTable*>* mems,
                             uint64_t* max_next_log_number = nullptr);
 
+  // Returns all memtable ordered from the oldest to the newest.
+  void ExportMemtables(autovector<MemTable*>* mems);
+
   // Reset status of the given memtable list back to pending state so that
   // they can get picked up again on the next round of flush.
   //
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 0b55cb4aae5..338dafa32ee 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -2961,6 +2961,9 @@ Status WriteBatchInternal::InsertInto(
     if (!w->status.ok()) {
       return w->status;
     }
+    if (w->post_callback) {
+      w->post_callback->Callback(w->sequence);
+    }
     assert(!seq_per_batch || w->batch_cnt != 0);
     assert(!seq_per_batch || inserter.sequence() - w->sequence == w->batch_cnt);
   }
@@ -2988,6 +2991,9 @@ Status WriteBatchInternal::InsertInto(
   inserter.set_log_number_ref(writer->log_ref);
   inserter.set_prot_info(writer->multi_batch.batches[0]->prot_info_.get());
   Status s = writer->multi_batch.batches[0]->Iterate(&inserter);
+  if (writer->post_callback && s.ok()) {
+    writer->post_callback->Callback(sequence);
+  }
   assert(!seq_per_batch || batch_cnt != 0);
   assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt);
   if (concurrent_memtable_writes) {
diff --git a/db/write_thread.cc b/db/write_thread.cc
index b24d3667af8..66f01a753c2 100644
--- a/db/write_thread.cc
+++ b/db/write_thread.cc
@@ -903,6 +903,8 @@ void WriteThread::Writer::ConsumeOne(size_t claimed) {
   if (!s.ok()) {
     std::lock_guard<SpinMutex> guard(this->status_lock);
     this->status = s;
+  } else if (post_callback) {
+    post_callback->Callback(sequence);
   }
   multi_batch.pending_wb_cnt.fetch_sub(1, std::memory_order_acq_rel);
 }
diff --git a/db/write_thread.h b/db/write_thread.h
index b0c8fb5c435..a7c9fc6ba28 100644
--- a/db/write_thread.h
+++ b/db/write_thread.h
@@ -21,6 +21,7 @@
 #include "db/trim_history_scheduler.h"
 #include "db/write_callback.h"
 #include "monitoring/instrumented_mutex.h"
+#include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
@@ -186,6 +187,7 @@ class WriteThread {
     size_t protection_bytes_per_key;
     PreReleaseCallback* pre_release_callback;
     PostMemTableCallback* post_memtable_callback;
+    PostWriteCallback* post_callback;
     uint64_t log_used;  // log number that this batch was inserted into
     uint64_t log_ref;   // log number that memtable insert should reference
     WriteCallback* callback;
@@ -215,6 +217,7 @@ class WriteThread {
           protection_bytes_per_key(0),
           pre_release_callback(nullptr),
           post_memtable_callback(nullptr),
+          post_callback(nullptr),
           log_used(0),
           log_ref(0),
           callback(nullptr),
@@ -230,7 +233,8 @@ class WriteThread {
            WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable,
            size_t _batch_cnt = 0,
            PreReleaseCallback* _pre_release_callback = nullptr,
-           PostMemTableCallback* _post_memtable_callback = nullptr)
+           PostMemTableCallback* _post_memtable_callback = nullptr,
+           PostWriteCallback* _post_callback = nullptr)
         : sync(write_options.sync),
           no_slowdown(write_options.no_slowdown),
           disable_wal(write_options.disableWAL),
@@ -240,6 +244,7 @@ class WriteThread {
           protection_bytes_per_key(_batch->GetProtectionBytesPerKey()),
           pre_release_callback(_pre_release_callback),
           post_memtable_callback(_post_memtable_callback),
+          post_callback(_post_callback),
           log_used(0),
           log_ref(_log_ref),
           callback(_callback),
@@ -257,7 +262,8 @@ class WriteThread {
     Writer(const WriteOptions& write_options, std::vector<WriteBatch*>&& _batch,
            WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable,
            PreReleaseCallback* _pre_release_callback = nullptr,
-           PostMemTableCallback* _post_memtable_callback = nullptr)
+           PostMemTableCallback* _post_memtable_callback = nullptr,
+           PostWriteCallback* _post_callback = nullptr)
         : sync(write_options.sync),
           no_slowdown(write_options.no_slowdown),
           disable_wal(write_options.disableWAL),
@@ -266,6 +272,7 @@ class WriteThread {
           batch_cnt(0),
           pre_release_callback(_pre_release_callback),
           post_memtable_callback(_post_memtable_callback),
+          post_callback(_post_callback),
           log_used(0),
           log_ref(_log_ref),
           callback(_callback),
diff --git a/encryption/encryption.cc b/encryption/encryption.cc
index 02f7f1bdc7b..dd9f3ca0d4f 100644
--- a/encryption/encryption.cc
+++ b/encryption/encryption.cc
@@ -529,15 +529,24 @@ Status KeyManagedEncryptedEnv::RenameFile(const std::string& src_fname,
   }
   s = target()->RenameFile(src_fname, dst_fname);
   if (s.ok()) {
-    s = key_manager_->DeleteFile(src_fname);
+    s = key_manager_->DeleteFileExt(src_fname, dst_fname);
   } else {
     Status delete_status __attribute__((__unused__)) =
-        key_manager_->DeleteFile(dst_fname);
+        key_manager_->DeleteFileExt(dst_fname, src_fname);
     assert(delete_status.ok());
   }
   return s;
 }
 
+Status KeyManagedEncryptedEnv::DeleteDir(const std::string& dname) {
+  // We don't guarantee atomicity. Delete keys first.
+  Status s = key_manager_->DeleteFile(dname);
+  if (!s.ok()) {
+    return s;
+  }
+  return target()->DeleteDir(dname);
+}
+
 Env* NewKeyManagedEncryptedEnv(Env* base_env,
                                std::shared_ptr<KeyManager>& key_manager) {
   std::shared_ptr<AESEncryptionProvider> provider(
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 8148a29673d..30a7dbec8eb 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -2099,8 +2099,6 @@ extern ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_destroy(
     rocksdb_write_buffer_manager_t* wbm);
 extern ROCKSDB_LIBRARY_API bool rocksdb_write_buffer_manager_enabled(
     rocksdb_write_buffer_manager_t* wbm);
-extern ROCKSDB_LIBRARY_API bool rocksdb_write_buffer_manager_cost_to_cache(
-    rocksdb_write_buffer_manager_t* wbm);
 extern ROCKSDB_LIBRARY_API size_t
 rocksdb_write_buffer_manager_memory_usage(rocksdb_write_buffer_manager_t* wbm);
 extern ROCKSDB_LIBRARY_API size_t
@@ -2110,11 +2108,7 @@ extern ROCKSDB_LIBRARY_API size_t
 rocksdb_write_buffer_manager_dummy_entries_in_cache_usage(
     rocksdb_write_buffer_manager_t* wbm);
 extern ROCKSDB_LIBRARY_API size_t
-rocksdb_write_buffer_manager_buffer_size(rocksdb_write_buffer_manager_t* wbm);
-extern ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_buffer_size(
-    rocksdb_write_buffer_manager_t* wbm, size_t new_size);
-extern ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_allow_stall(
-    rocksdb_write_buffer_manager_t* wbm, bool new_allow_stall);
+rocksdb_write_buffer_manager_flush_size(rocksdb_write_buffer_manager_t* wbm);
 
 /* HyperClockCache */
 
diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h
index 410ee4d3ab0..9592b8e4a7b 100644
--- a/include/rocksdb/compaction_filter.h
+++ b/include/rocksdb/compaction_filter.h
@@ -67,6 +67,7 @@ class CompactionFilter : public Customizable {
     kBlobIndex,
     // Wide-column entity
     kWideColumnEntity,
+    kDeletion,  // used only by TiKV's region range filter.
   };
 
   // Potential decisions that can be returned by the compaction filter's
@@ -254,9 +255,13 @@ class CompactionFilter : public Customizable {
       case ValueType::kBlobIndex:
         return Decision::kKeep;
 
-      default:
+      case ValueType::kDeletion:
+        // Should not appear in this API.
         assert(false);
         return Decision::kKeep;
+
+      default:
+        return Decision::kKeep;
     }
   }
 
@@ -298,8 +303,23 @@ class CompactionFilter : public Customizable {
       return Decision::kKeep;
     }
 
-    return FilterV2(level, key, value_type, *existing_value, new_value,
-                    skip_until);
+    return UnsafeFilter(level, key, value_type, *existing_value, new_value,
+                        skip_until);
+  }
+
+  // This interface is reserved for TiKV's region range filter. Only this
+  // interface can accept `value_type=kTypeDeletion`.
+  virtual Decision UnsafeFilter(int level, const Slice& key,
+                                ValueType value_type,
+                                const Slice& existing_value,
+                                std::string* new_value,
+                                std::string* skip_until) const {
+    if (value_type != ValueType::kDeletion) {
+      return FilterV2(level, key, value_type, existing_value, new_value,
+                      skip_until);
+    } else {
+      return Decision::kKeep;
+    }
   }
 
   // Internal (BlobDB) use only. Do not override in application code.
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 662522976b2..494f5af1865 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -149,6 +149,14 @@ struct GetMergeOperandsOptions {
 using TablePropertiesCollection =
     std::unordered_map<std::string, std::shared_ptr<const TableProperties>>;
 
+class PostWriteCallback {
+ public:
+  virtual ~PostWriteCallback() {}
+
+  // Will be called while on the write thread after the write executes.
+  virtual void Callback(SequenceNumber seq) = 0;
+};
+
 // A DB is a persistent, versioned ordered map from keys to values.
 // A DB is safe for concurrent access from multiple threads without
 // any external synchronization.
@@ -323,6 +331,50 @@ class DB {
   // auto-resume is in progress, without waiting for it to complete.
   // See DBOptions::max_bgerror_resume_count and
   // EventListener::OnErrorRecoveryBegin
+  // Merge multiple DBs into this one. All DBs must have disjoint internal
+  // keys.
+  //
+  // # Tips
+  //
+  // The provided DBs must be disjoint: their internal key ranges don't overlap
+  // each other. Calling `CompactRange` on the complementary ranges can make
+  // sure user-visible key range consistent with internal key range. Caveats are
+  // (1) sometimes `bottommost_level_compaction` needs to be configured to avoid
+  // trivial move; (2) range tombstones are very tricky, they might be retained
+  // even if there's no out-of-ranges key.
+  //
+  // To avoid triggering L0 (or Memtable) stall conditions, user can consider
+  // dynamically decreasing the corresponding limits before entering merge.
+  //
+  // WAL merge is not supported. User must write with disableWAL=true, or wait
+  // for all WALs to be retired before merging.
+  //
+  // To have the best performance, use the same `block_cache` and
+  // `prefix_extractor` in DB options.
+  //
+  // # Safety
+  //
+  // Performing merge on DBs that are still undergoing writes results in
+  // undefined behavior.
+  //
+  // Using different implementations of user comparator results in undefined
+  // behavior as well.
+  //
+  // Concurrently apply several merge operations on the same instance can cause
+  // deadlock.
+  //
+  virtual Status MergeDisjointInstances(
+      const MergeInstanceOptions& /*merge_options*/,
+      const std::vector<DB*>& /*instances*/) {
+    return Status::NotSupported("`MergeDisjointInstances` not implemented");
+  }
+
+  // Check all data written before this call is in the range [begin, end).
+  // Return InvalidArgument if not.
+  virtual Status CheckInRange(const Slice* /*begin*/, const Slice* /*end*/) {
+    return Status::NotSupported("`AssertInRange` not implemented");
+  }
+
   virtual Status Resume() { return Status::NotSupported(); }
 
   // Close the DB by releasing resources, closing files etc. This should be
@@ -540,18 +592,30 @@ class DB {
   // options.sync=true.
   // Returns OK on success, non-OK on failure.
   // Note: consider setting options.sync = true.
-  virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates,
+                       PostWriteCallback* callback) = 0;
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates) {
+    return Write(options, updates, nullptr);
+  }
 
   virtual Status MultiBatchWrite(const WriteOptions& /*options*/,
-                                 std::vector<WriteBatch*>&& /*updates*/) {
+                                 std::vector<WriteBatch*>&& /*updates*/,
+                                 PostWriteCallback* /*callback*/) {
     return Status::NotSupported();
   }
 
+  virtual Status MultiBatchWrite(const WriteOptions& options,
+                                 std::vector<WriteBatch*>&& updates) {
+    return MultiBatchWrite(options, std::move(updates), nullptr);
+  }
+
   // If the column family specified by "column_family" contains an entry for
   // "key", return the corresponding value in "*value". If the entry is a plain
   // key-value, return the value as-is; if it is a wide-column entity, return
   // the value of its default anonymous column (see kDefaultWideColumnName) if
   // any, or an empty value otherwise.
+  // If the database contains an entry for "key" store the
+  // corresponding value in *value and return OK.
   //
   // If timestamp is enabled and a non-null timestamp pointer is passed in,
   // timestamp is returned.
@@ -1407,6 +1471,10 @@ class DB {
     GetApproximateMemTableStats(DefaultColumnFamily(), range, count, size);
   }
 
+  virtual void GetApproximateActiveMemTableStats(
+      ColumnFamilyHandle* /*column_family*/, uint64_t* const /*memory_bytes*/,
+      uint64_t* const /*oldest_key_time*/) {}
+
   // Compact the underlying storage for the key range [*begin,*end].
   // The actual compaction interval might be superset of [*begin, *end].
   // In particular, deleted and overwritten versions are discarded,
diff --git a/include/rocksdb/encryption.h b/include/rocksdb/encryption.h
index b8f5e91e985..f1257d697ed 100644
--- a/include/rocksdb/encryption.h
+++ b/include/rocksdb/encryption.h
@@ -54,9 +54,17 @@ class KeyManager {
                          FileEncryptionInfo* file_info) = 0;
   virtual Status NewFile(const std::string& fname,
                          FileEncryptionInfo* file_info) = 0;
+  // Used with both file and directory.
   virtual Status DeleteFile(const std::string& fname) = 0;
   virtual Status LinkFile(const std::string& src_fname,
                           const std::string& dst_fname) = 0;
+  // Provide additional hint of physical file when the key name doesn't map to
+  // one. A typical use case of this is atomically deleting a directory by
+  // renaming it first.
+  virtual Status DeleteFileExt(const std::string& fname,
+                               const std::string& /*physical_fname*/) {
+    return DeleteFile(fname);
+  }
 };
 
 // An Env with underlying files being encrypted. It holds a reference to an
@@ -96,6 +104,8 @@ class KeyManagedEncryptedEnv : public EnvWrapper {
   Status RenameFile(const std::string& src_fname,
                     const std::string& dst_fname) override;
 
+  Status DeleteDir(const std::string& dname) override;
+
  private:
   const std::shared_ptr<KeyManager> key_manager_;
   const std::shared_ptr<AESEncryptionProvider> provider_;
diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h
index 787ed206ae8..063c51071cc 100644
--- a/include/rocksdb/listener.h
+++ b/include/rocksdb/listener.h
@@ -487,6 +487,8 @@ struct MemTableInfo {
   // memtable. It can then be assumed that any write with a larger(or equal)
   // sequence number will be present in this memtable or a later memtable.
   SequenceNumber earliest_seqno;
+  // The largest sequence number of writes in this memtable.
+  SequenceNumber largest_seqno;
   // Total number of entries in memtable
   uint64_t num_entries;
   // Total number of deletes in memtable
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index a33f8eea4bb..fffbe8bd1ba 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -349,6 +349,13 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   uint32_t memtable_max_range_deletions = 0;
 
+  // Column family based write buffer manager, if this is set, this column
+  // facmily will not report memtable memory usage to the write buffer manager
+  // in DBImpl.
+  //
+  // Default: null
+  std::shared_ptr<WriteBufferManager> cf_write_buffer_manager = nullptr;
+
   // Create ColumnFamilyOptions with default values for all fields
   ColumnFamilyOptions();
   // Create ColumnFamilyOptions from Options
@@ -1885,7 +1892,23 @@ struct FlushOptions {
   // is performed by someone else (foreground call or background thread).
   // Default: false
   bool allow_write_stall;
-  FlushOptions() : wait(true), allow_write_stall(false) {}
+  // Only flush memtable if it has the expected oldest key time.
+  // This option is ignored for atomic flush. Zero means disabling the check.
+  // Default: 0
+  uint64_t expected_oldest_key_time;
+  // Abort flush if compaction is disabled via `DisableManualCompaction`.
+  // Default: false
+  bool check_if_compaction_disabled;
+  // Used by RocksDB internally.
+  // Default: false
+  bool _write_stopped;
+
+  FlushOptions()
+      : wait(true),
+        allow_write_stall(false),
+        expected_oldest_key_time(0),
+        check_if_compaction_disabled(false),
+        _write_stopped(false) {}
 };
 
 // Create a Logger from provided DBOptions
@@ -2208,4 +2231,15 @@ struct WaitForCompactOptions {
   std::chrono::microseconds timeout = std::chrono::microseconds::zero();
 };
 
+struct MergeInstanceOptions {
+  // Whether to merge memtable. WAL must be empty to perform a memtable merge.
+  // Either write with disableWAL=true, or flush memtables before merge.
+  bool merge_memtable = false;
+  // Whether or not writes to source DBs are still allowed after the merge.
+  // Some optimizations are possible only with this flag set to false.
+  bool allow_source_write = true;
+  // No limit if negative.
+  int max_preload_files = 16;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h
index 0d7eb59499e..b105bf3e7d8 100644
--- a/include/rocksdb/slice.h
+++ b/include/rocksdb/slice.h
@@ -145,6 +145,7 @@ class PinnableSlice : public Slice, public Cleanable {
 
   // No copy constructor and copy assignment allowed.
   PinnableSlice(PinnableSlice&) = delete;
+  PinnableSlice(const PinnableSlice&) = delete;
   PinnableSlice& operator=(PinnableSlice&) = delete;
 
   inline void PinSlice(const Slice& s, CleanupFunction f, void* arg1,
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 86e1477a4f5..c5854e1ab78 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -258,8 +258,10 @@ class StackableDB : public DB {
     return db_->Merge(options, column_family, key, ts, value);
   }
 
-  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override {
-    return db_->Write(opts, updates);
+  using DB::Write;
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates,
+                       PostWriteCallback* callback) override {
+    return db_->Write(opts, updates, callback);
   }
 
   using DB::NewIterator;
diff --git a/include/rocksdb/write_buffer_manager.h b/include/rocksdb/write_buffer_manager.h
index 61e75c8888e..4d840961cc4 100644
--- a/include/rocksdb/write_buffer_manager.h
+++ b/include/rocksdb/write_buffer_manager.h
@@ -22,6 +22,8 @@
 
 namespace ROCKSDB_NAMESPACE {
 class CacheReservationManager;
+class DB;
+class ColumnFamilyHandle;
 
 // Interface to block and signal DB instances, intended for RocksDB
 // internal use only. Each DB instance contains ptr to StallInterface.
@@ -37,34 +39,42 @@ class StallInterface {
 class WriteBufferManager final {
  public:
   // Parameters:
-  // _buffer_size: _buffer_size = 0 indicates no limit. Memory won't be capped.
-  // memory_usage() won't be valid and ShouldFlush() will always return true.
+  // - flush_size: When the total size of mutable memtables exceeds this limit,
+  // the largest one will be frozen and scheduled for flush. Disabled when 0.
   //
-  // cache_: if `cache` is provided, we'll put dummy entries in the cache and
-  // cost the memory allocated to the cache. It can be used even if _buffer_size
-  // = 0.
+  // Immutable memtables are excluded for this reason: RocksDB always schedule
+  // a flush for newly created immutable memtable. We can consider them evicted
+  // from memory if flush bandwidth is sufficient.
   //
-  // allow_stall: if set true, it will enable stalling of writes when
-  // memory_usage() exceeds buffer_size. It will wait for flush to complete and
-  // memory usage to drop down.
-  explicit WriteBufferManager(size_t _buffer_size,
+  // It's an undefined behavior to enable/disable flush limit after the manager
+  // has been used by a DB instance.
+  //
+  // - stall_ratio: When the total size of memtables exceeds ratio*flush_size,
+  // user writes will be delayed. Disabled when smaller than 1.
+  //
+  // - flush_oldest_first: By default we freeze the largest mutable memtable
+  // when `flush_size` is triggered. By enabling this flag, the oldest mutable
+  // memtable will be frozen instead.
+  //
+  // - cache: if `cache` is provided, memtable memory will be charged as a
+  // dummy entry This is useful to keep the memory sum of both memtable and
+  // block cache under control.
+  explicit WriteBufferManager(size_t flush_size,
                               std::shared_ptr<Cache> cache = {},
-                              bool allow_stall = false);
+                              float stall_ratio = 0.0,
+                              bool flush_oldest_first = false);
   // No copying allowed
   WriteBufferManager(const WriteBufferManager&) = delete;
   WriteBufferManager& operator=(const WriteBufferManager&) = delete;
 
   ~WriteBufferManager();
 
-  // Returns true if buffer_limit is passed to limit the total memory usage and
-  // is greater than 0.
-  bool enabled() const { return buffer_size() > 0; }
-
-  // Returns true if pointer to cache is passed.
-  bool cost_to_cache() const { return cache_res_mgr_ != nullptr; }
+  // Returns true if a non-zero buffer_limit is passed to limit the total
+  // memory usage or cache is provided to charge write buffer memory.
+  bool enabled() const { return flush_size() > 0 || cache_res_mgr_ != nullptr; }
 
   // Returns the total memory used by memtables.
-  // Only valid if enabled()
+  // Only valid if enabled().
   size_t memory_usage() const {
     return memory_used_.load(std::memory_order_relaxed);
   }
@@ -76,45 +86,29 @@ class WriteBufferManager final {
 
   size_t dummy_entries_in_cache_usage() const;
 
-  // Returns the buffer_size.
-  size_t buffer_size() const {
-    return buffer_size_.load(std::memory_order_relaxed);
+  // Returns the flush_size.
+  size_t flush_size() const {
+    return flush_size_.load(std::memory_order_relaxed);
   }
 
-  // REQUIRED: `new_size` > 0
-  void SetBufferSize(size_t new_size) {
-    assert(new_size > 0);
-    buffer_size_.store(new_size, std::memory_order_relaxed);
-    mutable_limit_.store(new_size * 7 / 8, std::memory_order_relaxed);
-    // Check if stall is active and can be ended.
-    MaybeEndWriteStall();
+  size_t stall_size() const {
+    return static_cast<size_t>(flush_size() * stall_ratio_);
   }
 
-  void SetAllowStall(bool new_allow_stall) {
-    allow_stall_.store(new_allow_stall, std::memory_order_relaxed);
-    MaybeEndWriteStall();
+  void SetFlushSize(size_t new_size);
+
+  void SetFlushOldestFirst(bool v) {
+    flush_oldest_first_.store(v, std::memory_order_relaxed);
   }
 
   // Below functions should be called by RocksDB internally.
 
-  // Should only be called from write thread
-  bool ShouldFlush() const {
-    if (enabled()) {
-      if (mutable_memtable_memory_usage() >
-          mutable_limit_.load(std::memory_order_relaxed)) {
-        return true;
-      }
-      size_t local_size = buffer_size();
-      if (memory_usage() >= local_size &&
-          mutable_memtable_memory_usage() >= local_size / 2) {
-        // If the memory exceeds the buffer size, we trigger more aggressive
-        // flush. But if already more than half memory is being flushed,
-        // triggering more flush may not help. We will hold it instead.
-        return true;
-      }
-    }
-    return false;
-  }
+  // This handle is the same as the one created by `DB::Open` or
+  // `DB::CreateColumnFamily`.
+  // Must be called not holding db mutex and not inside write thread.
+  // `UnregisterColumnFamily()` must be called by DB before the handle is
+  // destroyed.
+  void RegisterColumnFamily(DB* db, ColumnFamilyHandle* cf);
 
   // Returns true if total memory usage exceeded buffer_size.
   // We stall the writes untill memory_usage drops below buffer_size. When the
@@ -124,22 +118,18 @@ class WriteBufferManager final {
   //
   // Should only be called by RocksDB internally .
   bool ShouldStall() const {
-    if (!allow_stall_.load(std::memory_order_relaxed) || !enabled()) {
+    if (!allow_stall_ || flush_size() == 0) {
       return false;
     }
-
-    return IsStallActive() || IsStallThresholdExceeded();
+    return is_stall_active() || is_stall_threshold_exceeded();
   }
+  // Called during `DB::Close`.
+  // Must be called not holding db mutex and not inside write thread.
+  void UnregisterDB(DB* db);
 
-  // Returns true if stall is active.
-  bool IsStallActive() const {
-    return stall_active_.load(std::memory_order_relaxed);
-  }
-
-  // Returns true if stalling condition is met.
-  bool IsStallThresholdExceeded() const {
-    return memory_usage() >= buffer_size_;
-  }
+  // Called during `DestroyColumnFamilyHandle`.
+  // Must be called not holding db mutex and not inside write thread.
+  void UnregisterColumnFamily(ColumnFamilyHandle* cf);
 
   void ReserveMem(size_t mem);
 
@@ -149,6 +139,36 @@ class WriteBufferManager final {
 
   void FreeMem(size_t mem);
 
+  // Whether the DB writer should call `MaybeFlush` before write.
+  bool ShouldFlush() {
+    size_t local_size = flush_size();
+    return local_size > 0 && mutable_memtable_memory_usage() >= local_size;
+  }
+
+  // Must be called without holding db mutex. When called in write thread,
+  // must pass in the pointer to the db.
+  void MaybeFlush(DB* this_db) {
+    if (sentinels_mu_.try_lock()) {
+      MaybeFlushLocked(this_db);
+      sentinels_mu_.unlock();
+    }
+  }
+
+  // Must ensure that the mutex of all dbs except this_db are not held. If
+  // this_db is not nullptr, the mutex of it must be held.
+  void MaybeFlushLocked(DB* this_db = nullptr);
+
+  // Returns true if stall is active.
+  bool is_stall_active() const {
+    return stall_active_.load(std::memory_order_relaxed);
+  }
+
+  // Returns true if stalling condition is met. Only valid if buffer_size_ is
+  // non-zero.
+  bool is_stall_threshold_exceeded() const {
+    return memory_usage() >= stall_size();
+  }
+
   // Add the DB instance to the queue and block the DB.
   // Should only be called by RocksDB internally.
   void BeginWriteStall(StallInterface* wbm_stall);
@@ -157,26 +177,41 @@ class WriteBufferManager final {
   // signal them to continue.
   void MaybeEndWriteStall();
 
-  void RemoveDBFromQueue(StallInterface* wbm_stall);
+  // Called when DB instance is closed.
+  void RemoveFromStallQueue(StallInterface* wbm_stall);
 
  private:
-  std::atomic<size_t> buffer_size_;
-  std::atomic<size_t> mutable_limit_;
+  struct WriteBufferSentinel {
+    DB* db;
+    ColumnFamilyHandle* cf;
+  };
+  // Protected by `sentinels_mu_`.
+  std::list<std::shared_ptr<WriteBufferSentinel>> sentinels_;
+  std::mutex sentinels_mu_;
+
+  // Shared by flush_size limit and cache charging.
+  // When cache charging is enabled, this is updated under cache_res_mgr_mu_.
   std::atomic<size_t> memory_used_;
-  // Memory that hasn't been scheduled to free.
+
+  std::atomic<size_t> flush_size_;
+  // Only used when flush_size is non-zero.
   std::atomic<size_t> memory_active_;
-  std::shared_ptr<CacheReservationManager> cache_res_mgr_;
-  // Protects cache_res_mgr_
-  std::mutex cache_res_mgr_mu_;
+  std::atomic<bool> flush_oldest_first_;
 
+  const bool allow_stall_;
+  const float stall_ratio_;
   std::list<StallInterface*> queue_;
   // Protects the queue_ and stall_active_.
-  std::mutex mu_;
-  std::atomic<bool> allow_stall_;
-  // Value should only be changed by BeginWriteStall() and MaybeEndWriteStall()
-  // while holding mu_, but it can be read without a lock.
+  std::mutex stall_mu_;
+  // Value should only be changed by BeginWriteStall() and
+  // MaybeEndWriteStall() while holding mu_, but it can be read without a
+  // lock.
   std::atomic<bool> stall_active_;
 
+  std::shared_ptr<CacheReservationManager> cache_res_mgr_;
+  // Protects cache_res_mgr_
+  std::mutex cache_res_mgr_mu_;
+
   void ReserveMemWithCache(size_t mem);
   void FreeMemWithCache(size_t mem);
 };
diff --git a/memtable/alloc_tracker.cc b/memtable/alloc_tracker.cc
index d780df0bf3d..d7fdd40958a 100644
--- a/memtable/alloc_tracker.cc
+++ b/memtable/alloc_tracker.cc
@@ -25,8 +25,7 @@ AllocTracker::~AllocTracker() { FreeMem(); }
 
 void AllocTracker::Allocate(size_t bytes) {
   assert(write_buffer_manager_ != nullptr);
-  if (write_buffer_manager_->enabled() ||
-      write_buffer_manager_->cost_to_cache()) {
+  if (write_buffer_manager_->enabled()) {
     bytes_allocated_.fetch_add(bytes, std::memory_order_relaxed);
     write_buffer_manager_->ReserveMem(bytes);
   }
@@ -34,8 +33,7 @@ void AllocTracker::Allocate(size_t bytes) {
 
 void AllocTracker::DoneAllocating() {
   if (write_buffer_manager_ != nullptr && !done_allocating_) {
-    if (write_buffer_manager_->enabled() ||
-        write_buffer_manager_->cost_to_cache()) {
+    if (write_buffer_manager_->enabled()) {
       write_buffer_manager_->ScheduleFreeMem(
           bytes_allocated_.load(std::memory_order_relaxed));
     } else {
@@ -50,8 +48,7 @@ void AllocTracker::FreeMem() {
     DoneAllocating();
   }
   if (write_buffer_manager_ != nullptr && !freed_) {
-    if (write_buffer_manager_->enabled() ||
-        write_buffer_manager_->cost_to_cache()) {
+    if (write_buffer_manager_->enabled()) {
       write_buffer_manager_->FreeMem(
           bytes_allocated_.load(std::memory_order_relaxed));
     } else {
diff --git a/memtable/write_buffer_manager.cc b/memtable/write_buffer_manager.cc
index ce1789c20d6..2dae1b75531 100644
--- a/memtable/write_buffer_manager.cc
+++ b/memtable/write_buffer_manager.cc
@@ -14,20 +14,24 @@
 #include "cache/cache_entry_roles.h"
 #include "cache/cache_reservation_manager.h"
 #include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+#include "rocksdb/options.h"
 #include "rocksdb/status.h"
 #include "util/coding.h"
 
 namespace ROCKSDB_NAMESPACE {
-WriteBufferManager::WriteBufferManager(size_t _buffer_size,
+WriteBufferManager::WriteBufferManager(size_t _flush_size,
                                        std::shared_ptr<Cache> cache,
-                                       bool allow_stall)
-    : buffer_size_(_buffer_size),
-      mutable_limit_(buffer_size_ * 7 / 8),
-      memory_used_(0),
+                                       float stall_ratio,
+                                       bool flush_oldest_first)
+    : memory_used_(0),
+      flush_size_(_flush_size),
       memory_active_(0),
-      cache_res_mgr_(nullptr),
-      allow_stall_(allow_stall),
-      stall_active_(false) {
+      flush_oldest_first_(flush_oldest_first),
+      allow_stall_(stall_ratio >= 1.0),
+      stall_ratio_(stall_ratio),
+      stall_active_(false),
+      cache_res_mgr_(nullptr) {
   if (cache) {
     // Memtable's memory usage tends to fluctuate frequently
     // therefore we set delayed_decrease = true to save some dummy entry
@@ -40,7 +44,7 @@ WriteBufferManager::WriteBufferManager(size_t _buffer_size,
 
 WriteBufferManager::~WriteBufferManager() {
 #ifndef NDEBUG
-  std::unique_lock<std::mutex> lock(mu_);
+  std::unique_lock<std::mutex> lock(stall_mu_);
   assert(queue_.empty());
 #endif
 }
@@ -53,13 +57,55 @@ std::size_t WriteBufferManager::dummy_entries_in_cache_usage() const {
   }
 }
 
+void WriteBufferManager::SetFlushSize(size_t new_size) {
+  if (flush_size_.exchange(new_size, std::memory_order_relaxed) > new_size) {
+    // Threshold is decreased. We must make sure all outstanding memtables
+    // are flushed.
+    std::lock_guard<std::mutex> lock(sentinels_mu_);
+    auto max_retry = sentinels_.size();
+    while ((max_retry--) && ShouldFlush()) {
+      MaybeFlushLocked();
+    }
+  } else {
+    // Check if stall is active and can be ended.
+    MaybeEndWriteStall();
+  }
+}
+
+void WriteBufferManager::RegisterColumnFamily(DB* db, ColumnFamilyHandle* cf) {
+  assert(db != nullptr);
+  auto sentinel = std::make_shared<WriteBufferSentinel>();
+  sentinel->db = db;
+  sentinel->cf = cf;
+  std::lock_guard<std::mutex> lock(sentinels_mu_);
+  MaybeFlushLocked();
+  sentinels_.push_back(sentinel);
+}
+
+void WriteBufferManager::UnregisterDB(DB* db) {
+  std::lock_guard<std::mutex> lock(sentinels_mu_);
+  sentinels_.remove_if([=](const std::shared_ptr<WriteBufferSentinel>& s) {
+    return s->db == db;
+  });
+  MaybeFlushLocked();
+}
+
+void WriteBufferManager::UnregisterColumnFamily(ColumnFamilyHandle* cf) {
+  std::lock_guard<std::mutex> lock(sentinels_mu_);
+  sentinels_.remove_if([=](const std::shared_ptr<WriteBufferSentinel>& s) {
+    return s->cf == cf;
+  });
+  MaybeFlushLocked();
+}
+
 void WriteBufferManager::ReserveMem(size_t mem) {
+  size_t local_size = flush_size();
   if (cache_res_mgr_ != nullptr) {
     ReserveMemWithCache(mem);
-  } else if (enabled()) {
+  } else if (local_size > 0) {
     memory_used_.fetch_add(mem, std::memory_order_relaxed);
   }
-  if (enabled()) {
+  if (local_size > 0) {
     memory_active_.fetch_add(mem, std::memory_order_relaxed);
   }
 }
@@ -84,7 +130,7 @@ void WriteBufferManager::ReserveMemWithCache(size_t mem) {
 }
 
 void WriteBufferManager::ScheduleFreeMem(size_t mem) {
-  if (enabled()) {
+  if (flush_size() > 0) {
     memory_active_.fetch_sub(mem, std::memory_order_relaxed);
   }
 }
@@ -92,7 +138,7 @@ void WriteBufferManager::ScheduleFreeMem(size_t mem) {
 void WriteBufferManager::FreeMem(size_t mem) {
   if (cache_res_mgr_ != nullptr) {
     FreeMemWithCache(mem);
-  } else if (enabled()) {
+  } else if (flush_size() > 0) {
     memory_used_.fetch_sub(mem, std::memory_order_relaxed);
   }
   // Check if stall is active and can be ended.
@@ -115,6 +161,87 @@ void WriteBufferManager::FreeMemWithCache(size_t mem) {
   s.PermitUncheckedError();
 }
 
+void WriteBufferManager::MaybeFlushLocked(DB* this_db) {
+  if (!ShouldFlush()) {
+    return;
+  }
+  // Have at least one candidate to flush with
+  // check_if_compaction_disabled=false when all others failed.
+  constexpr size_t kCandidateSize = 2;
+  // (score, age).
+  using Candidate = std::tuple<WriteBufferSentinel*, uint64_t, uint64_t>;
+  auto cmp = [](const Candidate& a, const Candidate& b) {
+    return std::get<1>(a) <= std::get<1>(b);
+  };
+  std::set<Candidate, decltype(cmp)> candidates(cmp);
+
+  for (auto& s : sentinels_) {
+    // TODO: move this calculation to a callback.
+    uint64_t current_score = 0;
+    uint64_t current_memory_bytes = std::numeric_limits<uint64_t>::max();
+    uint64_t oldest_time = std::numeric_limits<uint64_t>::max();
+    s->db->GetApproximateActiveMemTableStats(s->cf, &current_memory_bytes,
+                                             &oldest_time);
+    if (flush_oldest_first_.load(std::memory_order_relaxed)) {
+      // Convert oldest to highest score.
+      current_score = std::numeric_limits<uint64_t>::max() - oldest_time;
+    } else {
+      current_score = current_memory_bytes;
+    }
+    // A very mild penalty for too many L0 files.
+    uint64_t level0;
+    // 3 is to optimize the frequency of getting options, which uses mutex.
+    if (s->db->GetIntProperty(DB::Properties::kNumFilesAtLevelPrefix + "0",
+                              &level0) &&
+        level0 >= 3) {
+      auto opts = s->db->GetOptions(s->cf);
+      if (opts.level0_file_num_compaction_trigger > 0 &&
+          level0 >=
+              static_cast<uint64_t>(opts.level0_file_num_compaction_trigger)) {
+        auto diff = level0 - static_cast<uint64_t>(
+                                 opts.level0_file_num_compaction_trigger);
+        // 0->2, +1->4, +2->8, +3->12, +4->18
+        uint64_t factor = (diff + 2) * (diff + 2) / 2;
+        if (factor > 100) {
+          factor = 100;
+        }
+        current_score = current_score * (100 - factor) / factor;
+      }
+    }
+    candidates.insert({s.get(), current_score, oldest_time});
+    if (candidates.size() > kCandidateSize) {
+      candidates.erase(candidates.begin());
+    }
+  }
+
+  // We only flush at most one column family at a time.
+  // This is enough to keep size under control except when flush_size is
+  // dynamically decreased. That case is managed in `SetFlushSize`.
+  auto candidate = candidates.rbegin();
+  while (candidate != candidates.rend()) {
+    auto sentinel = std::get<0>(*candidate);
+    FlushOptions flush_opts;
+    flush_opts.allow_write_stall = true;
+    flush_opts.wait = false;
+    flush_opts._write_stopped = (sentinel->db == this_db);
+    flush_opts.expected_oldest_key_time = std::get<2>(*candidate);
+    candidate++;
+    if (candidate != candidates.rend()) {
+      // Don't check it for the last candidate. Otherwise we could end up
+      // never progressing.
+      flush_opts.check_if_compaction_disabled = true;
+    }
+    auto s = sentinel->db->Flush(flush_opts, sentinel->cf);
+    if (s.ok()) {
+      return;
+    }
+    auto opts = sentinel->db->GetDBOptions();
+    ROCKS_LOG_WARN(opts.info_log, "WriteBufferManager fails to flush: %s",
+                   s.ToString().c_str());
+    // Fallback to the next best candidate.
+  }
+}
+
 void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) {
   assert(wbm_stall != nullptr);
 
@@ -122,7 +249,7 @@ void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) {
   std::list<StallInterface*> new_node = {wbm_stall};
 
   {
-    std::unique_lock<std::mutex> lock(mu_);
+    std::unique_lock<std::mutex> lock(stall_mu_);
     // Verify if the stall conditions are stil active.
     if (ShouldStall()) {
       stall_active_.store(true, std::memory_order_relaxed);
@@ -140,15 +267,14 @@ void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) {
 // Called when memory is freed in FreeMem or the buffer size has changed.
 void WriteBufferManager::MaybeEndWriteStall() {
   // Stall conditions have not been resolved.
-  if (allow_stall_.load(std::memory_order_relaxed) &&
-      IsStallThresholdExceeded()) {
+  if (allow_stall_ && is_stall_threshold_exceeded()) {
     return;
   }
 
   // Perform all deallocations outside of the lock.
   std::list<StallInterface*> cleanup;
 
-  std::unique_lock<std::mutex> lock(mu_);
+  std::unique_lock<std::mutex> lock(stall_mu_);
   if (!stall_active_.load(std::memory_order_relaxed)) {
     return;  // Nothing to do.
   }
@@ -163,14 +289,14 @@ void WriteBufferManager::MaybeEndWriteStall() {
   cleanup = std::move(queue_);
 }
 
-void WriteBufferManager::RemoveDBFromQueue(StallInterface* wbm_stall) {
+void WriteBufferManager::RemoveFromStallQueue(StallInterface* wbm_stall) {
   assert(wbm_stall != nullptr);
 
   // Deallocate the removed nodes outside of the lock.
   std::list<StallInterface*> cleanup;
 
-  if (enabled() && allow_stall_.load(std::memory_order_relaxed)) {
-    std::unique_lock<std::mutex> lock(mu_);
+  if (allow_stall_) {
+    std::unique_lock<std::mutex> lock(stall_mu_);
     for (auto it = queue_.begin(); it != queue_.end();) {
       auto next = std::next(it);
       if (*it == wbm_stall) {
diff --git a/memtable/write_buffer_manager_test.cc b/memtable/write_buffer_manager_test.cc
index c992d2eabcb..9f4c5c0164b 100644
--- a/memtable/write_buffer_manager_test.cc
+++ b/memtable/write_buffer_manager_test.cc
@@ -24,57 +24,19 @@ TEST_F(WriteBufferManagerTest, ShouldFlush) {
 
   wbf->ReserveMem(8 * 1024 * 1024);
   ASSERT_FALSE(wbf->ShouldFlush());
-  // 90% of the hard limit will hit the condition
-  wbf->ReserveMem(1 * 1024 * 1024);
+  wbf->ReserveMem(2 * 1024 * 1024);
   ASSERT_TRUE(wbf->ShouldFlush());
   // Scheduling for freeing will release the condition
   wbf->ScheduleFreeMem(1 * 1024 * 1024);
   ASSERT_FALSE(wbf->ShouldFlush());
 
-  wbf->ReserveMem(2 * 1024 * 1024);
-  ASSERT_TRUE(wbf->ShouldFlush());
-
-  wbf->ScheduleFreeMem(4 * 1024 * 1024);
-  // 11MB total, 6MB mutable. hard limit still hit
+  // change size: 8M limit.
+  wbf->SetFlushSize(8 * 1024 * 1024);
+  // 9MB mutable.
   ASSERT_TRUE(wbf->ShouldFlush());
 
   wbf->ScheduleFreeMem(2 * 1024 * 1024);
-  // 11MB total, 4MB mutable. hard limit stills but won't flush because more
-  // than half data is already being flushed.
-  ASSERT_FALSE(wbf->ShouldFlush());
-
-  wbf->ReserveMem(4 * 1024 * 1024);
-  // 15 MB total, 8MB mutable.
-  ASSERT_TRUE(wbf->ShouldFlush());
-
-  wbf->FreeMem(7 * 1024 * 1024);
-  // 8MB total, 8MB mutable.
-  ASSERT_FALSE(wbf->ShouldFlush());
-
-  // change size: 8M limit, 7M mutable limit
-  wbf->SetBufferSize(8 * 1024 * 1024);
-  // 8MB total, 8MB mutable.
-  ASSERT_TRUE(wbf->ShouldFlush());
-
-  wbf->ScheduleFreeMem(2 * 1024 * 1024);
-  // 8MB total, 6MB mutable.
-  ASSERT_TRUE(wbf->ShouldFlush());
-
-  wbf->FreeMem(2 * 1024 * 1024);
-  // 6MB total, 6MB mutable.
-  ASSERT_FALSE(wbf->ShouldFlush());
-
-  wbf->ReserveMem(1 * 1024 * 1024);
-  // 7MB total, 7MB mutable.
-  ASSERT_FALSE(wbf->ShouldFlush());
-
-  wbf->ReserveMem(1 * 1024 * 1024);
-  // 8MB total, 8MB mutable.
-  ASSERT_TRUE(wbf->ShouldFlush());
-
-  wbf->ScheduleFreeMem(1 * 1024 * 1024);
-  wbf->FreeMem(1 * 1024 * 1024);
-  // 7MB total, 7MB mutable.
+  // 7MB mutable.
   ASSERT_FALSE(wbf->ShouldFlush());
 }
 
@@ -123,7 +85,6 @@ TEST_F(ChargeWriteBufferTest, Basic) {
   ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry);
   ASSERT_GE(cache->GetPinnedUsage(), 44 * 256 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 44 * 256 * 1024 + kMetaDataChargeOverhead);
-  ASSERT_FALSE(wbf->ShouldFlush());
 
   // Allocate another 41MB, memory_used_ = 52045KB
   wbf->ReserveMem(41 * 1024 * 1024);
@@ -131,19 +92,6 @@ TEST_F(ChargeWriteBufferTest, Basic) {
   ASSERT_GE(cache->GetPinnedUsage(), 204 * 256 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(),
             204 * 256 * 1024 + kMetaDataChargeOverhead);
-  ASSERT_TRUE(wbf->ShouldFlush());
-
-  ASSERT_TRUE(wbf->ShouldFlush());
-
-  // Schedule free 20MB, memory_used_ = 52045KB
-  // It will not cause any change in memory_used and cache cost
-  wbf->ScheduleFreeMem(20 * 1024 * 1024);
-  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 204 * kSizeDummyEntry);
-  ASSERT_GE(cache->GetPinnedUsage(), 204 * 256 * 1024);
-  ASSERT_LT(cache->GetPinnedUsage(),
-            204 * 256 * 1024 + kMetaDataChargeOverhead);
-  // Still need flush as the hard limit hits
-  ASSERT_TRUE(wbf->ShouldFlush());
 
   // Free 20MB, memory_used_ = 31565KB
   // It will releae 80 dummy entries from cache since
@@ -156,8 +104,6 @@ TEST_F(ChargeWriteBufferTest, Basic) {
   ASSERT_LT(cache->GetPinnedUsage(),
             124 * 256 * 1024 + kMetaDataChargeOverhead);
 
-  ASSERT_FALSE(wbf->ShouldFlush());
-
   // Free 16KB, memory_used_ = 31549KB
   // It will not release any dummy entry since memory_used_ >=
   // dummy_entries_in_cache_usage * (3/4)
@@ -214,8 +160,6 @@ TEST_F(ChargeWriteBufferTest, BasicWithNoBufferSizeLimit) {
   ASSERT_GE(cache->GetPinnedUsage(), 40 * 256 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 40 * 256 * 1024 + kMetaDataChargeOverhead);
 
-  ASSERT_FALSE(wbf->ShouldFlush());
-
   // Free 9MB,  memory_used_ = 1024KB
   // It will free 36 dummy entries
   wbf->FreeMem(9 * 1024 * 1024);
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index ced8597a9d6..f8a45d1f4aa 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -438,6 +438,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
        sizeof(std::shared_ptr<ConcurrentTaskLimiter>)},
       {offsetof(struct ColumnFamilyOptions, sst_partitioner_factory),
        sizeof(std::shared_ptr<SstPartitionerFactory>)},
+      {offsetof(struct ColumnFamilyOptions, cf_write_buffer_manager),
+       sizeof(std::shared_ptr<WriteBufferManager>)},
   };
 
   char* options_ptr = new char[sizeof(ColumnFamilyOptions)];
diff --git a/src.mk b/src.mk
index 0bc1c2e398c..4a8e6fdf6ee 100644
--- a/src.mk
+++ b/src.mk
@@ -55,6 +55,7 @@ LIB_SOURCES =                                                   \
   db/db_impl/db_impl_readonly.cc                                \
   db/db_impl/db_impl_secondary.cc                               \
   db/db_impl/db_impl_write.cc                                   \
+  db/db_impl/db_impl_merge.cc                                   \
   db/db_info_dumper.cc                                          \
   db/db_iter.cc                                                 \
   db/dbformat.cc                                                \
@@ -488,6 +489,7 @@ TEST_MAIN_SOURCES =                                                     \
   db/db_memtable_test.cc                                                \
   db/db_merge_operator_test.cc                                          \
   db/db_merge_operand_test.cc                                           \
+  db/db_merge_test.cc                                                   \
   db/db_options_test.cc                                                 \
   db/db_properties_test.cc                                              \
   db/db_range_del_test.cc                                               \
diff --git a/test_util/testutil.h b/test_util/testutil.h
index 5a173ca40c0..6dfd649dfa1 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -85,6 +85,18 @@ class TestKeyManager : public encryption::KeyManager {
   Status DeleteFile(const std::string& fname) override {
     std::lock_guard<std::mutex> l(mutex);
     file_set.erase(fname);
+    if (!fname.empty()) {
+      std::string copy = fname;
+      if (copy.back() != '/') {
+        copy.push_back('/');
+      }
+      auto begin = file_set.lower_bound(copy);
+      auto end = begin;
+      while (end != file_set.end() && end->compare(0, copy.size(), copy) == 0) {
+        end++;
+      }
+      file_set.erase(begin, end);
+    }
     return Status::OK();
   }
 
diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h
index e2f0b7bdbdd..c83260aff9e 100644
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@@ -196,8 +196,9 @@ class BlobDB : public StackableDB {
     return Status::NotSupported("Not supported operation in blob db.");
   }
 
-  virtual Status Write(const WriteOptions& opts,
-                       WriteBatch* updates) override = 0;
+  using rocksdb::StackableDB::Write;
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates,
+                       PostWriteCallback* callback) override = 0;
 
   using ROCKSDB_NAMESPACE::StackableDB::NewIterator;
   virtual Iterator* NewIterator(const ReadOptions& options) override = 0;
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 2fa7ae898f5..1e73b42ddd9 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -1003,7 +1003,8 @@ class BlobDBImpl::BlobInserter : public WriteBatch::Handler {
   void LogData(const Slice& blob) override { batch_.PutLogData(blob); }
 };
 
-Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
+Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates,
+                         PostWriteCallback* callback) {
   StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS);
   RecordTick(statistics_, BLOB_DB_NUM_WRITE);
   uint32_t default_cf_id =
@@ -1021,7 +1022,7 @@ Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
   if (!s.ok()) {
     return s;
   }
-  return db_->Write(options, blob_inserter.batch());
+  return db_->Write(options, blob_inserter.batch(), callback);
 }
 
 Status BlobDBImpl::Put(const WriteOptions& options, const Slice& key,
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index d491108d3e6..205c07e9c92 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -128,7 +128,8 @@ class BlobDBImpl : public BlobDB {
       std::vector<std::string>* values) override;
 
   using BlobDB::Write;
-  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates,
+                       PostWriteCallback* callback) override;
 
   virtual Status Close() override;
 
diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc
index e1f09451309..48f6e159e2f 100644
--- a/utilities/checkpoint/checkpoint_impl.cc
+++ b/utilities/checkpoint/checkpoint_impl.cc
@@ -311,7 +311,11 @@ Status CheckpointImpl::ExportColumnFamily(
   s = db_->GetEnv()->CreateDir(tmp_export_dir);
 
   if (s.ok()) {
-    s = db_->Flush(ROCKSDB_NAMESPACE::FlushOptions(), handle);
+    auto opts = ROCKSDB_NAMESPACE::FlushOptions();
+    // In TiKV context: If tablet is to be destroyed, its background work will
+    // be paused. Manual flush can never make progress.
+    opts.check_if_compaction_disabled = true;
+    s = db_->Flush(opts, handle);
   }
 
   ColumnFamilyMetaData db_metadata;
diff --git a/utilities/transactions/optimistic_transaction_db_impl.h b/utilities/transactions/optimistic_transaction_db_impl.h
index 7bc718e9bdc..3bf6ff9a422 100644
--- a/utilities/transactions/optimistic_transaction_db_impl.h
+++ b/utilities/transactions/optimistic_transaction_db_impl.h
@@ -81,12 +81,13 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
 
   // Range deletions also must not be snuck into `WriteBatch`es as they are
   // incompatible with `OptimisticTransactionDB`.
-  virtual Status Write(const WriteOptions& write_opts,
-                       WriteBatch* batch) override {
+  using OptimisticTransactionDB::Write;
+  virtual Status Write(const WriteOptions& write_opts, WriteBatch* batch,
+                       PostWriteCallback* callback) override {
     if (batch->HasDeleteRange()) {
       return Status::NotSupported();
     }
-    return OptimisticTransactionDB::Write(write_opts, batch);
+    return OptimisticTransactionDB::Write(write_opts, batch, callback);
   }
 
   OccValidationPolicy GetValidatePolicy() const { return validate_policy_; }
diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc
index e4bff782658..089849cc1b5 100644
--- a/utilities/ttl/db_ttl_impl.cc
+++ b/utilities/ttl/db_ttl_impl.cc
@@ -547,7 +547,8 @@ Status DBWithTTLImpl::Merge(const WriteOptions& options,
   return st;
 }
 
-Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
+Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates,
+                            PostWriteCallback* callback) {
   class Handler : public WriteBatch::Handler {
    public:
     explicit Handler(SystemClock* clock) : clock_(clock) {}
@@ -590,7 +591,7 @@ Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
   if (!st.ok()) {
     return st;
   } else {
-    return db_->Write(opts, &(handler.updates_ttl));
+    return db_->Write(opts, &(handler.updates_ttl), callback);
   }
 }
 
diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h
index b125d79b067..fbc93cc11f8 100644
--- a/utilities/ttl/db_ttl_impl.h
+++ b/utilities/ttl/db_ttl_impl.h
@@ -75,7 +75,9 @@ class DBWithTTLImpl : public DBWithTTL {
                        ColumnFamilyHandle* column_family, const Slice& key,
                        const Slice& value) override;
 
-  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+  using StackableDB::Write;
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates,
+                       PostWriteCallback* callback) override;
 
   using StackableDB::NewIterator;
   virtual Iterator* NewIterator(const ReadOptions& _read_options,