From 58c0ecbc4e1ee5a3fee96f8c37f9312f782ce717 Mon Sep 17 00:00:00 2001 From: Xintao Date: Fri, 11 Jun 2021 09:16:35 +0800 Subject: [PATCH 01/16] Pick add ldb unsafe_remove_sst_file subcommand (#236) Signed-off-by: Xintao Co-authored-by: sdong Co-authored-by: Andrew Kryczka --- HISTORY.md | 3 + db/db_impl/db_impl.cc | 21 +- db/db_impl/db_impl.h | 7 +- db/db_impl/db_impl_debug.cc | 3 +- db/table_cache.cc | 22 ++ db/table_cache.h | 8 + db/version_set.cc | 57 ++++ db/version_set.h | 5 + include/rocksdb/convenience.h | 29 ++ include/rocksdb/utilities/ldb_cmd.h | 12 +- include/rocksdb/utilities/options_util.h | 10 + options/options_parser.cc | 12 +- options/options_parser.h | 5 + tools/ldb_cmd.cc | 418 +++++++++++++++-------- tools/ldb_cmd_impl.h | 53 ++- tools/ldb_cmd_test.cc | 93 ++++- tools/ldb_tool.cc | 19 +- utilities/options/options_util.cc | 39 ++- 18 files changed, 652 insertions(+), 164 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 30ecbf51aa6..3640c5f6288 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,7 @@ ## Additional Improvements ### Public API Change * DeleteRange now returns `Status::InvalidArgument` if the range's end key comes before its start key according to the user comparator. Previously the behavior was undefined. +* ldb now uses options.force_consistency_checks = true by default and "--disable_consistency_checks" is added to disable it. ### New Features * When user uses options.force_consistency_check in RocksDb, instead of crashing the process, we now pass the error back to the users without killing the process. @@ -54,6 +55,7 @@ * ldb sometimes uses a string-append merge operator if no merge operator is passed in. This is to allow users to print keys from a DB with a merge operator. * Replaces old Registra with ObjectRegistry to allow user to create custom object from string, also add LoadEnv() to Env. * Added new overload of GetApproximateSizes which gets SizeApproximationOptions object and returns a Status. The older overloads are redirecting their calls to this new method and no longer assert if the include_flags doesn't have either of INCLUDE_MEMTABLES or INCLUDE_FILES bits set. It's recommended to use the new method only, as it is more type safe and returns a meaningful status in case of errors. +* LDBCommandRunner::RunCommand() to return the status code as an integer, rather than call exit() using the code. ### New Features * Add argument `--secondary_path` to ldb to open the database as the secondary instance. This would keep the original DB intact. @@ -93,6 +95,7 @@ * Add an option `unordered_write` which trades snapshot guarantees with higher write throughput. When used with WRITE_PREPARED transactions with two_write_queues=true, it offers higher throughput with however no compromise on guarantees. * Allow DBImplSecondary to remove memtables with obsolete data after replaying MANIFEST and WAL. * Add an option `failed_move_fall_back_to_copy` (default is true) for external SST ingestion. When `move_files` is true and hard link fails, ingestion falls back to copy if `failed_move_fall_back_to_copy` is true. Otherwise, ingestion reports an error. +* Add command `list_file_range_deletes` in ldb, which prints out tombstones in SST files. ### Performance Improvements * Reduce binary search when iterator reseek into the same data block. diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 7caa3a3e4a4..d022250df90 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -33,9 +33,9 @@ #include "db/error_handler.h" #include "db/event_helpers.h" #include "db/external_sst_file_ingestion_job.h" -#include "db/import_column_family_job.h" #include "db/flush_job.h" #include "db/forward_iterator.h" +#include "db/import_column_family_job.h" #include "db/job_context.h" #include "db/log_reader.h" #include "db/log_writer.h" @@ -91,6 +91,7 @@ #include "tools/sst_dump_tool_imp.h" #include "util/autovector.h" #include "util/build_version.h" +#include "util/cast_util.h" #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" @@ -864,6 +865,24 @@ void DBImpl::DumpStats() { PrintStatistics(); } +Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family, + int max_entries_to_print, + std::string* out_str) { + auto* cfh = + static_cast_with_check( + column_family); + ColumnFamilyData* cfd = cfh->cfd(); + + SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_); + Version* version = super_version->current; + + Status s = + version->TablesRangeTombstoneSummary(max_entries_to_print, out_str); + + CleanupSuperVersion(super_version); + return s; +} + void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) { if (!job_context->logs_to_free.empty()) { for (auto l : job_context->logs_to_free) { diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 67ba9d1e440..65661148950 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -798,6 +798,12 @@ class DBImpl : public DB { const autovector& cfds, const autovector& flush_memtable_ids, bool resuming_from_bg_err); + // Print information of all tombstones of all iterators to the std::string + // This is only used by ldb. The output might be capped. Tombstones + // printed out are not guaranteed to be in any order. + Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family, + int max_entries_to_print, + std::string* out_str); #ifndef NDEBUG // Compact any files in the named level that overlap [*begin, *end] @@ -899,7 +905,6 @@ class DBImpl : public DB { void TEST_WaitForPersistStatsRun(std::function callback) const; bool TEST_IsPersistentStatsEnabled() const; size_t TEST_EstimateInMemoryStatsHistorySize() const; - #endif // NDEBUG protected: diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index 082899e3a1a..566c175735a 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -9,12 +9,13 @@ #ifndef NDEBUG +#include "db/column_family.h" #include "db/db_impl/db_impl.h" #include "db/error_handler.h" #include "monitoring/thread_status_updater.h" +#include "util/cast_util.h" namespace rocksdb { - uint64_t DBImpl::TEST_GetLevel0TotalSize() { InstrumentedMutexLock l(&mutex_); return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0); diff --git a/db/table_cache.cc b/db/table_cache.cc index bc2cce0e43b..86eb661740d 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -265,6 +265,28 @@ InternalIterator* TableCache::NewIterator( return result; } +Status TableCache::GetRangeTombstoneIterator( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::unique_ptr* out_iter) { + const FileDescriptor& fd = file_meta.fd; + Status s; + TableReader* t = fd.table_reader; + Cache::Handle* handle = nullptr; + if (t == nullptr) { + s = FindTable(env_options_, internal_comparator, fd, &handle); + if (s.ok()) { + t = GetTableReaderFromHandle(handle); + } + } + if (s.ok()) { + out_iter->reset(t->NewRangeTombstoneIterator(options)); + assert(out_iter); + } + return s; +} + Status TableCache::Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const Slice& k, diff --git a/db/table_cache.h b/db/table_cache.h index 8a0277f25fb..4e4b5599c57 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -93,6 +93,14 @@ class TableCache { HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, int level = -1); + // Return the range delete tombstone iterator of the file specified by + // `file_meta`. + Status GetRangeTombstoneIterator( + const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::unique_ptr* out_iter); + // If a seek to internal key "k" in specified file finds an entry, // call get_context->SaveValue() repeatedly until // it returns false. As a side effect, it will insert the TableReader diff --git a/db/version_set.cc b/db/version_set.cc index 736240278d7..911dbae34ba 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1257,6 +1257,60 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { return Status::OK(); } +Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, + std::string* out_str) { + if (max_entries_to_print <= 0) { + return Status::OK(); + } + int num_entries_left = max_entries_to_print; + + std::stringstream ss; + + for (int level = 0; level < storage_info_.num_levels_; level++) { + for (const auto& file_meta : storage_info_.files_[level]) { + auto fname = + TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(), + file_meta->fd.GetPathId()); + + ss << "=== file : " << fname << " ===\n"; + + TableCache* table_cache = cfd_->table_cache(); + std::unique_ptr tombstone_iter; + + Status s = table_cache->GetRangeTombstoneIterator( + ReadOptions(), cfd_->internal_comparator(), *file_meta, + &tombstone_iter); + if (!s.ok()) { + return s; + } + if (tombstone_iter) { + tombstone_iter->SeekToFirst(); + + while (tombstone_iter->Valid() && num_entries_left > 0) { + ss << "start: " << tombstone_iter->start_key().ToString(true) + << " end: " << tombstone_iter->end_key().ToString(true) + << " seq: " << tombstone_iter->seq() << '\n'; + tombstone_iter->Next(); + num_entries_left--; + } + if (num_entries_left <= 0) { + break; + } + } + } + if (num_entries_left <= 0) { + break; + } + } + assert(num_entries_left >= 0); + if (num_entries_left <= 0) { + ss << "(results may not be complete)\n"; + } + + *out_str = ss.str(); + return Status::OK(); +} + Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, int level) { for (const auto& file_meta : storage_info_.files_[level]) { @@ -1993,6 +2047,9 @@ void VersionStorageInfo::GenerateLevelFilesBrief() { void Version::PrepareApply( const MutableCFOptions& mutable_cf_options, bool update_stats) { + TEST_SYNC_POINT_CALLBACK( + "Version::PrepareApply:forced_check", + reinterpret_cast(&storage_info_.force_consistency_checks_)); UpdateAccumulatedStats(update_stats); storage_info_.UpdateNumNonEmptyLevels(); storage_info_.CalculateBaseBytes(*cfd_->ioptions(), mutable_cf_options); diff --git a/db/version_set.h b/db/version_set.h index b2684254854..728afdb3bb1 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -627,6 +627,11 @@ class Version { Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n, TablePropertiesCollection* props) const; + // Print summary of range delete tombstones in SST files into out_str, + // with maximum max_entries_to_print entries printed out. + Status TablesRangeTombstoneSummary(int max_entries_to_print, + std::string* out_str); + // REQUIRES: lock is held // On success, "tp" will contains the aggregated table property among // the table properties of all sst files in this version. diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h index d3cbe6016ac..bc9dbe0c2ef 100644 --- a/include/rocksdb/convenience.h +++ b/include/rocksdb/convenience.h @@ -11,10 +11,39 @@ #include "rocksdb/db.h" #include "rocksdb/options.h" +#include "rocksdb/status.h" #include "rocksdb/table.h" namespace rocksdb { +class Env; +struct ColumnFamilyOptions; +struct DBOptions; +struct Options; + +// ConfigOptions containing the parameters/controls for +// comparing objects and converting to/from strings. +// These settings control how the methods +// treat errors (e.g. ignore_unknown_objects), the format +// of the serialization (e.g. delimiter), and how to compare +// options (sanity_level). +struct ConfigOptions { + // When true, any unused options will be ignored and OK will be returned + bool ignore_unknown_options = false; + + // If the strings are escaped (old-style?) + bool input_strings_escaped = true; + + // The separator between options when converting to a string + std::string delimiter = ";"; + + // `file_readahead_size` is used for readahead for the option file. + size_t file_readahead_size = 512 * 1024; + + // The environment to use for this option + Env* env = Env::Default(); +}; + #ifndef ROCKSDB_LITE // The following set of functions provide a way to construct RocksDB Options // from a string or a string-to-string map. Here're the general rule of diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h index e7000742d1b..c43ef6fb212 100644 --- a/include/rocksdb/utilities/ldb_cmd.h +++ b/include/rocksdb/utilities/ldb_cmd.h @@ -16,6 +16,7 @@ #include #include +#include "rocksdb/convenience.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/ldb_tool.h" @@ -56,6 +57,7 @@ class LDBCommand { static const std::string ARG_FILE_SIZE; static const std::string ARG_CREATE_IF_MISSING; static const std::string ARG_NO_VALUE; + static const std::string ARG_DISABLE_CONSISTENCY_CHECKS; struct ParsedParams { std::string cmd; @@ -80,7 +82,9 @@ class LDBCommand { bool ValidateCmdLineOptions(); - virtual Options PrepareOptionsForOpenDB(); + virtual void PrepareOptions(); + + virtual void OverrideBaseOptions(); virtual void SetDBOptions(Options options) { options_ = options; } @@ -160,6 +164,8 @@ class LDBCommand { bool try_load_options_; bool ignore_unknown_options_; + // The value passed to options.force_consistency_checks. + bool force_consistency_checks_; bool create_if_missing_; @@ -232,6 +238,7 @@ class LDBCommand { Options options_; std::vector column_families_; + ConfigOptions config_options_; LDBOptions ldb_options_; private: @@ -261,7 +268,8 @@ class LDBCommandRunner { public: static void PrintHelp(const LDBOptions& ldb_options, const char* exec_name); - static void RunCommand( + // Returns the status code to return. 0 is no error. + static int RunCommand( int argc, char** argv, Options options, const LDBOptions& ldb_options, const std::vector* column_families); }; diff --git a/include/rocksdb/utilities/options_util.h b/include/rocksdb/utilities/options_util.h index a1d0fde2f5f..e6768905e3a 100644 --- a/include/rocksdb/utilities/options_util.h +++ b/include/rocksdb/utilities/options_util.h @@ -11,6 +11,7 @@ #include #include +#include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/options.h" @@ -67,6 +68,10 @@ Status LoadLatestOptions(const std::string& dbpath, Env* env, std::vector* cf_descs, bool ignore_unknown_options = false, std::shared_ptr* cache = {}); +Status LoadLatestOptions(const ConfigOptions& config_options, + const std::string& dbpath, DBOptions* db_options, + std::vector* cf_descs, + std::shared_ptr* cache = {}); // Similar to LoadLatestOptions, this function constructs the DBOptions // and ColumnFamilyDescriptors based on the specified RocksDB Options file. @@ -77,6 +82,11 @@ Status LoadOptionsFromFile(const std::string& options_file_name, Env* env, std::vector* cf_descs, bool ignore_unknown_options = false, std::shared_ptr* cache = {}); +Status LoadOptionsFromFile(const ConfigOptions& config_options, + const std::string& options_file_name, + DBOptions* db_options, + std::vector* cf_descs, + std::shared_ptr* cache = {}); // Returns the latest options file name under the specified db path. Status GetLatestOptionsFileName(const std::string& dbpath, Env* env, diff --git a/options/options_parser.cc b/options/options_parser.cc index 07bbf056582..0ae38602d5b 100644 --- a/options/options_parser.cc +++ b/options/options_parser.cc @@ -202,10 +202,20 @@ Status RocksDBOptionsParser::ParseStatement(std::string* name, Status RocksDBOptionsParser::Parse(const std::string& file_name, Env* env, bool ignore_unknown_options) { + ConfigOptions config_options; // Use default for escaped(true) and check (exact) + config_options.ignore_unknown_options = ignore_unknown_options; + config_options.env = env; + return Parse(config_options, file_name); +} + +Status RocksDBOptionsParser::Parse(const ConfigOptions& config_options_in, + const std::string& file_name) { Reset(); + ConfigOptions config_options = config_options_in; + auto ignore_unknown_options = config_options.ignore_unknown_options; std::unique_ptr seq_file; - Status s = env->NewSequentialFile(file_name, &seq_file, EnvOptions()); + Status s = config_options.env->NewSequentialFile(file_name, &seq_file, EnvOptions()); if (!s.ok()) { return s; } diff --git a/options/options_parser.h b/options/options_parser.h index b2a806f179f..c1c28c51a07 100644 --- a/options/options_parser.h +++ b/options/options_parser.h @@ -17,6 +17,7 @@ namespace rocksdb { #ifndef ROCKSDB_LITE +struct ConfigOptions; #define ROCKSDB_OPTION_FILE_MAJOR 1 #define ROCKSDB_OPTION_FILE_MINOR 1 @@ -50,6 +51,10 @@ class RocksDBOptionsParser { Status Parse(const std::string& file_name, Env* env, bool ignore_unknown_options = false); + + Status Parse(const ConfigOptions& config_options, + const std::string& file_name); + static std::string TrimAndRemoveComment(const std::string& line, const bool trim_only = false); diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 86dfcc54e9e..91d916ba279 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -57,6 +57,8 @@ const std::string LDBCommand::ARG_TTL_START = "start_time"; const std::string LDBCommand::ARG_TTL_END = "end_time"; const std::string LDBCommand::ARG_TIMESTAMP = "timestamp"; const std::string LDBCommand::ARG_TRY_LOAD_OPTIONS = "try_load_options"; +const std::string LDBCommand::ARG_DISABLE_CONSISTENCY_CHECKS = + "disable_consistency_checks"; const std::string LDBCommand::ARG_IGNORE_UNKNOWN_OPTIONS = "ignore_unknown_options"; const std::string LDBCommand::ARG_FROM = "from"; @@ -261,6 +263,13 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) { return new IngestExternalSstFilesCommand(parsed_params.cmd_params, parsed_params.option_map, parsed_params.flags); + } else if (parsed_params.cmd == ListFileRangeDeletesCommand::Name()) { + return new ListFileRangeDeletesCommand(parsed_params.option_map, + parsed_params.flags); + } else if (parsed_params.cmd == UnsafeRemoveSstFileCommand::Name()) { + return new UnsafeRemoveSstFileCommand(parsed_params.cmd_params, + parsed_params.option_map, + parsed_params.flags); } return nullptr; } @@ -334,36 +343,14 @@ LDBCommand::LDBCommand(const std::map& options, timestamp_ = IsFlagPresent(flags, ARG_TIMESTAMP); try_load_options_ = IsFlagPresent(flags, ARG_TRY_LOAD_OPTIONS); ignore_unknown_options_ = IsFlagPresent(flags, ARG_IGNORE_UNKNOWN_OPTIONS); + force_consistency_checks_ = + !IsFlagPresent(flags, ARG_DISABLE_CONSISTENCY_CHECKS); + config_options_.ignore_unknown_options = + IsFlagPresent(flags, ARG_IGNORE_UNKNOWN_OPTIONS); } void LDBCommand::OpenDB() { - if (!create_if_missing_ && try_load_options_) { - Status s = LoadLatestOptions(db_path_, Env::Default(), &options_, - &column_families_, ignore_unknown_options_); - if (!s.ok() && !s.IsNotFound()) { - // Option file exists but load option file error. - std::string msg = s.ToString(); - exec_state_ = LDBCommandExecuteResult::Failed(msg); - db_ = nullptr; - return; - } - if (options_.env->FileExists(options_.wal_dir).IsNotFound()) { - options_.wal_dir = db_path_; - fprintf( - stderr, - "wal_dir loaded from the option file doesn't exist. Ignore it.\n"); - } - - // If merge operator is not set, set a string append operator. There is - // no harm doing it. - for (auto& cf_entry : column_families_) { - if (!cf_entry.options.merge_operator) { - cf_entry.options.merge_operator = - MergeOperators::CreateStringAppendOperator(':'); - } - } - } - options_ = PrepareOptionsForOpenDB(); + PrepareOptions(); if (!exec_state_.IsNotStarted()) { return; } @@ -391,21 +378,6 @@ void LDBCommand::OpenDB() { } db_ = db_ttl_; } else { - if (column_families_.empty()) { - // Try to figure out column family lists - std::vector cf_list; - st = DB::ListColumnFamilies(DBOptions(), db_path_, &cf_list); - // There is possible the DB doesn't exist yet, for "create if not - // "existing case". The failure is ignored here. We rely on DB::Open() - // to give us the correct error message for problem with opening - // existing DB. - if (st.ok() && cf_list.size() > 1) { - // Ignore single column family DB. - for (auto cf_name : cf_list) { - column_families_.emplace_back(cf_name, options_); - } - } - } if (is_read_only_ && secondary_path_.empty()) { if (column_families_.empty()) { st = DB::OpenForReadOnly(options_, db_path_, &db_); @@ -495,6 +467,7 @@ std::vector LDBCommand::BuildCmdLineOptions( ARG_FILE_SIZE, ARG_FIX_PREFIX_LEN, ARG_TRY_LOAD_OPTIONS, + ARG_DISABLE_CONSISTENCY_CHECKS, ARG_IGNORE_UNKNOWN_OPTIONS, ARG_CF_NAME}; ret.insert(ret.end(), options.begin(), options.end()); @@ -549,22 +522,8 @@ bool LDBCommand::ParseStringOption( return false; } -Options LDBCommand::PrepareOptionsForOpenDB() { - ColumnFamilyOptions* cf_opts; - auto column_families_iter = - std::find_if(column_families_.begin(), column_families_.end(), - [this](const ColumnFamilyDescriptor& cf_desc) { - return cf_desc.name == column_family_name_; - }); - if (column_families_iter != column_families_.end()) { - cf_opts = &column_families_iter->options; - } else { - cf_opts = static_cast(&options_); - } - DBOptions* db_opts = static_cast(&options_); - db_opts->create_if_missing = false; - - std::map::const_iterator itr; +void LDBCommand::OverrideBaseOptions() { + options_.create_if_missing = false; BlockBasedTableOptions table_options; bool use_table_options = false; @@ -590,34 +549,35 @@ Options LDBCommand::PrepareOptionsForOpenDB() { } } + options_.force_consistency_checks = force_consistency_checks_; if (use_table_options) { - cf_opts->table_factory.reset(NewBlockBasedTableFactory(table_options)); + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); } - itr = option_map_.find(ARG_AUTO_COMPACTION); + auto itr = option_map_.find(ARG_AUTO_COMPACTION); if (itr != option_map_.end()) { - cf_opts->disable_auto_compactions = !StringToBool(itr->second); + options_.disable_auto_compactions = !StringToBool(itr->second); } itr = option_map_.find(ARG_COMPRESSION_TYPE); if (itr != option_map_.end()) { std::string comp = itr->second; if (comp == "no") { - cf_opts->compression = kNoCompression; + options_.compression = kNoCompression; } else if (comp == "snappy") { - cf_opts->compression = kSnappyCompression; + options_.compression = kSnappyCompression; } else if (comp == "zlib") { - cf_opts->compression = kZlibCompression; + options_.compression = kZlibCompression; } else if (comp == "bzip2") { - cf_opts->compression = kBZip2Compression; + options_.compression = kBZip2Compression; } else if (comp == "lz4") { - cf_opts->compression = kLZ4Compression; + options_.compression = kLZ4Compression; } else if (comp == "lz4hc") { - cf_opts->compression = kLZ4HCCompression; + options_.compression = kLZ4HCCompression; } else if (comp == "xpress") { - cf_opts->compression = kXpressCompression; + options_.compression = kXpressCompression; } else if (comp == "zstd") { - cf_opts->compression = kZSTD; + options_.compression = kZSTD; } else { // Unknown compression. exec_state_ = @@ -629,7 +589,7 @@ Options LDBCommand::PrepareOptionsForOpenDB() { if (ParseIntOption(option_map_, ARG_COMPRESSION_MAX_DICT_BYTES, compression_max_dict_bytes, exec_state_)) { if (compression_max_dict_bytes >= 0) { - cf_opts->compression_opts.max_dict_bytes = compression_max_dict_bytes; + options_.compression_opts.max_dict_bytes = compression_max_dict_bytes; } else { exec_state_ = LDBCommandExecuteResult::Failed( ARG_COMPRESSION_MAX_DICT_BYTES + " must be >= 0."); @@ -640,7 +600,7 @@ Options LDBCommand::PrepareOptionsForOpenDB() { if (ParseIntOption(option_map_, ARG_DB_WRITE_BUFFER_SIZE, db_write_buffer_size, exec_state_)) { if (db_write_buffer_size >= 0) { - db_opts->db_write_buffer_size = db_write_buffer_size; + options_.db_write_buffer_size = db_write_buffer_size; } else { exec_state_ = LDBCommandExecuteResult::Failed(ARG_DB_WRITE_BUFFER_SIZE + " must be >= 0."); @@ -651,7 +611,7 @@ Options LDBCommand::PrepareOptionsForOpenDB() { if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size, exec_state_)) { if (write_buffer_size > 0) { - cf_opts->write_buffer_size = write_buffer_size; + options_.write_buffer_size = write_buffer_size; } else { exec_state_ = LDBCommandExecuteResult::Failed(ARG_WRITE_BUFFER_SIZE + " must be > 0."); @@ -661,15 +621,15 @@ Options LDBCommand::PrepareOptionsForOpenDB() { int file_size; if (ParseIntOption(option_map_, ARG_FILE_SIZE, file_size, exec_state_)) { if (file_size > 0) { - cf_opts->target_file_size_base = file_size; + options_.target_file_size_base = file_size; } else { exec_state_ = LDBCommandExecuteResult::Failed(ARG_FILE_SIZE + " must be > 0."); } } - if (db_opts->db_paths.size() == 0) { - db_opts->db_paths.emplace_back(db_path_, + if (options_.db_paths.size() == 0) { + options_.db_paths.emplace_back(db_path_, std::numeric_limits::max()); } @@ -677,17 +637,83 @@ Options LDBCommand::PrepareOptionsForOpenDB() { if (ParseIntOption(option_map_, ARG_FIX_PREFIX_LEN, fix_prefix_len, exec_state_)) { if (fix_prefix_len > 0) { - cf_opts->prefix_extractor.reset( + options_.prefix_extractor.reset( NewFixedPrefixTransform(static_cast(fix_prefix_len))); } else { exec_state_ = LDBCommandExecuteResult::Failed(ARG_FIX_PREFIX_LEN + " must be > 0."); } } - // TODO(ajkr): this return value doesn't reflect the CF options changed, so - // subcommands that rely on this won't see the effect of CF-related CLI args. - // Such subcommands need to be changed to properly support CFs. - return options_; +} + +// First, initializes the options state using the OPTIONS file when enabled. +// Second, overrides the options according to the CLI arguments and the +// specific subcommand being run. +void LDBCommand::PrepareOptions() { + if (!create_if_missing_ && try_load_options_) { + config_options_.env = options_.env; + Status s = LoadLatestOptions(config_options_, db_path_, &options_, + &column_families_); + if (!s.ok() && !s.IsNotFound()) { + // Option file exists but load option file error. + std::string msg = s.ToString(); + exec_state_ = LDBCommandExecuteResult::Failed(msg); + db_ = nullptr; + return; + } + if (options_.env->FileExists(options_.wal_dir).IsNotFound()) { + options_.wal_dir = db_path_; + fprintf( + stderr, + "wal_dir loaded from the option file doesn't exist. Ignore it.\n"); + } + + // If merge operator is not set, set a string append operator. + for (auto& cf_entry : column_families_) { + if (!cf_entry.options.merge_operator) { + cf_entry.options.merge_operator = + MergeOperators::CreateStringAppendOperator(':'); + } + } + } + + OverrideBaseOptions(); + if (exec_state_.IsFailed()) { + return; + } + + if (column_families_.empty()) { + // Reads the MANIFEST to figure out what column families exist. In this + // case, the option overrides from the CLI argument/specific subcommand + // apply to all column families. + std::vector cf_list; + Status st = DB::ListColumnFamilies(options_, db_path_, &cf_list); + // It is possible the DB doesn't exist yet, for "create if not + // existing" case. The failure is ignored here. We rely on DB::Open() + // to give us the correct error message for problem with opening + // existing DB. + if (st.ok() && cf_list.size() > 1) { + // Ignore single column family DB. + for (auto cf_name : cf_list) { + column_families_.emplace_back(cf_name, options_); + } + } + } else { + // We got column families from the OPTIONS file. In this case, the option + // overrides from the CLI argument/specific subcommand only apply to the + // column family specified by `--column_family_name`. + auto column_families_iter = + std::find_if(column_families_.begin(), column_families_.end(), + [this](const ColumnFamilyDescriptor& cf_desc) { + return cf_desc.name == column_family_name_; + }); + if (column_families_iter == column_families_.end()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Non-existing column family " + column_family_name_); + return; + } + column_families_iter->options = options_; + } } bool LDBCommand::ParseKeyValue(const std::string& line, std::string* key, @@ -926,13 +952,12 @@ void DBLoaderCommand::Help(std::string& ret) { ret.append("\n"); } -Options DBLoaderCommand::PrepareOptionsForOpenDB() { - Options opt = LDBCommand::PrepareOptionsForOpenDB(); - opt.create_if_missing = create_if_missing_; +void DBLoaderCommand::OverrideBaseOptions() { + LDBCommand::OverrideBaseOptions(); + options_.create_if_missing = create_if_missing_; if (bulk_load_) { - opt.PrepareForBulkLoad(); + options_.PrepareForBulkLoad(); } - return opt; } void DBLoaderCommand::DoCommand() { @@ -1712,14 +1737,14 @@ void ReduceDBLevelsCommand::Help(std::string& ret) { ret.append("\n"); } -Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() { - Options opt = LDBCommand::PrepareOptionsForOpenDB(); - opt.num_levels = old_levels_; - opt.max_bytes_for_level_multiplier_additional.resize(opt.num_levels, 1); +void ReduceDBLevelsCommand::OverrideBaseOptions() { + LDBCommand::OverrideBaseOptions(); + options_.num_levels = old_levels_; + options_.max_bytes_for_level_multiplier_additional.resize(options_.num_levels, + 1); // Disable size compaction - opt.max_bytes_for_level_base = 1ULL << 50; - opt.max_bytes_for_level_multiplier = 1; - return opt; + options_.max_bytes_for_level_base = 1ULL << 50; + options_.max_bytes_for_level_multiplier = 1; } Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, @@ -1764,9 +1789,9 @@ void ReduceDBLevelsCommand::DoCommand() { } Status st; - Options opt = PrepareOptionsForOpenDB(); + PrepareOptions(); int old_level_num = -1; - st = GetOldNumOfLevels(opt, &old_level_num); + st = GetOldNumOfLevels(options_, &old_level_num); if (!st.ok()) { exec_state_ = LDBCommandExecuteResult::Failed(st.ToString()); return; @@ -1793,7 +1818,8 @@ void ReduceDBLevelsCommand::DoCommand() { CloseDB(); EnvOptions soptions; - st = VersionSet::ReduceNumberOfLevels(db_path_, &opt, soptions, new_levels_); + st = VersionSet::ReduceNumberOfLevels(db_path_, &options_, soptions, + new_levels_); if (!st.ok()) { exec_state_ = LDBCommandExecuteResult::Failed(st.ToString()); return; @@ -1860,21 +1886,19 @@ void ChangeCompactionStyleCommand::Help(std::string& ret) { ret.append("\n"); } -Options ChangeCompactionStyleCommand::PrepareOptionsForOpenDB() { - Options opt = LDBCommand::PrepareOptionsForOpenDB(); +void ChangeCompactionStyleCommand::OverrideBaseOptions() { + LDBCommand::OverrideBaseOptions(); if (old_compaction_style_ == kCompactionStyleLevel && new_compaction_style_ == kCompactionStyleUniversal) { // In order to convert from level compaction to universal compaction, we // need to compact all data into a single file and move it to level 0. - opt.disable_auto_compactions = true; - opt.target_file_size_base = INT_MAX; - opt.target_file_size_multiplier = 1; - opt.max_bytes_for_level_base = INT_MAX; - opt.max_bytes_for_level_multiplier = 1; + options_.disable_auto_compactions = true; + options_.target_file_size_base = INT_MAX; + options_.target_file_size_multiplier = 1; + options_.max_bytes_for_level_base = INT_MAX; + options_.max_bytes_for_level_multiplier = 1; } - - return opt; } void ChangeCompactionStyleCommand::DoCommand() { @@ -2316,10 +2340,9 @@ void BatchPutCommand::DoCommand() { } } -Options BatchPutCommand::PrepareOptionsForOpenDB() { - Options opt = LDBCommand::PrepareOptionsForOpenDB(); - opt.create_if_missing = create_if_missing_; - return opt; +void BatchPutCommand::OverrideBaseOptions() { + LDBCommand::OverrideBaseOptions(); + options_.create_if_missing = create_if_missing_; } // ---------------------------------------------------------------------------- @@ -2596,10 +2619,9 @@ void PutCommand::DoCommand() { } } -Options PutCommand::PrepareOptionsForOpenDB() { - Options opt = LDBCommand::PrepareOptionsForOpenDB(); - opt.create_if_missing = create_if_missing_; - return opt; +void PutCommand::OverrideBaseOptions() { + LDBCommand::OverrideBaseOptions(); + options_.create_if_missing = create_if_missing_; } // ---------------------------------------------------------------------------- @@ -2692,7 +2714,7 @@ CheckConsistencyCommand::CheckConsistencyCommand( const std::vector& /*params*/, const std::map& options, const std::vector& flags) - : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {} + : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {} void CheckConsistencyCommand::Help(std::string& ret) { ret.append(" "); @@ -2701,19 +2723,13 @@ void CheckConsistencyCommand::Help(std::string& ret) { } void CheckConsistencyCommand::DoCommand() { - Options opt = PrepareOptionsForOpenDB(); - opt.paranoid_checks = true; - if (!exec_state_.IsNotStarted()) { - return; - } - DB* db; - Status st = DB::OpenForReadOnly(opt, db_path_, &db, false); - delete db; - if (st.ok()) { + options_.paranoid_checks = true; + options_.num_levels = 64; + OpenDB(); + if (exec_state_.IsSucceed() || exec_state_.IsNotStarted()) { fprintf(stdout, "OK\n"); - } else { - exec_state_ = LDBCommandExecuteResult::Failed(st.ToString()); } + CloseDB(); } // ---------------------------------------------------------------------------- @@ -2767,10 +2783,14 @@ void RepairCommand::Help(std::string& ret) { ret.append("\n"); } +void RepairCommand::OverrideBaseOptions() { + LDBCommand::OverrideBaseOptions(); + options_.info_log.reset(new StderrLogger(InfoLogLevel::WARN_LEVEL)); +} + void RepairCommand::DoCommand() { - Options options = PrepareOptionsForOpenDB(); - options.info_log.reset(new StderrLogger(InfoLogLevel::WARN_LEVEL)); - Status status = RepairDB(db_path_, options); + PrepareOptions(); + Status status = RepairDB(db_path_, options_); if (status.ok()) { printf("OK\n"); } else { @@ -3107,10 +3127,9 @@ void WriteExternalSstFilesCommand::DoCommand() { "external SST file written to " + output_sst_path_); } -Options WriteExternalSstFilesCommand::PrepareOptionsForOpenDB() { - Options opt = LDBCommand::PrepareOptionsForOpenDB(); - opt.create_if_missing = create_if_missing_; - return opt; +void WriteExternalSstFilesCommand::OverrideBaseOptions() { + LDBCommand::OverrideBaseOptions(); + options_.create_if_missing = create_if_missing_; } const std::string IngestExternalSstFilesCommand::ARG_MOVE_FILES = "move_files"; @@ -3222,10 +3241,139 @@ void IngestExternalSstFilesCommand::DoCommand() { } } -Options IngestExternalSstFilesCommand::PrepareOptionsForOpenDB() { - Options opt = LDBCommand::PrepareOptionsForOpenDB(); - opt.create_if_missing = create_if_missing_; - return opt; +void IngestExternalSstFilesCommand::OverrideBaseOptions() { + LDBCommand::OverrideBaseOptions(); + options_.create_if_missing = create_if_missing_; +} + +ListFileRangeDeletesCommand::ListFileRangeDeletesCommand( + const std::map& options, + const std::vector& flags) + : LDBCommand(options, flags, true, BuildCmdLineOptions({ARG_MAX_KEYS})) { + std::map::const_iterator itr = + options.find(ARG_MAX_KEYS); + if (itr != options.end()) { + try { +#if defined(CYGWIN) + max_keys_ = strtol(itr->second.c_str(), 0, 10); +#else + max_keys_ = std::stoi(itr->second); +#endif + } catch (const std::invalid_argument&) { + exec_state_ = LDBCommandExecuteResult::Failed(ARG_MAX_KEYS + + " has an invalid value"); + } catch (const std::out_of_range&) { + exec_state_ = LDBCommandExecuteResult::Failed( + ARG_MAX_KEYS + " has a value out-of-range"); + } + } +} + +void ListFileRangeDeletesCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(ListFileRangeDeletesCommand::Name()); + ret.append(" [--" + ARG_MAX_KEYS + "=]"); + ret.append(" : print tombstones in SST files.\n"); +} + +void ListFileRangeDeletesCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + + DBImpl* db_impl = static_cast_with_check(db_->GetRootDB()); + + std::string out_str; + + Status st = + db_impl->TablesRangeTombstoneSummary(GetCfHandle(), max_keys_, &out_str); + if (st.ok()) { + TEST_SYNC_POINT_CALLBACK( + "ListFileRangeDeletesCommand::DoCommand:BeforePrint", &out_str); + fprintf(stdout, "%s\n", out_str.c_str()); + } +} + +void UnsafeRemoveSstFileCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(UnsafeRemoveSstFileCommand::Name()); + ret.append(" "); + ret.append("\n"); + ret.append(" MUST NOT be used on a live DB."); + ret.append("\n"); +} + +UnsafeRemoveSstFileCommand::UnsafeRemoveSstFileCommand( + const std::vector& params, + const std::map& options, + const std::vector& flags) + : LDBCommand(options, flags, false /* is_read_only */, + BuildCmdLineOptions({})) { + if (params.size() != 1) { + exec_state_ = + LDBCommandExecuteResult::Failed("SST file number must be specified"); + } else { + char* endptr = nullptr; + sst_file_number_ = strtoull(params.at(0).c_str(), &endptr, 10 /* base */); + if (endptr == nullptr || *endptr != '\0') { + exec_state_ = LDBCommandExecuteResult::Failed( + "Failed to parse SST file number " + params.at(0)); + } + } +} + +void UnsafeRemoveSstFileCommand::DoCommand() { + // Instead of opening a `DB` and calling `DeleteFile()`, this implementation + // uses the underlying `VersionSet` API to read and modify the MANIFEST. This + // allows us to use the user's real options, while not having to worry about + // the DB persisting new SST files via flush/compaction or attempting to read/ + // compact files which may fail, particularly for the file we intend to remove + // (the user may want to remove an already deleted file from MANIFEST). + PrepareOptions(); + + if (options_.db_paths.empty()) { + // `VersionSet` expects options that have been through `SanitizeOptions()`, + // which would sanitize an empty `db_paths`. + options_.db_paths.emplace_back(db_path_, 0 /* target_size */); + } + + WriteController wc(options_.delayed_write_rate); + WriteBufferManager wb(options_.db_write_buffer_size); + ImmutableDBOptions immutable_db_options(options_); + std::shared_ptr tc( + NewLRUCache(1 << 20 /* capacity */, options_.table_cache_numshardbits)); + EnvOptions sopt; + VersionSet versions(db_path_, &immutable_db_options, sopt, tc.get(), &wb, &wc, + /*block_cache_tracer=*/nullptr); + Status s = versions.Recover(column_families_); + + ColumnFamilyData* cfd = nullptr; + int level = -1; + if (s.ok()) { + FileMetaData* metadata = nullptr; + s = versions.GetMetadataForFile(sst_file_number_, &level, &metadata, &cfd); + } + + if (s.ok()) { + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + edit.DeleteFile(level, sst_file_number_); + // Use `mutex` to imitate a locked DB mutex when calling `LogAndApply()`. + InstrumentedMutex mutex; + mutex.Lock(); + s = versions.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, + &mutex, nullptr /* db_directory */, + false /* new_descriptor_log */); + mutex.Unlock(); + } + + if (!s.ok()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "failed to unsafely remove SST file: " + s.ToString()); + } else { + exec_state_ = LDBCommandExecuteResult::Succeed("unsafely removed SST file"); + } } } // namespace rocksdb diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h index 23bafe68254..b12e5396d48 100644 --- a/tools/ldb_cmd_impl.h +++ b/tools/ldb_cmd_impl.h @@ -136,7 +136,7 @@ class DBLoaderCommand : public LDBCommand { static void Help(std::string& ret); virtual void DoCommand() override; - virtual Options PrepareOptionsForOpenDB() override; + virtual void OverrideBaseOptions() override; private: bool disable_wal_; @@ -230,7 +230,7 @@ class ReduceDBLevelsCommand : public LDBCommand { const std::map& options, const std::vector& flags); - virtual Options PrepareOptionsForOpenDB() override; + virtual void OverrideBaseOptions() override; virtual void DoCommand() override; @@ -262,7 +262,7 @@ class ChangeCompactionStyleCommand : public LDBCommand { const std::map& options, const std::vector& flags); - virtual Options PrepareOptionsForOpenDB() override; + virtual void OverrideBaseOptions() override; virtual void DoCommand() override; @@ -346,7 +346,7 @@ class BatchPutCommand : public LDBCommand { static void Help(std::string& ret); - virtual Options PrepareOptionsForOpenDB() override; + virtual void OverrideBaseOptions() override; private: /** @@ -421,7 +421,7 @@ class PutCommand : public LDBCommand { static void Help(std::string& ret); - virtual Options PrepareOptionsForOpenDB() override; + virtual void OverrideBaseOptions() override; private: std::string key_; @@ -495,6 +495,8 @@ class RepairCommand : public LDBCommand { virtual bool NoDBOpen() override { return true; } + virtual void OverrideBaseOptions() override; + static void Help(std::string& ret); }; @@ -551,7 +553,7 @@ class WriteExternalSstFilesCommand : public LDBCommand { virtual bool NoDBOpen() override { return false; } - virtual Options PrepareOptionsForOpenDB() override; + virtual void OverrideBaseOptions() override; static void Help(std::string& ret); @@ -571,7 +573,7 @@ class IngestExternalSstFilesCommand : public LDBCommand { virtual bool NoDBOpen() override { return false; } - virtual Options PrepareOptionsForOpenDB() override; + virtual void OverrideBaseOptions() override; static void Help(std::string& ret); @@ -592,4 +594,39 @@ class IngestExternalSstFilesCommand : public LDBCommand { static const std::string ARG_WRITE_GLOBAL_SEQNO; }; -} // namespace rocksdb +// Command that prints out range delete tombstones in SST files. +class ListFileRangeDeletesCommand : public LDBCommand { + public: + static std::string Name() { return "list_file_range_deletes"; } + + ListFileRangeDeletesCommand(const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + int max_keys_ = 1000; +}; + +// Command that removes the SST file forcibly from the manifest. +class UnsafeRemoveSstFileCommand : public LDBCommand { + public: + static std::string Name() { return "unsafe_remove_sst_file"; } + + UnsafeRemoveSstFileCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + static void Help(std::string& ret); + + virtual void DoCommand() override; + + virtual bool NoDBOpen() override { return true; } + + private: + uint64_t sst_file_number_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/tools/ldb_cmd_test.cc b/tools/ldb_cmd_test.cc index 995313a9a38..aea02105852 100644 --- a/tools/ldb_cmd_test.cc +++ b/tools/ldb_cmd_test.cc @@ -6,6 +6,7 @@ #ifndef ROCKSDB_LITE #include "rocksdb/utilities/ldb_cmd.h" +#include "test_util/sync_point.h" #include "test_util/testharness.h" using std::string; @@ -77,8 +78,8 @@ TEST_F(LdbCmdTest, MemEnv) { char arg3[] = "dump_live_files"; char* argv[] = {arg1, arg2, arg3}; - rocksdb::LDBTool tool; - tool.Run(3, argv, opts); + ASSERT_EQ(0, + LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr)); } TEST_F(LdbCmdTest, OptionParsing) { @@ -118,6 +119,94 @@ TEST_F(LdbCmdTest, OptionParsing) { } } +TEST_F(LdbCmdTest, ListFileTombstone) { + std::unique_ptr env(NewMemEnv(Env::Default())); + Options opts; + opts.env = env.get(); + opts.create_if_missing = true; + + DB* db = nullptr; + std::string dbname = test::TmpDir(); + ASSERT_OK(DB::Open(opts, dbname, &db)); + + WriteOptions wopts; + ASSERT_OK(db->Put(wopts, "foo", "1")); + ASSERT_OK(db->Put(wopts, "bar", "2")); + + FlushOptions fopts; + fopts.wait = true; + ASSERT_OK(db->Flush(fopts)); + + ASSERT_OK(db->DeleteRange(wopts, db->DefaultColumnFamily(), "foo", "foo2")); + ASSERT_OK(db->DeleteRange(wopts, db->DefaultColumnFamily(), "bar", "foo2")); + ASSERT_OK(db->Flush(fopts)); + + delete db; + + { + char arg1[] = "./ldb"; + char arg2[1024]; + snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str()); + char arg3[] = "list_file_range_deletes"; + char* argv[] = {arg1, arg2, arg3}; + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "ListFileRangeDeletesCommand::DoCommand:BeforePrint", [&](void* arg) { + std::string* out_str = reinterpret_cast(arg); + + // Count number of tombstones printed + int num_tb = 0; + const std::string kFingerprintStr = "start: "; + auto offset = out_str->find(kFingerprintStr); + while (offset != std::string::npos) { + num_tb++; + offset = + out_str->find(kFingerprintStr, offset + kFingerprintStr.size()); + } + EXPECT_EQ(2, num_tb); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_EQ( + 0, LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr)); + + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + } + + // Test the case of limiting tombstones + { + char arg1[] = "./ldb"; + char arg2[1024]; + snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str()); + char arg3[] = "list_file_range_deletes"; + char arg4[] = "--max_keys=1"; + char* argv[] = {arg1, arg2, arg3, arg4}; + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "ListFileRangeDeletesCommand::DoCommand:BeforePrint", [&](void* arg) { + std::string* out_str = reinterpret_cast(arg); + + // Count number of tombstones printed + int num_tb = 0; + const std::string kFingerprintStr = "start: "; + auto offset = out_str->find(kFingerprintStr); + while (offset != std::string::npos) { + num_tb++; + offset = + out_str->find(kFingerprintStr, offset + kFingerprintStr.size()); + } + EXPECT_EQ(1, num_tb); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_EQ( + 0, LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr)); + + rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + } +} } // namespace rocksdb int main(int argc, char** argv) { diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc index 2813f6c6edf..263677286a7 100644 --- a/tools/ldb_tool.cc +++ b/tools/ldb_tool.cc @@ -44,6 +44,8 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options, " : DB supports ttl and value is internally timestamp-suffixed\n"); ret.append(" --" + LDBCommand::ARG_TRY_LOAD_OPTIONS + " : Try to load option file from DB.\n"); + ret.append(" --" + LDBCommand::ARG_DISABLE_CONSISTENCY_CHECKS + + " : Set options.force_consistency_checks = false.\n"); ret.append(" --" + LDBCommand::ARG_IGNORE_UNKNOWN_OPTIONS + " : Ignore unknown options when loading option file.\n"); ret.append(" --" + LDBCommand::ARG_BLOOM_BITS + "=\n"); @@ -71,6 +73,7 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options, DBQuerierCommand::Help(ret); ApproxSizeCommand::Help(ret); CheckConsistencyCommand::Help(ret); + ListFileRangeDeletesCommand::Help(ret); ret.append("\n\n"); ret.append("Admin Commands:\n"); @@ -92,16 +95,17 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options, CheckPointCommand::Help(ret); WriteExternalSstFilesCommand::Help(ret); IngestExternalSstFilesCommand::Help(ret); + UnsafeRemoveSstFileCommand::Help(ret); fprintf(stderr, "%s\n", ret.c_str()); } -void LDBCommandRunner::RunCommand( +int LDBCommandRunner::RunCommand( int argc, char** argv, Options options, const LDBOptions& ldb_options, const std::vector* column_families) { if (argc <= 2) { PrintHelp(ldb_options, argv[0]); - exit(1); + return 1; } LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs( @@ -109,11 +113,11 @@ void LDBCommandRunner::RunCommand( if (cmdObj == nullptr) { fprintf(stderr, "Unknown command\n"); PrintHelp(ldb_options, argv[0]); - exit(1); + return 1; } if (!cmdObj->ValidateCmdLineOptions()) { - exit(1); + return 1; } cmdObj->Run(); @@ -121,14 +125,15 @@ void LDBCommandRunner::RunCommand( fprintf(stderr, "%s\n", ret.ToString().c_str()); delete cmdObj; - exit(ret.IsFailed()); + return ret.IsFailed() ? 1 : 0; } void LDBTool::Run(int argc, char** argv, Options options, const LDBOptions& ldb_options, const std::vector* column_families) { - LDBCommandRunner::RunCommand(argc, argv, options, ldb_options, - column_families); + int error_code = LDBCommandRunner::RunCommand(argc, argv, options, + ldb_options, column_families); + exit(error_code); } } // namespace rocksdb diff --git a/utilities/options/options_util.cc b/utilities/options/options_util.cc index 561e925ebbe..74b995ee0bc 100644 --- a/utilities/options/options_util.cc +++ b/utilities/options/options_util.cc @@ -17,8 +17,22 @@ Status LoadOptionsFromFile(const std::string& file_name, Env* env, std::vector* cf_descs, bool ignore_unknown_options, std::shared_ptr* cache) { + ConfigOptions config_options; + config_options.ignore_unknown_options = ignore_unknown_options; + config_options.input_strings_escaped = true; + config_options.env = env; + + return LoadOptionsFromFile(config_options, file_name, db_options, cf_descs, + cache); +} + +Status LoadOptionsFromFile(const ConfigOptions& config_options, + const std::string& file_name, DBOptions* db_options, + std::vector* cf_descs, + std::shared_ptr* cache) { RocksDBOptionsParser parser; - Status s = parser.Parse(file_name, env, ignore_unknown_options); + Status s = parser.Parse(file_name, config_options.env, + config_options.ignore_unknown_options); if (!s.ok()) { return s; } @@ -41,8 +55,8 @@ Status LoadOptionsFromFile(const std::string& file_name, Env* env, return Status::OK(); } -Status GetLatestOptionsFileName(const std::string& dbpath, - Env* env, std::string* options_file_name) { +Status GetLatestOptionsFileName(const std::string& dbpath, Env* env, + std::string* options_file_name) { Status s; std::string latest_file_name; uint64_t latest_time_stamp = 0; @@ -73,13 +87,26 @@ Status LoadLatestOptions(const std::string& dbpath, Env* env, std::vector* cf_descs, bool ignore_unknown_options, std::shared_ptr* cache) { + ConfigOptions config_options; + config_options.ignore_unknown_options = ignore_unknown_options; + config_options.input_strings_escaped = true; + config_options.env = env; + + return LoadLatestOptions(config_options, dbpath, db_options, cf_descs, cache); +} + +Status LoadLatestOptions(const ConfigOptions& config_options, + const std::string& dbpath, DBOptions* db_options, + std::vector* cf_descs, + std::shared_ptr* cache) { std::string options_file_name; - Status s = GetLatestOptionsFileName(dbpath, env, &options_file_name); + Status s = + GetLatestOptionsFileName(dbpath, config_options.env, &options_file_name); if (!s.ok()) { return s; } - return LoadOptionsFromFile(dbpath + "/" + options_file_name, env, db_options, - cf_descs, ignore_unknown_options, cache); + return LoadOptionsFromFile(config_options, dbpath + "/" + options_file_name, + db_options, cf_descs, cache); } Status CheckOptionsCompatibility( From a2b20bddc303e5ac6fc75ebf657944307c4e2bc9 Mon Sep 17 00:00:00 2001 From: Connor Date: Thu, 1 Jul 2021 10:29:35 +0800 Subject: [PATCH 02/16] Add picked level for ingestion job info (#241) Signed-off-by: Connor1996 --- db/db_impl/db_impl.cc | 1 + include/rocksdb/listener.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index d022250df90..2682d4ee005 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -4155,6 +4155,7 @@ void DBImpl::NotifyOnExternalFileIngested( info.internal_file_path = f.internal_file_path; info.global_seqno = f.assigned_seqno; info.table_properties = f.table_properties; + info.picked_level = f.picked_level; for (auto listener : immutable_db_options_.listeners) { listener->OnExternalFileIngested(this, info); } diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index 7eb14e10e61..31af54e5f2d 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -278,6 +278,8 @@ struct ExternalFileIngestionInfo { SequenceNumber global_seqno; // Table properties of the table being flushed TableProperties table_properties; + // Level inside the DB we picked for the external file. + int picked_level; }; // EventListener class contains a set of callback functions that will From 8ff1ede595ebe15f8b13930d0a0d2604819a6ab1 Mon Sep 17 00:00:00 2001 From: Croxx Date: Mon, 5 Jul 2021 12:50:52 +0800 Subject: [PATCH 03/16] Allow applying `CompactionFilter` outside of compaction (facebook#8243) (#243) Cherry-picking facebook/rocksdb#8243 > From HISTORY.md release note: > > * Allow `CompactionFilter`s to apply in more table file creation scenarios such as flush and recovery. For compatibility, `CompactionFilter`s by default apply during compaction. Users can customize this behavior by overriding `CompactionFilterFactory::ShouldFilterTableFileCreation()`. > * Removed unused structure `CompactionFilterContext` > > Test Plan: added unit tests Signed-off-by: MrCroxx --- HISTORY.md | 2 + db/builder.cc | 38 ++++++-- db/c.cc | 1 - db/compaction/compaction.cc | 7 ++ db/compaction/compaction_iterator.cc | 9 +- db/compaction/compaction_iterator.h | 2 + db/db_compaction_filter_test.cc | 136 ++++++++++++++++++++++++++- include/rocksdb/compaction_filter.h | 101 +++++++++++--------- include/rocksdb/listener.h | 9 +- include/rocksdb/options.h | 28 ++++-- include/rocksdb/types.h | 7 ++ 11 files changed, 261 insertions(+), 79 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 3640c5f6288..5325ef3cb11 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -3,11 +3,13 @@ ### Public API Change * DeleteRange now returns `Status::InvalidArgument` if the range's end key comes before its start key according to the user comparator. Previously the behavior was undefined. * ldb now uses options.force_consistency_checks = true by default and "--disable_consistency_checks" is added to disable it. +* Removed unused structure `CompactionFilterContext`. ### New Features * When user uses options.force_consistency_check in RocksDb, instead of crashing the process, we now pass the error back to the users without killing the process. * Added experimental ColumnFamilyOptions::sst_partitioner_factory to define determine the partitioning of sst files. This helps compaction to split the files on interesting boundaries (key prefixes) to make propagation of sst files less write amplifying (covering the whole key space). * Option `max_background_flushes` can be set dynamically using DB::SetDBOptions(). +* Allow `CompactionFilter`s to apply in more table file creation scenarios such as flush and recovery. For compatibility, `CompactionFilter`s by default apply during compaction. Users can customize this behavior by overriding `CompactionFilterFactory::ShouldFilterTableFileCreation()`. Picked from [facebook/rocksdb#pr8243](https://github.com/facebook/rocksdb/pull/8243). ### Bug Fixes * Fixed issue #6316 that can cause a corruption of the MANIFEST file in the middle when writing to it fails due to no disk space. diff --git a/db/builder.cc b/db/builder.cc index eac1b5fe2e1..47efcbe1b78 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -106,6 +106,25 @@ Status BuildTable( TableProperties tp; if (iter->Valid() || !range_del_agg->IsEmpty()) { + std::unique_ptr compaction_filter; + if (ioptions.compaction_filter_factory != nullptr && + ioptions.compaction_filter_factory->ShouldFilterTableFileCreation( + reason)) { + CompactionFilter::Context context; + context.is_full_compaction = false; + context.is_manual_compaction = false; + context.column_family_id = column_family_id; + context.reason = reason; + compaction_filter = + ioptions.compaction_filter_factory->CreateCompactionFilter(context); + if (compaction_filter != nullptr && + !compaction_filter->IgnoreSnapshots()) { + return Status::NotSupported( + "CompactionFilter::IgnoreSnapshots() = false is not supported " + "anymore."); + } + } + TableBuilder* builder; std::unique_ptr file_writer; // Currently we only enable dictionary compression during compaction to the @@ -141,17 +160,21 @@ Status BuildTable( 0 /*target_file_size*/, file_creation_time); } - MergeHelper merge(env, internal_comparator.user_comparator(), - ioptions.merge_operator, nullptr, ioptions.info_log, - true /* internal key corruption is not ok */, - snapshots.empty() ? 0 : snapshots.back(), - snapshot_checker); + MergeHelper merge( + env, internal_comparator.user_comparator(), ioptions.merge_operator, + compaction_filter.get(), ioptions.info_log, + true /* internal key corruption is not ok */, + snapshots.empty() ? 0 : snapshots.back(), snapshot_checker); CompactionIterator c_iter( iter, internal_comparator.user_comparator(), &merge, kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot, snapshot_checker, env, ShouldReportDetailedTime(env, ioptions.statistics), - true /* internal key corruption is not ok */, range_del_agg.get()); + true /* internal key corruption is not ok */, range_del_agg.get(), + /*compaction=*/nullptr, compaction_filter.get(), + /*shutting_down=*/nullptr, + /*preserve_deletes_seqnum=*/0, /*manual_compaction_paused=*/nullptr); + c_iter.SeekToFirst(); for (; c_iter.Valid(); c_iter.Next()) { const Slice& key = c_iter.key(); @@ -192,7 +215,8 @@ Status BuildTable( meta->fd.file_size = file_size; meta->marked_for_compaction = builder->NeedCompact(); assert(meta->fd.GetFileSize() > 0); - tp = builder->GetTableProperties(); // refresh now that builder is finished + tp = builder + ->GetTableProperties(); // refresh now that builder is finished if (table_properties) { *table_properties = tp; } diff --git a/db/c.cc b/db/c.cc index 4fe556c3063..144d8f90ded 100644 --- a/db/c.cc +++ b/db/c.cc @@ -54,7 +54,6 @@ using rocksdb::ColumnFamilyHandle; using rocksdb::ColumnFamilyOptions; using rocksdb::CompactionFilter; using rocksdb::CompactionFilterFactory; -using rocksdb::CompactionFilterContext; using rocksdb::CompactionOptionsFIFO; using rocksdb::Comparator; using rocksdb::CompressionType; diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc index 4bdafbdc43d..daae5cc1059 100644 --- a/db/compaction/compaction.cc +++ b/db/compaction/compaction.cc @@ -531,6 +531,12 @@ std::unique_ptr Compaction::CreateCompactionFilter( return nullptr; } + if (!cfd_->ioptions() + ->compaction_filter_factory->ShouldFilterTableFileCreation( + TableFileCreationReason::kCompaction)) { + return nullptr; + } + CompactionFilter::Context context; context.is_full_compaction = is_full_compaction_; context.is_manual_compaction = is_manual_compaction_; @@ -550,6 +556,7 @@ std::unique_ptr Compaction::CreateCompactionFilter( } } context.column_family_id = cfd_->GetID(); + context.reason = TableFileCreationReason::kCompaction; return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter( context); } diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index 1d3406b4082..361fd891a92 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -4,6 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include "db/compaction/compaction_iterator.h" + #include "db/snapshot_checker.h" #include "port/likely.h" #include "rocksdb/listener.h" @@ -78,8 +79,8 @@ CompactionIterator::CompactionIterator( current_user_key_snapshot_(0), merge_out_iter_(merge_helper_), current_key_committed_(false), - snap_list_callback_(snap_list_callback) { - assert(compaction_filter_ == nullptr || compaction_ != nullptr); + snap_list_callback_(snap_list_callback), + level_(compaction_ == nullptr ? 0 : compaction_->level()) { assert(snapshots_ != nullptr); bottommost_level_ = compaction_ == nullptr ? false : compaction_->bottommost_level(); @@ -121,7 +122,7 @@ void CompactionIterator::Next() { key_ = merge_out_iter_.key(); value_ = merge_out_iter_.value(); bool valid_key __attribute__((__unused__)); - valid_key = ParseInternalKey(key_, &ikey_); + valid_key = ParseInternalKey(key_, &ikey_); // MergeUntil stops when it encounters a corrupt key and does not // include them in the result, so we expect the keys here to be valid. assert(valid_key); @@ -176,7 +177,7 @@ void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, { StopWatchNano timer(env_, report_detailed_time_); filter = compaction_filter_->FilterV3( - compaction_->level(), filter_key, seqno, value_type, value_, + level_, filter_key, seqno, value_type, value_, &compaction_filter_value_, compaction_filter_skip_until_.rep()); iter_stats_.total_filter_time += env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0; diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h index 2bf847e2e21..f447746cfd8 100644 --- a/db/compaction/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -275,6 +275,8 @@ class CompactionIterator { // number of distinct keys processed size_t num_keys_ = 0; + const int level_; + bool IsShuttingDown() { // This is a best-effort facility, so memory_order_relaxed is sufficient. return shutting_down_ && shutting_down_->load(std::memory_order_relaxed); diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc index 37e80048e6d..074e2b32991 100644 --- a/db/db_compaction_filter_test.cc +++ b/db/db_compaction_filter_test.cc @@ -82,6 +82,11 @@ class DeleteFilter : public CompactionFilter { return true; } + bool FilterMergeOperand(int /*level*/, const Slice& /*key*/, + const Slice& /*operand*/) const override { + return true; + } + const char* Name() const override { return "DeleteFilter"; } }; @@ -206,18 +211,36 @@ class KeepFilterFactory : public CompactionFilterFactory { bool compaction_filter_created_; }; +// This filter factory is configured with a `TableFileCreationReason`. Only +// table files created for that reason will undergo filtering. This +// configurability makes it useful to tests for filtering non-compaction table +// files, such as "CompactionFilterFlush" and "CompactionFilterRecovery". class DeleteFilterFactory : public CompactionFilterFactory { public: + explicit DeleteFilterFactory(TableFileCreationReason reason) + : reason_(reason) {} + std::unique_ptr CreateCompactionFilter( const CompactionFilter::Context& context) override { - if (context.is_manual_compaction) { - return std::unique_ptr(new DeleteFilter()); - } else { + EXPECT_EQ(reason_, context.reason); + if (context.reason == TableFileCreationReason::kCompaction && + !context.is_manual_compaction) { + // Table files created by automatic compaction do not undergo filtering. + // Presumably some tests rely on this. return std::unique_ptr(nullptr); } + return std::unique_ptr(new DeleteFilter()); + } + + bool ShouldFilterTableFileCreation( + TableFileCreationReason reason) const override { + return reason_ == reason; } const char* Name() const override { return "DeleteFilterFactory"; } + + private: + const TableFileCreationReason reason_; }; // Delete Filter Factory which ignores snapshots @@ -377,7 +400,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { // create a new database with the compaction // filter in such a way that it deletes all keys - options.compaction_filter_factory = std::make_shared(); + options.compaction_filter_factory = std::make_shared( + TableFileCreationReason::kCompaction); options.create_if_missing = true; DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); @@ -447,7 +471,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { // entries in VersionEdit, but none of the 'AddFile's. TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) { Options options = CurrentOptions(); - options.compaction_filter_factory = std::make_shared(); + options.compaction_filter_factory = std::make_shared( + TableFileCreationReason::kCompaction); options.disable_auto_compactions = true; options.create_if_missing = true; DestroyAndReopen(options); @@ -475,6 +500,64 @@ TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) { } #endif // ROCKSDB_LITE +TEST_F(DBTestCompactionFilter, CompactionFilterFlush) { + // Tests a `CompactionFilterFactory` that filters when table file is created + // by flush. + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared(TableFileCreationReason::kFlush); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + Reopen(options); + + // Puts and Merges are purged in flush. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + ASSERT_OK(Flush()); + ASSERT_EQ("NOT_FOUND", Get("a")); + ASSERT_EQ("NOT_FOUND", Get("b")); + + // However, Puts and Merges are preserved by recovery. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + Reopen(options); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); + + // Likewise, compaction does not apply filtering. + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); +} + +TEST_F(DBTestCompactionFilter, CompactionFilterRecovery) { + // Tests a `CompactionFilterFactory` that filters when table file is created + // by recovery. + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared(TableFileCreationReason::kRecovery); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + Reopen(options); + + // Puts and Merges are purged in recovery. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + Reopen(options); + ASSERT_EQ("NOT_FOUND", Get("a")); + ASSERT_EQ("NOT_FOUND", Get("b")); + + // However, Puts and Merges are preserved by flush. + ASSERT_OK(Put("a", "v")); + ASSERT_OK(Merge("b", "v")); + ASSERT_OK(Flush()); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); + + // Likewise, compaction does not apply filtering. + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ("v", Get("a")); + ASSERT_EQ("v", Get("b")); +} + TEST_P(DBTestCompactionFilterWithCompactParam, CompactionFilterWithValueChange) { Options options = CurrentOptions(); @@ -864,6 +947,49 @@ TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalse) { delete options.compaction_filter; } +class TestNotSupportedFilterFactory : public CompactionFilterFactory { + public: + explicit TestNotSupportedFilterFactory(TableFileCreationReason reason) + : reason_(reason) {} + + bool ShouldFilterTableFileCreation( + TableFileCreationReason reason) const override { + return reason_ == reason; + } + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /* context */) override { + return std::unique_ptr(new TestNotSupportedFilter()); + } + + const char* Name() const override { return "TestNotSupportedFilterFactory"; } + + private: + const TableFileCreationReason reason_; +}; + +TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseDuringFlush) { + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared( + TableFileCreationReason::kFlush); + Reopen(options); + + ASSERT_OK(Put("a", "v10")); + ASSERT_TRUE(Flush().IsNotSupported()); +} + +TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseRecovery) { + Options options = CurrentOptions(); + options.compaction_filter_factory = + std::make_shared( + TableFileCreationReason::kRecovery); + Reopen(options); + + ASSERT_OK(Put("a", "v10")); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h index c6851cf6998..cd879b3f40f 100644 --- a/include/rocksdb/compaction_filter.h +++ b/include/rocksdb/compaction_filter.h @@ -15,25 +15,15 @@ #include "rocksdb/slice.h" #include "rocksdb/table_properties.h" +#include "rocksdb/types.h" namespace rocksdb { class Slice; class SliceTransform; -// Context information of a compaction run -struct CompactionFilterContext { - // Does this compaction run include all data files - bool is_full_compaction; - // Is this compaction requested by the client (true), - // or is it occurring as an automatic compaction process - bool is_manual_compaction; - // Whether output files are in bottommost level or not. - bool is_bottommost_level; -}; - -// CompactionFilter allows an application to modify/delete a key-value at -// the time of compaction. +// CompactionFilter allows an application to modify/delete a key-value during +// table file creation. class CompactionFilter { public: @@ -50,12 +40,15 @@ class CompactionFilter { kRemoveAndSkipUntil, }; - // Context information of a compaction run + enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError }; + + // Context information for a table file creation. struct Context { - // Does this compaction run include all data files + // Whether this table file is created as part of a compaction including all + // table files. bool is_full_compaction; - // Is this compaction requested by the client (true), - // or is it occurring as an automatic compaction process + // Whether this table file is created as part of a compaction requested by + // the client. bool is_manual_compaction; // Whether output files are in bottommost level or not. bool is_bottommost_level; @@ -72,23 +65,25 @@ class CompactionFilter { std::vector> table_properties; // Which column family this compaction is for. + // The column family that will contain the created table file. uint32_t column_family_id; + // Reason this table file is being created. + TableFileCreationReason reason; }; virtual ~CompactionFilter() {} - // The compaction process invokes this - // method for kv that is being compacted. A return value - // of false indicates that the kv should be preserved in the - // output of this compaction run and a return value of true - // indicates that this key-value should be removed from the - // output of the compaction. The application can inspect - // the existing value of the key and make decision based on it. + // The table file creation process invokes this method before adding a kv to + // the table file. A return value of false indicates that the kv should be + // preserved in the new table file and a return value of true indicates + // that this key-value should be removed from the new table file. The + // application can inspect the existing value of the key and make decision + // based on it. // - // Key-Values that are results of merge operation during compaction are not - // passed into this function. Currently, when you have a mix of Put()s and - // Merge()s on a same key, we only guarantee to process the merge operands - // through the compaction filters. Put()s might be processed, or might not. + // Key-Values that are results of merge operation during table file creation + // are not passed into this function. Currently, when you have a mix of Put()s + // and Merge()s on a same key, we only guarantee to process the merge operands + // through the `CompactionFilter`s. Put()s might be processed, or might not. // // When the value is to be preserved, the application has the option // to modify the existing_value and pass it back through new_value. @@ -96,9 +91,10 @@ class CompactionFilter { // // Note that RocksDB snapshots (i.e. call GetSnapshot() API on a // DB* object) will not guarantee to preserve the state of the DB with - // CompactionFilter. Data seen from a snapshot might disppear after a - // compaction finishes. If you use snapshots, think twice about whether you - // want to use compaction filter and whether you are using it in a safe way. + // CompactionFilter. Data seen from a snapshot might disappear after a + // table file created with a `CompactionFilter` is installed. If you use + // snapshots, think twice about whether you want to use `CompactionFilter` and + // whether you are using it in a safe way. // // If multithreaded compaction is being used *and* a single CompactionFilter // instance was supplied via Options::compaction_filter, this method may be @@ -106,7 +102,7 @@ class CompactionFilter { // that the call is thread-safe. // // If the CompactionFilter was created by a factory, then it will only ever - // be used by a single thread that is doing the compaction run, and this + // be used by a single thread that is doing the table file creation, and this // call does not need to be thread-safe. However, multiple filters may be // in existence and operating concurrently. virtual bool Filter(int /*level*/, const Slice& /*key*/, @@ -116,9 +112,9 @@ class CompactionFilter { return false; } - // The compaction process invokes this method on every merge operand. If this - // method returns true, the merge operand will be ignored and not written out - // in the compaction output + // The table file creation process invokes this method on every merge operand. + // If this method returns true, the merge operand will be ignored and not + // written out in the new table file. // // Note: If you are using a TransactionDB, it is not recommended to implement // FilterMergeOperand(). If a Merge operation is filtered out, TransactionDB @@ -179,13 +175,14 @@ class CompactionFilter { // snapshot - beware if you're using TransactionDB or // DB::GetSnapshot(). // - If value for a key was overwritten or merged into (multiple Put()s - // or Merge()s), and compaction filter skips this key with + // or Merge()s), and `CompactionFilter` skips this key with // kRemoveAndSkipUntil, it's possible that it will remove only // the new value, exposing the old value that was supposed to be // overwritten. // - Doesn't work with PlainTableFactory in prefix mode. - // - If you use kRemoveAndSkipUntil, consider also reducing - // compaction_readahead_size option. + // - If you use kRemoveAndSkipUntil for table files created by + // compaction, consider also reducing compaction_readahead_size + // option. // // Note: If you are using a TransactionDB, it is not recommended to filter // out or modify merge operands (ValueType::kMergeOperand). @@ -204,27 +201,39 @@ class CompactionFilter { } // This function is deprecated. Snapshots will always be ignored for - // compaction filters, because we realized that not ignoring snapshots doesn't - // provide the gurantee we initially thought it would provide. Repeatable - // reads will not be guaranteed anyway. If you override the function and - // returns false, we will fail the compaction. + // `CompactionFilter`s, because we realized that not ignoring snapshots + // doesn't provide the guarantee we initially thought it would provide. + // Repeatable reads will not be guaranteed anyway. If you override the + // function and returns false, we will fail the table file creation. virtual bool IgnoreSnapshots() const { return true; } - // Returns a name that identifies this compaction filter. + // Returns a name that identifies this `CompactionFilter`. // The name will be printed to LOG file on start up for diagnosis. virtual const char* Name() const = 0; }; -// Each compaction will create a new CompactionFilter allowing the -// application to know about different compactions +// Each thread of work involving creating table files will create a new +// `CompactionFilter` according to `ShouldFilterTableFileCreation()`. This +// allows the application to know about the different ongoing threads of work +// and makes it unnecessary for `CompactionFilter` to provide thread-safety. class CompactionFilterFactory { public: virtual ~CompactionFilterFactory() {} + // Returns whether a thread creating table files for the specified `reason` + // should invoke `CreateCompactionFilter()` and pass KVs through the returned + // filter. + virtual bool ShouldFilterTableFileCreation( + TableFileCreationReason reason) const { + // For backward compatibility, default implementation only applies + // `CompactionFilter` to files generated by compaction. + return reason == TableFileCreationReason::kCompaction; + } + virtual std::unique_ptr CreateCompactionFilter( const CompactionFilter::Context& context) = 0; - // Returns a name that identifies this compaction filter factory. + // Returns a name that identifies this `CompactionFilter` factory. virtual const char* Name() const = 0; }; diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index 31af54e5f2d..4fe284dbd72 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -11,9 +11,11 @@ #include #include #include + #include "rocksdb/compaction_job_stats.h" #include "rocksdb/status.h" #include "rocksdb/table_properties.h" +#include "rocksdb/types.h" namespace rocksdb { @@ -26,13 +28,6 @@ class Status; struct CompactionJobStats; enum CompressionType : unsigned char; -enum class TableFileCreationReason { - kFlush, - kCompaction, - kRecovery, - kMisc, -}; - struct TableFileCreationBriefInfo { // the name of the database where the file was created std::string db_name; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index bd93bc15b03..730ae9622d6 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -148,9 +148,10 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // Allows an application to modify/delete a key-value during background // compaction. // - // If the client requires a new compaction filter to be used for different - // compaction runs, it can specify compaction_filter_factory instead of this - // option. The client should specify only one of the two. + // If the client requires a new `CompactionFilter` to be used for different + // compaction runs and/or requires a `CompactionFilter` for table file + // creations outside of compaction, it can specify compaction_filter_factory + // instead of this option. The client should specify only one of the two. // compaction_filter takes precedence over compaction_filter_factory if // client specifies both. // @@ -161,12 +162,21 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // Default: nullptr const CompactionFilter* compaction_filter = nullptr; - // This is a factory that provides compaction filter objects which allow - // an application to modify/delete a key-value during background compaction. - // - // A new filter will be created on each compaction run. If multithreaded - // compaction is being used, each created CompactionFilter will only be used - // from a single thread and so does not need to be thread-safe. + // This is a factory that provides `CompactionFilter` objects which allow + // an application to modify/delete a key-value during table file creation. + // + // Unlike the `compaction_filter` option, which is used when compaction + // creates a table file, this factory allows using a `CompactionFilter` when a + // table file is created for various reasons. The factory can decide what + // `TableFileCreationReason`s use a `CompactionFilter`. For compatibility, by + // default the decision is to use a `CompactionFilter` for + // `TableFileCreationReason::kCompaction` only. + // + // Each thread of work involving creating table files will create a new + // `CompactionFilter` when it will be used according to the above + // `TableFileCreationReason`-based decision. This allows the application to + // know about the different ongoing threads of work and makes it unnecessary + // for `CompactionFilter` to provide thread-safety. // // Default: nullptr std::shared_ptr compaction_filter_factory = nullptr; diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h index 2cd4039bd7c..2745bea664e 100644 --- a/include/rocksdb/types.h +++ b/include/rocksdb/types.h @@ -17,6 +17,13 @@ typedef uint64_t SequenceNumber; const SequenceNumber kMinUnCommittedSeq = 1; // 0 is always committed +enum class TableFileCreationReason { + kFlush, + kCompaction, + kRecovery, + kMisc, +}; + // User-oriented representation of internal key types. enum EntryType { kEntryPut, From 176e4b1cfb8975a7c5ff38042ed6e168ed385ea6 Mon Sep 17 00:00:00 2001 From: Connor Date: Tue, 6 Jul 2021 11:10:32 +0800 Subject: [PATCH 04/16] Fix sst_dump not able to open ingested file (#6673) (#244) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: When investigating https://github.com/facebook/rocksdb/issues/6666, we encounter an error for sst_dump to dump an ingested SST file with global seqno. ``` Corruption: An external sst file with version 2 have global seqno property with value ��/, while largest seqno in the file is 0) ``` Same as https://github.com/facebook/rocksdb/pull/5097, it is due to SstFileReader don't know the largest seqno of a file, it will fail this check when it open a file with global seqno. https://github.com/facebook/rocksdb/blob/ca89ac2ba997dfa0e135bd75d4ccf6f5774a7eff/table/block_based_table_reader.cc#L730 Pull Request resolved: https://github.com/facebook/rocksdb/pull/6673 Test Plan: run it manually Reviewed By: cheng-chang Differential Revision: D20937546 Pulled By: ajkr fbshipit-source-id: c3fd04d60916a738533ee1885f3ea844669a9479 Signed-off-by: Connor1996 --- tools/sst_dump_tool.cc | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index 44a733b57c6..d75308f7d88 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -126,20 +126,22 @@ Status SstFileDumper::NewTableReader( const ImmutableCFOptions& /*ioptions*/, const EnvOptions& /*soptions*/, const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size, std::unique_ptr* /*table_reader*/) { + auto t_opt = TableReaderOptions(ioptions_, moptions_.prefix_extractor.get(), + soptions_, internal_comparator_); + // Allow open file with global sequence number for backward compatibility. + t_opt.largest_seqno = kMaxSequenceNumber; + // We need to turn off pre-fetching of index and filter nodes for // BlockBasedTable if (BlockBasedTableFactory::kName == options_.table_factory->Name()) { - return options_.table_factory->NewTableReader( - TableReaderOptions(ioptions_, moptions_.prefix_extractor.get(), - soptions_, internal_comparator_), - std::move(file_), file_size, &table_reader_, /*enable_prefetch=*/false); + return options_.table_factory->NewTableReader(t_opt, std::move(file_), + file_size, &table_reader_, + /*enable_prefetch=*/false); } // For all other factory implementation - return options_.table_factory->NewTableReader( - TableReaderOptions(ioptions_, moptions_.prefix_extractor.get(), soptions_, - internal_comparator_), - std::move(file_), file_size, &table_reader_); + return options_.table_factory->NewTableReader(t_opt, std::move(file_), + file_size, &table_reader_); } Status SstFileDumper::VerifyChecksum() { From b22a85a74dd7b1a9c00410fdaf0732d70fefdc46 Mon Sep 17 00:00:00 2001 From: Xintao Date: Mon, 19 Jul 2021 20:21:38 +0800 Subject: [PATCH 05/16] Reimplement print single sst meta when dump manifest (#247) Original PR: https://github.com/facebook/rocksdb/pull/6716 Signed-off-by: Xintao --- db/version_edit.cc | 8 +---- db/version_edit.h | 19 +++++++++++ db/version_set.cc | 77 ++++++++++++++++++++++++++---------------- db/version_set.h | 7 ++-- tools/ldb_cmd.cc | 23 +++++++++---- tools/ldb_cmd_impl.h | 2 ++ tools/sst_dump_tool.cc | 12 +++++++ 7 files changed, 101 insertions(+), 47 deletions(-) diff --git a/db/version_edit.cc b/db/version_edit.cc index ecadf6e3980..a1080b24be5 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -574,13 +574,7 @@ std::string VersionEdit::DebugString(bool hex_key) const { r.append("\n AddFile: "); AppendNumberTo(&r, new_files_[i].first); r.append(" "); - AppendNumberTo(&r, f.fd.GetNumber()); - r.append(" "); - AppendNumberTo(&r, f.fd.GetFileSize()); - r.append(" "); - r.append(f.smallest.DebugString(hex_key)); - r.append(" .. "); - r.append(f.largest.DebugString(hex_key)); + r.append(f.DebugString(hex_key)); } r.append("\n ColumnFamily: "); AppendNumberTo(&r, column_family_); diff --git a/db/version_edit.h b/db/version_edit.h index 4a93db34e15..0d55a4827ba 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -17,6 +17,7 @@ #include "memory/arena.h" #include "rocksdb/cache.h" #include "util/autovector.h" +#include "util/string_util.h" namespace rocksdb { @@ -154,6 +155,24 @@ struct FileMetaData { fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); fd.largest_seqno = std::max(fd.largest_seqno, seqno); } + + std::string DebugString(bool hex) const { + std::string r; + AppendNumberTo(&r, this->fd.GetNumber()); + r.push_back(':'); + AppendNumberTo(&r, this->fd.GetFileSize()); + r.append("["); + AppendNumberTo(&r, this->fd.smallest_seqno); + r.append(" .. "); + AppendNumberTo(&r, this->fd.largest_seqno); + r.append("]"); + r.append("["); + r.append(this->smallest.DebugString(hex)); + r.append(" .. "); + r.append(this->largest.DebugString(hex)); + r.append("]"); + return r; + } }; // A compressed copy of file meta data that just contain minimum data needed diff --git a/db/version_set.cc b/db/version_set.cc index 911dbae34ba..68e347f853f 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3387,19 +3387,7 @@ std::string Version::DebugString(bool hex, bool print_stats) const { const std::vector& files = storage_info_.files_[level]; for (size_t i = 0; i < files.size(); i++) { r.push_back(' '); - AppendNumberTo(&r, files[i]->fd.GetNumber()); - r.push_back(':'); - AppendNumberTo(&r, files[i]->fd.GetFileSize()); - r.append("["); - AppendNumberTo(&r, files[i]->fd.smallest_seqno); - r.append(" .. "); - AppendNumberTo(&r, files[i]->fd.largest_seqno); - r.append("]"); - r.append("["); - r.append(files[i]->smallest.DebugString(hex)); - r.append(" .. "); - r.append(files[i]->largest.DebugString(hex)); - r.append("]"); + r.append(files[i]->DebugString(hex)); if (print_stats) { r.append("("); r.append(ToString( @@ -4684,7 +4672,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, } Status VersionSet::DumpManifest(Options& options, std::string& dscname, - bool verbose, bool hex, bool json) { + bool verbose, bool hex, bool json, + uint64_t sst_file_number) { // Open the specified manifest file. std::unique_ptr file_reader; Status s; @@ -4844,10 +4833,12 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, } if (s.ok()) { + bool found = false; for (auto cfd : *column_family_set_) { if (cfd->IsDropped()) { continue; } + if (found) break; auto builders_iter = builders.find(cfd->GetID()); assert(builders_iter != builders.end()); auto builder = builders_iter->second->version_builder(); @@ -4858,33 +4849,59 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, builder->SaveTo(v->storage_info()); v->PrepareApply(*cfd->GetLatestMutableCFOptions(), false); - printf("--------------- Column family \"%s\" (ID %" PRIu32 - ") --------------\n", - cfd->GetName().c_str(), cfd->GetID()); - printf("log number: %" PRIu64 "\n", cfd->GetLogNumber()); - auto comparator = comparators.find(cfd->GetID()); - if (comparator != comparators.end()) { - printf("comparator: %s\n", comparator->second.c_str()); + if (sst_file_number != 0) { + for (int level = 0; level < v->storage_info_.num_levels_; level++) { + const std::vector& files = + v->storage_info_.files_[level]; + for (size_t i = 0; i < files.size(); i++) { + if (files[i]->fd.GetNumber() == sst_file_number) { + found = true; + printf("--------------- Column family \"%s\" (ID %" PRIu32 + ") --------------\n", + cfd->GetName().c_str(), cfd->GetID()); + printf("%s at level %d\n", files[i]->DebugString(hex).c_str(), + level); + break; + } + } + if (found) break; + } } else { - printf("comparator: \n"); + printf("--------------- Column family \"%s\" (ID %" PRIu32 + ") --------------\n", + cfd->GetName().c_str(), cfd->GetID()); + printf("log number: %" PRIu64 "\n", cfd->GetLogNumber()); + auto comparator = comparators.find(cfd->GetID()); + if (comparator != comparators.end()) { + printf("comparator: %s\n", comparator->second.c_str()); + } else { + printf("comparator: \n"); + } + printf("%s \n", v->DebugString(hex).c_str()); } - printf("%s \n", v->DebugString(hex).c_str()); delete v; } + if (sst_file_number != 0 && !found) { + s = Status::NotFound("sst " + ToString(sst_file_number) + + " is not in the live files set of the manifest"); + } + next_file_number_.store(next_file + 1); last_allocated_sequence_ = last_sequence; last_published_sequence_ = last_sequence; last_sequence_ = last_sequence; prev_log_number_ = previous_log_number; - printf("next_file_number %" PRIu64 " last_sequence %" PRIu64 - " prev_log_number %" PRIu64 " max_column_family %" PRIu32 - " min_log_number_to_keep " - "%" PRIu64 "\n", - next_file_number_.load(), last_sequence, previous_log_number, - column_family_set_->GetMaxColumnFamily(), - min_log_number_to_keep_2pc()); + if (sst_file_number == 0) { + printf("next_file_number %" PRIu64 " last_sequence %" PRIu64 + " prev_log_number %" PRIu64 " max_column_family %" PRIu32 + " min_log_number_to_keep " + "%" PRIu64 "\n", + next_file_number_.load(), last_sequence, previous_log_number, + column_family_set_->GetMaxColumnFamily(), + min_log_number_to_keep_2pc()); + } } return s; diff --git a/db/version_set.h b/db/version_set.h index 728afdb3bb1..3e5e361926b 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -870,9 +870,10 @@ class VersionSet { const EnvOptions& env_options, int new_levels); - // printf contents (for debugging) - Status DumpManifest(Options& options, std::string& manifestFileName, - bool verbose, bool hex = false, bool json = false); + // If sst_file_number is > 0, only prints manifest info for specified SST file number +Status DumpManifest(Options& options, std::string& dscname, + bool verbose, bool hex, bool json, + uint64_t sst_file_number); #endif // ROCKSDB_LITE diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 91d916ba279..1c91b8382d5 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -1003,7 +1003,7 @@ void DBLoaderCommand::DoCommand() { namespace { void DumpManifestFile(Options options, std::string file, bool verbose, bool hex, - bool json) { + bool json, uint64_t sst_file_number = 0) { EnvOptions sopt; std::string dbname("dummy"); std::shared_ptr tc(NewLRUCache(options.max_open_files - 10, @@ -1018,7 +1018,7 @@ void DumpManifestFile(Options options, std::string file, bool verbose, bool hex, ImmutableDBOptions immutable_db_options(options); VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc, /*block_cache_tracer=*/nullptr); - Status s = versions.DumpManifest(options, file, verbose, hex, json); + Status s = versions.DumpManifest(options, file, verbose, hex, json, sst_file_number); if (!s.ok()) { printf("Error in processing file %s %s\n", file.c_str(), s.ToString().c_str()); @@ -1030,6 +1030,7 @@ void DumpManifestFile(Options options, std::string file, bool verbose, bool hex, const std::string ManifestDumpCommand::ARG_VERBOSE = "verbose"; const std::string ManifestDumpCommand::ARG_JSON = "json"; const std::string ManifestDumpCommand::ARG_PATH = "path"; +const std::string ManifestDumpCommand::ARG_NUMBER = "sst_file_number"; void ManifestDumpCommand::Help(std::string& ret) { ret.append(" "); @@ -1037,6 +1038,7 @@ void ManifestDumpCommand::Help(std::string& ret) { ret.append(" [--" + ARG_VERBOSE + "]"); ret.append(" [--" + ARG_JSON + "]"); ret.append(" [--" + ARG_PATH + "=]"); + ret.append(" [--" + ARG_NUMBER + "=]"); ret.append("\n"); } @@ -1044,12 +1046,13 @@ ManifestDumpCommand::ManifestDumpCommand( const std::vector& /*params*/, const std::map& options, const std::vector& flags) - : LDBCommand( - options, flags, false, - BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX, ARG_JSON})), + : LDBCommand(options, flags, false, + BuildCmdLineOptions( + {ARG_VERBOSE, ARG_PATH, ARG_HEX, ARG_JSON, ARG_NUMBER})), verbose_(false), json_(false), - path_("") { + path_(""), + sst_file_number_(0) { verbose_ = IsFlagPresent(flags, ARG_VERBOSE); json_ = IsFlagPresent(flags, ARG_JSON); @@ -1061,6 +1064,11 @@ ManifestDumpCommand::ManifestDumpCommand( exec_state_ = LDBCommandExecuteResult::Failed("--path: missing pathname"); } } + + itr = options.find(ARG_NUMBER); + if (itr != options.end()) { + sst_file_number_ = ParseUint64(itr->second); + } } void ManifestDumpCommand::DoCommand() { @@ -1105,7 +1113,8 @@ void ManifestDumpCommand::DoCommand() { printf("Processing Manifest file %s\n", manifestfile.c_str()); } - DumpManifestFile(options_, manifestfile, verbose_, is_key_hex_, json_); + DumpManifestFile(options_, manifestfile, verbose_, is_key_hex_, json_, + sst_file_number_); if (verbose_) { printf("Processing Manifest file %s done\n", manifestfile.c_str()); diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h index b12e5396d48..fe9fc47a277 100644 --- a/tools/ldb_cmd_impl.h +++ b/tools/ldb_cmd_impl.h @@ -165,10 +165,12 @@ class ManifestDumpCommand : public LDBCommand { bool verbose_; bool json_; std::string path_; + uint64_t sst_file_number_; static const std::string ARG_VERBOSE; static const std::string ARG_JSON; static const std::string ARG_PATH; + static const std::string ARG_NUMBER; }; class ListColumnFamiliesCommand : public LDBCommand { diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index d75308f7d88..aa22170c38a 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -312,6 +312,7 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num, } else { iter->SeekToFirst(); } + std::string prev_key = ""; for (; iter->Valid(); iter->Next()) { Slice key = iter->key(); Slice value = iter->value(); @@ -342,6 +343,17 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num, ikey.DebugString(output_hex_).c_str(), value.ToString(output_hex_).c_str()); } + + if (prev_key.size() != 0 && + internal_comparator_.Compare(key, Slice(prev_key)) <= 0) { + InternalKey current, last; + current.DecodeFrom(key); + last.DecodeFrom(Slice(prev_key)); + return Status::Corruption("current key " + current.DebugString(true) + + " is not greater than last key " + + last.DebugString(true)); + } + prev_key = key.ToString(false); } read_num_ += i; From 573e74218a2d54190b41d5c975dca750d3dd9621 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Thu, 28 May 2020 10:00:19 -0700 Subject: [PATCH 06/16] Make it possible to look up files by number in VersionStorageInfo (#6862) Summary: Does what it says on the can: the patch adds a hash map to `VersionStorageInfo` that maps file numbers to file locations, i.e. (level, position in level) pairs. This will enable stricter consistency checks in `VersionBuilder`. The patch also fixes all the unit tests that used duplicate file numbers in a version (which would trigger an assertion with the new code). Pull Request resolved: https://github.com/facebook/rocksdb/pull/6862 Test Plan: `make check` `make whitebox_crash_test` Reviewed By: riversand963 Differential Revision: D21670446 Pulled By: ltamasi fbshipit-source-id: 2eac249945cf33d8fb8597b26bfff5221e1a861a Signed-off-by: tabokie --- db/compaction/compaction_picker_test.cc | 12 +++---- db/version_set.cc | 20 ++++++++++- db/version_set.h | 46 +++++++++++++++++++++++++ db/version_set_test.cc | 32 ++++++++++++----- 4 files changed, 95 insertions(+), 15 deletions(-) diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index c04418bb968..89feae141a4 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -369,8 +369,8 @@ TEST_F(CompactionPickerTest, LevelTriggerDynamic4) { mutable_cf_options_.max_bytes_for_level_multiplier = 10; NewVersionStorage(num_levels, kCompactionStyleLevel); Add(0, 1U, "150", "200"); - Add(num_levels - 1, 3U, "200", "250", 300U); - Add(num_levels - 1, 4U, "300", "350", 3000U); + Add(num_levels - 1, 2U, "200", "250", 300U); + Add(num_levels - 1, 3U, "300", "350", 3000U); Add(num_levels - 1, 4U, "400", "450", 3U); Add(num_levels - 2, 5U, "150", "180", 300U); Add(num_levels - 2, 6U, "181", "350", 500U); @@ -575,7 +575,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) { Add(2, 8U, "201", "300", 60000000U); // Overlaps with file 28, 29, total size 521M - Add(3, 26U, "100", "110", 261000000U); + Add(3, 25U, "100", "110", 261000000U); Add(3, 26U, "150", "170", 261000000U); Add(3, 27U, "171", "179", 260000000U); Add(3, 28U, "191", "220", 260000000U); @@ -1091,7 +1091,7 @@ TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded1) { // Size ratio L4/L3 is 9.9 // After merge from L3, L4 size is 1000900 Add(4, 11U, "400", "500", 999900); - Add(5, 11U, "400", "500", 8007200); + Add(5, 12U, "400", "500", 8007200); UpdateVersionStorageInfo(); @@ -1404,8 +1404,8 @@ TEST_F(CompactionPickerTest, IsTrivialMoveOn) { Add(3, 5U, "120", "130", 7000U); Add(3, 6U, "170", "180", 7000U); - Add(3, 5U, "220", "230", 7000U); - Add(3, 5U, "270", "280", 7000U); + Add(3, 7U, "220", "230", 7000U); + Add(3, 8U, "270", "280", 7000U); UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( diff --git a/db/version_set.cc b/db/version_set.cc index 68e347f853f..c1df704625b 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -2569,6 +2569,12 @@ void VersionStorageInfo::AddFile(int level, FileMetaData* f, Logger* info_log) { #endif f->refs++; level_files->push_back(f); + + const uint64_t file_number = f->fd.GetNumber(); + + assert(file_locations_.find(file_number) == file_locations_.end()); + file_locations_.emplace(file_number, + FileLocation(level, level_files->size() - 1)); } // Version::PrepareApply() need to be called before calling the function, or @@ -4655,7 +4661,19 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, } if (first_nonempty_level > 0) { - new_files_list[new_levels - 1] = vstorage->LevelFiles(first_nonempty_level); + auto& new_last_level = new_files_list[new_levels - 1]; + + new_last_level = vstorage->LevelFiles(first_nonempty_level); + + for (size_t i = 0; i < new_last_level.size(); ++i) { + const FileMetaData* const meta = new_last_level[i]; + assert(meta); + + const uint64_t file_number = meta->fd.GetNumber(); + + vstorage->file_locations_[file_number] = + VersionStorageInfo::FileLocation(new_levels - 1, i); + } } delete[] vstorage -> files_; diff --git a/db/version_set.h b/db/version_set.h index 3e5e361926b..88e2de6df7f 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -268,6 +268,47 @@ class VersionStorageInfo { return files_[level]; } + class FileLocation { + public: + FileLocation() = default; + FileLocation(int level, size_t position) + : level_(level), position_(position) {} + + int GetLevel() const { return level_; } + size_t GetPosition() const { return position_; } + + bool IsValid() const { return level_ >= 0; } + + bool operator==(const FileLocation& rhs) const { + return level_ == rhs.level_ && position_ == rhs.position_; + } + + bool operator!=(const FileLocation& rhs) const { return !(*this == rhs); } + + static FileLocation Invalid() { return FileLocation(); } + + private: + int level_ = -1; + size_t position_ = 0; + }; + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + FileLocation GetFileLocation(uint64_t file_number) const { + const auto it = file_locations_.find(file_number); + + if (it == file_locations_.end()) { + return FileLocation::Invalid(); + } + + assert(it->second.GetLevel() < num_levels_); + assert(it->second.GetPosition() < files_[it->second.GetLevel()].size()); + assert(files_[it->second.GetLevel()][it->second.GetPosition()]); + assert(files_[it->second.GetLevel()][it->second.GetPosition()] + ->fd.GetNumber() == file_number); + + return it->second; + } + const rocksdb::LevelFilesBrief& LevelFilesBrief(int level) const { assert(level < static_cast(level_files_brief_.size())); return level_files_brief_[level]; @@ -438,6 +479,11 @@ class VersionStorageInfo { // in increasing order of keys std::vector* files_; + // Map of all table files in version. Maps file number to (level, position on + // level). + using FileLocations = std::unordered_map; + FileLocations file_locations_; + // Level that L0 data should be compacted to. All levels < base_level_ should // be empty. -1 if it is not level-compaction so it's not applicable. int base_level_; diff --git a/db/version_set_test.cc b/db/version_set_test.cc index a1278bfc7ad..446ae94c720 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -360,19 +360,19 @@ TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) { Add(2, 3U, "6", "8", 1U); // Partial overlap with last level Add(3, 4U, "1", "9", 1U); // Contains range of last level Add(4, 5U, "4", "5", 1U); // Inside range of last level - Add(4, 5U, "6", "7", 1U); // Inside range of last level - Add(5, 6U, "4", "7", 10U); + Add(4, 6U, "6", "7", 1U); // Inside range of last level + Add(5, 7U, "4", "7", 10U); ASSERT_EQ(10U, vstorage_.EstimateLiveDataSize()); } TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) { Add(0, 1U, "9", "9", 1U); // Level 0 is not ordered - Add(0, 1U, "5", "6", 1U); // Ignored because of [5,6] in l1 - Add(1, 1U, "1", "2", 1U); // Ignored because of [2,3] in l2 - Add(1, 2U, "3", "4", 1U); // Ignored because of [2,3] in l2 - Add(1, 3U, "5", "6", 1U); - Add(2, 4U, "2", "3", 1U); - Add(3, 5U, "7", "8", 1U); + Add(0, 2U, "5", "6", 1U); // Ignored because of [5,6] in l1 + Add(1, 3U, "1", "2", 1U); // Ignored because of [2,3] in l2 + Add(1, 4U, "3", "4", 1U); // Ignored because of [2,3] in l2 + Add(1, 5U, "5", "6", 1U); + Add(2, 6U, "2", "3", 1U); + Add(3, 7U, "7", "8", 1U); ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize()); } @@ -409,6 +409,22 @@ TEST_F(VersionStorageInfoTest, GetOverlappingInputs) { 1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue})); } +TEST_F(VersionStorageInfoTest, FileLocation) { + Add(0, 11U, "1", "2", 5000U); + Add(0, 12U, "1", "2", 5000U); + + Add(2, 7U, "1", "2", 8000U); + + ASSERT_EQ(vstorage_.GetFileLocation(11U), + VersionStorageInfo::FileLocation(0, 0)); + ASSERT_EQ(vstorage_.GetFileLocation(12U), + VersionStorageInfo::FileLocation(0, 1)); + + ASSERT_EQ(vstorage_.GetFileLocation(7U), + VersionStorageInfo::FileLocation(2, 0)); + + ASSERT_FALSE(vstorage_.GetFileLocation(999U).IsValid()); +} class FindLevelFileTest : public testing::Test { public: From f46f5160e459c2db5e39811521643b87ff65a566 Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Wed, 3 Jun 2020 11:21:16 -0700 Subject: [PATCH 07/16] Improve consistency checks in VersionBuilder (#6901) Summary: The patch cleans up the code and improves the consistency checks around adding/deleting table files in `VersionBuilder`. Namely, it makes the checks stricter and improves them in the following ways: 1) A table file can now only be deleted from the LSM tree using the level it resides on. Earlier, there was some unnecessary wiggle room for trivially moved files (they could be deleted using a lower level number than the actual one). 2) A table file cannot be added to the tree if it is already present in the tree on any level (not just the target level). The earlier code only had an assertion (which is a no-op in release builds) that the newly added file is not already present on the target level. 3) The above consistency checks around state transitions are now mandatory, as opposed to the earlier `CheckConsistencyForDeletes`, which was a no-op in release mode unless `force_consistency_checks` was set to `true`. The rationale here is that assuming that the initial state is consistent, a valid transition leads to a next state that is also consistent; however, an *invalid* transition offers no such guarantee. Hence it makes sense to validate the transitions unconditionally, and save `force_consistency_checks` for the paranoid checks that re-validate the entire state. 4) The new checks build on the mechanism introduced in https://github.com/facebook/rocksdb/pull/6862, which enables us to efficiently look up the location (level and position within level) of files in a `Version` by file number. This makes the consistency checks much more efficient than the earlier `CheckConsistencyForDeletes`, which essentially performed a linear search. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6901 Test Plan: Extended the unit tests and ran: `make check` `make whitebox_crash_test` Reviewed By: ajkr Differential Revision: D21822714 Pulled By: ltamasi fbshipit-source-id: e2b29c8b6da1bf0f59004acc889e4870b2d18215 Signed-off-by: tabokie --- db/version_builder.cc | 239 ++++++++++++++++++++----------------- db/version_builder_test.cc | 233 +++++++++++++++++++++++++++++++++++- 2 files changed, 363 insertions(+), 109 deletions(-) diff --git a/db/version_builder.cc b/db/version_builder.cc index ae43af5d2b9..b72f4251407 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -89,14 +89,16 @@ class VersionBuilder::Rep { VersionStorageInfo* base_vstorage_; int num_levels_; LevelState* levels_; - // Store states of levels larger than num_levels_. We do this instead of + // Store sizes of levels larger than num_levels_. We do this instead of // storing them in levels_ to avoid regression in case there are no files // on invalid levels. The version is not consistent if in the end the files // on invalid levels don't cancel out. - std::map> invalid_levels_; + std::unordered_map invalid_level_sizes_; // Whether there are invalid new files or invalid deletion on levels larger // than num_levels_. bool has_invalid_levels_; + // Current levels of table files affected by additions/deletions. + std::unordered_map table_file_levels_; FileComparator level_zero_cmp_; FileComparator level_nonzero_cmp_; @@ -226,137 +228,162 @@ class VersionBuilder::Rep { return Status::OK(); } - Status CheckConsistencyForDeletes(VersionEdit* edit) { -#ifdef NDEBUG - if (!base_vstorage_->force_consistency_checks()) { - // Dont run consistency checks in release mode except if - // explicitly asked to - return Status::OK(); + bool CheckConsistencyForNumLevels() const { + // Make sure there are no files on or beyond num_levels(). + if (has_invalid_levels_) { + return false; } -#endif - std::unordered_map deletes; - for (const auto& del_file : edit->GetDeletedFiles()) { - const auto level = del_file.first; - const auto number = del_file.second; - if (level < num_levels_) { - deletes[number] = level; + + for (const auto& pair : invalid_level_sizes_) { + const size_t level_size = pair.second; + if (level_size != 0) { + return false; } } - // a file to be deleted better exist in the previous version - for (int l = 0; deletes.size() != 0 && l < num_levels_; l++) { - const std::vector& base_files = - base_vstorage_->LevelFiles(l); - for (size_t i = 0; i < base_files.size(); i++) { - FileMetaData* f = base_files[i]; - if (deletes.erase(f->fd.GetNumber()) != 0) { - if (deletes.size() == 0) { - break; - } - } - } + return true; + } + + int GetCurrentLevelForTableFile(uint64_t file_number) const { + auto it = table_file_levels_.find(file_number); + if (it != table_file_levels_.end()) { + return it->second; } - for (const auto& d : deletes) { - const auto number = d.first; - const auto level = d.second; - // if the file did not exist in the previous version, then it - // is possibly moved from lower level to higher level in current - // version - bool found = false; - for (int l = level + 1; l < num_levels_; l++) { - auto& level_added = levels_[l].added_files; - auto got = level_added.find(number); - if (got != level_added.end()) { - found = true; - break; - } - } - // maybe this file was added in a previous edit that was Applied - if (!found) { - auto& level_added = levels_[level].added_files; - auto got = level_added.find(number); - if (got != level_added.end()) { - found = true; - } + assert(base_vstorage_); + return base_vstorage_->GetFileLocation(file_number).GetLevel(); + } + + Status ApplyFileDeletion(int level, uint64_t file_number) { + assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel()); + + const int current_level = GetCurrentLevelForTableFile(file_number); + + if (level != current_level) { + if (level >= num_levels_) { + has_invalid_levels_ = true; } - if (!found) { - fprintf(stderr, "not found %" PRIu64 "\n", number); - return Status::Corruption("not found " + NumberToString(number)); + + std::ostringstream oss; + oss << "Cannot delete table file #" << file_number << " from level " + << level << " since it is "; + if (current_level == + VersionStorageInfo::FileLocation::Invalid().GetLevel()) { + oss << "not in the LSM tree"; + } else { + oss << "on level " << current_level; } + + return Status::Corruption("VersionBuilder", oss.str()); + } + + if (level >= num_levels_) { + assert(invalid_level_sizes_[level] > 0); + --invalid_level_sizes_[level]; + + table_file_levels_[file_number] = + VersionStorageInfo::FileLocation::Invalid().GetLevel(); + + return Status::OK(); + } + + auto& level_state = levels_[level]; + + auto& add_files = level_state.added_files; + auto add_it = add_files.find(file_number); + if (add_it != add_files.end()) { + UnrefFile(add_it->second); + add_files.erase(add_it); + } else { + auto& del_files = level_state.deleted_files; + assert(del_files.find(file_number) == del_files.end()); + del_files.emplace(file_number); } + + table_file_levels_[file_number] = + VersionStorageInfo::FileLocation::Invalid().GetLevel(); + return Status::OK(); } - bool CheckConsistencyForNumLevels() { - // Make sure there are no files on or beyond num_levels(). - if (has_invalid_levels_) { - return false; - } - for (auto& level : invalid_levels_) { - if (level.second.size() > 0) { - return false; + Status ApplyFileAddition(int level, const FileMetaData& meta) { + assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel()); + + const uint64_t file_number = meta.fd.GetNumber(); + + const int current_level = GetCurrentLevelForTableFile(file_number); + + if (current_level != + VersionStorageInfo::FileLocation::Invalid().GetLevel()) { + if (level >= num_levels_) { + has_invalid_levels_ = true; } + + std::ostringstream oss; + oss << "Cannot add table file #" << file_number << " to level " << level + << " since it is already in the LSM tree on level " << current_level; + return Status::Corruption("VersionBuilder", oss.str()); } - return true; + + if (level >= num_levels_) { + ++invalid_level_sizes_[level]; + table_file_levels_[file_number] = level; + + return Status::OK(); + } + + auto& level_state = levels_[level]; + + auto& del_files = level_state.deleted_files; + auto del_it = del_files.find(file_number); + if (del_it != del_files.end()) { + del_files.erase(del_it); + } else { + FileMetaData* const f = new FileMetaData(meta); + f->refs = 1; + + auto& add_files = level_state.added_files; + assert(add_files.find(file_number) == add_files.end()); + add_files.emplace(file_number, f); + } + + table_file_levels_[file_number] = level; + + return Status::OK(); } // Apply all of the edits in *edit to the current state. Status Apply(VersionEdit* edit) { - Status s = CheckConsistency(base_vstorage_); - if (!s.ok()) { - return s; + { + const Status s = CheckConsistency(base_vstorage_); + if (!s.ok()) { + return s; + } } // Delete files - const VersionEdit::DeletedFileSet& del = edit->GetDeletedFiles(); - s = CheckConsistencyForDeletes(edit); - if (!s.ok()) { - return s; - } - for (const auto& del_file : del) { - const auto level = del_file.first; - const auto number = del_file.second; - if (level < num_levels_) { - levels_[level].deleted_files.insert(number); - auto existing = levels_[level].added_files.find(number); - if (existing != levels_[level].added_files.end()) { - UnrefFile(existing->second); - levels_[level].added_files.erase(existing); - } - } else { - auto existing = invalid_levels_[level].find(number); - if (existing != invalid_levels_[level].end()) { - invalid_levels_[level].erase(existing); - } else { - // Deleting an non-existing file on invalid level. - has_invalid_levels_ = true; - } + for (const auto& deleted_file : edit->GetDeletedFiles()) { + const int level = deleted_file.first; + const uint64_t file_number = deleted_file.second; + + const Status s = ApplyFileDeletion(level, file_number); + if (!s.ok()) { + return s; } } // Add new files for (const auto& new_file : edit->GetNewFiles()) { const int level = new_file.first; - if (level < num_levels_) { - FileMetaData* f = new FileMetaData(new_file.second); - f->refs = 1; - - assert(levels_[level].added_files.find(f->fd.GetNumber()) == - levels_[level].added_files.end()); - levels_[level].deleted_files.erase(f->fd.GetNumber()); - levels_[level].added_files[f->fd.GetNumber()] = f; - } else { - uint64_t number = new_file.second.fd.GetNumber(); - if (invalid_levels_[level].count(number) == 0) { - invalid_levels_[level].insert(number); - } else { - // Creating an already existing file on invalid level. - has_invalid_levels_ = true; - } + const FileMetaData& meta = new_file.second; + + const Status s = ApplyFileAddition(level, meta); + if (!s.ok()) { + return s; } } - return s; + + return Status::OK(); } // Save the current state in *v. @@ -536,10 +563,6 @@ Status VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) { return rep_->CheckConsistency(vstorage); } -Status VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit) { - return rep_->CheckConsistencyForDeletes(edit); -} - bool VersionBuilder::CheckConsistencyForNumLevels() { return rep_->CheckConsistencyForNumLevels(); } diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 3a144190cf1..35c234fb918 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -3,6 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include #include #include "db/version_edit.h" #include "db/version_set.h" @@ -52,7 +53,7 @@ class VersionBuilderTest : public testing::Test { return InternalKey(ukey, smallest_seq, kTypeValue); } - void Add(int level, uint32_t file_number, const char* smallest, + void Add(int level, uint64_t file_number, const char* smallest, const char* largest, uint64_t file_size = 0, uint32_t path_id = 0, SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100, uint64_t num_entries = 0, uint64_t num_deletions = 0, @@ -275,6 +276,236 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { UnrefFilesInVersion(&new_vstorage); } +TEST_F(VersionBuilderTest, ApplyFileDeletionIncorrectLevel) { + constexpr int level = 1; + constexpr uint64_t file_number = 2345; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + + Add(level, file_number, smallest, largest); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + + VersionBuilder builder(env_options, table_cache, &vstorage_); + + VersionEdit edit; + + constexpr int incorrect_level = 3; + + edit.DeleteFile(incorrect_level, file_number); + + const Status s = builder.Apply(&edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), + "Cannot delete table file #2345 from level 3 since " + "it is on level 1")); +} + +TEST_F(VersionBuilderTest, ApplyFileDeletionNotInLSMTree) { + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + + VersionBuilder builder(env_options, table_cache, &vstorage_); + + VersionEdit edit; + + constexpr int level = 3; + constexpr uint64_t file_number = 1234; + + edit.DeleteFile(level, file_number); + + const Status s = builder.Apply(&edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), + "Cannot delete table file #1234 from level 3 since " + "it is not in the LSM tree")); +} + +TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) { + constexpr int level = 1; + constexpr uint64_t file_number = 2345; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + + Add(level, file_number, smallest, largest); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + + VersionBuilder builder(env_options, table_cache, &vstorage_); + + VersionEdit deletion; + + deletion.DeleteFile(level, file_number); + + ASSERT_OK(builder.Apply(&deletion)); + + VersionEdit addition; + + constexpr uint32_t path_id = 0; + constexpr uint64_t file_size = 10000; + constexpr SequenceNumber smallest_seqno = 100; + constexpr SequenceNumber largest_seqno = 1000; + constexpr bool marked_for_compaction = false; + + addition.AddFile(level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction); + + ASSERT_OK(builder.Apply(&addition)); + + constexpr bool force_consistency_checks = false; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + ASSERT_EQ(new_vstorage.GetFileLocation(file_number).GetLevel(), level); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) { + constexpr int level = 1; + constexpr uint64_t file_number = 2345; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + + Add(level, file_number, smallest, largest); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + + VersionBuilder builder(env_options, table_cache, &vstorage_); + + VersionEdit edit; + + constexpr int new_level = 2; + constexpr uint32_t path_id = 0; + constexpr uint64_t file_size = 10000; + constexpr SequenceNumber smallest_seqno = 100; + constexpr SequenceNumber largest_seqno = 1000; + constexpr bool marked_for_compaction = false; + + edit.AddFile(new_level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction); + + const Status s = builder.Apply(&edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), + "Cannot add table file #2345 to level 2 since it is " + "already in the LSM tree on level 1")); +} + +TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + + VersionBuilder builder(env_options, table_cache, &vstorage_); + + VersionEdit edit; + + constexpr int level = 3; + constexpr uint64_t file_number = 2345; + constexpr uint32_t path_id = 0; + constexpr uint64_t file_size = 10000; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + constexpr SequenceNumber smallest_seqno = 100; + constexpr SequenceNumber largest_seqno = 1000; + constexpr bool marked_for_compaction = false; + + edit.AddFile(level, file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction); + + ASSERT_OK(builder.Apply(&edit)); + + VersionEdit other_edit; + + constexpr int new_level = 2; + + other_edit.AddFile(new_level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction); + + const Status s = builder.Apply(&other_edit); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(std::strstr(s.getState(), + "Cannot add table file #2345 to level 2 since it is " + "already in the LSM tree on level 3")); +} + +TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) { + constexpr int level = 1; + constexpr uint64_t file_number = 2345; + constexpr uint32_t path_id = 0; + constexpr uint64_t file_size = 10000; + constexpr char smallest[] = "bar"; + constexpr char largest[] = "foo"; + constexpr SequenceNumber smallest_seqno = 100; + constexpr SequenceNumber largest_seqno = 1000; + constexpr bool marked_for_compaction = false; + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + + VersionBuilder builder(env_options, table_cache, &vstorage_); + + VersionEdit addition; + + addition.AddFile(level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction); + + ASSERT_OK(builder.Apply(&addition)); + + VersionEdit deletion; + + deletion.DeleteFile(level, file_number); + + ASSERT_OK(builder.Apply(&deletion)); + + constexpr bool force_consistency_checks = false; + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + + ASSERT_OK(builder.SaveTo(&new_vstorage)); + ASSERT_FALSE(new_vstorage.GetFileLocation(file_number).IsValid()); + + UnrefFilesInVersion(&new_vstorage); +} + +TEST_F(VersionBuilderTest, CheckConsistencyForFileDeletedTwice) { + Add(0, 1U, "150", "200", 100U); + UpdateVersionStorageInfo(); + + VersionEdit version_edit; + version_edit.DeleteFile(0, 1U); + + EnvOptions env_options; + constexpr TableCache* table_cache = nullptr; + + VersionBuilder version_builder(env_options, table_cache, &vstorage_); + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, + true /* force_consistency_checks */); + ASSERT_OK(version_builder.Apply(&version_edit)); + ASSERT_OK(version_builder.SaveTo(&new_vstorage)); + + VersionBuilder version_builder2(env_options, table_cache, &new_vstorage); + VersionStorageInfo new_vstorage2(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr, + true /* force_consistency_checks */); + ASSERT_NOK(version_builder2.Apply(&version_edit)); + + UnrefFilesInVersion(&new_vstorage); + UnrefFilesInVersion(&new_vstorage2); +} + TEST_F(VersionBuilderTest, EstimatedActiveKeys) { const uint32_t kTotalSamples = 20; const uint32_t kNumLevels = 5; From 2aba25166553d4410564f0f41b55e772e672434d Mon Sep 17 00:00:00 2001 From: Levi Tamasi Date: Thu, 11 Jun 2020 18:29:51 -0700 Subject: [PATCH 08/16] Revisit the handling of the case when a file is re-added to the same level (#6939) Summary: https://github.com/facebook/rocksdb/pull/6901 subtly changed the handling of the corner case when a table file is deleted from a level, then re-added to the same level. (Note: this should be extremely rare; one scenario that comes to mind is a trivial move followed by a call to `ReFitLevel` that moves the file back to the original level.) Before that change, a new `FileMetaData` object was created as a result of this sequence; after the change, the original `FileMetaData` was essentially resurrected (since the deletion and the addition simply cancel each other out with the change). This patch restores the original behavior, which is more intuitive considering the interface, and in sync with how trivial moves are handled. (Also note that `FileMetaData` contains some mutable data members, the values of which might be different in the resurrected object and the freshly created one.) The PR also fixes a bug in this area: with the original pre-6901 code, `VersionBuilder` would add the same file twice to the same level in the scenario described above. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6939 Test Plan: `make check` Reviewed By: ajkr Differential Revision: D21905580 Pulled By: ltamasi fbshipit-source-id: da07ae45384ecf3c6c53506d106432d88a7ec9df Signed-off-by: tabokie --- db/version_builder.cc | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/db/version_builder.cc b/db/version_builder.cc index b72f4251407..2692b04f8f5 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -294,12 +294,12 @@ class VersionBuilder::Rep { if (add_it != add_files.end()) { UnrefFile(add_it->second); add_files.erase(add_it); - } else { - auto& del_files = level_state.deleted_files; - assert(del_files.find(file_number) == del_files.end()); - del_files.emplace(file_number); } + auto& del_files = level_state.deleted_files; + assert(del_files.find(file_number) == del_files.end()); + del_files.emplace(file_number); + table_file_levels_[file_number] = VersionStorageInfo::FileLocation::Invalid().GetLevel(); @@ -338,15 +338,15 @@ class VersionBuilder::Rep { auto del_it = del_files.find(file_number); if (del_it != del_files.end()) { del_files.erase(del_it); - } else { - FileMetaData* const f = new FileMetaData(meta); - f->refs = 1; - - auto& add_files = level_state.added_files; - assert(add_files.find(file_number) == add_files.end()); - add_files.emplace(file_number, f); } + FileMetaData* const f = new FileMetaData(meta); + f->refs = 1; + + auto& add_files = level_state.added_files; + assert(add_files.find(file_number) == add_files.end()); + add_files.emplace(file_number, f); + table_file_levels_[file_number] = level; return Status::OK(); @@ -542,11 +542,27 @@ class VersionBuilder::Rep { } void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) { - if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) { + const uint64_t file_number = f->fd.GetNumber(); + + const auto& level_state = levels_[level]; + + const auto& del_files = level_state.deleted_files; + const auto del_it = del_files.find(file_number); + + if (del_it != del_files.end()) { // f is to-be-deleted table file vstorage->RemoveCurrentStats(f); } else { - vstorage->AddFile(level, f, info_log_); + const auto& add_files = level_state.added_files; + const auto add_it = add_files.find(file_number); + + // Note: if the file appears both in the base version and in the added + // list, the added FileMetaData supersedes the one in the base version. + if (add_it != add_files.end() && add_it->second != f) { + vstorage->RemoveCurrentStats(f); + } else { + vstorage->AddFile(level, f, info_log_); + } } } }; From f95aaf8dac667334bb6314bd06d80749f63177bb Mon Sep 17 00:00:00 2001 From: Connor Date: Tue, 3 Aug 2021 15:10:40 +0800 Subject: [PATCH 09/16] Add an option to disable write stall (#251) * add option to disable write stall Signed-off-by: Connor1996 --- db/column_family.cc | 14 ++-- db/db_impl/db_impl.cc | 4 +- db/db_impl/db_impl.h | 1 + db/db_impl/db_impl_compaction_flush.cc | 4 +- db/db_impl/db_impl_open.cc | 2 +- db/db_options_test.cc | 97 +++++++++++++++++++++++++- include/rocksdb/options.h | 5 ++ options/cf_options.cc | 2 + options/cf_options.h | 3 + options/db_options.cc | 2 + options/options.cc | 2 + options/options_helper.cc | 6 ++ options/options_settable_test.cc | 1 + 13 files changed, 132 insertions(+), 11 deletions(-) diff --git a/db/column_family.cc b/db/column_family.cc index 54b3f7112ea..2a5493e43cc 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -738,7 +738,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( bool needed_delay = write_controller->NeedsDelay(); if (write_stall_condition == WriteStallCondition::kStopped && - write_stall_cause == WriteStallCause::kMemtableLimit) { + write_stall_cause == WriteStallCause::kMemtableLimit && !mutable_cf_options.disable_write_stall) { write_controller_token_ = write_controller->GetStopToken(); internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1); ROCKS_LOG_WARN( @@ -748,7 +748,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( name_.c_str(), imm()->NumNotFlushed(), mutable_cf_options.max_write_buffer_number); } else if (write_stall_condition == WriteStallCondition::kStopped && - write_stall_cause == WriteStallCause::kL0FileCountLimit) { + write_stall_cause == WriteStallCause::kL0FileCountLimit && !mutable_cf_options.disable_write_stall) { write_controller_token_ = write_controller->GetStopToken(); internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1); if (compaction_picker_->IsLevel0CompactionInProgress()) { @@ -759,7 +759,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( "[%s] Stopping writes because we have %d level-0 files", name_.c_str(), vstorage->l0_delay_trigger_count()); } else if (write_stall_condition == WriteStallCondition::kStopped && - write_stall_cause == WriteStallCause::kPendingCompactionBytes) { + write_stall_cause == WriteStallCause::kPendingCompactionBytes && !mutable_cf_options.disable_write_stall) { write_controller_token_ = write_controller->GetStopToken(); internal_stats_->AddCFStats( InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1); @@ -769,7 +769,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( "bytes %" PRIu64, name_.c_str(), compaction_needed_bytes); } else if (write_stall_condition == WriteStallCondition::kDelayed && - write_stall_cause == WriteStallCause::kMemtableLimit) { + write_stall_cause == WriteStallCause::kMemtableLimit && !mutable_cf_options.disable_write_stall) { write_controller_token_ = SetupDelay(write_controller, compaction_needed_bytes, prev_compaction_needed_bytes_, was_stopped, @@ -784,7 +784,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( mutable_cf_options.max_write_buffer_number, write_controller->delayed_write_rate()); } else if (write_stall_condition == WriteStallCondition::kDelayed && - write_stall_cause == WriteStallCause::kL0FileCountLimit) { + write_stall_cause == WriteStallCause::kL0FileCountLimit && !mutable_cf_options.disable_write_stall) { // L0 is the last two files from stopping. bool near_stop = vstorage->l0_delay_trigger_count() >= mutable_cf_options.level0_stop_writes_trigger - 2; @@ -804,7 +804,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( name_.c_str(), vstorage->l0_delay_trigger_count(), write_controller->delayed_write_rate()); } else if (write_stall_condition == WriteStallCondition::kDelayed && - write_stall_cause == WriteStallCause::kPendingCompactionBytes) { + write_stall_cause == WriteStallCause::kPendingCompactionBytes && !mutable_cf_options.disable_write_stall) { // If the distance to hard limit is less than 1/4 of the gap between soft // and // hard bytes limit, we think it is near stop and speed up the slowdown. @@ -829,7 +829,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( name_.c_str(), vstorage->estimated_compaction_needed_bytes(), write_controller->delayed_write_rate()); } else { - assert(write_stall_condition == WriteStallCondition::kNormal); + assert(write_stall_condition == WriteStallCondition::kNormal || mutable_cf_options.disable_write_stall); if (vstorage->l0_delay_trigger_count() >= GetL0ThresholdSpeedupCompaction( mutable_cf_options.level0_file_num_compaction_trigger, diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 2682d4ee005..ea5364f29b1 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1012,11 +1012,13 @@ Status DBImpl::SetDBOptions( GetBGJobLimits(mutable_db_options_.max_background_flushes, mutable_db_options_.max_background_compactions, mutable_db_options_.max_background_jobs, + mutable_db_options_.base_background_compactions, /* parallelize_compactions */ true); const BGJobLimits new_bg_job_limits = GetBGJobLimits( new_options.max_background_flushes, new_options.max_background_compactions, - new_options.max_background_jobs, /* parallelize_compactions */ true); + new_options.max_background_jobs, + new_options.base_background_compactions, /* parallelize_compactions */ true); const bool max_flushes_increased = new_bg_job_limits.max_flushes > current_bg_job_limits.max_flushes; diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 65661148950..8b5381357d5 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -528,6 +528,7 @@ class DBImpl : public DB { static BGJobLimits GetBGJobLimits(int max_background_flushes, int max_background_compactions, int max_background_jobs, + int base_background_compactions, bool parallelize_compactions); // move logs pending closing from job_context to the DB queue and diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index ea5f9b15070..86985871438 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1965,12 +1965,14 @@ DBImpl::BGJobLimits DBImpl::GetBGJobLimits() const { return GetBGJobLimits(mutable_db_options_.max_background_flushes, mutable_db_options_.max_background_compactions, mutable_db_options_.max_background_jobs, + mutable_db_options_.base_background_compactions, write_controller_.NeedSpeedupCompaction()); } DBImpl::BGJobLimits DBImpl::GetBGJobLimits(int max_background_flushes, int max_background_compactions, int max_background_jobs, + int base_background_compactions, bool parallelize_compactions) { BGJobLimits res; if (max_background_flushes == -1 && max_background_compactions == -1) { @@ -1986,7 +1988,7 @@ DBImpl::BGJobLimits DBImpl::GetBGJobLimits(int max_background_flushes, } if (!parallelize_compactions) { // throttle background compactions until we deem necessary - res.max_compactions = 1; + res.max_compactions = std::max(1, std::min(base_background_compactions, res.max_compactions)); } return res; } diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 8e9cfe8f353..b1623f7eee2 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -57,7 +57,7 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { } auto bg_job_limits = DBImpl::GetBGJobLimits( result.max_background_flushes, result.max_background_compactions, - result.max_background_jobs, true /* parallelize_compactions */); + result.max_background_jobs, result.base_background_compactions, true /* parallelize_compactions */); result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions, Env::Priority::LOW); result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes, diff --git a/db/db_options_test.cc b/db/db_options_test.cc index 9a856587425..1a637f934b9 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -376,6 +376,88 @@ TEST_F(DBOptionsTest, EnableAutoCompactionAndTriggerStall) { } } +TEST_F(DBOptionsTest, EnableAutoCompactionButDisableStall) { + const std::string kValue(1024, 'v'); + Options options; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.disable_write_stall = true; + options.write_buffer_size = 1024 * 1024 * 10; + options.compression = CompressionType::kNoCompression; + options.level0_file_num_compaction_trigger = 1; + options.level0_stop_writes_trigger = std::numeric_limits::max(); + options.level0_slowdown_writes_trigger = std::numeric_limits::max(); + options.hard_pending_compaction_bytes_limit = + std::numeric_limits::max(); + options.soft_pending_compaction_bytes_limit = + std::numeric_limits::max(); + options.env = env_; + + DestroyAndReopen(options); + int i = 0; + for (; i < 1024; i++) { + Put(Key(i), kValue); + } + Flush(); + for (; i < 1024 * 2; i++) { + Put(Key(i), kValue); + } + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + uint64_t l0_size = SizeAtLevel(0); + + options.hard_pending_compaction_bytes_limit = l0_size; + options.soft_pending_compaction_bytes_limit = l0_size; + + Reopen(options); + dbfull()->TEST_WaitForCompact(); + ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay()); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBOptionsTest::EnableAutoCompactionButDisableStall:1", + "BackgroundCallCompaction:0"}, + {"DBImpl::BackgroundCompaction():BeforePickCompaction", + "DBOptionsTest::EnableAutoCompactionButDisableStall:2"}, + {"DBOptionsTest::EnableAutoCompactionButDisableStall:3", + "DBImpl::BackgroundCompaction():AfterPickCompaction"}}); + // Block background compaction. + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK( + dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionButDisableStall:1"); + // Wait for stall condition recalculate. + TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionButDisableStall:2"); + + ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_TRUE(dbfull()->TEST_write_controler().NeedSpeedupCompaction()); + + TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionButDisableStall:3"); + + // Background compaction executed. + dbfull()->TEST_WaitForCompact(); + ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped()); + ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_FALSE(dbfull()->TEST_write_controler().NeedSpeedupCompaction()); +} + +TEST_F(DBOptionsTest, SetOptionsDisableWriteStall) { + Options options; + options.create_if_missing = true; + options.disable_write_stall = false; + options.env = env_; + Reopen(options); + ASSERT_EQ(false, dbfull()->GetOptions().disable_write_stall); + + ASSERT_OK(dbfull()->SetOptions({{"disable_write_stall", "true"}})); + ASSERT_EQ(true, dbfull()->GetOptions().disable_write_stall); + ASSERT_OK(dbfull()->SetOptions({{"disable_write_stall", "false"}})); + ASSERT_EQ(false, dbfull()->GetOptions().disable_write_stall); +} + TEST_F(DBOptionsTest, SetOptionsMayTriggerCompaction) { Options options; options.create_if_missing = true; @@ -404,8 +486,21 @@ TEST_F(DBOptionsTest, SetBackgroundCompactionThreads) { ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); ASSERT_OK(dbfull()->SetDBOptions({{"max_background_compactions", "3"}})); ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); - auto stop_token = dbfull()->TEST_write_controler().GetStopToken(); + { + auto stop_token = dbfull()->TEST_write_controler().GetStopToken(); + ASSERT_EQ(3, dbfull()->TEST_BGCompactionsAllowed()); + } + + ASSERT_OK(dbfull()->SetDBOptions({{"base_background_compactions", "2"}})); + ASSERT_EQ(2, dbfull()->TEST_BGCompactionsAllowed()); + ASSERT_OK(dbfull()->SetDBOptions({{"base_background_compactions", "5"}})); ASSERT_EQ(3, dbfull()->TEST_BGCompactionsAllowed()); + ASSERT_OK(dbfull()->SetDBOptions({{"base_background_compactions", "1"}})); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + { + auto stop_token = dbfull()->TEST_write_controler().GetStopToken(); + ASSERT_EQ(3, dbfull()->TEST_BGCompactionsAllowed()); + } } TEST_F(DBOptionsTest, SetBackgroundFlushThreads) { diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 730ae9622d6..64e7859929b 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -300,6 +300,11 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API bool disable_auto_compactions = false; + // Disable write stall mechanism. + // + // Dynamically changeable through SetOptions() API + bool disable_write_stall = false; + // This is a factory that provides TableFactory objects. // Default: a block-based table factory that provides a default // implementation of TableBuilder and TableReader with default diff --git a/options/cf_options.cc b/options/cf_options.cc index bf0f142f30e..d5a5d3f569e 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -148,6 +148,8 @@ void MutableCFOptions::Dump(Logger* log) const { prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name()); ROCKS_LOG_INFO(log, " disable_auto_compactions: %d", disable_auto_compactions); + ROCKS_LOG_INFO(log, " disable_write_stall: %d", + disable_write_stall); ROCKS_LOG_INFO(log, " soft_pending_compaction_bytes_limit: %" PRIu64, soft_pending_compaction_bytes_limit); ROCKS_LOG_INFO(log, " hard_pending_compaction_bytes_limit: %" PRIu64, diff --git a/options/cf_options.h b/options/cf_options.h index 0834497d655..8612de2de69 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -139,6 +139,7 @@ struct MutableCFOptions { inplace_update_num_locks(options.inplace_update_num_locks), prefix_extractor(options.prefix_extractor), disable_auto_compactions(options.disable_auto_compactions), + disable_write_stall(options.disable_write_stall), soft_pending_compaction_bytes_limit( options.soft_pending_compaction_bytes_limit), hard_pending_compaction_bytes_limit( @@ -179,6 +180,7 @@ struct MutableCFOptions { inplace_update_num_locks(0), prefix_extractor(nullptr), disable_auto_compactions(false), + disable_write_stall(false), soft_pending_compaction_bytes_limit(0), hard_pending_compaction_bytes_limit(0), level0_file_num_compaction_trigger(0), @@ -231,6 +233,7 @@ struct MutableCFOptions { // Compaction related options bool disable_auto_compactions; + bool disable_write_stall; uint64_t soft_pending_compaction_bytes_limit; uint64_t hard_pending_compaction_bytes_limit; int level0_file_num_compaction_trigger; diff --git a/options/db_options.cc b/options/db_options.cc index efed1ca3c9b..1fb754424a5 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -274,6 +274,8 @@ MutableDBOptions::MutableDBOptions(const DBOptions& options) void MutableDBOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.max_background_jobs: %d", max_background_jobs); + ROCKS_LOG_HEADER(log, " Options.base_background_compactions: %d", + base_background_compactions); ROCKS_LOG_HEADER(log, " Options.max_background_compactions: %d", max_background_compactions); ROCKS_LOG_HEADER(log, " Options.avoid_flush_during_shutdown: %d", diff --git a/options/options.cc b/options/options.cc index a4f5a9bfa2e..c49ebf656e1 100644 --- a/options/options.cc +++ b/options/options.cc @@ -249,6 +249,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const { rate_limit_delay_max_milliseconds); ROCKS_LOG_HEADER(log, " Options.disable_auto_compactions: %d", disable_auto_compactions); + ROCKS_LOG_HEADER(log, " Options.disable_write_stall: %d", + disable_write_stall); const auto& it_compaction_style = compaction_style_to_string.find(compaction_style); diff --git a/options/options_helper.cc b/options/options_helper.cc index 2605674787b..a1a998c38ad 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -165,6 +165,8 @@ ColumnFamilyOptions BuildColumnFamilyOptions( // Compaction related options cf_opts.disable_auto_compactions = mutable_cf_options.disable_auto_compactions; + cf_opts.disable_write_stall = + mutable_cf_options.disable_write_stall; cf_opts.soft_pending_compaction_bytes_limit = mutable_cf_options.soft_pending_compaction_bytes_limit; cf_opts.hard_pending_compaction_bytes_limit = @@ -1807,6 +1809,10 @@ std::unordered_map {offset_of(&ColumnFamilyOptions::disable_auto_compactions), OptionType::kBoolean, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, disable_auto_compactions)}}, + {"disable_write_stall", + {offset_of(&ColumnFamilyOptions::disable_write_stall), + OptionType::kBoolean, OptionVerificationType::kNormal, true, + offsetof(struct MutableCFOptions, disable_write_stall)}}, {"filter_deletes", {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true, 0}}, diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index d83bc22444b..b3cc8602a70 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -456,6 +456,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "compaction_pri=kMinOverlappingRatio;" "hard_pending_compaction_bytes_limit=0;" "disable_auto_compactions=false;" + "disable_write_stall=false;" "report_bg_io_stats=true;" "ttl=60;" "periodic_compaction_seconds=3600;" From 3de4cc218406258204579ef1f3ef109d2b8e9390 Mon Sep 17 00:00:00 2001 From: LemonHX Date: Fri, 13 Aug 2021 16:02:26 +0800 Subject: [PATCH 10/16] Add bitmap control for perf context (#237) * `PerfFlag` implementation aside the `PerfLevel` design spec in on: https://docs.google.com/document/d/1JYmWMIZwYV0AZW6rNv_oFVZLBCAkxLLYslt39eEZstM/edit?usp=sharing Signed-off-by: lemonhx Co-authored-by: Xinye Tao --- CMakeLists.txt | 1 + db/c.cc | 12 ++++ db/perf_context_test.cc | 20 ++++++ include/rocksdb/c.h | 3 + include/rocksdb/perf_flag.h | 20 ++++++ include/rocksdb/perf_flag_defs.h | 108 +++++++++++++++++++++++++++++++ monitoring/iostats_context_imp.h | 13 ++-- monitoring/perf_context_imp.h | 68 +++++++++---------- monitoring/perf_flag.cc | 29 +++++++++ monitoring/perf_flag_imp.h | 10 +++ monitoring/perf_step_timer.h | 6 +- src.mk | 1 + 12 files changed, 250 insertions(+), 41 deletions(-) create mode 100644 include/rocksdb/perf_flag.h create mode 100644 include/rocksdb/perf_flag_defs.h create mode 100644 monitoring/perf_flag.cc create mode 100644 monitoring/perf_flag_imp.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 441173a29a1..b66f0dfaee9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -585,6 +585,7 @@ set(SOURCES monitoring/iostats_context.cc monitoring/perf_context.cc monitoring/perf_level.cc + monitoring/perf_flag.cc monitoring/persistent_stats_history.cc monitoring/statistics.cc monitoring/thread_status_impl.cc diff --git a/db/c.cc b/db/c.cc index 144d8f90ded..63fb575f855 100644 --- a/db/c.cc +++ b/db/c.cc @@ -25,6 +25,7 @@ #include "rocksdb/merge_operator.h" #include "rocksdb/options.h" #include "rocksdb/perf_context.h" +#include "rocksdb/perf_flag.h" #include "rocksdb/rate_limiter.h" #include "rocksdb/slice_transform.h" #include "rocksdb/statistics.h" @@ -113,6 +114,9 @@ using rocksdb::Checkpoint; using rocksdb::TransactionLogIterator; using rocksdb::BatchResult; using rocksdb::PerfLevel; +using rocksdb::EnablePerfFlag; +using rocksdb::DisablePerfFlag; +using rocksdb::CheckPerfFlag; using rocksdb::PerfContext; using rocksdb::MemoryUtil; @@ -2748,6 +2752,14 @@ void rocksdb_set_perf_level(int v) { SetPerfLevel(level); } +void rocksdb_enable_perf_flag(uint64_t flag) { EnablePerfFlag(flag); } + +void rocksdb_disable_perf_flag(uint64_t flag) { DisablePerfFlag(flag); } + +int rocksdb_check_perf_flag(uint64_t flag) { + return static_cast(CheckPerfFlag(flag)); +} + rocksdb_perfcontext_t* rocksdb_perfcontext_create() { rocksdb_perfcontext_t* context = new rocksdb_perfcontext_t; context->rep = rocksdb::get_perf_context(); diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index 94eabff7ff5..4f0d74e52f8 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -938,6 +938,26 @@ TEST_F(PerfContextTest, CPUTimer) { ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos); } } + +TEST_F(PerfContextTest, BitMapControl) { + DestroyDB(kDbName, Options()); + auto db = OpenDb(); + WriteOptions write_options; + SetPerfLevel(PerfLevel::kDisable); + EnablePerfFlag(flag_user_key_comparison_count); + EnablePerfFlag(flag_write_wal_time); + + for (int i = 0; i < FLAGS_total_keys; ++i) { + std::string i_str = ToString(i); + std::string key = "k" + i_str; + std::string value = "v" + i_str; + + db->Put(write_options, key, value); + } + ASSERT_GT(get_perf_context()->user_key_comparison_count, 0); + ASSERT_GT(get_perf_context()->write_wal_time, 0); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index f21dbcf7d4a..1db70697df1 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -1120,6 +1120,9 @@ enum { }; extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int); +extern ROCKSDB_LIBRARY_API void rocksdb_enable_perf_flag(uint64_t); +extern ROCKSDB_LIBRARY_API void rocksdb_disable_perf_flag(uint64_t); +extern ROCKSDB_LIBRARY_API int rocksdb_check_perf_flag(uint64_t); extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create(); extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_reset( rocksdb_perfcontext_t* context); diff --git a/include/rocksdb/perf_flag.h b/include/rocksdb/perf_flag.h new file mode 100644 index 00000000000..57536b9a7c5 --- /dev/null +++ b/include/rocksdb/perf_flag.h @@ -0,0 +1,20 @@ +// complements to perf_level +#pragma once + +#include + +#include "perf_flag_defs.h" + +#define GET_FLAG(flag) perf_flags[(uint64_t)(flag) >> 3] + +// FLAGS_LEN = ceiling(FLAG_END / bits(uint8_t)) +#define FLAGS_LEN \ + (((uint64_t)FLAG_END & (uint64_t)0b111) == 0 \ + ? ((uint64_t)FLAG_END >> 3) \ + : ((uint64_t)FLAG_END >> 3) + 1) + +namespace rocksdb { +void EnablePerfFlag(uint64_t flag); +void DisablePerfFlag(uint64_t flag); +bool CheckPerfFlag(uint64_t flag); +} // namespace rocksdb diff --git a/include/rocksdb/perf_flag_defs.h b/include/rocksdb/perf_flag_defs.h new file mode 100644 index 00000000000..d2cddf98e25 --- /dev/null +++ b/include/rocksdb/perf_flag_defs.h @@ -0,0 +1,108 @@ +#pragma once +#include + +enum { + flag_user_key_comparison_count = 0, + flag_block_cache_hit_count, + flag_block_read_count, + flag_block_read_byte, + flag_block_read_time, + flag_block_cache_index_hit_count, + flag_index_block_read_count, + flag_block_cache_filter_hit_count, + flag_filter_block_read_count, + flag_compression_dict_block_read_count, + flag_block_checksum_time, + flag_block_decompress_time, + flag_get_read_bytes, + flag_multiget_read_bytes, + flag_iter_read_bytes, + flag_internal_key_skipped_count, + flag_internal_delete_skipped_count, + flag_internal_recent_skipped_count, + flag_internal_merge_count, + flag_get_snapshot_time, + flag_get_from_memtable_time, + flag_get_from_memtable_count, + flag_get_post_process_time, + flag_get_from_output_files_time, + flag_seek_on_memtable_time, + flag_seek_on_memtable_count, + flag_next_on_memtable_count, + flag_prev_on_memtable_count, + flag_seek_child_seek_time, + flag_seek_child_seek_count, + flag_seek_min_heap_time, + flag_seek_max_heap_time, + flag_seek_internal_seek_time, + flag_find_next_user_entry_time, + flag_write_wal_time, + flag_write_memtable_time, + flag_write_delay_time, + flag_write_scheduling_flushes_compactions_time, + flag_write_pre_and_post_process_time, + flag_write_thread_wait_nanos, + flag_db_mutex_lock_nanos, + flag_db_condition_wait_nanos, + flag_merge_operator_time_nanos, + flag_read_index_block_nanos, + flag_read_filter_block_nanos, + flag_new_table_block_iter_nanos, + flag_new_table_iterator_nanos, + flag_block_seek_nanos, + flag_find_table_nanos, + flag_bloom_memtable_hit_count, + flag_bloom_memtable_miss_count, + flag_bloom_sst_hit_count, + flag_bloom_sst_miss_count, + flag_key_lock_wait_time, + flag_key_lock_wait_count, + flag_env_new_sequential_file_nanos, + flag_env_new_random_access_file_nanos, + flag_env_new_writable_file_nanos, + flag_env_reuse_writable_file_nanos, + flag_env_new_random_rw_file_nanos, + flag_env_new_directory_nanos, + flag_env_file_exists_nanos, + flag_env_get_children_nanos, + flag_env_get_children_file_attributes_nanos, + flag_env_delete_file_nanos, + flag_env_create_dir_nanos, + flag_env_create_dir_if_missing_nanos, + flag_env_delete_dir_nanos, + flag_env_get_file_size_nanos, + flag_env_get_file_modification_time_nanos, + flag_env_rename_file_nanos, + flag_env_link_file_nanos, + flag_env_lock_file_nanos, + flag_env_unlock_file_nanos, + flag_env_new_logger_nanos, + flag_get_cpu_nanos, + flag_iter_next_cpu_nanos, + flag_iter_prev_cpu_nanos, + flag_iter_seek_cpu_nanos, + flag_encrypt_data_nanos, + flag_decrypt_data_nanos, + + flag_get_from_table_nanos, + flag_user_key_return_count, + flag_block_cache_miss_count, + flag_bloom_filter_full_positive, + flag_bloom_filter_useful, + flag_bloom_filter_full_true_positive, + + flag_bytes_read, + flag_bytes_written, + flag_open_nanos, + flag_allocate_nanos, + flag_write_nanos, + flag_read_nanos, + flag_range_sync_nanos, + flag_prepare_write_nanos, + flag_fsync_nanos, + flag_logger_nanos, + flag_cpu_read_nanos, + flag_cpu_write_nanos, + // Should always be the last + FLAG_END +}; diff --git a/monitoring/iostats_context_imp.h b/monitoring/iostats_context_imp.h index 19e34209b1b..41b432543ba 100644 --- a/monitoring/iostats_context_imp.h +++ b/monitoring/iostats_context_imp.h @@ -33,15 +33,16 @@ extern __thread IOStatsContext iostats_context; #define IOSTATS(metric) (iostats_context.metric) // Declare and set start time of the timer -#define IOSTATS_TIMER_GUARD(metric) \ - PerfStepTimer iostats_step_timer_##metric(&(iostats_context.metric)); \ +#define IOSTATS_TIMER_GUARD(metric) \ + PerfStepTimer iostats_step_timer_##metric(&(iostats_context.metric), \ + CheckPerfFlag(flag_##metric)); \ iostats_step_timer_##metric.Start(); // Declare and set start time of the timer -#define IOSTATS_CPU_TIMER_GUARD(metric, env) \ - PerfStepTimer iostats_step_timer_##metric( \ - &(iostats_context.metric), env, true, \ - PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \ +#define IOSTATS_CPU_TIMER_GUARD(metric, env) \ + PerfStepTimer iostats_step_timer_##metric( \ + &(iostats_context.metric), CheckPerfFlag(flag_##metric), env, true, \ + PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \ iostats_step_timer_##metric.Start(); #else // ROCKSDB_SUPPORT_THREAD_LOCAL diff --git a/monitoring/perf_context_imp.h b/monitoring/perf_context_imp.h index e0ff8afc58e..ed61e6ec26f 100644 --- a/monitoring/perf_context_imp.h +++ b/monitoring/perf_context_imp.h @@ -37,29 +37,31 @@ extern thread_local PerfContext perf_context; #define PERF_TIMER_START(metric) perf_step_timer_##metric.Start(); // Declare and set start time of the timer -#define PERF_TIMER_GUARD(metric) \ - PerfStepTimer perf_step_timer_##metric(&(perf_context.metric)); \ +#define PERF_TIMER_GUARD(metric) \ + PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), \ + CheckPerfFlag(flag_##metric)); \ perf_step_timer_##metric.Start(); // Declare and set start time of the timer -#define PERF_TIMER_GUARD_WITH_ENV(metric, env) \ - PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), env); \ +#define PERF_TIMER_GUARD_WITH_ENV(metric, env) \ + PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), \ + CheckPerfFlag(flag_##metric), env); \ perf_step_timer_##metric.Start(); // Declare and set start time of the timer -#define PERF_CPU_TIMER_GUARD(metric, env) \ - PerfStepTimer perf_step_timer_##metric( \ - &(perf_context.metric), env, true, \ - PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \ +#define PERF_CPU_TIMER_GUARD(metric, env) \ + PerfStepTimer perf_step_timer_##metric( \ + &(perf_context.metric), CheckPerfFlag(flag_##metric), env, true, \ + PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \ perf_step_timer_##metric.Start(); -#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \ - ticker_type) \ - PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), nullptr, \ - false, PerfLevel::kEnableTime, stats, \ - ticker_type); \ - if (condition) { \ - perf_step_timer_##metric.Start(); \ +#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \ + ticker_type) \ + PerfStepTimer perf_step_timer_##metric( \ + &(perf_context.metric), CheckPerfFlag(flag_##metric), nullptr, false, \ + PerfLevel::kEnableTime, stats, ticker_type); \ + if (condition) { \ + perf_step_timer_##metric.Start(); \ } // Update metric with time elapsed since last START. start time is reset @@ -67,27 +69,27 @@ extern thread_local PerfContext perf_context; #define PERF_TIMER_MEASURE(metric) perf_step_timer_##metric.Measure(); // Increase metric value -#define PERF_COUNTER_ADD(metric, value) \ - if (perf_level >= PerfLevel::kEnableCount) { \ - perf_context.metric += value; \ +#define PERF_COUNTER_ADD(metric, value) \ + if (perf_level >= PerfLevel::kEnableCount || CheckPerfFlag(flag_##metric)) { \ + perf_context.metric += value; \ } // Increase metric value -#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) \ - if (perf_level >= PerfLevel::kEnableCount && \ - perf_context.per_level_perf_context_enabled && \ - perf_context.level_to_perf_context) { \ - if ((*(perf_context.level_to_perf_context)).find(level) != \ - (*(perf_context.level_to_perf_context)).end()) { \ - (*(perf_context.level_to_perf_context))[level].metric += value; \ - } \ - else { \ - PerfContextByLevel empty_context; \ - (*(perf_context.level_to_perf_context))[level] = empty_context; \ - (*(perf_context.level_to_perf_context))[level].metric += value; \ - } \ - } \ +#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) \ + if ((perf_level >= PerfLevel::kEnableCount || \ + CheckPerfFlag(flag_##metric)) && \ + perf_context.per_level_perf_context_enabled && \ + perf_context.level_to_perf_context) { \ + if ((*(perf_context.level_to_perf_context)).find(level) != \ + (*(perf_context.level_to_perf_context)).end()) { \ + (*(perf_context.level_to_perf_context))[level].metric += value; \ + } else { \ + PerfContextByLevel empty_context; \ + (*(perf_context.level_to_perf_context))[level] = empty_context; \ + (*(perf_context.level_to_perf_context))[level].metric += value; \ + } \ + } #endif -} +} // namespace rocksdb diff --git a/monitoring/perf_flag.cc b/monitoring/perf_flag.cc new file mode 100644 index 00000000000..a3bdbda353e --- /dev/null +++ b/monitoring/perf_flag.cc @@ -0,0 +1,29 @@ +#include "rocksdb/perf_flag.h" + +namespace rocksdb { +#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL +__thread uint8_t perf_flags[FLAGS_LEN] = {0}; +#else +uint8_t perf_flags[FLAGS_LEN] = {0}; +#endif + +void EnablePerfFlag(uint64_t flag) { + if (!CheckPerfFlag(flag)) { + // & 0b111 means find the flag location is a alternative way to do mod + // operation + GET_FLAG(flag) ^= (uint64_t)0b1 << ((uint64_t)flag & (uint64_t)0b111); + } +} + +void DisablePerfFlag(uint64_t flag) { + if (CheckPerfFlag(flag)) { + GET_FLAG(flag) ^= (uint64_t)0b1 << ((uint64_t)flag & (uint64_t)0b111); + } +} + +bool CheckPerfFlag(uint64_t flag) { + return ((uint64_t)GET_FLAG(flag) & + (uint64_t)0b1 << (flag & (uint64_t)0b111)) != 0; +} + +} // namespace rocksdb diff --git a/monitoring/perf_flag_imp.h b/monitoring/perf_flag_imp.h new file mode 100644 index 00000000000..453c5e03db8 --- /dev/null +++ b/monitoring/perf_flag_imp.h @@ -0,0 +1,10 @@ +#include +#include "rocksdb/perf_flag.h" + +namespace rocksdb { +#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL +extern __thread uint8_t perf_flags[FLAGS_LEN]; +#else +extern uint8_t perf_flags[FLAGS_LEN]; +#endif +} diff --git a/monitoring/perf_step_timer.h b/monitoring/perf_step_timer.h index 6501bd54aba..85fb2b13cbd 100644 --- a/monitoring/perf_step_timer.h +++ b/monitoring/perf_step_timer.h @@ -4,6 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). // #pragma once +#include "monitoring/perf_flag_imp.h" #include "monitoring/perf_level_imp.h" #include "rocksdb/env.h" #include "util/stop_watch.h" @@ -13,10 +14,11 @@ namespace rocksdb { class PerfStepTimer { public: explicit PerfStepTimer( - uint64_t* metric, Env* env = nullptr, bool use_cpu_time = false, + uint64_t* metric, bool enable_flag = false, Env* env = nullptr, + bool use_cpu_time = false, PerfLevel enable_level = PerfLevel::kEnableTimeExceptForMutex, Statistics* statistics = nullptr, uint32_t ticker_type = 0) - : perf_counter_enabled_(perf_level >= enable_level), + : perf_counter_enabled_(perf_level >= enable_level || enable_flag), use_cpu_time_(use_cpu_time), env_((perf_counter_enabled_ || statistics != nullptr) ? ((env != nullptr) ? env : Env::Default()) diff --git a/src.mk b/src.mk index 12bb2e0601a..13923612f6d 100644 --- a/src.mk +++ b/src.mk @@ -94,6 +94,7 @@ LIB_SOURCES = \ monitoring/iostats_context.cc \ monitoring/perf_context.cc \ monitoring/perf_level.cc \ + monitoring/perf_flag.cc \ monitoring/persistent_stats_history.cc \ monitoring/statistics.cc \ monitoring/thread_status_impl.cc \ From 6b97c0567c4694ef1d06374d54bf582e541d215d Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Wed, 18 Aug 2021 12:28:24 +0800 Subject: [PATCH 11/16] reduce file lookups before purging and fix double referencing files (#250) This PR contains a optimization to reduce DB mutex held time when generating obsolete files purging jobs: Obsolete file candidates consist of `obsolete_files_` that are no longer referenced by any version in `VersionSet`, or all physical files in DB directory if `doing_full_scan` is true. Consider the former case, `obsolete_files_` is guaranteed to be obsolete by protocol and thus no need to be checked against live files list. This PR avoids the unnecessary live file lookups when not doing full scan. The optimization assumes a solid reference counting for sst files, i.e. each physical SST file is represented by an unique `FileMetaData` structure. But this assumption is actually broken in existing codebase: A trivially moved file will appear as a new file meta with the same file number. This patch also fix this issue by forwarding moved old files in `VersionBuilder`. Aside from that, this PR also assumes SST file won't be moved to a different path with the same file number, and will raise error in that case. Finally, this PR shows impressive performance improvement in our own testing (thanks @5kbpers). We examine the patch in an artificial workload, where iterators are frequently created and released (active iterator count around 250, RocksDB write OPS around 20K). The logging shows before patching, `AddLiveFiles()` call takes 100+ms and in that process copies 2500K file metas with 60K of them being unique. Test Plan: - Extend existing unit tests to simulate trivial file movement and incorrect file editing. Benchmark Results: [TPC-C 5K WH 1 KV 512 threads] Before: TPM: 23676.8, 50th(ms): 570.4, 90th(ms): 704.6, 95th(ms): 738.2, 99th(ms): 872.4, 99.9th(ms): 5637.1, Max(ms): 12884.9 After: TPM: 24395.1(3%+), 50th(ms): 570.4, 90th(ms): 704.6, 95th(ms): 738.2, 99th(ms): 838.9, 99.9th(ms): 1342.2(76%-), Max(ms): 2952.8(77%-) Signed-off-by: Xinye Tao --- HISTORY.md | 2 + db/builder.h | 2 +- db/compaction/compaction_picker_test.cc | 1 - db/db_filesnapshot.cc | 4 +- db/db_impl/db_impl_files.cc | 23 ++-- db/file_indexer.h | 2 +- db/forward_iterator.h | 2 +- db/job_context.h | 3 +- db/version_builder.cc | 149 +++++++++++++---------- db/version_builder.h | 3 +- db/version_builder_test.cc | 154 +++++++++++++++++++----- db/version_edit.h | 23 +++- db/version_set.cc | 77 +++++++----- db/version_set.h | 15 ++- db/version_set_test.cc | 11 +- 15 files changed, 314 insertions(+), 157 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 5325ef3cb11..09492305577 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -22,6 +22,8 @@ * Fix a bug in which a snapshot read could be affected by a DeleteRange after the snapshot (#6062). * `WriteBatchWithIndex::DeleteRange` returns `Status::NotSupported`. Previously it returned success even though reads on the batch did not account for range tombstones. The corresponding language bindings now cannot be used. In C, that includes `rocksdb_writebatch_wi_delete_range`, `rocksdb_writebatch_wi_delete_range_cf`, `rocksdb_writebatch_wi_delete_rangev`, and `rocksdb_writebatch_wi_delete_rangev_cf`. In Java, that includes `WriteBatchWithIndex::deleteRange`. +### Performance Improvements +* When gathering unreferenced obsolete files for purging, file metas associated with active versions will no longer be copied for double-check. Updated VersionBuilder to make sure each physical file is reference counted by at most one FileMetaData. ## 6.4.6 (10/16/2019) * Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand. diff --git a/db/builder.h b/db/builder.h index 4fa56f50e34..1844efd60a6 100644 --- a/db/builder.h +++ b/db/builder.h @@ -25,7 +25,7 @@ namespace rocksdb { struct Options; -struct FileMetaData; +class FileMetaData; class Env; struct EnvOptions; diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index 89feae141a4..c75b14a7946 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -96,7 +96,6 @@ class CompactionPickerTest : public testing::Test { f->fd.largest_seqno = largest_seq; f->compensated_file_size = (compensated_file_size != 0) ? compensated_file_size : file_size; - f->refs = 0; vstorage_->AddFile(level, f); files_.emplace_back(f); file_map_.insert({file_number, {f, level}}); diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 67d994f5568..a3bffe1dde0 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -119,7 +119,7 @@ Status DBImpl::GetLiveFiles(std::vector& ret, } // Make a set of all of the live *.sst files - std::vector live; + std::vector live; for (auto cfd : *versions_->GetColumnFamilySet()) { if (cfd->IsDropped()) { continue; @@ -133,7 +133,7 @@ Status DBImpl::GetLiveFiles(std::vector& ret, // create names of the live files. The names are not absolute // paths, instead they are relative to dbname_; for (const auto& live_file : live) { - ret.push_back(MakeTableFileName("", live_file.GetNumber())); + ret.push_back(MakeTableFileName("", live_file)); } ret.push_back(CurrentFileName("")); diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index dc6ffa437ed..a3cc0a92d96 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -35,11 +35,13 @@ uint64_t DBImpl::MinObsoleteSstNumberToKeep() { return std::numeric_limits::max(); } -// * Returns the list of live files in 'sst_live' // If it's doing full scan: // * Returns the list of all files in the filesystem in // 'full_scan_candidate_files'. -// Otherwise, gets obsolete files from VersionSet. +// * Returns the list of live files in 'sst_live'. +// Otherwise: +// * Gets obsolete files from VersionSet. +// // no_full_scan = true -- never do the full scan using GetChildren() // force = false -- don't force the full scan, except every // mutable_db_options_.delete_obsolete_files_period_micros @@ -103,7 +105,6 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, job_context->log_number = MinLogNumberToKeep(); job_context->prev_log_number = versions_->prev_log_number(); - versions_->AddLiveFiles(&job_context->sst_live); if (doing_the_full_scan) { InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(), dbname_); @@ -235,6 +236,9 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, log_recycle_files_.end()); if (job_context->HaveSomethingToDelete()) { ++pending_purge_obsolete_files_; + if (doing_the_full_scan) { + versions_->AddLiveFiles(&job_context->sst_live); + } } logs_to_free_.clear(); } @@ -303,12 +307,9 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { // FindObsoleteFiles() should've populated this so nonzero assert(state.manifest_file_number != 0); - // Now, convert live list to an unordered map, WITHOUT mutex held; - // set is slow. - std::unordered_map sst_live_map; - for (const FileDescriptor& fd : state.sst_live) { - sst_live_map[fd.GetNumber()] = &fd; - } + // Now, convert lists to unordered sets, WITHOUT mutex held; set is slow. + std::unordered_set sst_live_set(state.sst_live.begin(), + state.sst_live.end()); std::unordered_set log_recycle_files_set( state.log_recycle_files.begin(), state.log_recycle_files.end()); @@ -407,7 +408,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { case kTableFile: // If the second condition is not there, this makes // DontDeletePendingOutputs fail - keep = (sst_live_map.find(number) != sst_live_map.end()) || + keep = (sst_live_set.find(number) != sst_live_set.end()) || number >= state.min_pending_output; if (!keep) { files_to_del.insert(number); @@ -422,7 +423,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { // // TODO(yhchiang): carefully modify the third condition to safely // remove the temp options files. - keep = (sst_live_map.find(number) != sst_live_map.end()) || + keep = (sst_live_set.find(number) != sst_live_set.end()) || (number == state.pending_manifest_file_number) || (to_delete.find(kOptionsFileNamePrefix) != std::string::npos); break; diff --git a/db/file_indexer.h b/db/file_indexer.h index 2091f80292b..98180c1ce2e 100644 --- a/db/file_indexer.h +++ b/db/file_indexer.h @@ -19,7 +19,7 @@ namespace rocksdb { class Comparator; -struct FileMetaData; +class FileMetaData; struct FdWithKeyRange; struct FileLevel; diff --git a/db/forward_iterator.h b/db/forward_iterator.h index fb73f458edd..39121b92853 100644 --- a/db/forward_iterator.h +++ b/db/forward_iterator.h @@ -25,7 +25,7 @@ struct SuperVersion; class ColumnFamilyData; class ForwardLevelIterator; class VersionStorageInfo; -struct FileMetaData; +class FileMetaData; class MinIterComparator { public: diff --git a/db/job_context.h b/db/job_context.h index 3978fad33c9..64415a60cf2 100644 --- a/db/job_context.h +++ b/db/job_context.h @@ -140,7 +140,8 @@ struct JobContext { std::vector full_scan_candidate_files; // the list of all live sst files that cannot be deleted - std::vector sst_live; + // (filled only if we're doing full scan) + std::vector sst_live; // a list of sst files that we need to delete std::vector sst_delete_files; diff --git a/db/version_builder.cc b/db/version_builder.cc index 2692b04f8f5..467d16e12e7 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -78,8 +78,13 @@ class VersionBuilder::Rep { }; struct LevelState { - std::unordered_set deleted_files; - // Map from file number to file meta data. + // Files in base version that should be deleted. + std::unordered_set deleted_base_files; + // Files moved from base version. + // Those files will not be referenced by VersionBuilder. + std::unordered_map moved_files; + // Files added, must not intersect with moved_files. + // Those files will be referenced during the lifetime of VersionBuilder. std::unordered_map added_files; }; @@ -130,8 +135,7 @@ class VersionBuilder::Rep { } void UnrefFile(FileMetaData* f) { - f->refs--; - if (f->refs <= 0) { + if (f->Unref()) { if (f->table_reader_handle) { assert(table_cache_ != nullptr); table_cache_->ReleaseHandle(f->table_reader_handle); @@ -296,7 +300,13 @@ class VersionBuilder::Rep { add_files.erase(add_it); } - auto& del_files = level_state.deleted_files; + auto& moved_files = level_state.moved_files; + auto moved_it = moved_files.find(file_number); + if (moved_it != moved_files.end()) { + moved_files.erase(moved_it); + } + + auto& del_files = level_state.deleted_base_files; assert(del_files.find(file_number) == del_files.end()); del_files.emplace(file_number); @@ -333,19 +343,30 @@ class VersionBuilder::Rep { } auto& level_state = levels_[level]; - - auto& del_files = level_state.deleted_files; - auto del_it = del_files.find(file_number); - if (del_it != del_files.end()) { - del_files.erase(del_it); + // Try to reuse file meta from base version. + FileMetaData* f = base_vstorage_->GetFileMetaDataByNumber(file_number); + std::unordered_map* container = nullptr; + if (f != nullptr) { + // This should be a file trivially moved to a new position. Make sure the + // two are the same physical file. + if (f->fd.GetPathId() != meta.fd.GetPathId()) { + std::ostringstream oss; + oss << "Cannot add table file #" << file_number << " to level " << level + << " by trivial move since it isn't trivial to move to a " + << "different path"; + return Status::Corruption("VersionBuilder", oss.str()); + } + // No need to Ref() this file held by base_vstorage_. + container = &level_state.moved_files; + } else { + f = new FileMetaData(meta); + // Will drop reference in dtor. + f->Ref(); + container = &level_state.added_files; } - FileMetaData* const f = new FileMetaData(meta); - f->refs = 1; - - auto& add_files = level_state.added_files; - assert(add_files.find(file_number) == add_files.end()); - add_files.emplace(file_number, f); + assert(container && container->find(file_number) == container->end()); + container->emplace(file_number, f); table_file_levels_[file_number] = level; @@ -403,39 +424,70 @@ class VersionBuilder::Rep { // Merge the set of added files with the set of pre-existing files. // Drop any deleted files. Store the result in *v. const auto& base_files = base_vstorage_->LevelFiles(level); + const auto& del_files = levels_[level].deleted_base_files; const auto& unordered_added_files = levels_[level].added_files; + const auto& unordered_moved_files = levels_[level].moved_files; + vstorage->Reserve(level, base_files.size() + unordered_added_files.size()); - // Sort added files for the level. - std::vector added_files; - added_files.reserve(unordered_added_files.size()); + // Sort delta files for the level. + std::vector delta_files; + delta_files.reserve(unordered_added_files.size() + + unordered_moved_files.size()); for (const auto& pair : unordered_added_files) { - added_files.push_back(pair.second); + delta_files.push_back(pair.second); + } + for (const auto& pair : unordered_moved_files) { + // SaveTo will always be called under db mutex. + pair.second->being_moved_to = level; + delta_files.push_back(pair.second); } - std::sort(added_files.begin(), added_files.end(), cmp); + std::sort(delta_files.begin(), delta_files.end(), cmp); #ifndef NDEBUG - FileMetaData* prev_added_file = nullptr; - for (const auto& added : added_files) { - if (level > 0 && prev_added_file != nullptr) { + FileMetaData* prev_file = nullptr; + for (const auto& delta : delta_files) { + if (level > 0 && prev_file != nullptr) { assert(base_vstorage_->InternalComparator()->Compare( - prev_added_file->smallest, added->smallest) <= 0); + prev_file->smallest, delta->smallest) <= 0); } - prev_added_file = added; + prev_file = delta; } #endif auto base_iter = base_files.begin(); auto base_end = base_files.end(); - auto added_iter = added_files.begin(); - auto added_end = added_files.end(); - while (added_iter != added_end || base_iter != base_end) { - if (base_iter == base_end || - (added_iter != added_end && cmp(*added_iter, *base_iter))) { - MaybeAddFile(vstorage, level, *added_iter++); + auto delta_iter = delta_files.begin(); + auto delta_end = delta_files.end(); + + // Delta file supersedes base file because base is masked by + // deleted_base_files. + while (delta_iter != delta_end || base_iter != base_end) { + if (delta_iter == delta_end || + (base_iter != base_end && cmp(*base_iter, *delta_iter))) { + FileMetaData* f = *base_iter++; + const uint64_t file_number = f->fd.GetNumber(); + + if (del_files.find(file_number) != del_files.end()) { + // vstorage inherited base_vstorage_'s stats, need to account for + // deleted base files. + vstorage->RemoveCurrentStats(f); + } else { +#ifndef NDEBUG + assert(unordered_added_files.find(file_number) == + unordered_added_files.end()); +#endif + vstorage->AddFile(level, f, info_log_); + } } else { - MaybeAddFile(vstorage, level, *base_iter++); + FileMetaData* f = *delta_iter++; + if (f->init_stats_from_file) { + // A moved file whose stats is inited by base_vstorage_ and then + // deleted from it. + vstorage->UpdateAccumulatedStats(f); + } + vstorage->AddFile(level, f, info_log_); } } } @@ -540,31 +592,6 @@ class VersionBuilder::Rep { } return Status::OK(); } - - void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) { - const uint64_t file_number = f->fd.GetNumber(); - - const auto& level_state = levels_[level]; - - const auto& del_files = level_state.deleted_files; - const auto del_it = del_files.find(file_number); - - if (del_it != del_files.end()) { - // f is to-be-deleted table file - vstorage->RemoveCurrentStats(f); - } else { - const auto& add_files = level_state.added_files; - const auto add_it = add_files.find(file_number); - - // Note: if the file appears both in the base version and in the added - // list, the added FileMetaData supersedes the one in the base version. - if (add_it != add_files.end() && add_it->second != f) { - vstorage->RemoveCurrentStats(f); - } else { - vstorage->AddFile(level, f, info_log_); - } - } - } }; VersionBuilder::VersionBuilder(const EnvOptions& env_options, @@ -597,10 +624,4 @@ Status VersionBuilder::LoadTableHandlers( prefetch_index_and_filter_in_cache, is_initial_load, prefix_extractor); } - -void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level, - FileMetaData* f) { - rep_->MaybeAddFile(vstorage, level, f); -} - } // namespace rocksdb diff --git a/db/version_builder.h b/db/version_builder.h index bdd3b151ef3..504ee58c078 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -18,7 +18,7 @@ namespace rocksdb { class TableCache; class VersionStorageInfo; class VersionEdit; -struct FileMetaData; +class FileMetaData; class InternalStats; // A helper class so we can efficiently apply a whole sequence @@ -38,7 +38,6 @@ class VersionBuilder { bool prefetch_index_and_filter_in_cache, bool is_initial_load, const SliceTransform* prefix_extractor); - void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f); private: class Rep; diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 35c234fb918..2f049ef2a64 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -41,7 +41,7 @@ class VersionBuilderTest : public testing::Test { ~VersionBuilderTest() override { for (int i = 0; i < vstorage_.num_levels(); i++) { for (auto* f : vstorage_.LevelFiles(i)) { - if (--f->refs == 0) { + if (f->Unref()) { delete f; } } @@ -67,7 +67,6 @@ class VersionBuilderTest : public testing::Test { f->fd.smallest_seqno = smallest_seqno; f->fd.largest_seqno = largest_seqno; f->compensated_file_size = file_size; - f->refs = 0; f->num_entries = num_entries; f->num_deletions = num_deletions; vstorage_.AddFile(level, f); @@ -88,10 +87,30 @@ class VersionBuilderTest : public testing::Test { } }; +// Check that one file number is mapped to one unique FileMetaData in a series +// of versions. +struct FileReferenceChecker { + std::unordered_map files; + + bool Check(const VersionStorageInfo* vstorage) { + for (int i = 0; i < vstorage->num_levels(); i++) { + for (auto* f : vstorage->LevelFiles(i)) { + auto it = files.find(f->fd.GetNumber()); + if (it == files.end()) { + files[f->fd.GetNumber()] = f; + } else if (it->second != f) { + return false; + } + } + } + return true; + } +}; + void UnrefFilesInVersion(VersionStorageInfo* new_vstorage) { for (int i = 0; i < new_vstorage->num_levels(); i++) { for (auto* f : new_vstorage->LevelFiles(i)) { - if (--f->refs == 0) { + if (f->Unref()) { delete f; } } @@ -325,45 +344,121 @@ TEST_F(VersionBuilderTest, ApplyFileDeletionNotInLSMTree) { TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) { constexpr int level = 1; constexpr uint64_t file_number = 2345; + constexpr uint64_t path_id = 0; + constexpr uint64_t file_size = 1024; constexpr char smallest[] = "bar"; constexpr char largest[] = "foo"; + constexpr SequenceNumber smallest_seq = 100; + constexpr SequenceNumber largest_seq = 100; + constexpr uint64_t num_entries = 1000; + constexpr uint64_t num_deletions = 0; + constexpr bool sampled = true; + constexpr SequenceNumber smallest_seqno = 1; + constexpr SequenceNumber largest_seqno = 1000; + constexpr bool marked_for_compaction = false; + constexpr bool force_consistency_checks = false; - Add(level, file_number, smallest, largest); + Add(level, file_number, smallest, largest, file_size, path_id, smallest_seq, + largest_seq, num_entries, num_deletions, sampled, smallest_seqno, + largest_seqno); EnvOptions env_options; constexpr TableCache* table_cache = nullptr; - VersionBuilder builder(env_options, table_cache, &vstorage_); - VersionEdit deletion; - deletion.DeleteFile(level, file_number); - ASSERT_OK(builder.Apply(&deletion)); - - VersionEdit addition; - - constexpr uint32_t path_id = 0; - constexpr uint64_t file_size = 10000; - constexpr SequenceNumber smallest_seqno = 100; - constexpr SequenceNumber largest_seqno = 1000; - constexpr bool marked_for_compaction = false; - - addition.AddFile(level, file_number, path_id, file_size, - GetInternalKey(smallest), GetInternalKey(largest), - smallest_seqno, largest_seqno, marked_for_compaction); - - ASSERT_OK(builder.Apply(&addition)); + { + VersionBuilder builder(env_options, table_cache, &vstorage_); + ASSERT_OK(builder.Apply(&deletion)); + VersionEdit addition; + addition.AddFile(level, file_number, path_id, file_size, + GetInternalKey("181", smallest_seq), + GetInternalKey("798", largest_seq), smallest_seqno, + largest_seqno, marked_for_compaction); + ASSERT_OK(builder.Apply(&addition)); + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + ASSERT_OK(builder.SaveTo(&new_vstorage)); + ASSERT_EQ(new_vstorage.GetFileLocation(file_number).GetLevel(), level); + FileReferenceChecker checker; + ASSERT_TRUE(checker.Check(&vstorage_)); + ASSERT_TRUE(checker.Check(&new_vstorage)); + UnrefFilesInVersion(&new_vstorage); + } - constexpr bool force_consistency_checks = false; - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, &vstorage_, - force_consistency_checks); + { // Move to a higher level. + VersionBuilder builder(env_options, table_cache, &vstorage_); + ASSERT_OK(builder.Apply(&deletion)); + VersionEdit addition; + addition.AddFile(level + 1, file_number, path_id, file_size, + GetInternalKey("181", smallest_seq), + GetInternalKey("798", largest_seq), smallest_seqno, + largest_seqno, marked_for_compaction); + ASSERT_OK(builder.Apply(&addition)); + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + ASSERT_OK(builder.SaveTo(&new_vstorage)); + ASSERT_EQ(new_vstorage.GetFileLocation(file_number).GetLevel(), level + 1); + // File movement should not change key estimation. + ASSERT_EQ(vstorage_.GetEstimatedActiveKeys(), + new_vstorage.GetEstimatedActiveKeys()); + FileReferenceChecker checker; + ASSERT_TRUE(checker.Check(&vstorage_)); + ASSERT_TRUE(checker.Check(&new_vstorage)); + UnrefFilesInVersion(&new_vstorage); + } - ASSERT_OK(builder.SaveTo(&new_vstorage)); - ASSERT_EQ(new_vstorage.GetFileLocation(file_number).GetLevel(), level); + { // Move to a different path. + VersionBuilder builder(env_options, table_cache, &vstorage_); + ASSERT_OK(builder.Apply(&deletion)); + VersionEdit addition; + addition.AddFile(level, file_number, path_id + 1, file_size, + GetInternalKey("181", smallest_seq), + GetInternalKey("798", largest_seq), smallest_seqno, + largest_seqno, marked_for_compaction); + const Status s = builder.Apply(&addition); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE( + std::strstr(s.getState(), + "Cannot add table file #2345 to level 1 by trivial " + "move since it isn't trivial to move to a different " + "path")); + } - UnrefFilesInVersion(&new_vstorage); + { // Move twice. + VersionBuilder builder(env_options, table_cache, &vstorage_); + ASSERT_OK(builder.Apply(&deletion)); + VersionEdit addition_1; + addition_1.AddFile(level + 1, file_number, path_id, file_size, + GetInternalKey("181", smallest_seq), + GetInternalKey("798", largest_seq), smallest_seqno, + largest_seqno, marked_for_compaction); + VersionEdit deletion_1; + deletion_1.DeleteFile(level + 1, file_number); + VersionEdit addition_2; + addition_2.AddFile(level + 2, file_number, path_id, file_size, + GetInternalKey("181", smallest_seq), + GetInternalKey("798", largest_seq), smallest_seqno, + largest_seqno, marked_for_compaction); + ASSERT_OK(builder.Apply(&addition_1)); + ASSERT_OK(builder.Apply(&deletion_1)); + ASSERT_OK(builder.Apply(&addition_2)); + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, &vstorage_, + force_consistency_checks); + ASSERT_OK(builder.SaveTo(&new_vstorage)); + ASSERT_EQ(new_vstorage.GetFileLocation(file_number).GetLevel(), level + 2); + // File movement should not change key estimation. + ASSERT_EQ(vstorage_.GetEstimatedActiveKeys(), + new_vstorage.GetEstimatedActiveKeys()); + FileReferenceChecker checker; + ASSERT_TRUE(checker.Check(&vstorage_)); + ASSERT_TRUE(checker.Check(&new_vstorage)); + UnrefFilesInVersion(&new_vstorage); + } } TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) { @@ -388,6 +483,7 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) { constexpr SequenceNumber largest_seqno = 1000; constexpr bool marked_for_compaction = false; + // Add an existing file. edit.AddFile(new_level, file_number, path_id, file_size, GetInternalKey(smallest), GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction); diff --git a/db/version_edit.h b/db/version_edit.h index 0d55a4827ba..13f3e8b9e98 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -86,12 +86,13 @@ struct FileSampledStats { mutable std::atomic num_reads_sampled; }; -struct FileMetaData { +class FileMetaData { + public: FileDescriptor fd; InternalKey smallest; // Smallest internal key served by table InternalKey largest; // Largest internal key served by table - // Needs to be disposed when refs becomes 0. + // Needs to be disposed when refs_ becomes 0. Cache::Handle* table_reader_handle; FileSampledStats stats; @@ -109,9 +110,8 @@ struct FileMetaData { uint64_t raw_key_size; // total uncompressed key size. uint64_t raw_value_size; // total uncompressed value size. - int refs; // Reference count - bool being_compacted; // Is this file undergoing compaction? + int being_moved_to; // Is this file undergoing trivial move? bool init_stats_from_file; // true if the data-entry stats of this file // has initialized from file. @@ -125,10 +125,18 @@ struct FileMetaData { num_deletions(0), raw_key_size(0), raw_value_size(0), - refs(0), being_compacted(false), + being_moved_to(-1), init_stats_from_file(false), - marked_for_compaction(false) {} + marked_for_compaction(false), + refs_(0) {} + + void Ref() { ++refs_; } + + bool Unref() { + assert(refs_ > 0); + return --refs_ <= 0; + } // REQUIRED: Keys must be given to the function in sorted order (it expects // the last key to be the largest). @@ -173,6 +181,9 @@ struct FileMetaData { r.append("]"); return r; } + + private: + int refs_; // Reference count }; // A compressed copy of file meta data that just contain minimum data needed diff --git a/db/version_set.cc b/db/version_set.cc index c1df704625b..84c7041ec92 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -742,9 +742,7 @@ Version::~Version() { for (int level = 0; level < storage_info_.num_levels_; level++) { for (size_t i = 0; i < storage_info_.files_[level].size(); i++) { FileMetaData* f = storage_info_.files_[level][i]; - assert(f->refs > 0); - f->refs--; - if (f->refs <= 0) { + if (f->Unref()) { assert(cfd_ != nullptr); uint32_t path_id = f->fd.GetPathId(); assert(path_id < cfd_->ioptions()->cf_paths.size()); @@ -2297,6 +2295,7 @@ void VersionStorageInfo::EstimateCompactionBytesNeeded( namespace { uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions, const MutableCFOptions& mutable_cf_options, + int level, const std::vector& files) { uint32_t ttl_expired_files_count = 0; @@ -2305,7 +2304,8 @@ uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions, if (status.ok()) { const uint64_t current_time = static_cast(_current_time); for (auto f : files) { - if (!f->being_compacted && f->fd.table_reader != nullptr && + if ((!f->being_compacted || f->being_moved_to == level) && + f->fd.table_reader != nullptr && f->fd.table_reader->GetTableProperties() != nullptr) { auto creation_time = f->fd.table_reader->GetTableProperties()->creation_time; @@ -2340,7 +2340,7 @@ void VersionStorageInfo::ComputeCompactionScore( int num_sorted_runs = 0; uint64_t total_size = 0; for (auto* f : files_[level]) { - if (!f->being_compacted) { + if (!f->being_compacted || f->being_moved_to == level) { total_size += f->compensated_file_size; num_sorted_runs++; } @@ -2350,7 +2350,8 @@ void VersionStorageInfo::ComputeCompactionScore( // compaction score for the whole DB. Adding other levels as if // they are L0 files. for (int i = 1; i < num_levels(); i++) { - if (!files_[i].empty() && !files_[i][0]->being_compacted) { + if (!files_[i].empty() && (!files_[i][0]->being_compacted || + files_[i][0]->being_moved_to == i)) { num_sorted_runs++; } } @@ -2366,10 +2367,10 @@ void VersionStorageInfo::ComputeCompactionScore( score); } if (mutable_cf_options.ttl > 0) { - score = std::max( - static_cast(GetExpiredTtlFilesCount( - immutable_cf_options, mutable_cf_options, files_[level])), - score); + score = std::max(static_cast(GetExpiredTtlFilesCount( + immutable_cf_options, mutable_cf_options, level, + files_[level])), + score); } } else { @@ -2388,7 +2389,7 @@ void VersionStorageInfo::ComputeCompactionScore( // Compute the ratio of current size to size limit. uint64_t level_bytes_no_compacting = 0; for (auto f : files_[level]) { - if (!f->being_compacted) { + if (!f->being_compacted || f->being_moved_to == level) { level_bytes_no_compacting += f->compensated_file_size; } } @@ -2441,7 +2442,8 @@ void VersionStorageInfo::ComputeFilesMarkedForCompaction() { for (int level = 0; level <= last_qualify_level; level++) { for (auto* f : files_[level]) { - if (!f->being_compacted && f->marked_for_compaction) { + if ((!f->being_compacted || f->being_moved_to == level) && + f->marked_for_compaction) { files_marked_for_compaction_.emplace_back(level, f); } } @@ -2463,7 +2465,8 @@ void VersionStorageInfo::ComputeExpiredTtlFiles( for (int level = 0; level < num_levels() - 1; level++) { for (auto f : files_[level]) { - if (!f->being_compacted && f->fd.table_reader != nullptr && + if ((!f->being_compacted || f->being_moved_to == level) && + f->fd.table_reader != nullptr && f->fd.table_reader->GetTableProperties() != nullptr) { auto creation_time = f->fd.table_reader->GetTableProperties()->creation_time; @@ -2493,7 +2496,8 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( for (int level = 0; level < num_levels(); level++) { for (auto f : files_[level]) { - if (!f->being_compacted && f->fd.table_reader != nullptr && + if ((!f->being_compacted || f->being_moved_to == level) && + f->fd.table_reader != nullptr && f->fd.table_reader->GetTableProperties() != nullptr) { // Compute a file's modification time in the following order: // 1. Use file_creation_time table property if it is > 0. @@ -2545,13 +2549,13 @@ bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) { } // anonymous namespace void VersionStorageInfo::AddFile(int level, FileMetaData* f, Logger* info_log) { - auto* level_files = &files_[level]; + auto& level_files = files_[level]; // Must not overlap #ifndef NDEBUG - if (level > 0 && !level_files->empty() && + if (level > 0 && !level_files.empty() && internal_comparator_->Compare( - (*level_files)[level_files->size() - 1]->largest, f->smallest) >= 0) { - auto* f2 = (*level_files)[level_files->size() - 1]; + level_files[level_files.size() - 1]->largest, f->smallest) >= 0) { + auto* f2 = level_files[level_files.size() - 1]; if (info_log != nullptr) { Error(info_log, "Adding new file %" PRIu64 " range (%s, %s) to level %d but overlapping " @@ -2567,14 +2571,15 @@ void VersionStorageInfo::AddFile(int level, FileMetaData* f, Logger* info_log) { #else (void)info_log; #endif - f->refs++; - level_files->push_back(f); + level_files.push_back(f); + + f->Ref(); const uint64_t file_number = f->fd.GetNumber(); assert(file_locations_.find(file_number) == file_locations_.end()); file_locations_.emplace(file_number, - FileLocation(level, level_files->size() - 1)); + FileLocation(level, level_files.size() - 1)); } // Version::PrepareApply() need to be called before calling the function, or @@ -2617,6 +2622,11 @@ void VersionStorageInfo::SetFinalized() { if (LevelFiles(level).size() > 0) { assert(level < num_non_empty_levels()); } + for (auto* f : LevelFiles(level)) { + if (f->being_moved_to == level) { + f->being_moved_to = -1; + } + } } assert(compaction_level_.size() > 0); assert(compaction_level_.size() == compaction_score_.size()); @@ -3366,11 +3376,13 @@ bool VersionStorageInfo::RangeMightExistAfterSortedRun( return false; } -void Version::AddLiveFiles(std::vector* live) { - for (int level = 0; level < storage_info_.num_levels(); level++) { - const std::vector& files = storage_info_.files_[level]; - for (const auto& file : files) { - live->push_back(file->fd); +void Version::AddLiveFiles(std::vector* live_table_files) const { + assert(live_table_files); + for (int level = 0; level < storage_info_.num_levels(); ++level) { + const auto& level_files = storage_info_.LevelFiles(level); + for (const auto& meta : level_files) { + assert(meta); + live_table_files->emplace_back(meta->fd.GetNumber()); } } } @@ -5119,9 +5131,10 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, return result; } -void VersionSet::AddLiveFiles(std::vector* live_list) { +void VersionSet::AddLiveFiles(std::vector* live_table_files) const { + assert(live_table_files); // pre-calculate space requirement - int64_t total_files = 0; + int64_t total_table_files = 0; for (auto cfd : *column_family_set_) { if (!cfd->initialized()) { continue; @@ -5131,13 +5144,13 @@ void VersionSet::AddLiveFiles(std::vector* live_list) { v = v->next_) { const auto* vstorage = v->storage_info(); for (int level = 0; level < vstorage->num_levels(); level++) { - total_files += vstorage->LevelFiles(level).size(); + total_table_files += vstorage->LevelFiles(level).size(); } } } // just one time extension to the right size - live_list->reserve(live_list->size() + static_cast(total_files)); + live_table_files->reserve(live_table_files->size() + total_table_files); for (auto cfd : *column_family_set_) { if (!cfd->initialized()) { @@ -5148,7 +5161,7 @@ void VersionSet::AddLiveFiles(std::vector* live_list) { Version* dummy_versions = cfd->dummy_versions(); for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { - v->AddLiveFiles(live_list); + v->AddLiveFiles(live_table_files); if (v == current) { found_current = true; } @@ -5156,7 +5169,7 @@ void VersionSet::AddLiveFiles(std::vector* live_list) { if (!found_current && current != nullptr) { // Should never happen unless it is a bug. assert(false); - current->AddLiveFiles(live_list); + current->AddLiveFiles(live_table_files); } } } diff --git a/db/version_set.h b/db/version_set.h index 88e2de6df7f..583080291a7 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -309,6 +309,17 @@ class VersionStorageInfo { return it->second; } + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + FileMetaData* GetFileMetaDataByNumber(uint64_t file_number) const { + auto location = GetFileLocation(file_number); + + if (!location.IsValid()) { + return nullptr; + } + + return files_[location.GetLevel()][location.GetPosition()]; + } + const rocksdb::LevelFilesBrief& LevelFilesBrief(int level) const { assert(level < static_cast(level_files_brief_.size())); return level_files_brief_[level]; @@ -646,7 +657,7 @@ class Version { bool Unref(); // Add all files listed in the current version to *live. - void AddLiveFiles(std::vector* live); + void AddLiveFiles(std::vector* live) const; // Return a human readable string that describes this version's contents. std::string DebugString(bool hex = false, bool print_stats = false) const; @@ -1031,7 +1042,7 @@ Status DumpManifest(Options& options, std::string& dscname, const EnvOptions& env_options_compactions); // Add all files listed in any live version to *live. - void AddLiveFiles(std::vector* live_list); + void AddLiveFiles(std::vector* live_list) const; // Return the approximate size of data to be scanned for range [start, end) // in levels [start_level, end_level). If end_level == -1 it will search diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 446ae94c720..d8dffaf00b1 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -118,7 +118,7 @@ class VersionStorageInfoTest : public testing::Test { ~VersionStorageInfoTest() override { for (int i = 0; i < vstorage_.num_levels(); i++) { for (auto* f : vstorage_.LevelFiles(i)) { - if (--f->refs == 0) { + if (f->Unref()) { delete f; } } @@ -133,7 +133,6 @@ class VersionStorageInfoTest : public testing::Test { f->smallest = GetInternalKey(smallest, 0); f->largest = GetInternalKey(largest, 0); f->compensated_file_size = file_size; - f->refs = 0; f->num_entries = 0; f->num_deletions = 0; vstorage_.AddFile(level, f); @@ -147,7 +146,6 @@ class VersionStorageInfoTest : public testing::Test { f->smallest = smallest; f->largest = largest; f->compensated_file_size = file_size; - f->refs = 0; f->num_entries = 0; f->num_deletions = 0; vstorage_.AddFile(level, f); @@ -409,7 +407,7 @@ TEST_F(VersionStorageInfoTest, GetOverlappingInputs) { 1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue})); } -TEST_F(VersionStorageInfoTest, FileLocation) { +TEST_F(VersionStorageInfoTest, FileLocationAndMetaDataByNumber) { Add(0, 11U, "1", "2", 5000U); Add(0, 12U, "1", "2", 5000U); @@ -417,13 +415,18 @@ TEST_F(VersionStorageInfoTest, FileLocation) { ASSERT_EQ(vstorage_.GetFileLocation(11U), VersionStorageInfo::FileLocation(0, 0)); + ASSERT_NE(vstorage_.GetFileMetaDataByNumber(11U), nullptr); + ASSERT_EQ(vstorage_.GetFileLocation(12U), VersionStorageInfo::FileLocation(0, 1)); + ASSERT_NE(vstorage_.GetFileMetaDataByNumber(12U), nullptr); ASSERT_EQ(vstorage_.GetFileLocation(7U), VersionStorageInfo::FileLocation(2, 0)); + ASSERT_NE(vstorage_.GetFileMetaDataByNumber(7U), nullptr); ASSERT_FALSE(vstorage_.GetFileLocation(999U).IsValid()); + ASSERT_EQ(vstorage_.GetFileMetaDataByNumber(999U), nullptr); } class FindLevelFileTest : public testing::Test { From cf6ca227d491a88762dcb0b8293e9d160ba2b2f1 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Wed, 25 Aug 2021 13:49:19 +0800 Subject: [PATCH 12/16] fix mac build (#253) Partially picking https://github.com/facebook/rocksdb/pull/7776 Signed-off-by: tabokie --- port/jemalloc_helper.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h index 86df80196d8..79bbb6d98c9 100644 --- a/port/jemalloc_helper.h +++ b/port/jemalloc_helper.h @@ -38,6 +38,24 @@ static inline bool HasJemalloc() { return true; } #else +// definitions for compatibility with older versions of jemalloc +#if !defined(JEMALLOC_ALLOCATOR) +#define JEMALLOC_ALLOCATOR +#endif +#if !defined(JEMALLOC_RESTRICT_RETURN) +#define JEMALLOC_RESTRICT_RETURN +#endif +#if !defined(JEMALLOC_NOTHROW) +#define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow) +#endif +#if !defined(JEMALLOC_ALLOC_SIZE) +#ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE +#define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s)) +#else +#define JEMALLOC_ALLOC_SIZE(s) +#endif +#endif + // Declare non-standard jemalloc APIs as weak symbols. We can null-check these // symbols to detect whether jemalloc is linked with the binary. extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW * From b071c44789fd21f3bce10bcf5e8c4224c7c0703b Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Wed, 25 Aug 2021 16:19:39 +0800 Subject: [PATCH 13/16] Do not tune down write-amp based rate limit when flush flow decreases (#234) Because foreground writes can be proactively restrained when compactions lag behind, added a minor algorithm change that avoids tuning down rate limit when foreground writes (flush flow) decrease. This will not affect the overall performance of `WriteAmpBasedRateLimiter`. Signed-off-by: tabokie --- .../write_amp_based_rate_limiter.cc | 41 +++++++++++++------ 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/utilities/rate_limiters/write_amp_based_rate_limiter.cc b/utilities/rate_limiters/write_amp_based_rate_limiter.cc index 18c556db6ed..487f6feb51b 100644 --- a/utilities/rate_limiters/write_amp_based_rate_limiter.cc +++ b/utilities/rate_limiters/write_amp_based_rate_limiter.cc @@ -31,10 +31,11 @@ constexpr int kSecondsPerTune = 1; constexpr int kMillisPerTune = 1000 * kSecondsPerTune; constexpr int kMicrosPerTune = 1000 * 1000 * kSecondsPerTune; -// Two reasons for adding padding to baseline limit: -// 1. compaction cannot fully utilize the IO quota we set. -// 2. make it faster to digest unexpected burst of pending compaction bytes, -// generally this will help flatten IO waves. +// Due to the execution model of compaction, large waves of pending compactions +// could possibly be hidden behind a constant rate of I/O requests. It's then +// wise to raise the threshold slightly above estimation to ensure those +// pending compactions can contribute to the convergence of a new alternative +// threshold. // Padding is calculated through hyperbola based on empirical percentage of 10% // and special care for low-pressure domain. E.g. coordinates (5M, 18M) and // (10M, 16M) are on this curve. @@ -314,6 +315,12 @@ int64_t WriteAmpBasedRateLimiter::CalculateRefillBytesPerPeriod( } } +// The core function used to dynamically adjust the compaction rate limit, +// called **at most** once every `kSecondsPerTune`. +// I/O throughput threshold is automatically tuned based on history samples of +// compaction and flush flow. This algorithm excels by taking into account the +// limiter's inability to estimate the pressure of pending compactions, and the +// possibility of foreground write fluctuation. Status WriteAmpBasedRateLimiter::Tune() { // computed rate limit will be larger than 10MB/s const int64_t kMinBytesPerSec = 10 << 20; @@ -331,21 +338,33 @@ Status WriteAmpBasedRateLimiter::Tune() { int64_t prev_bytes_per_sec = GetBytesPerSecond(); - // Loop through the actual time slice to make sure bytes flow from long period - // of time is properly estimated when the compaction rate is low. + // This function can be called less frequent than we anticipate when + // compaction rate is low. Loop through the actual time slice to correct + // the estimation. for (uint32_t i = 0; i < duration_ms / kMillisPerTune; i++) { bytes_sampler_.AddSample(duration_bytes_through_ * 1000 / duration_ms); highpri_bytes_sampler_.AddSample(duration_highpri_bytes_through_ * 1000 / duration_ms); limit_bytes_sampler_.AddSample(prev_bytes_per_sec); } + int64_t new_bytes_per_sec = bytes_sampler_.GetFullValue(); int32_t ratio = std::max( kRatioLower, static_cast( bytes_sampler_.GetFullValue() * 10 / std::max(highpri_bytes_sampler_.GetFullValue(), kHighBytesLower))); - - // in case there are compaction bursts even when online writes are stable + // Only adjust threshold when foreground write (flush) flow increases, + // because decreasement could also be caused by manual flow control at + // application level to alleviate background pressure. + new_bytes_per_sec = std::max( + new_bytes_per_sec, + ratio * + std::max(highpri_bytes_sampler_.GetRecentValue(), kHighBytesLower) / + 10); + // Set the threshold higher to avoid write stalls caused by pending + // compactions. + int64_t padding = CalculatePadding(new_bytes_per_sec); + // Adjustment based on utilization. int64_t util = bytes_sampler_.GetRecentValue() * 1000 / limit_bytes_sampler_.GetRecentValue(); if (util >= 995) { @@ -355,11 +374,7 @@ Status WriteAmpBasedRateLimiter::Tune() { } else if (percent_delta_ > 0) { percent_delta_ -= 1; } - - int64_t new_bytes_per_sec = - ratio * - std::max(highpri_bytes_sampler_.GetRecentValue(), kHighBytesLower) / 10; - int64_t padding = CalculatePadding(new_bytes_per_sec); + // React to pace-up requests when LSM is out of shape. if (critical_pace_up_.load(std::memory_order_relaxed)) { percent_delta_ = 150; critical_pace_up_.store(false, std::memory_order_relaxed); From d822f13d89fb2df384d71fb86c5a9123cb0d7c33 Mon Sep 17 00:00:00 2001 From: Wallace Date: Fri, 3 Sep 2021 14:20:26 +0800 Subject: [PATCH 14/16] fix version-check for ingested sst (#257) * fix check log seqno Signed-off-by: Little-Wallace --- db/version_builder.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/db/version_builder.cc b/db/version_builder.cc index 467d16e12e7..bf95866e94b 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -172,7 +172,12 @@ class VersionBuilder::Rep { if (f2->fd.smallest_seqno == f2->fd.largest_seqno) { // This is an external file that we ingested SequenceNumber external_file_seqno = f2->fd.smallest_seqno; - if (!(external_file_seqno < f1->fd.largest_seqno || + // If f1 and f2 are both ingested files, their seqno may be same. + // Because RocksDB will assign only one seqno for all files ingested + // in once call by `IngestExternalFile`. + if (!((f1->fd.smallest_seqno == f1->fd.largest_seqno && + f1->fd.smallest_seqno == external_file_seqno) || + external_file_seqno < f1->fd.largest_seqno || external_file_seqno == 0)) { fprintf(stderr, "L0 file with seqno %" PRIu64 " %" PRIu64 From 24213c4086d4daaf26aee29663c47438890e1a2e Mon Sep 17 00:00:00 2001 From: Xintao Date: Wed, 15 Sep 2021 12:38:26 +0800 Subject: [PATCH 15/16] Add resume interface for C api (#255) Signed-off-by: Xintao --- db/c.cc | 4 ++++ include/rocksdb/c.h | 2 ++ 2 files changed, 6 insertions(+) diff --git a/db/c.cc b/db/c.cc index 63fb575f855..51da66b8ba1 100644 --- a/db/c.cc +++ b/db/c.cc @@ -537,6 +537,10 @@ rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options, return result; } +void rocksdb_resume(rocksdb_t* db, char** errptr) { + SaveError(errptr, db->rep->Resume()); +} + rocksdb_backup_engine_t* rocksdb_backup_engine_open( const rocksdb_options_t* options, const char* path, char** errptr) { BackupEngine* be; diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 1db70697df1..03b003540bd 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -142,6 +142,8 @@ extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary( const rocksdb_options_t* options, const char* name, const char* secondary_path, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_resume(rocksdb_t* db, char** errptr); + extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open( const rocksdb_options_t* options, const char* path, char** errptr); From f236fe443d06337ca2538414c62f70ba17b8e409 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Fri, 17 Sep 2021 15:33:15 +0800 Subject: [PATCH 16/16] disable ignore_unknown_options only for lower version (#256) Only enable force option check when reading options file from lower version. Signed-off-by: tabokie --- options/options_parser.cc | 6 +++--- options/options_test.cc | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/options/options_parser.cc b/options/options_parser.cc index 0ae38602d5b..4b7f95d1287 100644 --- a/options/options_parser.cc +++ b/options/options_parser.cc @@ -244,11 +244,11 @@ Status RocksDBOptionsParser::Parse(const ConfigOptions& config_options_in, return s; } - // If the option file is not generated by a higher minor version, + // If the option file is generated by a lower minor version, // there shouldn't be any unknown option. if (ignore_unknown_options && section == kOptionSectionVersion) { - if (db_version[0] < ROCKSDB_MAJOR || (db_version[0] == ROCKSDB_MAJOR && - db_version[1] <= ROCKSDB_MINOR)) { + if (db_version[0] < ROCKSDB_MAJOR || + (db_version[0] == ROCKSDB_MAJOR && db_version[1] < ROCKSDB_MINOR)) { ignore_unknown_options = false; } } diff --git a/options/options_test.cc b/options/options_test.cc index 05ea766f6a6..cf0e8ae1d03 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -1307,7 +1307,7 @@ TEST_F(OptionsParserTest, IgnoreUnknownOptions) { bool should_ignore = true; if (case_id == 0) { // same version - should_ignore = false; + should_ignore = true; version_string = ToString(ROCKSDB_MAJOR) + "." + ToString(ROCKSDB_MINOR) + ".0"; } else if (case_id == 1) {