diff --git a/nano/core_test/toml.cpp b/nano/core_test/toml.cpp index ce861da1d8..61c2b7fe02 100644 --- a/nano/core_test/toml.cpp +++ b/nano/core_test/toml.cpp @@ -239,7 +239,7 @@ TEST (toml, daemon_config_deserialize_defaults) ASSERT_EQ (conf.node.lmdb_config.map_size, defaults.node.lmdb_config.map_size); ASSERT_EQ (conf.node.rocksdb_config.enable, defaults.node.rocksdb_config.enable); - ASSERT_EQ (conf.node.rocksdb_config.memory_multiplier, defaults.node.rocksdb_config.memory_multiplier); + ASSERT_EQ (conf.node.rocksdb_config.cache_size, defaults.node.rocksdb_config.cache_size); ASSERT_EQ (conf.node.rocksdb_config.io_threads, defaults.node.rocksdb_config.io_threads); ASSERT_EQ (conf.node.optimistic_scheduler.enabled, defaults.node.optimistic_scheduler.enabled); @@ -573,7 +573,7 @@ TEST (toml, daemon_config_deserialize_no_defaults) [node.rocksdb] enable = true - memory_multiplier = 3 + cache_size = 3 io_threads = 99 [node.experimental] @@ -743,7 +743,7 @@ TEST (toml, daemon_config_deserialize_no_defaults) ASSERT_TRUE (conf.node.rocksdb_config.enable); ASSERT_EQ (nano::rocksdb_config::using_rocksdb_in_tests (), defaults.node.rocksdb_config.enable); - ASSERT_NE (conf.node.rocksdb_config.memory_multiplier, defaults.node.rocksdb_config.memory_multiplier); + ASSERT_NE (conf.node.rocksdb_config.cache_size, defaults.node.rocksdb_config.cache_size); ASSERT_NE (conf.node.rocksdb_config.io_threads, defaults.node.rocksdb_config.io_threads); ASSERT_NE (conf.node.optimistic_scheduler.enabled, defaults.node.optimistic_scheduler.enabled); diff --git a/nano/lib/rocksdbconfig.cpp b/nano/lib/rocksdbconfig.cpp index a12605d1f7..4c9a232b1c 100644 --- a/nano/lib/rocksdbconfig.cpp +++ b/nano/lib/rocksdbconfig.cpp @@ -5,7 +5,7 @@ nano::error nano::rocksdb_config::serialize_toml (nano::tomlconfig & toml) const { toml.put ("enable", enable, "Whether to use the RocksDB backend for the ledger database.\ntype:bool"); - toml.put ("memory_multiplier", memory_multiplier, "This will modify how much memory is used represented by 1 (low), 2 (medium), 3 (high). Default is 2.\ntype:uint8"); + toml.put ("cache_size", cache_size, "Amount of memory in MB used for caching for each table. Valid values are from 1 to 1024. Default is 64.\ntype:uint8"); toml.put ("io_threads", io_threads, "Number of threads to use with the background compaction and flushing.\ntype:uint32"); return toml.get_error (); } @@ -13,7 +13,7 @@ nano::error nano::rocksdb_config::serialize_toml (nano::tomlconfig & toml) const nano::error nano::rocksdb_config::deserialize_toml (nano::tomlconfig & toml) { toml.get_optional ("enable", enable); - toml.get_optional ("memory_multiplier", memory_multiplier); + toml.get_optional ("cache_size", cache_size); toml.get_optional ("io_threads", io_threads); // Validate ranges @@ -21,9 +21,9 @@ nano::error nano::rocksdb_config::deserialize_toml (nano::tomlconfig & toml) { toml.get_error ().set ("io_threads must be non-zero"); } - if (memory_multiplier < 1 || memory_multiplier > 3) + if (cache_size < 1 || cache_size > 1024) { - toml.get_error ().set ("memory_multiplier must be either 1, 2 or 3"); + toml.get_error ().set ("cache_size must be between 1 and 1024 MB"); } return toml.get_error (); diff --git a/nano/lib/rocksdbconfig.hpp b/nano/lib/rocksdbconfig.hpp index 232d320193..c2d178cfff 100644 --- a/nano/lib/rocksdbconfig.hpp +++ b/nano/lib/rocksdbconfig.hpp @@ -25,7 +25,7 @@ class rocksdb_config final static bool using_rocksdb_in_tests (); bool enable{ false }; - uint8_t memory_multiplier{ 2 }; + uint16_t cache_size{ 64 }; unsigned io_threads{ std::max (nano::hardware_concurrency () / 2, 1u) }; }; } diff --git a/nano/store/rocksdb/rocksdb.cpp b/nano/store/rocksdb/rocksdb.cpp index 60f7e95cee..39730dea84 100644 --- a/nano/store/rocksdb/rocksdb.cpp +++ b/nano/store/rocksdb/rocksdb.cpp @@ -64,7 +64,7 @@ nano::store::rocksdb::component::component (nano::logger & logger_a, std::filesy logger{ logger_a }, constants{ constants }, rocksdb_config{ rocksdb_config_a }, - max_block_write_batch_num_m{ nano::narrow_cast (blocks_memtable_size_bytes () / (2 * (sizeof (nano::block_type) + nano::state_block::size + nano::block_sideband::size (nano::block_type::state)))) }, + max_block_write_batch_num_m{ calculate_max_block_write_batch_num () }, cf_name_table_map{ create_cf_name_table_map () } { boost::system::error_code error_mkdir, error_chmod; @@ -80,7 +80,6 @@ nano::store::rocksdb::component::component (nano::logger & logger_a, std::filesy debug_assert (path_a.filename () == "rocksdb"); generate_tombstone_map (); - small_table_factory.reset (::rocksdb::NewBlockBasedTableFactory (get_small_table_options ())); // TODO: get_db_options () registers a listener for resetting tombstones, needs to check if it is a problem calling it more than once. auto options = get_db_options (); @@ -400,118 +399,13 @@ void nano::store::rocksdb::component::generate_tombstone_map () tombstone_map.emplace (std::piecewise_construct, std::forward_as_tuple (nano::tables::pending), std::forward_as_tuple (0, 25000)); } -rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_common_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const -{ - ::rocksdb::ColumnFamilyOptions cf_options; - cf_options.table_factory = table_factory_a; - - // (1 active, 1 inactive) - auto num_memtables = 2; - - // Each level is a multiple of the above. If L1 is 512MB. L2 will be 512 * 8 = 2GB. L3 will be 2GB * 8 = 16GB, and so on... - cf_options.max_bytes_for_level_multiplier = 8; - - // Although this should be the default provided by RocksDB, not setting this is causing sequence conflict checks if not using - cf_options.max_write_buffer_size_to_maintain = memtable_size_bytes_a * num_memtables; - - // Files older than this (1 day) will be scheduled for compaction when there is no other background work. This can lead to more writes however. - cf_options.ttl = 1 * 24 * 60 * 60; - - // Multiplier for each level - cf_options.target_file_size_multiplier = 10; - - // Size of level 1 sst files - cf_options.target_file_size_base = memtable_size_bytes_a; - - // Size of each memtable - cf_options.write_buffer_size = memtable_size_bytes_a; - - // Number of memtables to keep in memory - cf_options.max_write_buffer_number = num_memtables; - - return cf_options; -} - rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_cf_options (std::string const & cf_name_a) const { ::rocksdb::ColumnFamilyOptions cf_options; - auto const memtable_size_bytes = base_memtable_size_bytes (); - auto const block_cache_size_bytes = 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_block_cache_size; - if (cf_name_a == "blocks") - { - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 4))); - cf_options = get_active_cf_options (table_factory, blocks_memtable_size_bytes ()); - } - else if (cf_name_a == "confirmation_height") - { - // Entries will not be deleted in the normal case, so can make memtables a lot bigger - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes * 2); - } - else if (cf_name_a == "meta" || cf_name_a == "online_weight" || cf_name_a == "peers") - { - // Meta - It contains just version key - // Online weight - Periodically deleted - // Peers - Cleaned periodically, a lot of deletions. This is never read outside of initializing? Keep this small - cf_options = get_small_cf_options (small_table_factory); - } - else if (cf_name_a == "cached_counts") - { - // Really small (keys are blocks tables, value is uint64_t) - cf_options = get_small_cf_options (small_table_factory); - } - else if (cf_name_a == "pending") + if (cf_name_a != ::rocksdb::kDefaultColumnFamilyName) { - // Pending can have a lot of deletions too - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes); - - // Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded - cf_options.level0_file_num_compaction_trigger = 2; - - // L1 size, compaction is triggered for L0 at this size (2 SST files in L1) - cf_options.max_bytes_for_level_base = memtable_size_bytes * 2; - } - else if (cf_name_a == "frontiers") - { - // Frontiers is only needed during bootstrap for legacy blocks - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes); - } - else if (cf_name_a == "accounts") - { - // Can have deletions from rollbacks - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes); - } - else if (cf_name_a == "vote") - { - // No deletes it seems, only overwrites. - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes); - } - else if (cf_name_a == "pruned") - { - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes); - } - else if (cf_name_a == "final_votes") - { - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes); - } - else if (cf_name_a == "rep_weights") - { - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes); - } - else if (cf_name_a == ::rocksdb::kDefaultColumnFamilyName) - { - // Do nothing. - } - else - { - debug_assert (false); + cf_options.table_factory = std::shared_ptr<::rocksdb::TableFactory> (::rocksdb::NewBlockBasedTableFactory (get_active_table_options ())); + cf_options.write_buffer_size = memtable_size_bytes; } return cf_options; @@ -836,6 +730,8 @@ int nano::store::rocksdb::component::clear (::rocksdb::ColumnFamilyHandle * colu ::rocksdb::ReadOptions read_options; ::rocksdb::WriteOptions write_options; ::rocksdb::WriteBatch write_batch; + read_options.readahead_size = 0; // Readahead only adds overhead on SSD drives + std::unique_ptr<::rocksdb::Iterator> it (db->NewIterator (read_options, column_family)); for (it->SeekToFirst (); it->Valid (); it->Next ()) @@ -862,32 +758,10 @@ rocksdb::Options nano::store::rocksdb::component::get_db_options () ::rocksdb::Options db_options; db_options.create_if_missing = true; db_options.create_missing_column_families = true; - - // TODO: review if this should be changed due to the unchecked table removal. - // Enable whole key bloom filter in memtables for ones with memtable_prefix_bloom_size_ratio set (unchecked table currently). - // It can potentially reduce CPU usage for point-look-ups. - db_options.memtable_whole_key_filtering = true; - - // Sets the compaction priority - db_options.compaction_pri = ::rocksdb::CompactionPri::kMinOverlappingRatio; - - // Start aggressively flushing WAL files when they reach over 1GB - db_options.max_total_wal_size = 1 * 1024 * 1024 * 1024LL; - - // Optimize RocksDB. This is the easiest way to get RocksDB to perform well + // Set number of threads to use db_options.IncreaseParallelism (rocksdb_config.io_threads); + // Optimize RocksDB. This is the easiest way to get RocksDB to perform well db_options.OptimizeLevelStyleCompaction (); - - // Adds a separate write queue for memtable/WAL - db_options.enable_pipelined_write = true; - - // Default is 16, setting to -1 allows faster startup times for SSDs by allowings more files to be read in parallel. - db_options.max_file_opening_threads = -1; - - // The MANIFEST file contains a history of all file operations since the last time the DB was opened and is replayed during DB open. - // Default is 1GB, lowering this to avoid replaying for too long (100MB) - db_options.max_manifest_file_size = 100 * 1024 * 1024ULL; - // Not compressing any SST files for compatibility reasons. db_options.compression = ::rocksdb::kNoCompression; @@ -899,75 +773,24 @@ rocksdb::Options nano::store::rocksdb::component::get_db_options () return db_options; } -rocksdb::BlockBasedTableOptions nano::store::rocksdb::component::get_active_table_options (std::size_t lru_size) const +rocksdb::BlockBasedTableOptions nano::store::rocksdb::component::get_active_table_options () const { ::rocksdb::BlockBasedTableOptions table_options; // Improve point lookup performance be using the data block hash index (uses about 5% more space). table_options.data_block_index_type = ::rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash; - table_options.data_block_hash_table_util_ratio = 0.75; // Using format_version=4 significantly reduces the index block size, in some cases around 4-5x. // This frees more space in block cache, which would result in higher hit rate for data and filter blocks, // or offer the same performance with a smaller block cache size. table_options.format_version = 4; - table_options.index_block_restart_interval = 16; // Block cache for reads - table_options.block_cache = ::rocksdb::NewLRUCache (lru_size); - - // Bloom filter to help with point reads. 10bits gives 1% false positive rate. - table_options.filter_policy.reset (::rocksdb::NewBloomFilterPolicy (10, false)); - - // Increasing block_size decreases memory usage and space amplification, but increases read amplification. - table_options.block_size = 16 * 1024ULL; - - // Whether level 0 index and filter blocks are stored in block_cache - table_options.pin_l0_filter_and_index_blocks_in_cache = true; + table_options.block_cache = ::rocksdb::NewLRUCache (1024ULL * 1024 * rocksdb_config.cache_size); return table_options; } -rocksdb::BlockBasedTableOptions nano::store::rocksdb::component::get_small_table_options () const -{ - ::rocksdb::BlockBasedTableOptions table_options; - // Improve point lookup performance be using the data block hash index (uses about 5% more space). - table_options.data_block_index_type = ::rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash; - table_options.data_block_hash_table_util_ratio = 0.75; - table_options.block_size = 1024ULL; - return table_options; -} - -rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_small_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a) const -{ - auto const memtable_size_bytes = 10000; - auto cf_options = get_common_cf_options (table_factory_a, memtable_size_bytes); - - // Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded - cf_options.level0_file_num_compaction_trigger = 1; - - // L1 size, compaction is triggered for L0 at this size (1 SST file in L1) - cf_options.max_bytes_for_level_base = memtable_size_bytes; - - return cf_options; -} - -::rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_active_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const -{ - auto cf_options = get_common_cf_options (table_factory_a, memtable_size_bytes_a); - - // Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded - cf_options.level0_file_num_compaction_trigger = 4; - - // L1 size, compaction is triggered for L0 at this size (4 SST files in L1) - cf_options.max_bytes_for_level_base = memtable_size_bytes_a * 4; - - // Size target of levels are changed dynamically based on size of the last level - cf_options.level_compaction_dynamic_level_bytes = true; - - return cf_options; -} - void nano::store::rocksdb::component::on_flush (::rocksdb::FlushJobInfo const & flush_job_info_a) { // Reset appropriate tombstone counters @@ -1109,22 +932,19 @@ void nano::store::rocksdb::component::serialize_memory_stats (boost::property_tr json.put ("block-cache-usage", val); } -unsigned long long nano::store::rocksdb::component::blocks_memtable_size_bytes () const -{ - return base_memtable_size_bytes (); -} - -unsigned long long nano::store::rocksdb::component::base_memtable_size_bytes () const -{ - return 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_memtable_size; -} - // This is a ratio of the blocks memtable size to keep total write transaction commit size down. unsigned nano::store::rocksdb::component::max_block_write_batch_num () const { return max_block_write_batch_num_m; } +unsigned nano::store::rocksdb::component::calculate_max_block_write_batch_num () const +{ + // Calculates the max write batch size from the memtable_size (write buffer) and the size of a block. + // With a memtable_size of 32 MB we will get 125672 as max block write batch + return nano::narrow_cast (memtable_size_bytes / (sizeof (nano::block_type) + nano::state_block::size + nano::block_sideband::size (nano::block_type::state))); +} + std::string nano::store::rocksdb::component::error_string (int status) const { return std::to_string (status); diff --git a/nano/store/rocksdb/rocksdb.hpp b/nano/store/rocksdb/rocksdb.hpp index 5d8b22bba7..ea91e8e1b8 100644 --- a/nano/store/rocksdb/rocksdb.hpp +++ b/nano/store/rocksdb/rocksdb.hpp @@ -108,10 +108,10 @@ class component : public nano::store::component ::rocksdb::TransactionDB * transaction_db = nullptr; std::unique_ptr<::rocksdb::DB> db; std::vector> handles; - std::shared_ptr<::rocksdb::TableFactory> small_table_factory; std::unordered_map write_lock_mutexes; nano::rocksdb_config rocksdb_config; unsigned const max_block_write_batch_num_m; + unsigned calculate_max_block_write_batch_num () const; class tombstone_info { @@ -155,25 +155,20 @@ class component : public nano::store::component void construct_column_family_mutexes (); ::rocksdb::Options get_db_options (); - ::rocksdb::ColumnFamilyOptions get_common_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const; - ::rocksdb::ColumnFamilyOptions get_active_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const; - ::rocksdb::ColumnFamilyOptions get_small_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a) const; - ::rocksdb::BlockBasedTableOptions get_active_table_options (std::size_t lru_size) const; - ::rocksdb::BlockBasedTableOptions get_small_table_options () const; + ::rocksdb::ColumnFamilyOptions get_common_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a) const; + ::rocksdb::BlockBasedTableOptions get_active_table_options () const; ::rocksdb::ColumnFamilyOptions get_cf_options (std::string const & cf_name_a) const; void on_flush (::rocksdb::FlushJobInfo const &); void flush_table (nano::tables table_a); void flush_tombstones_check (nano::tables table_a); void generate_tombstone_map (); + std::unordered_map create_cf_name_table_map () const; std::vector<::rocksdb::ColumnFamilyDescriptor> create_column_families (); - unsigned long long base_memtable_size_bytes () const; - unsigned long long blocks_memtable_size_bytes () const; - constexpr static int base_memtable_size = 16; - constexpr static int base_block_cache_size = 8; + constexpr static int memtable_size_bytes = 1024ULL * 1024 * 64; // 64 MB write buffer friend class nano::rocksdb_block_store_tombstone_count_Test; friend class rocksdb_block_store_upgrade_v21_v22_Test;