Merge pull request #4730 from RickiNano/Rocksdb-overhaul

Rocksdb overhaul and clean up
nanocurrency · Sep 25, 2024 · 0d7b1de · 0d7b1de
2 parents 5c306a0 + 3f089a2
commit 0d7b1de
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 215 deletions.
diff --git a/nano/core_test/toml.cpp b/nano/core_test/toml.cpp
@@ -241,8 +241,9 @@ TEST (toml, daemon_config_deserialize_defaults)
 	ASSERT_EQ (conf.node.lmdb_config.map_size, defaults.node.lmdb_config.map_size);
 
 	ASSERT_EQ (conf.node.rocksdb_config.enable, defaults.node.rocksdb_config.enable);
-	ASSERT_EQ (conf.node.rocksdb_config.memory_multiplier, defaults.node.rocksdb_config.memory_multiplier);
 	ASSERT_EQ (conf.node.rocksdb_config.io_threads, defaults.node.rocksdb_config.io_threads);
+	ASSERT_EQ (conf.node.rocksdb_config.read_cache, defaults.node.rocksdb_config.read_cache);
+	ASSERT_EQ (conf.node.rocksdb_config.write_cache, defaults.node.rocksdb_config.write_cache);
 
 	ASSERT_EQ (conf.node.optimistic_scheduler.enabled, defaults.node.optimistic_scheduler.enabled);
 	ASSERT_EQ (conf.node.optimistic_scheduler.gap_threshold, defaults.node.optimistic_scheduler.gap_threshold);
@@ -578,8 +579,9 @@ TEST (toml, daemon_config_deserialize_no_defaults)
 
 	[node.rocksdb]
 	enable = true
-	memory_multiplier = 3
 	io_threads = 99
+	read_cache = 99
+	write_cache = 99
 
 	[node.experimental]
 	secondary_work_peers = ["dev.org:998"]
@@ -748,8 +750,9 @@ TEST (toml, daemon_config_deserialize_no_defaults)
 
 	ASSERT_TRUE (conf.node.rocksdb_config.enable);
 	ASSERT_EQ (nano::rocksdb_config::using_rocksdb_in_tests (), defaults.node.rocksdb_config.enable);
-	ASSERT_NE (conf.node.rocksdb_config.memory_multiplier, defaults.node.rocksdb_config.memory_multiplier);
 	ASSERT_NE (conf.node.rocksdb_config.io_threads, defaults.node.rocksdb_config.io_threads);
+	ASSERT_NE (conf.node.rocksdb_config.read_cache, defaults.node.rocksdb_config.read_cache);
+	ASSERT_NE (conf.node.rocksdb_config.write_cache, defaults.node.rocksdb_config.write_cache);
 
 	ASSERT_NE (conf.node.optimistic_scheduler.enabled, defaults.node.optimistic_scheduler.enabled);
 	ASSERT_NE (conf.node.optimistic_scheduler.gap_threshold, defaults.node.optimistic_scheduler.gap_threshold);

diff --git a/nano/lib/rocksdbconfig.cpp b/nano/lib/rocksdbconfig.cpp
@@ -5,25 +5,34 @@
 nano::error nano::rocksdb_config::serialize_toml (nano::tomlconfig & toml) const
 {
 	toml.put ("enable", enable, "Whether to use the RocksDB backend for the ledger database.\ntype:bool");
-	toml.put ("memory_multiplier", memory_multiplier, "This will modify how much memory is used represented by 1 (low), 2 (medium), 3 (high). Default is 2.\ntype:uint8");
 	toml.put ("io_threads", io_threads, "Number of threads to use with the background compaction and flushing.\ntype:uint32");
+	toml.put ("read_cache", read_cache, "Amount of megabytes per table allocated to read cache. Valid range is 1 - 1024. Default is 32.\nCarefully monitor memory usage if non-default values are used\ntype:long");
+	toml.put ("write_cache", write_cache, "Total amount of megabytes allocated to write cache. Valid range is 1 - 256. Default is 64.\nCarefully monitor memory usage if non-default values are used\ntype:long");
+
 	return toml.get_error ();
 }
 
 nano::error nano::rocksdb_config::deserialize_toml (nano::tomlconfig & toml)
 {
 	toml.get_optional<bool> ("enable", enable);
-	toml.get_optional<uint8_t> ("memory_multiplier", memory_multiplier);
 	toml.get_optional<unsigned> ("io_threads", io_threads);
+	toml.get_optional<long> ("read_cache", read_cache);
+	toml.get_optional<long> ("write_cache", write_cache);
 
 	// Validate ranges
 	if (io_threads == 0)
 	{
 		toml.get_error ().set ("io_threads must be non-zero");
 	}
-	if (memory_multiplier < 1 || memory_multiplier > 3)
+
+	if (read_cache < 1 || read_cache > 1024)
+	{
+		toml.get_error ().set ("read_cache must be between 1 and 1024 MB");
+	}
+
+	if (write_cache < 1 || write_cache > 256)
 	{
-		toml.get_error ().set ("memory_multiplier must be either 1, 2 or 3");
+		toml.get_error ().set ("write_cache must be between 1 and 256 MB");
 	}
 
 	return toml.get_error ();

diff --git a/nano/lib/rocksdbconfig.hpp b/nano/lib/rocksdbconfig.hpp
@@ -25,7 +25,8 @@ class rocksdb_config final
 	static bool using_rocksdb_in_tests ();
 
 	bool enable{ false };
-	uint8_t memory_multiplier{ 2 };
 	unsigned io_threads{ std::max (nano::hardware_concurrency () / 2, 1u) };
+	long read_cache{ 32 };
+	long write_cache{ 64 };
 };
 }
diff --git a/nano/store/rocksdb/rocksdb.cpp b/nano/store/rocksdb/rocksdb.cpp
@@ -64,7 +64,7 @@ nano::store::rocksdb::component::component (nano::logger & logger_a, std::filesy
 	logger{ logger_a },
 	constants{ constants },
 	rocksdb_config{ rocksdb_config_a },
-	max_block_write_batch_num_m{ nano::narrow_cast<unsigned> (blocks_memtable_size_bytes () / (2 * (sizeof (nano::block_type) + nano::state_block::size + nano::block_sideband::size (nano::block_type::state)))) },
+	max_block_write_batch_num_m{ nano::narrow_cast<unsigned> ((rocksdb_config_a.write_cache * 1024 * 1024) / (2 * (sizeof (nano::block_type) + nano::state_block::size + nano::block_sideband::size (nano::block_type::state)))) },
 	cf_name_table_map{ create_cf_name_table_map () }
 {
 	boost::system::error_code error_mkdir, error_chmod;
@@ -80,7 +80,6 @@ nano::store::rocksdb::component::component (nano::logger & logger_a, std::filesy
 	debug_assert (path_a.filename () == "rocksdb");
 
 	generate_tombstone_map ();
-	small_table_factory.reset (::rocksdb::NewBlockBasedTableFactory (get_small_table_options ()));
 
 	// TODO: get_db_options () registers a listener for resetting tombstones, needs to check if it is a problem calling it more than once.
 	auto options = get_db_options ();
@@ -400,120 +399,16 @@ void nano::store::rocksdb::component::generate_tombstone_map ()
 	tombstone_map.emplace (std::piecewise_construct, std::forward_as_tuple (nano::tables::pending), std::forward_as_tuple (0, 25000));
 }
 
-rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_common_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const
-{
-	::rocksdb::ColumnFamilyOptions cf_options;
-	cf_options.table_factory = table_factory_a;
-
-	// (1 active, 1 inactive)
-	auto num_memtables = 2;
-
-	// Each level is a multiple of the above. If L1 is 512MB. L2 will be 512 * 8 = 2GB. L3 will be 2GB * 8 = 16GB, and so on...
-	cf_options.max_bytes_for_level_multiplier = 8;
-
-	// Although this should be the default provided by RocksDB, not setting this is causing sequence conflict checks if not using
-	cf_options.max_write_buffer_size_to_maintain = memtable_size_bytes_a * num_memtables;
-
-	// Files older than this (1 day) will be scheduled for compaction when there is no other background work. This can lead to more writes however.
-	cf_options.ttl = 1 * 24 * 60 * 60;
-
-	// Multiplier for each level
-	cf_options.target_file_size_multiplier = 10;
-
-	// Size of level 1 sst files
-	cf_options.target_file_size_base = memtable_size_bytes_a;
-
-	// Size of each memtable
-	cf_options.write_buffer_size = memtable_size_bytes_a;
-
-	// Number of memtables to keep in memory
-	cf_options.max_write_buffer_number = num_memtables;
-
-	return cf_options;
-}
-
 rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_cf_options (std::string const & cf_name_a) const
 {
 	::rocksdb::ColumnFamilyOptions cf_options;
-	auto const memtable_size_bytes = base_memtable_size_bytes ();
-	auto const block_cache_size_bytes = 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_block_cache_size;
-	if (cf_name_a == "blocks")
-	{
-		std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 4)));
-		cf_options = get_active_cf_options (table_factory, blocks_memtable_size_bytes ());
-	}
-	else if (cf_name_a == "confirmation_height")
-	{
-		// Entries will not be deleted in the normal case, so can make memtables a lot bigger
-		std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes)));
-		cf_options = get_active_cf_options (table_factory, memtable_size_bytes * 2);
-	}
-	else if (cf_name_a == "meta" || cf_name_a == "online_weight" || cf_name_a == "peers")
-	{
-		// Meta - It contains just version key
-		// Online weight - Periodically deleted
-		// Peers - Cleaned periodically, a lot of deletions. This is never read outside of initializing? Keep this small
-		cf_options = get_small_cf_options (small_table_factory);
-	}
-	else if (cf_name_a == "cached_counts")
-	{
-		// Really small (keys are blocks tables, value is uint64_t)
-		cf_options = get_small_cf_options (small_table_factory);
-	}
-	else if (cf_name_a == "pending")
-	{
-		// Pending can have a lot of deletions too
-		std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes)));
-		cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
-
-		// Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
-		cf_options.level0_file_num_compaction_trigger = 2;
-
-		// L1 size, compaction is triggered for L0 at this size (2 SST files in L1)
-		cf_options.max_bytes_for_level_base = memtable_size_bytes * 2;
-	}
-	else if (cf_name_a == "frontiers")
-	{
-		// Frontiers is only needed during bootstrap for legacy blocks
-		std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes)));
-		cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
-	}
-	else if (cf_name_a == "accounts")
-	{
-		// Can have deletions from rollbacks
-		std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
-		cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
-	}
-	else if (cf_name_a == "vote")
-	{
-		// No deletes it seems, only overwrites.
-		std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
-		cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
-	}
-	else if (cf_name_a == "pruned")
-	{
-		std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
-		cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
-	}
-	else if (cf_name_a == "final_votes")
-	{
-		std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
-		cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
-	}
-	else if (cf_name_a == "rep_weights")
-	{
-		std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
-		cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
-	}
-	else if (cf_name_a == ::rocksdb::kDefaultColumnFamilyName)
-	{
-		// Do nothing.
-	}
-	else
+	if (cf_name_a != ::rocksdb::kDefaultColumnFamilyName)
 	{
-		debug_assert (false);
+		std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_table_options ()));
+		cf_options.table_factory = table_factory;
+		// Size of each memtable (write buffer for this column family)
+		cf_options.write_buffer_size = rocksdb_config.write_cache * 1024 * 1024;
 	}
-
 	return cf_options;
 }
 
@@ -863,30 +758,11 @@ rocksdb::Options nano::store::rocksdb::component::get_db_options ()
 	db_options.create_if_missing = true;
 	db_options.create_missing_column_families = true;
 
-	// TODO: review if this should be changed due to the unchecked table removal.
-	// Enable whole key bloom filter in memtables for ones with memtable_prefix_bloom_size_ratio set (unchecked table currently).
-	// It can potentially reduce CPU usage for point-look-ups.
-	db_options.memtable_whole_key_filtering = true;
-
-	// Sets the compaction priority
-	db_options.compaction_pri = ::rocksdb::CompactionPri::kMinOverlappingRatio;
-
-	// Start aggressively flushing WAL files when they reach over 1GB
-	db_options.max_total_wal_size = 1 * 1024 * 1024 * 1024LL;
-
 	// Optimize RocksDB. This is the easiest way to get RocksDB to perform well
-	db_options.IncreaseParallelism (rocksdb_config.io_threads);
 	db_options.OptimizeLevelStyleCompaction ();
 
-	// Adds a separate write queue for memtable/WAL
-	db_options.enable_pipelined_write = true;
-
-	// Default is 16, setting to -1 allows faster startup times for SSDs by allowings more files to be read in parallel.
-	db_options.max_file_opening_threads = -1;
-
-	// The MANIFEST file contains a history of all file operations since the last time the DB was opened and is replayed during DB open.
-	// Default is 1GB, lowering this to avoid replaying for too long (100MB)
-	db_options.max_manifest_file_size = 100 * 1024 * 1024ULL;
+	// Set max number of threads
+	db_options.IncreaseParallelism (rocksdb_config.io_threads);
 
 	// Not compressing any SST files for compatibility reasons.
 	db_options.compression = ::rocksdb::kNoCompression;
@@ -899,75 +775,27 @@ rocksdb::Options nano::store::rocksdb::component::get_db_options ()
 	return db_options;
 }
 
-rocksdb::BlockBasedTableOptions nano::store::rocksdb::component::get_active_table_options (std::size_t lru_size) const
+rocksdb::BlockBasedTableOptions nano::store::rocksdb::component::get_table_options () const
 {
 	::rocksdb::BlockBasedTableOptions table_options;
 
 	// Improve point lookup performance be using the data block hash index (uses about 5% more space).
 	table_options.data_block_index_type = ::rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash;
-	table_options.data_block_hash_table_util_ratio = 0.75;
 
-	// Using format_version=4 significantly reduces the index block size, in some cases around 4-5x.
-	// This frees more space in block cache, which would result in higher hit rate for data and filter blocks,
-	// or offer the same performance with a smaller block cache size.
-	table_options.format_version = 4;
-	table_options.index_block_restart_interval = 16;
+	// Using storage format_version 5.
+	// Version 5 offers improved read spead, caching and better compression (if enabled)
+	// Any existing ledger data in version 4 will not be migrated. New data will be written in version 5.
+	table_options.format_version = 5;
 
 	// Block cache for reads
-	table_options.block_cache = ::rocksdb::NewLRUCache (lru_size);
+	table_options.block_cache = ::rocksdb::NewLRUCache (rocksdb_config.read_cache * 1024 * 1024);
 
 	// Bloom filter to help with point reads. 10bits gives 1% false positive rate.
 	table_options.filter_policy.reset (::rocksdb::NewBloomFilterPolicy (10, false));
 
-	// Increasing block_size decreases memory usage and space amplification, but increases read amplification.
-	table_options.block_size = 16 * 1024ULL;
-
-	// Whether level 0 index and filter blocks are stored in block_cache
-	table_options.pin_l0_filter_and_index_blocks_in_cache = true;
-
-	return table_options;
-}
-
-rocksdb::BlockBasedTableOptions nano::store::rocksdb::component::get_small_table_options () const
-{
-	::rocksdb::BlockBasedTableOptions table_options;
-	// Improve point lookup performance be using the data block hash index (uses about 5% more space).
-	table_options.data_block_index_type = ::rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash;
-	table_options.data_block_hash_table_util_ratio = 0.75;
-	table_options.block_size = 1024ULL;
 	return table_options;
 }
 
-rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_small_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a) const
-{
-	auto const memtable_size_bytes = 10000;
-	auto cf_options = get_common_cf_options (table_factory_a, memtable_size_bytes);
-
-	// Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
-	cf_options.level0_file_num_compaction_trigger = 1;
-
-	// L1 size, compaction is triggered for L0 at this size (1 SST file in L1)
-	cf_options.max_bytes_for_level_base = memtable_size_bytes;
-
-	return cf_options;
-}
-
-::rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_active_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const
-{
-	auto cf_options = get_common_cf_options (table_factory_a, memtable_size_bytes_a);
-
-	// Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
-	cf_options.level0_file_num_compaction_trigger = 4;
-
-	// L1 size, compaction is triggered for L0 at this size (4 SST files in L1)
-	cf_options.max_bytes_for_level_base = memtable_size_bytes_a * 4;
-
-	// Size target of levels are changed dynamically based on size of the last level
-	cf_options.level_compaction_dynamic_level_bytes = true;
-
-	return cf_options;
-}
-
 void nano::store::rocksdb::component::on_flush (::rocksdb::FlushJobInfo const & flush_job_info_a)
 {
 	// Reset appropriate tombstone counters
@@ -1109,16 +937,6 @@ void nano::store::rocksdb::component::serialize_memory_stats (boost::property_tr
 	json.put ("block-cache-usage", val);
 }
 
-unsigned long long nano::store::rocksdb::component::blocks_memtable_size_bytes () const
-{
-	return base_memtable_size_bytes ();
-}
-
-unsigned long long nano::store::rocksdb::component::base_memtable_size_bytes () const
-{
-	return 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_memtable_size;
-}
-
 // This is a ratio of the blocks memtable size to keep total write transaction commit size down.
 unsigned nano::store::rocksdb::component::max_block_write_batch_num () const
 {

diff --git a/nano/store/rocksdb/rocksdb.hpp b/nano/store/rocksdb/rocksdb.hpp
@@ -108,7 +108,6 @@ class component : public nano::store::component
 	::rocksdb::TransactionDB * transaction_db = nullptr;
 	std::unique_ptr<::rocksdb::DB> db;
 	std::vector<std::unique_ptr<::rocksdb::ColumnFamilyHandle>> handles;
-	std::shared_ptr<::rocksdb::TableFactory> small_table_factory;
 	std::unordered_map<nano::tables, nano::mutex> write_lock_mutexes;
 	nano::rocksdb_config rocksdb_config;
 	unsigned const max_block_write_batch_num_m;
@@ -155,11 +154,7 @@ class component : public nano::store::component
 
 	void construct_column_family_mutexes ();
 	::rocksdb::Options get_db_options ();
-	::rocksdb::ColumnFamilyOptions get_common_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const;
-	::rocksdb::ColumnFamilyOptions get_active_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const;
-	::rocksdb::ColumnFamilyOptions get_small_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a) const;
-	::rocksdb::BlockBasedTableOptions get_active_table_options (std::size_t lru_size) const;
-	::rocksdb::BlockBasedTableOptions get_small_table_options () const;
+	::rocksdb::BlockBasedTableOptions get_table_options () const;
 	::rocksdb::ColumnFamilyOptions get_cf_options (std::string const & cf_name_a) const;
 
 	void on_flush (::rocksdb::FlushJobInfo const &);
@@ -169,11 +164,6 @@ class component : public nano::store::component
 	std::unordered_map<char const *, nano::tables> create_cf_name_table_map () const;
 
 	std::vector<::rocksdb::ColumnFamilyDescriptor> create_column_families ();
-	unsigned long long base_memtable_size_bytes () const;
-	unsigned long long blocks_memtable_size_bytes () const;
-
-	constexpr static int base_memtable_size = 16;
-	constexpr static int base_block_cache_size = 8;
 
 	friend class nano::rocksdb_block_store_tombstone_count_Test;
 	friend class rocksdb_block_store_upgrade_v21_v22_Test;