Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rocksdb cleanup #4717

Closed
wants to merge 11 commits into from
6 changes: 3 additions & 3 deletions nano/core_test/toml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ TEST (toml, daemon_config_deserialize_defaults)
ASSERT_EQ (conf.node.lmdb_config.map_size, defaults.node.lmdb_config.map_size);

ASSERT_EQ (conf.node.rocksdb_config.enable, defaults.node.rocksdb_config.enable);
ASSERT_EQ (conf.node.rocksdb_config.memory_multiplier, defaults.node.rocksdb_config.memory_multiplier);
ASSERT_EQ (conf.node.rocksdb_config.cache_size, defaults.node.rocksdb_config.cache_size);
ASSERT_EQ (conf.node.rocksdb_config.io_threads, defaults.node.rocksdb_config.io_threads);

ASSERT_EQ (conf.node.optimistic_scheduler.enabled, defaults.node.optimistic_scheduler.enabled);
Expand Down Expand Up @@ -573,7 +573,7 @@ TEST (toml, daemon_config_deserialize_no_defaults)

[node.rocksdb]
enable = true
memory_multiplier = 3
cache_size = 3
io_threads = 99

[node.experimental]
Expand Down Expand Up @@ -743,7 +743,7 @@ TEST (toml, daemon_config_deserialize_no_defaults)

ASSERT_TRUE (conf.node.rocksdb_config.enable);
ASSERT_EQ (nano::rocksdb_config::using_rocksdb_in_tests (), defaults.node.rocksdb_config.enable);
ASSERT_NE (conf.node.rocksdb_config.memory_multiplier, defaults.node.rocksdb_config.memory_multiplier);
ASSERT_NE (conf.node.rocksdb_config.cache_size, defaults.node.rocksdb_config.cache_size);
ASSERT_NE (conf.node.rocksdb_config.io_threads, defaults.node.rocksdb_config.io_threads);

ASSERT_NE (conf.node.optimistic_scheduler.enabled, defaults.node.optimistic_scheduler.enabled);
Expand Down
8 changes: 4 additions & 4 deletions nano/lib/rocksdbconfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,25 @@
nano::error nano::rocksdb_config::serialize_toml (nano::tomlconfig & toml) const
{
toml.put ("enable", enable, "Whether to use the RocksDB backend for the ledger database.\ntype:bool");
toml.put ("memory_multiplier", memory_multiplier, "This will modify how much memory is used represented by 1 (low), 2 (medium), 3 (high). Default is 2.\ntype:uint8");
toml.put ("cache_size", cache_size, "Amount of memory in MB used for caching for each table. Valid values are from 1 to 1024. Default is 64.\ntype:uint8");
toml.put ("io_threads", io_threads, "Number of threads to use with the background compaction and flushing.\ntype:uint32");
return toml.get_error ();
}

nano::error nano::rocksdb_config::deserialize_toml (nano::tomlconfig & toml)
{
toml.get_optional<bool> ("enable", enable);
toml.get_optional<uint8_t> ("memory_multiplier", memory_multiplier);
toml.get_optional<uint16_t> ("cache_size", cache_size);
toml.get_optional<unsigned> ("io_threads", io_threads);

// Validate ranges
if (io_threads == 0)
{
toml.get_error ().set ("io_threads must be non-zero");
}
if (memory_multiplier < 1 || memory_multiplier > 3)
if (cache_size < 1 || cache_size > 1024)
{
toml.get_error ().set ("memory_multiplier must be either 1, 2 or 3");
toml.get_error ().set ("cache_size must be between 1 and 1024 MB");
}

return toml.get_error ();
Expand Down
2 changes: 1 addition & 1 deletion nano/lib/rocksdbconfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class rocksdb_config final
static bool using_rocksdb_in_tests ();

bool enable{ false };
uint8_t memory_multiplier{ 2 };
uint16_t cache_size{ 64 };
unsigned io_threads{ std::max (nano::hardware_concurrency () / 2, 1u) };
};
}
214 changes: 17 additions & 197 deletions nano/store/rocksdb/rocksdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ nano::store::rocksdb::component::component (nano::logger & logger_a, std::filesy
logger{ logger_a },
constants{ constants },
rocksdb_config{ rocksdb_config_a },
max_block_write_batch_num_m{ nano::narrow_cast<unsigned> (blocks_memtable_size_bytes () / (2 * (sizeof (nano::block_type) + nano::state_block::size + nano::block_sideband::size (nano::block_type::state)))) },
max_block_write_batch_num_m{ calculate_max_block_write_batch_num () },
cf_name_table_map{ create_cf_name_table_map () }
{
boost::system::error_code error_mkdir, error_chmod;
Expand All @@ -80,7 +80,6 @@ nano::store::rocksdb::component::component (nano::logger & logger_a, std::filesy
debug_assert (path_a.filename () == "rocksdb");

generate_tombstone_map ();
small_table_factory.reset (::rocksdb::NewBlockBasedTableFactory (get_small_table_options ()));

// TODO: get_db_options () registers a listener for resetting tombstones, needs to check if it is a problem calling it more than once.
auto options = get_db_options ();
Expand Down Expand Up @@ -400,118 +399,13 @@ void nano::store::rocksdb::component::generate_tombstone_map ()
tombstone_map.emplace (std::piecewise_construct, std::forward_as_tuple (nano::tables::pending), std::forward_as_tuple (0, 25000));
}

rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_common_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const
{
::rocksdb::ColumnFamilyOptions cf_options;
cf_options.table_factory = table_factory_a;

// (1 active, 1 inactive)
auto num_memtables = 2;

// Each level is a multiple of the above. If L1 is 512MB. L2 will be 512 * 8 = 2GB. L3 will be 2GB * 8 = 16GB, and so on...
cf_options.max_bytes_for_level_multiplier = 8;

// Although this should be the default provided by RocksDB, not setting this is causing sequence conflict checks if not using
cf_options.max_write_buffer_size_to_maintain = memtable_size_bytes_a * num_memtables;

// Files older than this (1 day) will be scheduled for compaction when there is no other background work. This can lead to more writes however.
cf_options.ttl = 1 * 24 * 60 * 60;

// Multiplier for each level
cf_options.target_file_size_multiplier = 10;

// Size of level 1 sst files
cf_options.target_file_size_base = memtable_size_bytes_a;

// Size of each memtable
cf_options.write_buffer_size = memtable_size_bytes_a;

// Number of memtables to keep in memory
cf_options.max_write_buffer_number = num_memtables;

return cf_options;
}

rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_cf_options (std::string const & cf_name_a) const
{
::rocksdb::ColumnFamilyOptions cf_options;
auto const memtable_size_bytes = base_memtable_size_bytes ();
auto const block_cache_size_bytes = 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_block_cache_size;
if (cf_name_a == "blocks")
{
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 4)));
cf_options = get_active_cf_options (table_factory, blocks_memtable_size_bytes ());
}
else if (cf_name_a == "confirmation_height")
{
// Entries will not be deleted in the normal case, so can make memtables a lot bigger
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes * 2);
}
else if (cf_name_a == "meta" || cf_name_a == "online_weight" || cf_name_a == "peers")
{
// Meta - It contains just version key
// Online weight - Periodically deleted
// Peers - Cleaned periodically, a lot of deletions. This is never read outside of initializing? Keep this small
cf_options = get_small_cf_options (small_table_factory);
}
else if (cf_name_a == "cached_counts")
{
// Really small (keys are blocks tables, value is uint64_t)
cf_options = get_small_cf_options (small_table_factory);
}
else if (cf_name_a == "pending")
if (cf_name_a != ::rocksdb::kDefaultColumnFamilyName)
{
// Pending can have a lot of deletions too
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);

// Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
cf_options.level0_file_num_compaction_trigger = 2;

// L1 size, compaction is triggered for L0 at this size (2 SST files in L1)
cf_options.max_bytes_for_level_base = memtable_size_bytes * 2;
}
else if (cf_name_a == "frontiers")
{
// Frontiers is only needed during bootstrap for legacy blocks
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
}
else if (cf_name_a == "accounts")
{
// Can have deletions from rollbacks
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
}
else if (cf_name_a == "vote")
{
// No deletes it seems, only overwrites.
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
}
else if (cf_name_a == "pruned")
{
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
}
else if (cf_name_a == "final_votes")
{
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
}
else if (cf_name_a == "rep_weights")
{
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
}
else if (cf_name_a == ::rocksdb::kDefaultColumnFamilyName)
{
// Do nothing.
}
else
{
debug_assert (false);
cf_options.table_factory = std::shared_ptr<::rocksdb::TableFactory> (::rocksdb::NewBlockBasedTableFactory (get_active_table_options ()));
cf_options.write_buffer_size = memtable_size_bytes;
}

return cf_options;
Expand Down Expand Up @@ -836,6 +730,8 @@ int nano::store::rocksdb::component::clear (::rocksdb::ColumnFamilyHandle * colu
::rocksdb::ReadOptions read_options;
::rocksdb::WriteOptions write_options;
::rocksdb::WriteBatch write_batch;
read_options.readahead_size = 0; // Readahead only adds overhead on SSD drives

std::unique_ptr<::rocksdb::Iterator> it (db->NewIterator (read_options, column_family));

for (it->SeekToFirst (); it->Valid (); it->Next ())
Expand All @@ -862,32 +758,10 @@ rocksdb::Options nano::store::rocksdb::component::get_db_options ()
::rocksdb::Options db_options;
db_options.create_if_missing = true;
db_options.create_missing_column_families = true;

// TODO: review if this should be changed due to the unchecked table removal.
// Enable whole key bloom filter in memtables for ones with memtable_prefix_bloom_size_ratio set (unchecked table currently).
// It can potentially reduce CPU usage for point-look-ups.
db_options.memtable_whole_key_filtering = true;

// Sets the compaction priority
db_options.compaction_pri = ::rocksdb::CompactionPri::kMinOverlappingRatio;

// Start aggressively flushing WAL files when they reach over 1GB
db_options.max_total_wal_size = 1 * 1024 * 1024 * 1024LL;

// Optimize RocksDB. This is the easiest way to get RocksDB to perform well
// Set number of threads to use
db_options.IncreaseParallelism (rocksdb_config.io_threads);
// Optimize RocksDB. This is the easiest way to get RocksDB to perform well
db_options.OptimizeLevelStyleCompaction ();

// Adds a separate write queue for memtable/WAL
db_options.enable_pipelined_write = true;

// Default is 16, setting to -1 allows faster startup times for SSDs by allowings more files to be read in parallel.
db_options.max_file_opening_threads = -1;

// The MANIFEST file contains a history of all file operations since the last time the DB was opened and is replayed during DB open.
// Default is 1GB, lowering this to avoid replaying for too long (100MB)
db_options.max_manifest_file_size = 100 * 1024 * 1024ULL;

// Not compressing any SST files for compatibility reasons.
db_options.compression = ::rocksdb::kNoCompression;

Expand All @@ -899,75 +773,24 @@ rocksdb::Options nano::store::rocksdb::component::get_db_options ()
return db_options;
}

rocksdb::BlockBasedTableOptions nano::store::rocksdb::component::get_active_table_options (std::size_t lru_size) const
rocksdb::BlockBasedTableOptions nano::store::rocksdb::component::get_active_table_options () const
{
::rocksdb::BlockBasedTableOptions table_options;

// Improve point lookup performance be using the data block hash index (uses about 5% more space).
table_options.data_block_index_type = ::rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash;
table_options.data_block_hash_table_util_ratio = 0.75;

// Using format_version=4 significantly reduces the index block size, in some cases around 4-5x.
// This frees more space in block cache, which would result in higher hit rate for data and filter blocks,
// or offer the same performance with a smaller block cache size.
table_options.format_version = 4;
table_options.index_block_restart_interval = 16;

// Block cache for reads
table_options.block_cache = ::rocksdb::NewLRUCache (lru_size);

// Bloom filter to help with point reads. 10bits gives 1% false positive rate.
table_options.filter_policy.reset (::rocksdb::NewBloomFilterPolicy (10, false));

// Increasing block_size decreases memory usage and space amplification, but increases read amplification.
table_options.block_size = 16 * 1024ULL;

// Whether level 0 index and filter blocks are stored in block_cache
table_options.pin_l0_filter_and_index_blocks_in_cache = true;
table_options.block_cache = ::rocksdb::NewLRUCache (1024ULL * 1024 * rocksdb_config.cache_size);

return table_options;
}

rocksdb::BlockBasedTableOptions nano::store::rocksdb::component::get_small_table_options () const
{
::rocksdb::BlockBasedTableOptions table_options;
// Improve point lookup performance be using the data block hash index (uses about 5% more space).
table_options.data_block_index_type = ::rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash;
table_options.data_block_hash_table_util_ratio = 0.75;
table_options.block_size = 1024ULL;
return table_options;
}

rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_small_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a) const
{
auto const memtable_size_bytes = 10000;
auto cf_options = get_common_cf_options (table_factory_a, memtable_size_bytes);

// Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
cf_options.level0_file_num_compaction_trigger = 1;

// L1 size, compaction is triggered for L0 at this size (1 SST file in L1)
cf_options.max_bytes_for_level_base = memtable_size_bytes;

return cf_options;
}

::rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_active_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const
{
auto cf_options = get_common_cf_options (table_factory_a, memtable_size_bytes_a);

// Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
cf_options.level0_file_num_compaction_trigger = 4;

// L1 size, compaction is triggered for L0 at this size (4 SST files in L1)
cf_options.max_bytes_for_level_base = memtable_size_bytes_a * 4;

// Size target of levels are changed dynamically based on size of the last level
cf_options.level_compaction_dynamic_level_bytes = true;

return cf_options;
}

void nano::store::rocksdb::component::on_flush (::rocksdb::FlushJobInfo const & flush_job_info_a)
{
// Reset appropriate tombstone counters
Expand Down Expand Up @@ -1109,22 +932,19 @@ void nano::store::rocksdb::component::serialize_memory_stats (boost::property_tr
json.put ("block-cache-usage", val);
}

unsigned long long nano::store::rocksdb::component::blocks_memtable_size_bytes () const
{
return base_memtable_size_bytes ();
}

unsigned long long nano::store::rocksdb::component::base_memtable_size_bytes () const
{
return 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_memtable_size;
}

// This is a ratio of the blocks memtable size to keep total write transaction commit size down.
unsigned nano::store::rocksdb::component::max_block_write_batch_num () const
{
return max_block_write_batch_num_m;
}

unsigned nano::store::rocksdb::component::calculate_max_block_write_batch_num () const
{
// Calculates the max write batch size from the memtable_size (write buffer) and the size of a block.
// With a memtable_size of 32 MB we will get 125672 as max block write batch
return nano::narrow_cast<unsigned> (memtable_size_bytes / (sizeof (nano::block_type) + nano::state_block::size + nano::block_sideband::size (nano::block_type::state)));
}

std::string nano::store::rocksdb::component::error_string (int status) const
{
return std::to_string (status);
Expand Down
15 changes: 5 additions & 10 deletions nano/store/rocksdb/rocksdb.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,10 @@ class component : public nano::store::component
::rocksdb::TransactionDB * transaction_db = nullptr;
std::unique_ptr<::rocksdb::DB> db;
std::vector<std::unique_ptr<::rocksdb::ColumnFamilyHandle>> handles;
std::shared_ptr<::rocksdb::TableFactory> small_table_factory;
std::unordered_map<nano::tables, nano::mutex> write_lock_mutexes;
nano::rocksdb_config rocksdb_config;
unsigned const max_block_write_batch_num_m;
unsigned calculate_max_block_write_batch_num () const;

class tombstone_info
{
Expand Down Expand Up @@ -155,25 +155,20 @@ class component : public nano::store::component

void construct_column_family_mutexes ();
::rocksdb::Options get_db_options ();
::rocksdb::ColumnFamilyOptions get_common_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const;
::rocksdb::ColumnFamilyOptions get_active_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const;
::rocksdb::ColumnFamilyOptions get_small_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a) const;
::rocksdb::BlockBasedTableOptions get_active_table_options (std::size_t lru_size) const;
::rocksdb::BlockBasedTableOptions get_small_table_options () const;
::rocksdb::ColumnFamilyOptions get_common_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a) const;
::rocksdb::BlockBasedTableOptions get_active_table_options () const;
::rocksdb::ColumnFamilyOptions get_cf_options (std::string const & cf_name_a) const;

void on_flush (::rocksdb::FlushJobInfo const &);
void flush_table (nano::tables table_a);
void flush_tombstones_check (nano::tables table_a);
void generate_tombstone_map ();

std::unordered_map<char const *, nano::tables> create_cf_name_table_map () const;

std::vector<::rocksdb::ColumnFamilyDescriptor> create_column_families ();
unsigned long long base_memtable_size_bytes () const;
unsigned long long blocks_memtable_size_bytes () const;

constexpr static int base_memtable_size = 16;
constexpr static int base_block_cache_size = 8;
constexpr static int memtable_size_bytes = 1024ULL * 1024 * 64; // 64 MB write buffer

friend class nano::rocksdb_block_store_tombstone_count_Test;
friend class rocksdb_block_store_upgrade_v21_v22_Test;
Expand Down
Loading