From 5d5a8ceda8fbd3ab60a5e284f74b0219cebca109 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 21 Sep 2024 23:06:24 +0200 Subject: [PATCH 01/90] KeyRange::empty --- src/key_range.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/key_range.rs b/src/key_range.rs index eea2f839..c7f862f6 100644 --- a/src/key_range.rs +++ b/src/key_range.rs @@ -29,6 +29,10 @@ impl KeyRange { Self(range) } + pub fn empty() -> Self { + Self((Slice::new(b""), Slice::new(b""))) + } + /// Returns `true` if the list of key ranges is disjoint pub fn is_disjoint(ranges: &[&Self]) -> bool { for (idx, a) in ranges.iter().enumerate() { From 521e84c62b15021c70e58094535da437a5f9c1fe Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 21 Sep 2024 23:07:02 +0200 Subject: [PATCH 02/90] import --- src/level_manifest/level.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/level_manifest/level.rs b/src/level_manifest/level.rs index 00b345f9..2464b18f 100644 --- a/src/level_manifest/level.rs +++ b/src/level_manifest/level.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use crate::{key_range::KeyRange, segment::meta::SegmentId, Segment}; -use std::sync::Arc; +use std::{ops::Bound, sync::Arc}; /// Level of an LSM-tree #[derive(Clone, Debug)] From a782200381ab2c5ce5e61480cdb6ca5b0de249c4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 21 Sep 2024 23:07:26 +0200 Subject: [PATCH 03/90] import --- src/level_manifest/level.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/level_manifest/level.rs b/src/level_manifest/level.rs index 2464b18f..3166b009 100644 --- a/src/level_manifest/level.rs +++ b/src/level_manifest/level.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::{key_range::KeyRange, segment::meta::SegmentId, Segment}; +use crate::{key_range::KeyRange, segment::meta::SegmentId, Segment, UserKey}; use std::{ops::Bound, sync::Arc}; /// Level of an LSM-tree From eeb51a1bb8d540d71709c64be3c38317fe483971 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 22 Sep 2024 00:30:24 +0200 Subject: [PATCH 04/90] #51 (L1 segments also need to get full index?) --- src/compaction/fifo.rs | 7 +- src/compaction/leveled.rs | 7 +- src/compaction/maintenance.rs | 11 ++- src/compaction/tiered.rs | 7 +- src/compaction/worker.rs | 12 ++- src/level_manifest/level.rs | 30 ++++-- src/segment/block_index/full_index.rs | 74 +++++++++++++++ src/segment/block_index/mod.rs | 35 ++++++- src/segment/block_index/top_level.rs | 35 +++---- src/segment/block_index/two_level_index.rs | 103 +++++++++++++-------- src/segment/mod.rs | 38 ++++---- src/segment/range.rs | 55 ++++++----- src/segment/writer/mod.rs | 6 +- src/tree/mod.rs | 20 ++-- 14 files changed, 311 insertions(+), 129 deletions(-) create mode 100644 src/segment/block_index/full_index.rs diff --git a/src/compaction/fifo.rs b/src/compaction/fifo.rs index 3fa5f3ce..755ef69e 100644 --- a/src/compaction/fifo.rs +++ b/src/compaction/fifo.rs @@ -126,7 +126,7 @@ mod tests { key_range::KeyRange, level_manifest::LevelManifest, segment::{ - block_index::two_level_index::TwoLevelBlockIndex, + block_index::{two_level_index::TwoLevelBlockIndex, BlockIndexImpl}, file_offsets::FileOffsets, meta::{Metadata, SegmentId}, Segment, @@ -144,10 +144,13 @@ mod tests { fn fixture_segment(id: SegmentId, created_at: u128) -> Arc { let block_cache = Arc::new(BlockCache::with_capacity_bytes(10 * 1_024 * 1_024)); + let block_index = TwoLevelBlockIndex::new((0, id).into(), block_cache.clone()); + let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); + Arc::new(Segment { tree_id: 0, descriptor_table: Arc::new(FileDescriptorTable::new(512, 1)), - block_index: Arc::new(TwoLevelBlockIndex::new((0, id).into(), block_cache.clone())), + block_index, offsets: FileOffsets { bloom_ptr: 0, diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 87427132..f4342c09 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -238,7 +238,7 @@ mod tests { key_range::KeyRange, level_manifest::LevelManifest, segment::{ - block_index::two_level_index::TwoLevelBlockIndex, + block_index::{two_level_index::TwoLevelBlockIndex, BlockIndexImpl}, file_offsets::FileOffsets, meta::{Metadata, SegmentId}, Segment, @@ -269,10 +269,13 @@ mod tests { ) -> Arc { let block_cache = Arc::new(BlockCache::with_capacity_bytes(10 * 1_024 * 1_024)); + let block_index = TwoLevelBlockIndex::new((0, id).into(), block_cache.clone()); + let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); + Arc::new(Segment { tree_id: 0, descriptor_table: Arc::new(FileDescriptorTable::new(512, 1)), - block_index: Arc::new(TwoLevelBlockIndex::new((0, id).into(), block_cache.clone())), + block_index, offsets: FileOffsets { bloom_ptr: 0, diff --git a/src/compaction/maintenance.rs b/src/compaction/maintenance.rs index d2960739..a479be4d 100644 --- a/src/compaction/maintenance.rs +++ b/src/compaction/maintenance.rs @@ -86,8 +86,10 @@ mod tests { key_range::KeyRange, level_manifest::LevelManifest, segment::{ - block_index::two_level_index::TwoLevelBlockIndex, file_offsets::FileOffsets, - meta::Metadata, Segment, + block_index::{two_level_index::TwoLevelBlockIndex, BlockIndexImpl}, + file_offsets::FileOffsets, + meta::Metadata, + Segment, }, }; use std::sync::Arc; @@ -100,10 +102,13 @@ mod tests { fn fixture_segment(id: SegmentId, created_at: u128) -> Arc { let block_cache = Arc::new(BlockCache::with_capacity_bytes(10 * 1_024 * 1_024)); + let block_index = TwoLevelBlockIndex::new((0, id).into(), block_cache.clone()); + let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); + Arc::new(Segment { tree_id: 0, descriptor_table: Arc::new(FileDescriptorTable::new(512, 1)), - block_index: Arc::new(TwoLevelBlockIndex::new((0, id).into(), block_cache.clone())), + block_index, offsets: FileOffsets { bloom_ptr: 0, diff --git a/src/compaction/tiered.rs b/src/compaction/tiered.rs index 6fe5ca95..0a8324ed 100644 --- a/src/compaction/tiered.rs +++ b/src/compaction/tiered.rs @@ -131,7 +131,7 @@ mod tests { key_range::KeyRange, level_manifest::LevelManifest, segment::{ - block_index::two_level_index::TwoLevelBlockIndex, + block_index::{two_level_index::TwoLevelBlockIndex, BlockIndexImpl}, file_offsets::FileOffsets, meta::{Metadata, SegmentId}, Segment, @@ -148,10 +148,13 @@ mod tests { fn fixture_segment(id: SegmentId, size_mib: u64, max_seqno: SeqNo) -> Arc { let block_cache = Arc::new(BlockCache::with_capacity_bytes(10 * 1_024 * 1_024)); + let block_index = TwoLevelBlockIndex::new((0, id).into(), block_cache.clone()); + let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); + Arc::new(Segment { tree_id: 0, descriptor_table: Arc::new(FileDescriptorTable::new(512, 1)), - block_index: Arc::new(TwoLevelBlockIndex::new((0, id).into(), block_cache.clone())), + block_index, offsets: FileOffsets { bloom_ptr: 0, diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 05eb549b..d4787d57 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -229,21 +229,23 @@ fn merge_segments( let segment_id = trailer.metadata.id; let segment_file_path = segments_base_folder.join(segment_id.to_string()); - let tli_ptr = trailer.offsets.tli_ptr; - #[cfg(feature = "bloom")] let bloom_ptr = trailer.offsets.bloom_ptr; // NOTE: Need to allow because of false positive in Clippy // because of "bloom" feature #[allow(clippy::needless_borrows_for_generic_args)] - let block_index = Arc::new(TwoLevelBlockIndex::from_file( + let block_index = TwoLevelBlockIndex::from_file( &segment_file_path, - tli_ptr, + &trailer.metadata, + &trailer.offsets, (opts.tree_id, segment_id).into(), opts.config.descriptor_table.clone(), opts.config.block_cache.clone(), - )?); + )?; + let block_index = Arc::new(crate::segment::block_index::BlockIndexImpl::TwoLevel( + block_index, + )); Ok(Arc::new(Segment { tree_id: opts.tree_id, diff --git a/src/level_manifest/level.rs b/src/level_manifest/level.rs index 3166b009..7c86bf77 100644 --- a/src/level_manifest/level.rs +++ b/src/level_manifest/level.rs @@ -17,6 +17,8 @@ pub struct Level { /// is only recomputed when the level is changed /// to avoid unnecessary CPU work pub is_disjoint: bool, + + pub key_range: KeyRange, } impl std::fmt::Display for Level { @@ -41,22 +43,32 @@ impl Default for Level { fn default() -> Self { Self { is_disjoint: true, - segments: Vec::with_capacity(10), + segments: Vec::new(), + key_range: KeyRange::empty(), } } } impl Level { - pub fn insert(&mut self, segment: Arc) { - self.segments.push(segment); + // TODO: unit test + fn set_key_range(&mut self) { + todo!() + } + + fn update_metadata(&mut self) { self.set_disjoint_flag(); self.sort(); + // self.set_key_range(); + } + + pub fn insert(&mut self, segment: Arc) { + self.segments.push(segment); + self.update_metadata(); } pub fn remove(&mut self, segment_id: SegmentId) { self.segments.retain(|x| segment_id != x.metadata.id); - self.set_disjoint_flag(); - self.sort(); + self.update_metadata(); } pub(crate) fn sort(&mut self) { @@ -182,7 +194,7 @@ mod tests { descriptor_table::FileDescriptorTable, key_range::KeyRange, segment::{ - block_index::two_level_index::TwoLevelBlockIndex, + block_index::{two_level_index::TwoLevelBlockIndex, BlockIndexImpl}, file_offsets::FileOffsets, meta::{Metadata, SegmentId}, Segment, @@ -199,10 +211,13 @@ mod tests { fn fixture_segment(id: SegmentId, key_range: KeyRange) -> Arc { let block_cache = Arc::new(BlockCache::with_capacity_bytes(10 * 1_024 * 1_024)); + let block_index = TwoLevelBlockIndex::new((0, id).into(), block_cache.clone()); + let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); + Arc::new(Segment { tree_id: 0, descriptor_table: Arc::new(FileDescriptorTable::new(512, 1)), - block_index: Arc::new(TwoLevelBlockIndex::new((0, id).into(), block_cache.clone())), + block_index, offsets: FileOffsets { bloom_ptr: 0, @@ -244,6 +259,7 @@ mod tests { fn level_disjoint_cull() { let level = Level { is_disjoint: true, + key_range: KeyRange::empty(), segments: vec![ fixture_segment(0, KeyRange::new((Slice::from("a"), Slice::from("c")))), fixture_segment(1, KeyRange::new((Slice::from("d"), Slice::from("g")))), diff --git a/src/segment/block_index/full_index.rs b/src/segment/block_index/full_index.rs new file mode 100644 index 00000000..a571be2d --- /dev/null +++ b/src/segment/block_index/full_index.rs @@ -0,0 +1,74 @@ +use super::{block_handle::KeyedBlockHandle, BlockIndex}; +use crate::segment::{block_index::IndexBlock, value_block::CachePolicy}; +use std::{fs::File, io::Seek, path::Path}; + +/// Index that translates item keys to block handles +/// +/// The index is fully loaded into memory. +pub struct FullBlockIndex(Box<[KeyedBlockHandle]>); + +impl FullBlockIndex { + pub fn from_file>( + path: P, + metadata: &crate::segment::meta::Metadata, + offsets: &crate::segment::file_offsets::FileOffsets, + ) -> crate::Result { + let path = path.as_ref(); + let cnt = metadata.index_block_count as usize; + + log::trace!( + "reading full block index from {path:?} at idx_ptr={} ({cnt} index blocks)", + offsets.index_block_ptr, + ); + + let mut file = File::open(path)?; + file.seek(std::io::SeekFrom::Start(offsets.index_block_ptr))?; + + let mut block_handles = Vec::with_capacity(cnt); + + for _ in 0..cnt { + let idx_block = IndexBlock::from_reader(&mut file)?.items; + block_handles.extend(idx_block); + } + + debug_assert!(!block_handles.is_empty()); + debug_assert_eq!(cnt, block_handles.len()); + + Ok(Self(block_handles.into_boxed_slice())) + } +} + +impl BlockIndex for FullBlockIndex { + fn get_lowest_block_containing_key( + &self, + key: &[u8], + _: CachePolicy, + ) -> crate::Result> { + use super::RawBlockIndex; + + self.0 + .get_lowest_block_containing_key(key, CachePolicy::Read) + .map(|x| x.map(|x| x.offset)) + } + + /// Gets the last block handle that may contain the given item + fn get_last_block_containing_key( + &self, + key: &[u8], + cache_policy: CachePolicy, + ) -> crate::Result> { + use super::RawBlockIndex; + + self.0 + .get_last_block_containing_key(key, cache_policy) + .map(|x| x.map(|x| x.offset)) + } + + fn get_last_block_handle(&self, _: CachePolicy) -> crate::Result { + use super::RawBlockIndex; + + self.0 + .get_last_block_handle(CachePolicy::Read) + .map(|x| x.offset) + } +} diff --git a/src/segment/block_index/mod.rs b/src/segment/block_index/mod.rs index c791d892..f6a955ba 100644 --- a/src/segment/block_index/mod.rs +++ b/src/segment/block_index/mod.rs @@ -3,16 +3,19 @@ // (found in the LICENSE-* files in the repository) pub mod block_handle; +pub mod full_index; pub mod top_level; pub mod two_level_index; pub mod writer; use super::{block::Block, value_block::CachePolicy}; use block_handle::KeyedBlockHandle; +use full_index::FullBlockIndex; +use two_level_index::TwoLevelBlockIndex; pub type IndexBlock = Block; -impl BlockIndex for [KeyedBlockHandle] { +impl RawBlockIndex for [KeyedBlockHandle] { fn get_lowest_block_containing_key( &self, key: &[u8], @@ -54,7 +57,28 @@ impl BlockIndex for [KeyedBlockHandle] { } } +#[enum_dispatch::enum_dispatch] pub trait BlockIndex { + /// Gets the lowest block handle that may contain the given item + fn get_lowest_block_containing_key( + &self, + key: &[u8], + cache_policy: CachePolicy, + ) -> crate::Result>; + + /// Gets the last block handle that may contain the given item + fn get_last_block_containing_key( + &self, + key: &[u8], + cache_policy: CachePolicy, + ) -> crate::Result>; + + /// Returns a handle to the last block + fn get_last_block_handle(&self, cache_policy: CachePolicy) -> crate::Result; +} + +#[allow(clippy::module_name_repetitions)] +pub trait RawBlockIndex { /// Gets the lowest block handle that may contain the given item fn get_lowest_block_containing_key( &self, @@ -73,11 +97,18 @@ pub trait BlockIndex { fn get_last_block_handle(&self, cache_policy: CachePolicy) -> crate::Result<&KeyedBlockHandle>; } +#[enum_dispatch::enum_dispatch(BlockIndex)] +#[allow(clippy::module_name_repetitions)] +pub enum BlockIndexImpl { + Full(FullBlockIndex), + TwoLevel(TwoLevelBlockIndex), +} + #[cfg(test)] #[allow(clippy::expect_used)] mod tests { use super::*; - use crate::{segment::block_index::BlockIndex, Slice}; + use crate::Slice; use test_log::test; fn bh>(end_key: K, offset: u64) -> KeyedBlockHandle { diff --git a/src/segment/block_index/top_level.rs b/src/segment/block_index/top_level.rs index c4a1be5c..15ad1606 100644 --- a/src/segment/block_index/top_level.rs +++ b/src/segment/block_index/top_level.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{block_handle::KeyedBlockHandle, BlockIndex}; +use super::{block_handle::KeyedBlockHandle, RawBlockIndex}; use crate::segment::{block_index::IndexBlock, value_block::CachePolicy}; use std::{fs::File, path::Path}; @@ -29,27 +29,30 @@ use std::{fs::File, path::Path}; pub struct TopLevelIndex(Box<[KeyedBlockHandle]>); impl TopLevelIndex { - /// Creates a top-level block index - #[must_use] - pub fn from_boxed_slice(handles: Box<[KeyedBlockHandle]>) -> Self { - Self(handles) - } - - /// Loads a top-level index from disk - pub fn from_file>(path: P, offset: u64) -> crate::Result { + pub fn from_file>( + path: P, + _: &crate::segment::meta::Metadata, + offsets: &crate::segment::file_offsets::FileOffsets, + ) -> crate::Result { let path = path.as_ref(); - log::trace!("reading TLI from {path:?}, offset={offset}"); + + log::trace!("reading TLI from {path:?} at tli_ptr={}", offsets.tli_ptr); let mut file = File::open(path)?; + let items = IndexBlock::from_file(&mut file, offsets.tli_ptr)?.items; - let items = IndexBlock::from_file(&mut file, offset)?.items; log::trace!("loaded TLI ({path:?}): {items:?}"); - debug_assert!(!items.is_empty()); Ok(Self::from_boxed_slice(items)) } + /// Creates a top-level block index + #[must_use] + pub fn from_boxed_slice(handles: Box<[KeyedBlockHandle]>) -> Self { + Self(handles) + } + #[must_use] pub fn len(&self) -> usize { self.0.len() @@ -63,10 +66,8 @@ impl TopLevelIndex { pub fn iter(&self) -> impl Iterator { self.0.iter() } -} -impl BlockIndex for TopLevelIndex { - fn get_lowest_block_containing_key( + pub fn get_lowest_block_containing_key( &self, key: &[u8], _: CachePolicy, @@ -76,7 +77,7 @@ impl BlockIndex for TopLevelIndex { } /// Gets the last block handle that may contain the given item - fn get_last_block_containing_key( + pub fn get_last_block_containing_key( &self, key: &[u8], cache_policy: CachePolicy, @@ -84,7 +85,7 @@ impl BlockIndex for TopLevelIndex { self.0.get_last_block_containing_key(key, cache_policy) } - fn get_last_block_handle(&self, _: CachePolicy) -> crate::Result<&KeyedBlockHandle> { + pub fn get_last_block_handle(&self, _: CachePolicy) -> crate::Result<&KeyedBlockHandle> { self.0.get_last_block_handle(CachePolicy::Read) } } diff --git a/src/segment/block_index/two_level_index.rs b/src/segment/block_index/two_level_index.rs index ffdb284f..df74037a 100644 --- a/src/segment/block_index/two_level_index.rs +++ b/src/segment/block_index/two_level_index.rs @@ -4,11 +4,14 @@ use super::{ super::{id::GlobalSegmentId, value_block::CachePolicy}, - block_handle::KeyedBlockHandle, top_level::TopLevelIndex, BlockIndex, IndexBlock, }; -use crate::{block_cache::BlockCache, descriptor_table::FileDescriptorTable}; +use crate::{ + block_cache::BlockCache, + descriptor_table::FileDescriptorTable, + segment::{file_offsets::FileOffsets, meta::Metadata}, +}; use std::{path::Path, sync::Arc}; /// Allows reading index blocks - just a wrapper around a block cache @@ -52,13 +55,35 @@ pub struct TwoLevelBlockIndex { index_block_fetcher: IndexBlockFetcher, } +impl BlockIndex for TwoLevelBlockIndex { + fn get_lowest_block_containing_key( + &self, + key: &[u8], + cache_policy: CachePolicy, + ) -> crate::Result> { + self.get_lowest_data_block_handle_containing_item(key, cache_policy) + } + + fn get_last_block_handle(&self, cache_policy: CachePolicy) -> crate::Result { + self.get_last_data_block_handle(cache_policy) + } + + fn get_last_block_containing_key( + &self, + key: &[u8], + cache_policy: CachePolicy, + ) -> crate::Result> { + self.get_last_data_block_handle_containing_item(key, cache_policy) + } +} + impl TwoLevelBlockIndex { /// Gets the lowest block handle that may contain the given item pub fn get_lowest_data_block_handle_containing_item( &self, key: &[u8], cache_policy: CachePolicy, - ) -> crate::Result> { + ) -> crate::Result> { let Some(index_block_handle) = self .top_level_index .get_lowest_block_containing_key(key, cache_policy) @@ -67,13 +92,17 @@ impl TwoLevelBlockIndex { return Ok(None); }; - let index_block = self.load_index_block(index_block_handle, cache_policy)?; + let index_block = self.load_index_block(index_block_handle.offset, cache_policy)?; - Ok(index_block - .items - .get_lowest_block_containing_key(key, cache_policy) - .expect("cannot fail") - .cloned()) + Ok({ + use super::RawBlockIndex; + + index_block + .items + .get_lowest_block_containing_key(key, cache_policy) + .expect("cannot fail") + .map(|x| x.offset) + }) } /// Gets the last block handle that may contain the given item @@ -81,7 +110,7 @@ impl TwoLevelBlockIndex { &self, key: &[u8], cache_policy: CachePolicy, - ) -> crate::Result> { + ) -> crate::Result> { let Some(index_block_handle) = self .top_level_index .get_last_block_containing_key(key, cache_policy) @@ -90,45 +119,43 @@ impl TwoLevelBlockIndex { return Ok(Some(self.get_last_data_block_handle(cache_policy)?)); }; - let index_block = self.load_index_block(index_block_handle, cache_policy)?; + let index_block = self.load_index_block(index_block_handle.offset, cache_policy)?; - Ok(index_block - .items - .get_last_block_containing_key(key, cache_policy) - .expect("cannot fail") - .cloned()) + Ok({ + use super::RawBlockIndex; + + index_block + .items + .get_last_block_containing_key(key, cache_policy) + .expect("cannot fail") + .map(|x| x.offset) + }) } - pub fn get_last_data_block_handle( - &self, - cache_policy: CachePolicy, - ) -> crate::Result { + pub fn get_last_data_block_handle(&self, cache_policy: CachePolicy) -> crate::Result { let index_block_handle = self .top_level_index .get_last_block_handle(cache_policy) .expect("cannot fail"); - let index_block = self.load_index_block(index_block_handle, cache_policy)?; + let index_block = self.load_index_block(index_block_handle.offset, cache_policy)?; Ok(index_block .items .last() .expect("index block should not be empty") - .clone()) + .offset) } /// Loads an index block from disk pub fn load_index_block( &self, - block_handle: &KeyedBlockHandle, + offset: u64, cache_policy: CachePolicy, ) -> crate::Result> { - log::trace!("loading index block {:?}/{block_handle:?}", self.segment_id); + log::trace!("loading index block {:?}/{offset:?}", self.segment_id); - if let Some(block) = self - .index_block_fetcher - .get(self.segment_id, block_handle.offset) - { + if let Some(block) = self.index_block_fetcher.get(self.segment_id, offset) { // Cache hit: Copy from block Ok(block) @@ -142,13 +169,13 @@ impl TwoLevelBlockIndex { let block = IndexBlock::from_file( &mut *file_guard.file.lock().expect("lock is poisoned"), - block_handle.offset, + offset, ) .map_err(|e| { log::error!( "Failed to load index block {:?}/{:?}: {e:?}", self.segment_id, - block_handle.offset + offset ); e })?; @@ -159,11 +186,8 @@ impl TwoLevelBlockIndex { let block = Arc::new(block); if cache_policy == CachePolicy::Write { - self.index_block_fetcher.insert( - self.segment_id, - block_handle.offset, - block.clone(), - ); + self.index_block_fetcher + .insert(self.segment_id, offset, block.clone()); } Ok(block) @@ -184,16 +208,17 @@ impl TwoLevelBlockIndex { } pub fn from_file>( - file_path: P, - offset: u64, + path: P, + metadata: &Metadata, + offsets: &FileOffsets, segment_id: GlobalSegmentId, descriptor_table: Arc, block_cache: Arc, ) -> crate::Result { - let file_path = file_path.as_ref(); + let file_path = path.as_ref(); log::trace!("Reading block index from {file_path:?}"); - let top_level_index = TopLevelIndex::from_file(file_path, offset)?; + let top_level_index = TopLevelIndex::from_file(file_path, metadata, offsets)?; Ok(Self { descriptor_table, diff --git a/src/segment/mod.rs b/src/segment/mod.rs index ff482fde..83916ee2 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -26,7 +26,7 @@ use crate::{ ValueType, }; use block::checksum::Checksum; -use block_index::two_level_index::TwoLevelBlockIndex; +use block_index::{two_level_index::TwoLevelBlockIndex, BlockIndexImpl}; use file_offsets::FileOffsets; use range::Range; use std::{ops::Bound, path::Path, sync::Arc}; @@ -57,7 +57,7 @@ pub struct Segment { /// Translates key (first item of a block) to block offset (address inside file) and (compressed) size #[doc(hidden)] - pub block_index: Arc, + pub block_index: Arc, /// Block cache /// @@ -92,7 +92,8 @@ impl Segment { let mut file = guard.file.lock().expect("lock is poisoned"); - // NOTE: TODO: because of 1.74.0 + todo!(); + /* // NOTE: TODO: because of 1.74.0 #[allow(clippy::explicit_iter_loop)] for handle in self.block_index.top_level_index.iter() { let block = match IndexBlock::from_file(&mut *file, handle.offset) { @@ -137,7 +138,7 @@ impl Segment { log::debug!("Checked {data_block_count} data blocks"); } } - } + } */ assert_eq!( data_block_count, self.metadata.data_block_count, @@ -147,6 +148,8 @@ impl Segment { Ok(broken_count) } + // TODO: need to give recovery a flag to choose which block index to load + /// Tries to recover a segment from a file. pub(crate) fn recover>( file_path: P, @@ -167,11 +170,13 @@ impl Segment { ); let block_index = TwoLevelBlockIndex::from_file( file_path, - trailer.offsets.tli_ptr, + &trailer.metadata, + &trailer.offsets, (tree_id, trailer.metadata.id).into(), descriptor_table.clone(), block_cache.clone(), )?; + let block_index = BlockIndexImpl::TwoLevel(block_index); #[cfg(feature = "bloom")] let bloom_ptr = trailer.offsets.bloom_ptr; @@ -218,6 +223,10 @@ impl Segment { seqno: Option, hash: CompositeHash, ) -> crate::Result> { + if !self.bloom_filter.contains_hash(hash) { + return Ok(None); + } + if let Some(seqno) = seqno { if self.metadata.seqnos.0 >= seqno { return Ok(None); @@ -228,12 +237,6 @@ impl Segment { return Ok(None); } - { - if !self.bloom_filter.contains_hash(hash) { - return Ok(None); - } - } - self.point_read(key, seqno) } @@ -242,13 +245,14 @@ impl Segment { key: K, seqno: Option, ) -> crate::Result> { + use block_index::BlockIndex; use value_block::{CachePolicy, ValueBlock}; let key = key.as_ref(); let Some(first_block_handle) = self .block_index - .get_lowest_data_block_handle_containing_item(key.as_ref(), CachePolicy::Write)? + .get_lowest_block_containing_key(key, CachePolicy::Write)? else { return Ok(None); }; @@ -257,7 +261,7 @@ impl Segment { &self.descriptor_table, &self.block_cache, (self.tree_id, self.metadata.id).into(), - first_block_handle.offset, + first_block_handle, CachePolicy::Write, )? else { @@ -286,7 +290,7 @@ impl Segment { self.descriptor_table.clone(), (self.tree_id, self.metadata.id).into(), self.block_cache.clone(), - first_block_handle.offset, + first_block_handle, None, ); reader.lo_block_size = block.header.data_length.into(); @@ -355,18 +359,18 @@ impl Segment { key: K, seqno: Option, ) -> crate::Result> { + let key = key.as_ref(); + if let Some(seqno) = seqno { if self.metadata.seqnos.0 >= seqno { return Ok(None); } } - if !self.metadata.key_range.contains_key(&key) { + if !self.metadata.key_range.contains_key(key) { return Ok(None); } - let key = key.as_ref(); - #[cfg(feature = "bloom")] { debug_assert!(false, "Use Segment::get_with_hash instead"); diff --git a/src/segment/range.rs b/src/segment/range.rs index c7dc9d39..172c2b50 100644 --- a/src/segment/range.rs +++ b/src/segment/range.rs @@ -2,7 +2,8 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::block_index::two_level_index::TwoLevelBlockIndex; +use super::block_index::BlockIndex; +use super::block_index::BlockIndexImpl; use super::id::GlobalSegmentId; use super::reader::Reader; use super::value_block::CachePolicy; @@ -16,7 +17,7 @@ use std::ops::RangeBounds; use std::sync::Arc; pub struct Range { - block_index: Arc, + block_index: Arc, is_initialized: bool, @@ -33,7 +34,7 @@ impl Range { descriptor_table: Arc, segment_id: GlobalSegmentId, block_cache: Arc, - block_index: Arc, + block_index: Arc, range: (Bound, Bound), ) -> Self { let reader = Reader::new( @@ -72,9 +73,9 @@ impl Range { Bound::Included(start) | Bound::Excluded(start) => { if let Some(lower_bound) = self .block_index - .get_lowest_data_block_handle_containing_item(start, CachePolicy::Write)? + .get_lowest_block_containing_key(start, CachePolicy::Write)? { - self.reader.lo_block_offset = lower_bound.offset; + self.reader.lo_block_offset = lower_bound; } Some(start) @@ -85,20 +86,18 @@ impl Range { // would make short ranges 1.5-2x faster if only one direction is used let end_key: Option<&Slice> = match self.range.end_bound() { Bound::Unbounded => { - let upper_bound = self - .block_index - .get_last_data_block_handle(CachePolicy::Write)?; + let upper_bound = self.block_index.get_last_block_handle(CachePolicy::Write)?; - self.reader.hi_block_offset = Some(upper_bound.offset); + self.reader.hi_block_offset = Some(upper_bound); None } Bound::Included(end) | Bound::Excluded(end) => { if let Some(upper_bound) = self .block_index - .get_last_data_block_handle_containing_item(end, CachePolicy::Write)? + .get_last_block_containing_key(end, CachePolicy::Write)? { - self.reader.hi_block_offset = Some(upper_bound.offset); + self.reader.hi_block_offset = Some(upper_bound); } Some(end) @@ -233,7 +232,7 @@ mod tests { block_cache::BlockCache, descriptor_table::FileDescriptorTable, segment::{ - block_index::two_level_index::TwoLevelBlockIndex, + block_index::{two_level_index::TwoLevelBlockIndex, BlockIndexImpl}, range::Range, writer::{Options, Writer}, }, @@ -285,13 +284,15 @@ mod tests { table.insert(&segment_file_path, (0, 0).into()); let block_cache = Arc::new(BlockCache::with_capacity_bytes(10 * 1_024 * 1_024)); - let block_index = Arc::new(TwoLevelBlockIndex::from_file( + let block_index = TwoLevelBlockIndex::from_file( segment_file_path, - trailer.offsets.tli_ptr, + &trailer.metadata, + &trailer.offsets, (0, 0).into(), table.clone(), block_cache.clone(), - )?); + )?; + let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); let iter = Range::new( trailer.offsets.index_block_ptr, @@ -384,13 +385,15 @@ mod tests { table.insert(&segment_file_path, (0, 0).into()); let block_cache = Arc::new(BlockCache::with_capacity_bytes(10 * 1_024 * 1_024)); - let block_index = Arc::new(TwoLevelBlockIndex::from_file( + let block_index = TwoLevelBlockIndex::from_file( segment_file_path, - trailer.offsets.tli_ptr, + &trailer.metadata, + &trailer.offsets, (0, 0).into(), table.clone(), block_cache.clone(), - )?); + )?; + let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); { let mut iter = Range::new( @@ -584,13 +587,15 @@ mod tests { table.insert(&segment_file_path, (0, 0).into()); let block_cache = Arc::new(BlockCache::with_capacity_bytes(10 * 1_024 * 1_024)); - let block_index = Arc::new(TwoLevelBlockIndex::from_file( + let block_index = TwoLevelBlockIndex::from_file( segment_file_path, - trailer.offsets.tli_ptr, + &trailer.metadata, + &trailer.offsets, (0, 0).into(), table.clone(), block_cache.clone(), - )?); + )?; + let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); let ranges: Vec<(Bound, Bound)> = vec![ range_bounds_to_tuple(&(0..1_000)), @@ -687,13 +692,15 @@ mod tests { table.insert(&segment_file_path, (0, 0).into()); let block_cache = Arc::new(BlockCache::with_capacity_bytes(10 * 1_024 * 1_024)); - let block_index = Arc::new(TwoLevelBlockIndex::from_file( + let block_index = TwoLevelBlockIndex::from_file( segment_file_path, - trailer.offsets.tli_ptr, + &trailer.metadata, + &trailer.offsets, (0, 0).into(), table.clone(), block_cache.clone(), - )?); + )?; + let block_index = Arc::new(BlockIndexImpl::TwoLevel(block_index)); for (i, &start_char) in chars.iter().enumerate() { for &end_char in chars.iter().skip(i + 1) { diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 08876537..84aea211 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -392,7 +392,11 @@ mod tests { // the TLI length fits into u32 as well #[allow(clippy::cast_possible_truncation)] { - let tli = TopLevelIndex::from_file(&segment_file_path, trailer.offsets.tli_ptr)?; + use crate::segment::block_index::BlockIndex; + + let tli = + TopLevelIndex::from_file(&segment_file_path, &trailer.metadata, &trailer.offsets)?; + assert_eq!(tli.len() as u32, trailer.metadata.index_block_count); } diff --git a/src/tree/mod.rs b/src/tree/mod.rs index be0a7012..11ec87e6 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -13,7 +13,13 @@ use crate::{ manifest::Manifest, memtable::Memtable, range::{prefix_to_range, MemtableLockGuard, TreeIter}, - segment::{block_index::two_level_index::TwoLevelBlockIndex, meta::TableType, Segment}, + segment::{ + block_index::{ + full_index::FullBlockIndex, two_level_index::TwoLevelBlockIndex, BlockIndexImpl, + }, + meta::TableType, + Segment, + }, stop_signal::StopSignal, value::InternalValue, version::Version, @@ -474,13 +480,10 @@ impl Tree { log::debug!("Finalized segment write at {segment_folder:?}"); - let block_index = Arc::new(TwoLevelBlockIndex::from_file( - &segment_file_path, - trailer.offsets.tli_ptr, - (self.id, segment_id).into(), - self.config.descriptor_table.clone(), - self.config.block_cache.clone(), - )?); + // TODO: full block index + let block_index = + FullBlockIndex::from_file(&segment_file_path, &trailer.metadata, &trailer.offsets)?; + let block_index = Arc::new(BlockIndexImpl::Full(block_index)); #[cfg(feature = "bloom")] let bloom_ptr = trailer.offsets.bloom_ptr; @@ -672,6 +675,7 @@ impl Tree { } return Ok(Some(entry)); }; + drop(memtable_lock); // Now look in sealed memtables From 888b11f27a4a5187033c53bd2b04d860530b3a61 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 28 Sep 2024 02:28:21 +0200 Subject: [PATCH 05/90] also use full index for L1 --- src/compaction/worker.rs | 47 ++++++++++++++++++++++++------------- src/level_manifest/level.rs | 7 +++--- src/tree/mod.rs | 5 +--- 3 files changed, 35 insertions(+), 24 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index d4787d57..fa19ae16 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -9,8 +9,12 @@ use crate::{ level_manifest::LevelManifest, merge::{BoxedIterator, Merger}, segment::{ - block_index::two_level_index::TwoLevelBlockIndex, id::GlobalSegmentId, - multi_writer::MultiWriter, Segment, + block_index::{ + full_index::FullBlockIndex, two_level_index::TwoLevelBlockIndex, BlockIndexImpl, + }, + id::GlobalSegmentId, + multi_writer::MultiWriter, + Segment, }, stop_signal::StopSignal, tree::inner::{SealedMemtables, TreeId}, @@ -232,20 +236,31 @@ fn merge_segments( #[cfg(feature = "bloom")] let bloom_ptr = trailer.offsets.bloom_ptr; - // NOTE: Need to allow because of false positive in Clippy - // because of "bloom" feature - #[allow(clippy::needless_borrows_for_generic_args)] - let block_index = TwoLevelBlockIndex::from_file( - &segment_file_path, - &trailer.metadata, - &trailer.offsets, - (opts.tree_id, segment_id).into(), - opts.config.descriptor_table.clone(), - opts.config.block_cache.clone(), - )?; - let block_index = Arc::new(crate::segment::block_index::BlockIndexImpl::TwoLevel( - block_index, - )); + let block_index = match payload.dest_level { + 0 | 1 => { + let block_index = FullBlockIndex::from_file( + &segment_file_path, + &trailer.metadata, + &trailer.offsets, + )?; + BlockIndexImpl::Full(block_index) + } + _ => { + // NOTE: Need to allow because of false positive in Clippy + // because of "bloom" feature + #[allow(clippy::needless_borrows_for_generic_args)] + let block_index = TwoLevelBlockIndex::from_file( + &segment_file_path, + &trailer.metadata, + &trailer.offsets, + (opts.tree_id, segment_id).into(), + opts.config.descriptor_table.clone(), + opts.config.block_cache.clone(), + )?; + BlockIndexImpl::TwoLevel(block_index) + } + }; + let block_index = Arc::new(block_index); Ok(Arc::new(Segment { tree_id: opts.tree_id, diff --git a/src/level_manifest/level.rs b/src/level_manifest/level.rs index 64ae691b..0c970052 100644 --- a/src/level_manifest/level.rs +++ b/src/level_manifest/level.rs @@ -17,8 +17,7 @@ pub struct Level { /// is only recomputed when the level is changed /// to avoid unnecessary CPU work pub is_disjoint: bool, - - pub key_range: KeyRange, + // pub key_range: KeyRange, } impl std::fmt::Display for Level { @@ -44,7 +43,7 @@ impl Default for Level { Self { is_disjoint: true, segments: Vec::new(), - key_range: KeyRange::empty(), + // key_range: KeyRange::empty(), } } } @@ -289,7 +288,7 @@ mod tests { fn level_disjoint_cull() { let level = Level { is_disjoint: true, - key_range: KeyRange::empty(), + // key_range: KeyRange::empty(), segments: vec![ fixture_segment(0, KeyRange::new((Slice::from("a"), Slice::from("c")))), fixture_segment(1, KeyRange::new((Slice::from("d"), Slice::from("g")))), diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 11ec87e6..cda703c2 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -14,9 +14,7 @@ use crate::{ memtable::Memtable, range::{prefix_to_range, MemtableLockGuard, TreeIter}, segment::{ - block_index::{ - full_index::FullBlockIndex, two_level_index::TwoLevelBlockIndex, BlockIndexImpl, - }, + block_index::{full_index::FullBlockIndex, BlockIndexImpl}, meta::TableType, Segment, }, @@ -480,7 +478,6 @@ impl Tree { log::debug!("Finalized segment write at {segment_folder:?}"); - // TODO: full block index let block_index = FullBlockIndex::from_file(&segment_file_path, &trailer.metadata, &trailer.offsets)?; let block_index = Arc::new(BlockIndexImpl::Full(block_index)); From d18b4cbecbe167bbf7e2b983cac4495ca7c25e00 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 2 Nov 2024 18:25:44 +0100 Subject: [PATCH 06/90] fix bench --- benches/tree.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benches/tree.rs b/benches/tree.rs index 1929db2b..3cef9588 100644 --- a/benches/tree.rs +++ b/benches/tree.rs @@ -181,7 +181,7 @@ fn tree_get_pairs(c: &mut Criterion) { { let folder = tempfile::tempdir().unwrap(); let tree = Config::new(folder) - .block_size(1_024) + .data_block_size(1_024) .block_cache(Arc::new(BlockCache::with_capacity_bytes(0))) .open() .unwrap(); @@ -219,7 +219,7 @@ fn tree_get_pairs(c: &mut Criterion) { { let folder = tempfile::tempdir().unwrap(); let tree = Config::new(folder) - .block_size(1_024) + .data_block_size(1_024) .block_cache(Arc::new(BlockCache::with_capacity_bytes(0))) .open() .unwrap(); @@ -262,7 +262,7 @@ fn disk_point_read(c: &mut Criterion) { let folder = tempdir().unwrap(); let tree = Config::new(folder) - .block_size(1_024) + .data_block_size(1_024) .block_cache(Arc::new(BlockCache::with_capacity_bytes(0))) .open() .unwrap(); @@ -300,7 +300,7 @@ fn disjoint_tree_minmax(c: &mut Criterion) { let folder = tempfile::tempdir().unwrap(); let tree = Config::new(folder) - .block_size(1_024) + .data_block_size(1_024) .block_cache(Arc::new(BlockCache::with_capacity_bytes(0))) .open() .unwrap(); From 6b34e135631bd15af668634a814c5b17bac692a3 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 2 Nov 2024 19:43:13 +0100 Subject: [PATCH 07/90] refactor: bench --- benches/tree.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/benches/tree.rs b/benches/tree.rs index 3cef9588..e3cb96a0 100644 --- a/benches/tree.rs +++ b/benches/tree.rs @@ -149,7 +149,7 @@ fn scan_vs_prefix(c: &mut Criterion) { group.bench_function(format!("scan {} (uncached)", size), |b| { b.iter(|| { let iter = tree.iter(); - let iter = iter.into_iter().filter(|x| match x { + let iter = iter.filter(|x| match x { Ok((key, _)) => key.starts_with(prefix.as_bytes()), Err(_) => false, }); @@ -159,14 +159,12 @@ fn scan_vs_prefix(c: &mut Criterion) { group.bench_function(format!("prefix {} (uncached)", size), |b| { b.iter(|| { let iter = tree.prefix(prefix); - let iter = iter.into_iter(); assert_eq!(iter.count(), 10); }); }); group.bench_function(format!("prefix rev {} (uncached)", size), |b| { b.iter(|| { let iter = tree.prefix(prefix); - let iter = iter.into_iter(); assert_eq!(iter.rev().count(), 10); }); }); From 58635c7ba45d8fed067b6f35cf1c023f1cbd3fef Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 2 Nov 2024 19:43:17 +0100 Subject: [PATCH 08/90] add test case --- tests/segment_range.rs | 50 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 tests/segment_range.rs diff --git a/tests/segment_range.rs b/tests/segment_range.rs new file mode 100644 index 00000000..6acbba00 --- /dev/null +++ b/tests/segment_range.rs @@ -0,0 +1,50 @@ +use lsm_tree::{AbstractTree, Config}; +use test_log::test; + +const ITEM_COUNT: usize = 1_000_000; + +#[test] +fn segment_ranges() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?.into_path(); + + let tree = Config::new(folder) + .data_block_size(1_024) + .index_block_size(1_024) + .open()?; + + for x in 0..ITEM_COUNT as u64 { + let key = x.to_be_bytes(); + let value = nanoid::nanoid!(); + tree.insert(key, value.as_bytes(), 0); + } + tree.flush_active_memtable(0)?; + + let iter = tree.range(1_000u64.to_be_bytes()..11_000u64.to_be_bytes()); + assert_eq!(10_000, iter.count()); + + let iter = tree.range(1_000u64.to_be_bytes()..11_000u64.to_be_bytes()); + assert_eq!(10_000, iter.rev().count()); + + let mut iter = tree.range(1_000u64.to_be_bytes()..11_000u64.to_be_bytes()); + let mut count = 0; + + for x in 0.. { + if x % 2 == 0 { + let Some(_) = iter.next() else { + break; + }; + + count += 1; + } else { + let Some(_) = iter.next_back() else { + break; + }; + + count += 1; + } + } + + assert_eq!(10_000, count); + + Ok(()) +} From 3f1482f577f81d65a990eda6cf47d41944f9e645 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 2 Nov 2024 19:43:33 +0100 Subject: [PATCH 09/90] closes #62 --- src/segment/range.rs | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/src/segment/range.rs b/src/segment/range.rs index af2c8378..1a5169cf 100644 --- a/src/segment/range.rs +++ b/src/segment/range.rs @@ -19,7 +19,8 @@ use std::sync::Arc; pub struct Range { block_index: Arc, - is_initialized: bool, + lo_initialized: bool, + hi_initialized: bool, pub(crate) range: (Bound, Bound), @@ -45,7 +46,8 @@ impl Range { ); Self { - is_initialized: false, + lo_initialized: false, + hi_initialized: false, block_index, @@ -75,9 +77,13 @@ impl Range { Some(start) } }; + if let Some(key) = start_key.cloned() { self.reader.set_lower_bound(key); } + + self.lo_initialized = true; + Ok(()) } @@ -107,19 +113,8 @@ impl Range { if let Some(key) = end_key.cloned() { self.reader.set_upper_bound(key); } - Ok(()) - } - fn initialize(&mut self) -> crate::Result<()> { - // TODO: can we skip searching for lower bound until next is called at least once...? - // would make short ranges 1.5-2x faster (if cache miss) if only one direction is used - self.initialize_lo_bound()?; - - // TODO: can we skip searching for upper bound until next_back is called at least once...? - // would make short ranges 1.5-2x faster (if cache miss) if only one direction is used - self.initialize_hi_bound()?; - - self.is_initialized = true; + self.hi_initialized = true; Ok(()) } @@ -129,8 +124,8 @@ impl Iterator for Range { type Item = crate::Result; fn next(&mut self) -> Option { - if !self.is_initialized { - if let Err(e) = self.initialize() { + if !self.lo_initialized { + if let Err(e) = self.initialize_lo_bound() { return Some(Err(e)); }; } @@ -182,16 +177,14 @@ impl Iterator for Range { impl DoubleEndedIterator for Range { fn next_back(&mut self) -> Option { - if !self.is_initialized { - if let Err(e) = self.initialize() { + if !self.hi_initialized { + if let Err(e) = self.initialize_hi_bound() { return Some(Err(e)); }; } loop { - let entry_result = self.reader.next_back()?; - - match entry_result { + match self.reader.next_back()? { Ok(entry) => { match self.range.start_bound() { Bound::Included(start) => { From 124b9ff2126c309f685dfd058c6c42e0e47edc66 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 6 Nov 2024 21:38:41 +0100 Subject: [PATCH 10/90] change size tiered base size to 64M --- src/compaction/tiered.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compaction/tiered.rs b/src/compaction/tiered.rs index 74f5bdd2..367c2548 100644 --- a/src/compaction/tiered.rs +++ b/src/compaction/tiered.rs @@ -43,7 +43,7 @@ impl Strategy { impl Default for Strategy { fn default() -> Self { Self { - base_size: 16 * 1_024 * 1_024, + base_size: 64 * 1_024 * 1_024, level_ratio: 4, } } From 224558c06ad18f88145a585e00f36a096de42020 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 10 Nov 2024 17:24:45 +0100 Subject: [PATCH 11/90] perf: make Memtable::get_highest_seqno O(1) --- benches/memtable.rs | 42 +++++++++++++++++++++++++++++++----------- src/memtable/mod.rs | 36 +++++++++++++++++++++++++----------- 2 files changed, 56 insertions(+), 22 deletions(-) diff --git a/benches/memtable.rs b/benches/memtable.rs index 41e3ad6c..fedf2d14 100644 --- a/benches/memtable.rs +++ b/benches/memtable.rs @@ -3,22 +3,42 @@ use lsm_tree::{InternalValue, Memtable}; use nanoid::nanoid; fn memtable_get_upper_bound(c: &mut Criterion) { - let memtable = Memtable::default(); + c.bench_function("memtable get", |b| { + let memtable = Memtable::default(); - for _ in 0..1_000_000 { - memtable.insert(InternalValue::from_components( - format!("abc_{}", nanoid!()).as_bytes(), - vec![], - 0, - lsm_tree::ValueType::Value, - )); - } + for _ in 0..1_000_000 { + memtable.insert(InternalValue::from_components( + format!("abc_{}", nanoid!()).as_bytes(), + vec![], + 0, + lsm_tree::ValueType::Value, + )); + } - c.bench_function("memtable get", |b| { b.iter(|| { memtable.get("abc", None); }); }); } -criterion_group!(benches, memtable_get_upper_bound); + +fn memtable_highest_seqno(c: &mut Criterion) { + c.bench_function("memtable highest seqno", |b| { + let memtable = Memtable::default(); + + for x in 0..100_000 { + memtable.insert(InternalValue::from_components( + format!("abc_{}", nanoid!()).as_bytes(), + vec![], + x, + lsm_tree::ValueType::Value, + )); + } + + b.iter(|| { + assert_eq!(Some(99_999), memtable.get_highest_seqno()); + }); + }); +} + +criterion_group!(benches, memtable_get_upper_bound, memtable_highest_seqno); criterion_main!(benches); diff --git a/src/memtable/mod.rs b/src/memtable/mod.rs index d755cc77..57bfdfca 100644 --- a/src/memtable/mod.rs +++ b/src/memtable/mod.rs @@ -8,7 +8,7 @@ use crate::segment::block::ItemSize; use crate::value::{InternalValue, SeqNo, UserValue, ValueType}; use crossbeam_skiplist::SkipMap; use std::ops::RangeBounds; -use std::sync::atomic::AtomicU32; +use std::sync::atomic::{AtomicU32, AtomicU64}; struct DoubleEndedWrapper(I); @@ -32,16 +32,26 @@ where } } -/// The memtable serves as an intermediary storage for new items +/// The memtable serves as an intermediary, ephemeral, sorted storage for new items +/// +/// When the Memtable exceeds some size, it should be flushed to a disk segment. +/// +/// For durability, the Memtable is backed by a write-ahead log. #[derive(Default)] pub struct Memtable { + /// The actual content, stored in a lock-free skiplist. #[doc(hidden)] pub items: SkipMap, - /// Approximate active memtable size + /// Approximate active memtable size. /// - /// If this grows too large, a flush is triggered + /// If this grows too large, a flush is triggered. pub(crate) approximate_size: AtomicU32, + + /// Highest encountered sequence number. + /// + /// This is used so that `get_highest_seqno` has O(1) complexity. + pub(crate) highest_seqno: AtomicU64, } impl Memtable { @@ -167,18 +177,22 @@ impl Memtable { let key = InternalKey::new(item.key.user_key, item.key.seqno, item.key.value_type); self.items.insert(key, item.value); + self.highest_seqno + .fetch_max(item.key.seqno, std::sync::atomic::Ordering::AcqRel); + (item_size, size_before + item_size) } /// Returns the highest sequence number in the memtable. pub fn get_highest_seqno(&self) -> Option { - self.items - .iter() - .map(|x| { - let key = x.key(); - key.seqno - }) - .max() + if self.is_empty() { + None + } else { + Some( + self.highest_seqno + .load(std::sync::atomic::Ordering::Acquire), + ) + } } } From b1d141919a59f19d99b7c8252d5086dfd3799369 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 10 Nov 2024 17:24:53 +0100 Subject: [PATCH 12/90] add test cases --- tests/segment_range.rs | 80 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tests/segment_range.rs b/tests/segment_range.rs index 6acbba00..3fa6b3ff 100644 --- a/tests/segment_range.rs +++ b/tests/segment_range.rs @@ -48,3 +48,83 @@ fn segment_ranges() -> lsm_tree::Result<()> { Ok(()) } + +#[test] +fn segment_range_last_back() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?.into_path(); + + let tree = Config::new(folder) + .data_block_size(1_024) + .index_block_size(1_024) + .open()?; + + let value = (0..2_000).map(|_| 0).collect::>(); + + for x in 0..10_u64 { + let key = x.to_be_bytes(); + tree.insert(key, &value, 0); + } + tree.flush_active_memtable(0)?; + + let iter = tree.range(0u64.to_be_bytes()..10u64.to_be_bytes()); + assert_eq!(10, iter.count()); + + let iter = tree.range(0u64.to_be_bytes()..10u64.to_be_bytes()); + assert_eq!(10, iter.rev().count()); + + let mut iter = tree.range(0u64.to_be_bytes()..5u64.to_be_bytes()); + + assert_eq!(0u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(1u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(2u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(3u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(4u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert!(iter.next_back().is_none()); + + Ok(()) +} + +#[test] +fn segment_range_last_back_2() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?.into_path(); + + let tree = Config::new(folder) + .data_block_size(1_024) + .index_block_size(1_024) + .open()?; + + let value = (0..2_000).map(|_| 0).collect::>(); + + for x in 0..10_u64 { + let key = x.to_be_bytes(); + tree.insert(key, &value, 0); + } + tree.insert(10u64.to_be_bytes(), [], 0); + tree.insert(11u64.to_be_bytes(), [], 0); + tree.flush_active_memtable(0)?; + + let iter = tree.range(0u64.to_be_bytes()..10u64.to_be_bytes()); + assert_eq!(10, iter.count()); + + let iter = tree.range(0u64.to_be_bytes()..10u64.to_be_bytes()); + assert_eq!(10, iter.rev().count()); + + let mut iter = tree.range(0u64.to_be_bytes()..12u64.to_be_bytes()); + + assert_eq!(0u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(1u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(2u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(3u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(4u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(5u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(6u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(7u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(8u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(9u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(10u64.to_be_bytes(), &*iter.next().unwrap().unwrap().0); + assert_eq!(11u64.to_be_bytes(), &*iter.next_back().unwrap().unwrap().0); + assert!(iter.next().is_none()); + assert!(iter.next_back().is_none()); + + Ok(()) +} From dd19d96d6bbcfdf2a033db5981bfe602c59099c4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 20 Nov 2024 19:13:14 +0100 Subject: [PATCH 13/90] remove unused memtable clear method --- src/memtable/mod.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/memtable/mod.rs b/src/memtable/mod.rs index ddd7b06d..653eb4d2 100644 --- a/src/memtable/mod.rs +++ b/src/memtable/mod.rs @@ -30,13 +30,6 @@ pub struct Memtable { } impl Memtable { - /// Clears the memtable. - pub fn clear(&mut self) { - self.items.clear(); - self.approximate_size - .store(0, std::sync::atomic::Ordering::Release); - } - /// Creates an iterator over all items. pub fn iter(&self) -> impl DoubleEndedIterator + '_ { self.items.iter().map(|entry| InternalValue { From 5cf470d860b5a72d63827073a0152a4ae6e8879b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 20 Nov 2024 19:14:04 +0100 Subject: [PATCH 14/90] Revert "remove unused memtable clear method" This reverts commit dd19d96d6bbcfdf2a033db5981bfe602c59099c4. --- src/memtable/mod.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/memtable/mod.rs b/src/memtable/mod.rs index 653eb4d2..ddd7b06d 100644 --- a/src/memtable/mod.rs +++ b/src/memtable/mod.rs @@ -30,6 +30,13 @@ pub struct Memtable { } impl Memtable { + /// Clears the memtable. + pub fn clear(&mut self) { + self.items.clear(); + self.approximate_size + .store(0, std::sync::atomic::Ordering::Release); + } + /// Creates an iterator over all items. pub fn iter(&self) -> impl DoubleEndedIterator + '_ { self.items.iter().map(|entry| InternalValue { From f5e2ccb578d0bb670208dc12ea4c7c809d25c3e4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 20 Nov 2024 19:14:52 +0100 Subject: [PATCH 15/90] fix: Memtable::clear --- src/memtable/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/memtable/mod.rs b/src/memtable/mod.rs index ddd7b06d..b0abb51a 100644 --- a/src/memtable/mod.rs +++ b/src/memtable/mod.rs @@ -33,6 +33,7 @@ impl Memtable { /// Clears the memtable. pub fn clear(&mut self) { self.items.clear(); + self.highest_seqno = AtomicU64::new(0); self.approximate_size .store(0, std::sync::atomic::Ordering::Release); } From 50badf70799678e17ba5cf923061a5d29ae8ea37 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 22 Nov 2024 14:44:18 +0100 Subject: [PATCH 16/90] fix: range upper bound --- src/segment/range.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/segment/range.rs b/src/segment/range.rs index 27b524d6..0da45f6e 100644 --- a/src/segment/range.rs +++ b/src/segment/range.rs @@ -103,6 +103,9 @@ impl Range { .get_last_block_containing_key(end, CachePolicy::Write)? { self.reader.hi_block_offset = Some(upper_bound); + } else { + self.reader.hi_block_offset = + Some(self.block_index.get_last_block_handle(CachePolicy::Write)?); } Some(end) From 333964687970055e08a5185e2e2c2671f430ff99 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 22 Nov 2024 14:51:56 +0100 Subject: [PATCH 17/90] fix: workaround for 1.74 rust --- src/segment/block_index/full_index.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/segment/block_index/full_index.rs b/src/segment/block_index/full_index.rs index 3b1cab4c..08bc5d8d 100644 --- a/src/segment/block_index/full_index.rs +++ b/src/segment/block_index/full_index.rs @@ -31,7 +31,8 @@ impl FullBlockIndex { for _ in 0..cnt { let idx_block = IndexBlock::from_reader(&mut file)?.items; - block_handles.extend(idx_block); + // TODO: 1.80? IntoIter impl for Box<[T]> + block_handles.extend(idx_block.into_vec()); } debug_assert!(!block_handles.is_empty()); From f8082941d5730e1bcbce0b10edf281a5d9848db8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 22 Nov 2024 15:00:46 +0100 Subject: [PATCH 18/90] fix(full block index): can't debug assert len --- src/segment/block_index/full_index.rs | 1 - src/segment/block_index/mod.rs | 5 +---- src/segment/writer/mod.rs | 2 -- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/src/segment/block_index/full_index.rs b/src/segment/block_index/full_index.rs index 08bc5d8d..0b5060f5 100644 --- a/src/segment/block_index/full_index.rs +++ b/src/segment/block_index/full_index.rs @@ -36,7 +36,6 @@ impl FullBlockIndex { } debug_assert!(!block_handles.is_empty()); - debug_assert_eq!(cnt, block_handles.len()); Ok(Self(block_handles.into_boxed_slice())) } diff --git a/src/segment/block_index/mod.rs b/src/segment/block_index/mod.rs index 7fd369ea..7fcbd35c 100644 --- a/src/segment/block_index/mod.rs +++ b/src/segment/block_index/mod.rs @@ -111,10 +111,7 @@ pub enum BlockIndexImpl { #[allow(clippy::expect_used)] mod tests { use super::*; - use crate::{ - segment::{block_index::BlockIndex, value_block::BlockOffset}, - Slice, - }; + use crate::{segment::value_block::BlockOffset, Slice}; use test_log::test; fn bh>(end_key: K, offset: BlockOffset) -> KeyedBlockHandle { diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 238af5db..5afa3f1e 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -457,8 +457,6 @@ mod tests { // the TLI length fits into u32 as well #[allow(clippy::cast_possible_truncation)] { - use crate::segment::block_index::BlockIndex; - let tli = TopLevelIndex::from_file(&segment_file_path, &trailer.metadata, &trailer.offsets)?; From d2cedba663c8cdfc1589c5d688df7d3ea98ccc0b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 23 Nov 2024 02:00:47 +0100 Subject: [PATCH 19/90] reimplement segment verify --- src/segment/mod.rs | 159 +++++++++++++++++++++++++++++---------------- 1 file changed, 103 insertions(+), 56 deletions(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 12456d96..09287232 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -88,58 +88,98 @@ impl Segment { let mut file = guard.file.lock().expect("lock is poisoned"); - todo!(); - /* // NOTE: TODO: because of 1.74.0 - #[allow(clippy::explicit_iter_loop)] - for handle in self.block_index.top_level_index.iter() { - let block = match IndexBlock::from_file(&mut *file, handle.offset) { - Ok(v) => v, - Err(e) => { - log::error!( - "index block {handle:?} could not be loaded, it is probably corrupted: {e:?}" - ); - broken_count += 1; - continue; - } - }; - - for handle in &*block.items { - let value_block = match ValueBlock::from_file(&mut *file, handle.offset) { - Ok(v) => v, - Err(e) => { - log::error!( - "data block {handle:?} could not be loaded, it is probably corrupted: {e:?}" - ); + // TODO: maybe move to BlockIndexImpl::verify + match &*self.block_index { + BlockIndexImpl::Full(block_index) => { + for handle in block_index.iter() { + let value_block = match ValueBlock::from_file(&mut *file, handle.offset) { + Ok(v) => v, + Err(e) => { + log::error!( + "data block {handle:?} could not be loaded, it is probably corrupted: {e:?}" + ); + broken_count += 1; + data_block_count += 1; + continue; + } + }; + + let (_, data) = ValueBlock::to_bytes_compressed( + &value_block.items, + value_block.header.previous_block_offset, + value_block.header.compression, + )?; + let actual_checksum = Checksum::from_bytes(&data); + + if value_block.header.checksum != actual_checksum { + log::error!("{handle:?} is corrupted, invalid checksum value"); broken_count += 1; - data_block_count += 1; - continue; } - }; - - let (_, data) = ValueBlock::to_bytes_compressed( - &value_block.items, - value_block.header.previous_block_offset, - value_block.header.compression, - )?; - let actual_checksum = Checksum::from_bytes(&data); - - if value_block.header.checksum != actual_checksum { - log::error!("{handle:?} is corrupted, invalid checksum value"); - broken_count += 1; + + data_block_count += 1; + + if data_block_count % 1_000 == 0 { + log::debug!("Checked {data_block_count} data blocks"); + } } + } + BlockIndexImpl::TwoLevel(block_index) => { + // NOTE: TODO: because of 1.74.0 + #[allow(clippy::explicit_iter_loop)] + for handle in block_index.top_level_index.iter() { + let block = match IndexBlock::from_file(&mut *file, handle.offset) { + Ok(v) => v, + Err(e) => { + log::error!( + "index block {handle:?} could not be loaded, it is probably corrupted: {e:?}" + ); + broken_count += 1; + continue; + } + }; + + for handle in &*block.items { + let value_block = match ValueBlock::from_file(&mut *file, handle.offset) { + Ok(v) => v, + Err(e) => { + log::error!( + "data block {handle:?} could not be loaded, it is probably corrupted: {e:?}" + ); + broken_count += 1; + data_block_count += 1; + continue; + } + }; + + let (_, data) = ValueBlock::to_bytes_compressed( + &value_block.items, + value_block.header.previous_block_offset, + value_block.header.compression, + )?; + let actual_checksum = Checksum::from_bytes(&data); + + if value_block.header.checksum != actual_checksum { + log::error!("{handle:?} is corrupted, invalid checksum value"); + broken_count += 1; + } - data_block_count += 1; + data_block_count += 1; - if data_block_count % 1_000 == 0 { - log::debug!("Checked {data_block_count} data blocks"); + if data_block_count % 1_000 == 0 { + log::debug!("Checked {data_block_count} data blocks"); + } + } } } - } */ + } - assert_eq!( - data_block_count, self.metadata.data_block_count, - "not all data blocks were visited" - ); + if data_block_count != self.metadata.data_block_count { + log::error!( + "Not all data blocks were visited during verification of disk segment {:?}", + self.metadata.id + ); + broken_count += 1; + } Ok(broken_count) } @@ -164,16 +204,15 @@ impl Segment { }) } - // TODO: need to give recovery a flag to choose which block index to load - /// Tries to recover a segment from a file. pub(crate) fn recover>( file_path: P, tree_id: TreeId, block_cache: Arc, descriptor_table: Arc, + use_full_block_index: bool, ) -> crate::Result { - use block_index::two_level_index::TwoLevelBlockIndex; + use block_index::{full_index::FullBlockIndex, two_level_index::TwoLevelBlockIndex}; use trailer::SegmentFileTrailer; let file_path = file_path.as_ref(); @@ -190,15 +229,23 @@ impl Segment { "Creating block index, with tli_ptr={}", trailer.offsets.tli_ptr ); - let block_index = TwoLevelBlockIndex::from_file( - file_path, - &trailer.metadata, - &trailer.offsets, - (tree_id, trailer.metadata.id).into(), - descriptor_table.clone(), - block_cache.clone(), - )?; - let block_index = BlockIndexImpl::TwoLevel(block_index); + + let block_index = if use_full_block_index { + let block_index = + FullBlockIndex::from_file(file_path, &trailer.metadata, &trailer.offsets)?; + + BlockIndexImpl::Full(block_index) + } else { + let block_index = TwoLevelBlockIndex::from_file( + file_path, + &trailer.metadata, + &trailer.offsets, + (tree_id, trailer.metadata.id).into(), + descriptor_table.clone(), + block_cache.clone(), + )?; + BlockIndexImpl::TwoLevel(block_index) + }; #[cfg(feature = "bloom")] let bloom_ptr = trailer.offsets.bloom_ptr; From 5de0290b17e60a6f1646ceafaefebbacfff73024 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 23 Nov 2024 02:01:43 +0100 Subject: [PATCH 20/90] recover L0/L1 with full block index --- src/level_manifest/mod.rs | 23 ++++++++++++++++++----- src/tree/mod.rs | 14 +++++++++----- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 30395ac1..248f27b9 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -175,11 +175,24 @@ impl LevelManifest { Ok(levels) } - pub(crate) fn recover_ids>(path: P) -> crate::Result> { - Ok(Self::load_level_manifest(path)? - .into_iter() - .flatten() - .collect()) + pub(crate) fn recover_ids>( + path: P, + ) -> crate::Result> { + let manifest = Self::load_level_manifest(path)?; + let mut result = crate::HashMap::default(); + + for (level_idx, segment_ids) in manifest.into_iter().enumerate() { + for segment_id in segment_ids { + result.insert( + segment_id, + level_idx + .try_into() + .expect("there are less than 256 levels"), + ); + } + } + + Ok(result) } fn resolve_levels( diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 8d120258..05a95efa 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -914,8 +914,8 @@ impl Tree { let level_manifest_path = tree_path.join(LEVELS_MANIFEST_FILE); - let segment_ids_to_recover = LevelManifest::recover_ids(&level_manifest_path)?; - let cnt = segment_ids_to_recover.len(); + let segment_id_map = LevelManifest::recover_ids(&level_manifest_path)?; + let cnt = segment_id_map.len(); log::debug!("Recovering {cnt} disk segments from {tree_path:?}"); @@ -960,12 +960,13 @@ impl Tree { crate::Error::Unrecoverable })?; - if segment_ids_to_recover.contains(&segment_id) { + if let Some(&level_idx) = segment_id_map.get(&segment_id) { let segment = Segment::recover( &segment_file_path, tree_id, block_cache.clone(), descriptor_table.clone(), + level_idx == 0 || level_idx == 1, )?; descriptor_table.insert(&segment_file_path, (tree_id, segment.metadata.id).into()); @@ -982,8 +983,11 @@ impl Tree { } } - if segments.len() < segment_ids_to_recover.len() { - log::error!("Recovered less segments than expected: {segment_ids_to_recover:?}"); + if segments.len() < cnt { + log::error!( + "Recovered less segments than expected: {:?}", + segment_id_map.keys(), + ); return Err(crate::Error::Unrecoverable); } From 4be47cc1a8bf2409c7dc4b8241b97d27af46ee41 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 23 Nov 2024 02:02:11 +0100 Subject: [PATCH 21/90] stage missing file --- src/segment/block_index/full_index.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/segment/block_index/full_index.rs b/src/segment/block_index/full_index.rs index 0b5060f5..683dc269 100644 --- a/src/segment/block_index/full_index.rs +++ b/src/segment/block_index/full_index.rs @@ -10,6 +10,14 @@ use std::{fs::File, io::Seek, path::Path}; /// The index is fully loaded into memory. pub struct FullBlockIndex(Box<[KeyedBlockHandle]>); +impl std::ops::Deref for FullBlockIndex { + type Target = Box<[KeyedBlockHandle]>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + impl FullBlockIndex { pub fn from_file>( path: P, From 9671f3d92588c81179ad1a6be88cc5aaa91f132e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 23 Nov 2024 02:13:58 +0100 Subject: [PATCH 22/90] refactor --- src/segment/block_index/top_level.rs | 11 +++++++---- src/segment/block_index/two_level_index.rs | 2 +- src/segment/writer/mod.rs | 7 +++++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/segment/block_index/top_level.rs b/src/segment/block_index/top_level.rs index 15ad1606..7ba10bea 100644 --- a/src/segment/block_index/top_level.rs +++ b/src/segment/block_index/top_level.rs @@ -3,7 +3,10 @@ // (found in the LICENSE-* files in the repository) use super::{block_handle::KeyedBlockHandle, RawBlockIndex}; -use crate::segment::{block_index::IndexBlock, value_block::CachePolicy}; +use crate::segment::{ + block_index::IndexBlock, + value_block::{BlockOffset, CachePolicy}, +}; use std::{fs::File, path::Path}; /// The block index stores references to the positions of blocks on a file and their size @@ -32,14 +35,14 @@ impl TopLevelIndex { pub fn from_file>( path: P, _: &crate::segment::meta::Metadata, - offsets: &crate::segment::file_offsets::FileOffsets, + tli_ptr: BlockOffset, ) -> crate::Result { let path = path.as_ref(); - log::trace!("reading TLI from {path:?} at tli_ptr={}", offsets.tli_ptr); + log::trace!("reading TLI from {path:?} at tli_ptr={tli_ptr}"); let mut file = File::open(path)?; - let items = IndexBlock::from_file(&mut file, offsets.tli_ptr)?.items; + let items = IndexBlock::from_file(&mut file, tli_ptr)?.items; log::trace!("loaded TLI ({path:?}): {items:?}"); debug_assert!(!items.is_empty()); diff --git a/src/segment/block_index/two_level_index.rs b/src/segment/block_index/two_level_index.rs index 87ed3e3d..165ff9da 100644 --- a/src/segment/block_index/two_level_index.rs +++ b/src/segment/block_index/two_level_index.rs @@ -221,7 +221,7 @@ impl TwoLevelBlockIndex { let file_path = path.as_ref(); log::trace!("Reading block index from {file_path:?}"); - let top_level_index = TopLevelIndex::from_file(file_path, metadata, offsets)?; + let top_level_index = TopLevelIndex::from_file(file_path, metadata, offsets.tli_ptr)?; Ok(Self { descriptor_table, diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 5afa3f1e..e7181cf3 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -457,8 +457,11 @@ mod tests { // the TLI length fits into u32 as well #[allow(clippy::cast_possible_truncation)] { - let tli = - TopLevelIndex::from_file(&segment_file_path, &trailer.metadata, &trailer.offsets)?; + let tli = TopLevelIndex::from_file( + &segment_file_path, + &trailer.metadata, + trailer.offsets.tli_ptr, + )?; assert_eq!(tli.len() as u32, trailer.metadata.index_block_count); } From fa766d07b7b7ed7d824391cc2a4a5635b7e9d0ff Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 23 Nov 2024 02:21:30 +0100 Subject: [PATCH 23/90] closes #51 From 9674bd055424a17ed5d22a9d48717cb0ba59ae10 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 26 Nov 2024 15:11:45 +0100 Subject: [PATCH 24/90] wip --- src/compaction/worker.rs | 2 +- src/segment/block_index/two_level_index.rs | 6 +++--- src/segment/mod.rs | 2 +- src/segment/range.rs | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index fef8a83c..31da3649 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -252,7 +252,7 @@ fn merge_segments( let block_index = TwoLevelBlockIndex::from_file( &segment_file_path, &trailer.metadata, - &trailer.offsets, + trailer.offsets.tli_ptr, (opts.tree_id, segment_id).into(), opts.config.descriptor_table.clone(), opts.config.block_cache.clone(), diff --git a/src/segment/block_index/two_level_index.rs b/src/segment/block_index/two_level_index.rs index 165ff9da..a28c685a 100644 --- a/src/segment/block_index/two_level_index.rs +++ b/src/segment/block_index/two_level_index.rs @@ -10,7 +10,7 @@ use super::{ use crate::{ block_cache::BlockCache, descriptor_table::FileDescriptorTable, - segment::{file_offsets::FileOffsets, meta::Metadata, value_block::BlockOffset}, + segment::{meta::Metadata, value_block::BlockOffset}, }; use std::{path::Path, sync::Arc}; @@ -213,7 +213,7 @@ impl TwoLevelBlockIndex { pub fn from_file>( path: P, metadata: &Metadata, - offsets: &FileOffsets, + tli_ptr: BlockOffset, segment_id: GlobalSegmentId, descriptor_table: Arc, block_cache: Arc, @@ -221,7 +221,7 @@ impl TwoLevelBlockIndex { let file_path = path.as_ref(); log::trace!("Reading block index from {file_path:?}"); - let top_level_index = TopLevelIndex::from_file(file_path, metadata, offsets.tli_ptr)?; + let top_level_index = TopLevelIndex::from_file(file_path, metadata, tli_ptr)?; Ok(Self { descriptor_table, diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 09287232..bd820ba5 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -239,7 +239,7 @@ impl Segment { let block_index = TwoLevelBlockIndex::from_file( file_path, &trailer.metadata, - &trailer.offsets, + trailer.offsets.tli_ptr, (tree_id, trailer.metadata.id).into(), descriptor_table.clone(), block_cache.clone(), diff --git a/src/segment/range.rs b/src/segment/range.rs index 0da45f6e..2c0af968 100644 --- a/src/segment/range.rs +++ b/src/segment/range.rs @@ -289,7 +289,7 @@ mod tests { let block_index = TwoLevelBlockIndex::from_file( segment_file_path, &trailer.metadata, - &trailer.offsets, + trailer.offsets.tli_ptr, (0, 0).into(), table.clone(), block_cache.clone(), @@ -389,7 +389,7 @@ mod tests { let block_index = TwoLevelBlockIndex::from_file( segment_file_path, &trailer.metadata, - &trailer.offsets, + trailer.offsets.tli_ptr, (0, 0).into(), table.clone(), block_cache.clone(), @@ -590,7 +590,7 @@ mod tests { let block_index = TwoLevelBlockIndex::from_file( segment_file_path, &trailer.metadata, - &trailer.offsets, + trailer.offsets.tli_ptr, (0, 0).into(), table.clone(), block_cache.clone(), @@ -694,7 +694,7 @@ mod tests { let block_index = TwoLevelBlockIndex::from_file( segment_file_path, &trailer.metadata, - &trailer.offsets, + trailer.offsets.tli_ptr, (0, 0).into(), table.clone(), block_cache.clone(), From f04b7a8e2f78e6b81205f34e48582aae38243172 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 26 Nov 2024 15:14:14 +0100 Subject: [PATCH 25/90] rename --- src/segment/block_index/full_index.rs | 6 +++--- src/segment/block_index/mod.rs | 4 ++-- src/segment/block_index/top_level.rs | 2 +- src/segment/block_index/two_level_index.rs | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/segment/block_index/full_index.rs b/src/segment/block_index/full_index.rs index 683dc269..6b9f69cb 100644 --- a/src/segment/block_index/full_index.rs +++ b/src/segment/block_index/full_index.rs @@ -55,7 +55,7 @@ impl BlockIndex for FullBlockIndex { key: &[u8], _: CachePolicy, ) -> crate::Result> { - use super::RawBlockIndex; + use super::KeyedBlockIndex; self.0 .get_lowest_block_containing_key(key, CachePolicy::Read) @@ -68,7 +68,7 @@ impl BlockIndex for FullBlockIndex { key: &[u8], cache_policy: CachePolicy, ) -> crate::Result> { - use super::RawBlockIndex; + use super::KeyedBlockIndex; self.0 .get_last_block_containing_key(key, cache_policy) @@ -76,7 +76,7 @@ impl BlockIndex for FullBlockIndex { } fn get_last_block_handle(&self, _: CachePolicy) -> crate::Result { - use super::RawBlockIndex; + use super::KeyedBlockIndex; self.0 .get_last_block_handle(CachePolicy::Read) diff --git a/src/segment/block_index/mod.rs b/src/segment/block_index/mod.rs index 7fcbd35c..67fcea27 100644 --- a/src/segment/block_index/mod.rs +++ b/src/segment/block_index/mod.rs @@ -18,7 +18,7 @@ use two_level_index::TwoLevelBlockIndex; pub type IndexBlock = Block; -impl RawBlockIndex for [KeyedBlockHandle] { +impl KeyedBlockIndex for [KeyedBlockHandle] { fn get_lowest_block_containing_key( &self, key: &[u8], @@ -81,7 +81,7 @@ pub trait BlockIndex { } #[allow(clippy::module_name_repetitions)] -pub trait RawBlockIndex { +pub trait KeyedBlockIndex { /// Gets the lowest block handle that may contain the given item fn get_lowest_block_containing_key( &self, diff --git a/src/segment/block_index/top_level.rs b/src/segment/block_index/top_level.rs index 7ba10bea..8a490d60 100644 --- a/src/segment/block_index/top_level.rs +++ b/src/segment/block_index/top_level.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{block_handle::KeyedBlockHandle, RawBlockIndex}; +use super::{block_handle::KeyedBlockHandle, KeyedBlockIndex}; use crate::segment::{ block_index::IndexBlock, value_block::{BlockOffset, CachePolicy}, diff --git a/src/segment/block_index/two_level_index.rs b/src/segment/block_index/two_level_index.rs index a28c685a..a246c016 100644 --- a/src/segment/block_index/two_level_index.rs +++ b/src/segment/block_index/two_level_index.rs @@ -95,7 +95,7 @@ impl TwoLevelBlockIndex { let index_block = self.load_index_block(index_block_handle.offset, cache_policy)?; Ok({ - use super::RawBlockIndex; + use super::KeyedBlockIndex; index_block .items @@ -122,7 +122,7 @@ impl TwoLevelBlockIndex { let index_block = self.load_index_block(index_block_handle.offset, cache_policy)?; Ok({ - use super::RawBlockIndex; + use super::KeyedBlockIndex; index_block .items From 489ffd4890aa303546a1491401e96572c3e49f53 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 26 Nov 2024 15:21:18 +0100 Subject: [PATCH 26/90] refactor --- src/segment/block_index/top_level.rs | 8 +++++--- src/segment/block_index/two_level_index.rs | 6 ++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/segment/block_index/top_level.rs b/src/segment/block_index/top_level.rs index 8a490d60..c37bbc3c 100644 --- a/src/segment/block_index/top_level.rs +++ b/src/segment/block_index/top_level.rs @@ -69,8 +69,10 @@ impl TopLevelIndex { pub fn iter(&self) -> impl Iterator { self.0.iter() } +} - pub fn get_lowest_block_containing_key( +impl KeyedBlockIndex for TopLevelIndex { + fn get_lowest_block_containing_key( &self, key: &[u8], _: CachePolicy, @@ -80,7 +82,7 @@ impl TopLevelIndex { } /// Gets the last block handle that may contain the given item - pub fn get_last_block_containing_key( + fn get_last_block_containing_key( &self, key: &[u8], cache_policy: CachePolicy, @@ -88,7 +90,7 @@ impl TopLevelIndex { self.0.get_last_block_containing_key(key, cache_policy) } - pub fn get_last_block_handle(&self, _: CachePolicy) -> crate::Result<&KeyedBlockHandle> { + fn get_last_block_handle(&self, _: CachePolicy) -> crate::Result<&KeyedBlockHandle> { self.0.get_last_block_handle(CachePolicy::Read) } } diff --git a/src/segment/block_index/two_level_index.rs b/src/segment/block_index/two_level_index.rs index a246c016..eaa0bc1e 100644 --- a/src/segment/block_index/two_level_index.rs +++ b/src/segment/block_index/two_level_index.rs @@ -84,6 +84,8 @@ impl TwoLevelBlockIndex { key: &[u8], cache_policy: CachePolicy, ) -> crate::Result> { + use super::KeyedBlockIndex; + let Some(index_block_handle) = self .top_level_index .get_lowest_block_containing_key(key, cache_policy) @@ -111,6 +113,8 @@ impl TwoLevelBlockIndex { key: &[u8], cache_policy: CachePolicy, ) -> crate::Result> { + use super::KeyedBlockIndex; + let Some(index_block_handle) = self .top_level_index .get_last_block_containing_key(key, cache_policy) @@ -136,6 +140,8 @@ impl TwoLevelBlockIndex { &self, cache_policy: CachePolicy, ) -> crate::Result { + use super::KeyedBlockIndex; + let index_block_handle = self .top_level_index .get_last_block_handle(cache_policy) From 8d96736f0f34778230d35eca97a352d2a71e7af5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 26 Nov 2024 18:28:38 +0100 Subject: [PATCH 27/90] fix: leveled compaction --- benches/tli.rs | 2 +- src/compaction/leveled.rs | 364 +++++++++++------------------------- src/compaction/worker.rs | 2 +- src/key_range.rs | 21 ++- src/level_manifest/level.rs | 23 ++- src/level_manifest/mod.rs | 42 ++++- src/tree/mod.rs | 4 +- 7 files changed, 185 insertions(+), 273 deletions(-) diff --git a/benches/tli.rs b/benches/tli.rs index d81c2715..8afc445d 100644 --- a/benches/tli.rs +++ b/benches/tli.rs @@ -1,6 +1,6 @@ use criterion::{criterion_group, criterion_main, Criterion}; use lsm_tree::segment::{ - block_index::BlockIndex, value_block::BlockOffset, value_block::CachePolicy, + block_index::KeyedBlockIndex, value_block::BlockOffset, value_block::CachePolicy, }; fn tli_find_item(c: &mut Criterion) { diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 32b5cec1..0ed01b59 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -11,6 +11,78 @@ use crate::{ HashSet, SegmentId, }; +fn aggregate_key_range(segments: &[Segment]) -> KeyRange { + KeyRange::aggregate(segments.iter().map(|x| &x.metadata.key_range)) +} + +// TODO: Currently does not take in `overshoot` +// TODO: Need to make sure compactions are not too small +fn pick_minimal_overlap(curr_level: &Level, next_level: &Level) -> (HashSet, bool) { + // assert!(curr_level.is_disjoint, "Lx is not disjoint"); + // assert!(next_level.is_disjoint, "Lx+1 is not disjoint"); + + let mut choices = vec![]; + + for size in 1..=next_level.len() { + let windows = next_level.windows(size); + + for window in windows { + let key_range = aggregate_key_range(window); + + // Pull in all segments in current level into compaction + let curr_level_pull_in: Vec<_> = if curr_level.is_disjoint { + curr_level.contained_segments(&key_range).collect() + } else { + curr_level.overlapping_segments(&key_range).collect() + }; + + let curr_level_size = curr_level_pull_in + .iter() + .map(|x| x.metadata.file_size) + .sum::(); + + let next_level_size = window.iter().map(|x| x.metadata.file_size).sum::(); + + let mut segment_ids: HashSet<_> = window.iter().map(|x| x.metadata.id).collect(); + segment_ids.extend(curr_level_pull_in.iter().map(|x| x.metadata.id)); + + let write_amp = (next_level_size as f32) / (curr_level_size as f32); + + choices.push((write_amp, segment_ids, false)); + } + } + + // NOTE: Find trivial moves + for size in (1..=curr_level.len()).rev() { + let windows = curr_level.windows(size); + + for window in windows { + let segment_ids: HashSet = window.iter().map(|x| x.metadata.id).collect(); + + let key_range = aggregate_key_range(window); + + if next_level.overlapping_segments(&key_range).next().is_none() { + choices.push((0.0, segment_ids, true)); + } + } + } + + // NOTE: Keep compactions with 25 or less segments + // to make compactions not too large + // + // TODO: ideally, if a level has a lot of compaction debt + // compactions could be parallelized as long as they don't overlap in key range + choices.retain(|(_, segments, _)| segments.len() <= 25); + + let minimum_effort_choice = choices + .into_iter() + .min_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal)); + + let (_, set, can_trivial_move) = minimum_effort_choice.expect("should exist"); + + (set, can_trivial_move) +} + /// Levelled compaction strategy (LCS) /// /// If a level reaches some threshold size, parts of it are merged into overlapping segments in the next level. @@ -25,18 +97,18 @@ use crate::{ #[derive(Clone)] pub struct Strategy { /// When the number of segments in L0 reaches this threshold, - /// they are merged into L1 + /// they are merged into L1. /// /// Default = 4 /// - /// Same as `level0_file_num_compaction_trigger` in `RocksDB` + /// Same as `level0_file_num_compaction_trigger` in `RocksDB`. pub l0_threshold: u8, - /// Target segment size (compressed) + /// The target segment size as disk (possibly compressed). /// /// Default = 64 MiB /// - /// Same as `target_file_size_base` in `RocksDB` + /// Same as `target_file_size_base` in `RocksDB`. pub target_size: u32, /// Size ratio between levels of the LSM tree (a.k.a fanout, growth rate) @@ -44,114 +116,37 @@ pub struct Strategy { /// This is the exponential growth of the from one. /// level to the next /// - /// A level target size is: max_memtable_size * level_ratio.pow(#level + 1) + /// A level target size is: max_memtable_size * level_ratio.pow(#level + 1). #[allow(clippy::doc_markdown)] pub level_ratio: u8, + + /// The target size of L1. + /// + /// Currently hard coded to 256 MiB. + /// + /// Default = 256 MiB + pub level_base_size: u32, } impl Default for Strategy { fn default() -> Self { Self { l0_threshold: 4, - target_size: 64 * 1_024 * 1_024, - level_ratio: 8, // TODO: benchmark vs 10 + target_size:/* 64 Mib */ 64 * 1_024 * 1_024, + level_ratio: 10, + level_base_size:/* 256 MiB */ 256 * 1_024 * 1_024, } } } -fn aggregate_key_range(segments: &[Segment]) -> KeyRange { - KeyRange::aggregate(segments.iter().map(|x| &x.metadata.key_range)) -} - -fn desired_level_size_in_bytes(level_idx: u8, ratio: u8, target_size: u32) -> usize { - (ratio as usize).pow(u32::from(level_idx)) * (target_size as usize) -} - -fn pick_minimal_overlap( - curr_level: &Level, - next_level: &Level, - overshoot: u64, -) -> (HashSet, bool) { - let mut choices = vec![]; +impl Strategy { + fn level_target_size(&self, level_idx: u8) -> u64 { + assert!(level_idx >= 1, "level_target_size does not apply to L0"); - for size in 1..=curr_level.len() { - let windows = curr_level.windows(size); - - for window in windows { - let size_sum = window.iter().map(|x| x.metadata.file_size).sum::(); + let power = (self.level_ratio as usize).pow(u32::from(level_idx) - 1); - if size_sum >= overshoot { - // NOTE: Consider this window - - let mut segment_ids: HashSet = - window.iter().map(|x| x.metadata.id).collect(); - - // Get overlapping segments in next level - let key_range = aggregate_key_range(window); - - let next_level_overlapping_segments: Vec<_> = next_level - .overlapping_segments(&key_range) - .cloned() - .collect(); - - // Get overlapping segments in same level - let key_range = aggregate_key_range(&next_level_overlapping_segments); - - let curr_level_overlapping_segment_ids: Vec<_> = curr_level - .overlapping_segments(&key_range) - .filter(|x| !segment_ids.contains(&x.metadata.id)) - .collect(); - - // Calculate effort - let size_next_level = next_level_overlapping_segments - .iter() - .map(|x| x.metadata.file_size) - .sum::(); - - let size_curr_level = curr_level_overlapping_segment_ids - .iter() - .map(|x| x.metadata.file_size) - .sum::(); - - let effort = size_sum + size_next_level + size_curr_level; - - segment_ids.extend( - next_level_overlapping_segments - .iter() - .map(|x| x.metadata.id), - ); - - segment_ids.extend( - curr_level_overlapping_segment_ids - .iter() - .map(|x| x.metadata.id), - ); - - // TODO: need to calculate write_amp and choose minimum write_amp instead - // - // consider the segments in La = A to be the ones in the window - // and the segments in La+1 B to be the ones that overlap - // and r = A / B - // we want to avoid compactions that have a low ratio r - // because that means we don't clear out a lot of segments in La - // but have to rewrite a lot of segments in La+1 - // - // ultimately, we want the highest ratio - // to maximize the amount of segments we are getting rid of in La - // for the least amount of effort - choices.push(( - effort, - segment_ids, - next_level_overlapping_segments.is_empty(), - )); - } - } + (power * (self.level_base_size as usize)) as u64 } - - let minimum_effort_choice = choices.into_iter().min_by(|a, b| a.0.cmp(&b.0)); - let (_, set, can_trivial_move) = minimum_effort_choice.expect("should exist"); - - (set, can_trivial_move) } impl CompactionStrategy for Strategy { @@ -173,7 +168,7 @@ impl CompactionStrategy for Strategy { .enumerate() .skip(1) .take(resolved_view.len() - 2) - .rev() + //.rev() { // NOTE: Level count is 255 max #[allow(clippy::cast_possible_truncation)] @@ -189,18 +184,22 @@ impl CompactionStrategy for Strategy { continue; } - let desired_bytes = - desired_level_size_in_bytes(curr_level_index, self.level_ratio, self.target_size); + let desired_bytes = self.level_target_size(curr_level_index); - let overshoot = level.size().saturating_sub(desired_bytes as u64); + let overshoot = level.size().saturating_sub(desired_bytes); if overshoot > 0 { let Some(next_level) = &resolved_view.get(next_level_index as usize) else { break; }; - let (segment_ids, can_trivial_move) = - pick_minimal_overlap(level, next_level, overshoot); + let (segment_ids, can_trivial_move) = pick_minimal_overlap(level, next_level); + + // eprintln!( + // "merge {} segments, L{}->L{next_level_index}: {segment_ids:?}", + // segment_ids.len(), + // next_level_index - 1, + // ); let choice = CompactionInput { segment_ids, @@ -584,166 +583,11 @@ mod tests { Ok(()) } - #[test] - fn leveled_deeper_level_with_overlap() -> crate::Result<()> { - let tempdir = tempfile::tempdir()?; - let compactor = Strategy { - target_size: 64 * 1_024 * 1_024, - level_ratio: 2, - ..Default::default() - }; - let config = Config::default(); - - #[rustfmt::skip] - let levels = build_levels(tempdir.path(), vec![ - vec![], - vec![(1, "a", "g", 64), (2, "h", "t", 64), (3, "x", "z", 64)], - vec![(4, "f", "l", 64)], - vec![], - ])?; - - assert_eq!( - compactor.choose(&levels, &config), - Choice::Merge(CompactionInput { - dest_level: 2, - segment_ids: set![3], - target_size: 64 * 1_024 * 1_024 - }) - ); - - Ok(()) - } - - #[test] - fn leveled_deeper_level_no_overlap() -> crate::Result<()> { - let tempdir = tempfile::tempdir()?; - let compactor = Strategy { - target_size: 64 * 1_024 * 1_024, - level_ratio: 2, - ..Default::default() - }; - let config = Config::default(); - - #[rustfmt::skip] - let levels = build_levels(tempdir.path(), vec![ - vec![], - vec![(1, "a", "g", 64), (2, "h", "j", 64), (3, "k", "t", 64)], - vec![(4, "k", "l", 64)], - vec![], - ])?; - - assert_eq!( - compactor.choose(&levels, &config), - // NOTE: We merge because segments are demoted into "cold" levels - // see https://github.com/fjall-rs/lsm-tree/issues/63 - Choice::Merge(CompactionInput { - dest_level: 2, - segment_ids: set![1], - target_size: 64 * 1_024 * 1_024 - }) - ); - - Ok(()) - } - - #[test] - fn leveled_last_level_with_overlap() -> crate::Result<()> { - let tempdir = tempfile::tempdir()?; - let compactor = Strategy { - target_size: 64 * 1_024 * 1_024, - level_ratio: 2, - ..Default::default() - }; - let config = Config::default(); - - #[rustfmt::skip] - let levels = build_levels(tempdir.path(), vec![ - vec![], - vec![], - vec![(1, "a", "g", 64), (2, "a", "g", 64), (3, "a", "g", 64), (4, "a", "g", 64), (5, "y", "z", 64)], - vec![(6, "f", "l", 64)], - ])?; - - assert_eq!( - compactor.choose(&levels, &config), - Choice::Merge(CompactionInput { - dest_level: 3, - // NOTE: 5 is the only segment that has no overlap with #3 - segment_ids: set![5], - target_size: 64 * 1_024 * 1_024 - }) - ); - - Ok(()) - } - - #[test] - fn levelled_last_level_with_overlap_invariant() -> crate::Result<()> { - let tempdir = tempfile::tempdir()?; - let compactor = Strategy { - target_size: 64 * 1_024 * 1_024, - level_ratio: 2, - ..Default::default() - }; - let config = Config::default(); - - #[rustfmt::skip] - let levels = build_levels(tempdir.path(), vec![ - vec![], - vec![], - vec![(1, "a", "g", 64), (2, "h", "j", 64), (3, "k", "l", 64), (4, "m", "n", 64), (5, "y", "z", 64)], - vec![(6, "f", "l", 64)], - ])?; - - assert_eq!( - compactor.choose(&levels, &config), - Choice::Move(CompactionInput { - dest_level: 3, - // NOTE: segment #4 is the left-most segment that has no overlap with L3 - segment_ids: set![4], - target_size: 64 * 1_024 * 1_024 - }) - ); - - Ok(()) - } - - #[test] - fn levelled_last_level_without_overlap_invariant() -> crate::Result<()> { - let tempdir = tempfile::tempdir()?; - let compactor = Strategy { - target_size: 64 * 1_024 * 1_024, - level_ratio: 2, - ..Default::default() - }; - let config = Config::default(); - - #[rustfmt::skip] - let levels = build_levels(tempdir.path(), vec![ - vec![], - vec![], - vec![(1, "a", "g", 64), (2, "h", "j", 64), (3, "k", "l", 64), (4, "m", "n", 64), (5, "y", "z", 64)], - vec![(6, "w", "x", 64)], - ])?; - - assert_eq!( - compactor.choose(&levels, &config), - Choice::Move(CompactionInput { - dest_level: 3, - segment_ids: set![1], - target_size: 64 * 1_024 * 1_024 - }) - ); - - Ok(()) - } - #[test] fn levelled_from_tiered() -> crate::Result<()> { let tempdir = tempfile::tempdir()?; let compactor = Strategy { target_size: 64 * 1_024 * 1_024, - level_ratio: 2, ..Default::default() }; let config = Config::default(); @@ -751,7 +595,7 @@ mod tests { #[rustfmt::skip] let levels = build_levels(tempdir.path(), vec![ vec![], - vec![(1, "a", "z", 64), (2, "a", "z", 64), (3, "g", "z", 64)], + vec![(1, "a", "z", 64), (2, "a", "z", 64), (3, "g", "z", 64), (5, "g", "z", 64), (6, "g", "z", 64)], vec![(4, "a", "g", 64)], vec![], ])?; @@ -760,7 +604,7 @@ mod tests { compactor.choose(&levels, &config), Choice::Merge(CompactionInput { dest_level: 2, - segment_ids: [1, 2, 3, 4].into_iter().collect::>(), + segment_ids: [1, 2, 3, 4, 5, 6].into_iter().collect::>(), target_size: 64 * 1_024 * 1_024 }) ); diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 31da3649..9b42bcf8 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -224,7 +224,7 @@ fn merge_segments( log::debug!( "Compacted in {}ms ({} segments created)", start.elapsed().as_millis(), - writer_results.len() + writer_results.len(), ); let created_segments = writer_results diff --git a/src/key_range.rs b/src/key_range.rs index d2b9aeea..0c87881c 100644 --- a/src/key_range.rs +++ b/src/key_range.rs @@ -71,6 +71,12 @@ impl KeyRange { key >= *start && key <= *end } + pub fn contains_range(&self, other: &Self) -> bool { + let (start1, end1) = &self.0; + let (start2, end2) = &other.0; + start1 <= start2 && end1 >= end2 + } + pub fn overlaps_with_key_range(&self, other: &Self) -> bool { let (start1, end1) = &self.0; let (start2, end2) = &other.0; @@ -183,7 +189,7 @@ mod tests { } #[test] - fn key_range_aggregate() { + fn key_range_aggregate_1() { let ranges = [ int_key_range(2, 4), int_key_range(0, 4), @@ -195,6 +201,19 @@ mod tests { assert_eq!([0, 0, 0, 0, 0, 0, 0, 10], &*max); } + #[test] + fn key_range_aggregate_2() { + let ranges = [ + int_key_range(6, 7), + int_key_range(0, 2), + int_key_range(0, 10), + ]; + let aggregated = KeyRange::aggregate(ranges.iter()); + let (min, max) = aggregated.0; + assert_eq!([0, 0, 0, 0, 0, 0, 0, 0], &*min); + assert_eq!([0, 0, 0, 0, 0, 0, 0, 10], &*max); + } + mod is_disjoint { use super::*; use test_log::test; diff --git a/src/level_manifest/level.rs b/src/level_manifest/level.rs index 488bb363..c3867608 100644 --- a/src/level_manifest/level.rs +++ b/src/level_manifest/level.rs @@ -58,7 +58,7 @@ impl Level { self.segments.iter().map(|x| x.metadata.id).collect() } - fn update_metadata(&mut self) { + pub fn update_metadata(&mut self) { self.set_disjoint_flag(); self.sort(); // self.set_key_range(); @@ -125,15 +125,19 @@ impl Level { self.segments.iter().map(|x| x.metadata.file_size).sum() } - /// Checks if the level is disjoint and caches the result in `is_disjoint`. - fn set_disjoint_flag(&mut self) { + pub(crate) fn compute_is_disjoint(&self) -> bool { let ranges = self .segments .iter() .map(|x| &x.metadata.key_range) .collect::>(); - self.is_disjoint = KeyRange::is_disjoint(&ranges); + KeyRange::is_disjoint(&ranges) + } + + /// Checks if the level is disjoint and caches the result in `is_disjoint`. + fn set_disjoint_flag(&mut self) { + self.is_disjoint = self.compute_is_disjoint(); } /// Returns an iterator over segments in the level that have a key range @@ -147,6 +151,17 @@ impl Level { .filter(|x| x.metadata.key_range.overlaps_with_key_range(key_range)) } + /// Returns an iterator over segments in the level that have a key range + /// fully contained in the input key range. + pub fn contained_segments<'a>( + &'a self, + key_range: &'a KeyRange, + ) -> impl Iterator { + self.segments + .iter() + .filter(|x| key_range.contains_range(&x.metadata.key_range)) + } + pub fn as_disjoint(&self) -> Option> { if self.is_disjoint { Some(DisjointLevel(self)) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 248f27b9..45d4f0e3 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -46,11 +46,43 @@ pub struct LevelManifest { impl std::fmt::Display for LevelManifest { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { for (idx, level) in self.levels.iter().enumerate() { - write!(f, "{idx}: ")?; + write!( + f, + "{idx} [{}]: ", + match (level.is_empty(), level.compute_is_disjoint()) { + (true, _) => ".", + (false, true) => "D", + + (false, false) => { + // #[allow(clippy::all)] + // if idx > 0 { + // eprintln!("wtf??!"); + // eprintln!( + // "disjoint: {}", + // KeyRange::is_disjoint( + // &level + // .segments + // .iter() + // .map(|x| &x.metadata.key_range) + // .collect::>() + // ) + // ); + + // for segment in &level.segments { + // eprintln!( + // "{}: {:?}", + // segment.metadata.id, segment.metadata.key_range + // ); + // } + // } + "_" + } + } + )?; if level.segments.is_empty() { write!(f, "")?; - } else if level.segments.len() >= 10 { + } else if level.segments.len() >= 30 { #[allow(clippy::indexing_slicing)] for segment in level.segments.iter().take(2) { let id = segment.metadata.id; @@ -283,7 +315,7 @@ impl LevelManifest { Self::write_to_disk(&self.path, &working_copy)?; self.levels = working_copy.into_iter().map(Arc::new).collect(); - self.sort_levels(); + self.update_metadata(); self.set_disjoint_flag(); log::trace!("Swapped level manifest to:\n{self}"); @@ -297,11 +329,11 @@ impl LevelManifest { self.insert_into_level(0, segment); } - pub(crate) fn sort_levels(&mut self) { + pub fn update_metadata(&mut self) { for level in &mut self.levels { Arc::get_mut(level) .expect("could not get mutable Arc - this is a bug") - .sort(); + .update_metadata(); } } diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 05a95efa..cb149bf0 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -208,6 +208,8 @@ impl AbstractTree for Tree { } })?; + // eprintln!("{original_levels}"); + for segment in segments { log::trace!("releasing sealed memtable {}", segment.metadata.id); sealed_memtables.remove(segment.metadata.id); @@ -838,7 +840,7 @@ impl Tree { &config.block_cache, &config.descriptor_table, )?; - levels.sort_levels(); + levels.update_metadata(); let highest_segment_id = levels .iter() From d543daddd31c23d9c9868629af7cedea6bf939a4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 26 Nov 2024 18:35:30 +0100 Subject: [PATCH 28/90] filter out 0 compactions and add explanatory comments --- src/compaction/leveled.rs | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 0ed01b59..639a9ec3 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -31,8 +31,27 @@ fn pick_minimal_overlap(curr_level: &Level, next_level: &Level) -> (HashSet = if curr_level.is_disjoint { + // IMPORTANT: Avoid "infectious spread" of key ranges + // Imagine these levels: + // + // A B C D E F + // L1 | ----- ----- ----- ----- ----- ----- + // L2 | ----- ----- ----- ----- ----- + // 1 2 3 4 5 + // + // If we took 1, we would also have to include A, + // but then we would also have to include 2, + // but then we would also have to include B, + // but then we would also have to include 3, + // ... + // + // Instead, we consider a window like 1 - 3 + // and then take A & B, because they are *contained* in that range + // Not including C is fine, because we are not shadowing data unexpectedly curr_level.contained_segments(&key_range).collect() } else { + // If the level is not disjoint, we just merge everything that overlaps + // to try and "repair" the level curr_level.overlapping_segments(&key_range).collect() }; @@ -41,14 +60,17 @@ fn pick_minimal_overlap(curr_level: &Level, next_level: &Level) -> (HashSet(); - let next_level_size = window.iter().map(|x| x.metadata.file_size).sum::(); + // NOTE: Only consider compactions where we actually do some merging + if curr_level_size > 0 { + let next_level_size = window.iter().map(|x| x.metadata.file_size).sum::(); - let mut segment_ids: HashSet<_> = window.iter().map(|x| x.metadata.id).collect(); - segment_ids.extend(curr_level_pull_in.iter().map(|x| x.metadata.id)); + let mut segment_ids: HashSet<_> = window.iter().map(|x| x.metadata.id).collect(); + segment_ids.extend(curr_level_pull_in.iter().map(|x| x.metadata.id)); - let write_amp = (next_level_size as f32) / (curr_level_size as f32); + let write_amp = (next_level_size as f32) / (curr_level_size as f32); - choices.push((write_amp, segment_ids, false)); + choices.push((write_amp, segment_ids, false)); + } } } From 952703649e10c3b6a0cc6d4e52afaaa5c1b451c8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 26 Nov 2024 18:39:08 +0100 Subject: [PATCH 29/90] wip --- src/level_manifest/mod.rs | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 45d4f0e3..e7e0d3c3 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -52,31 +52,7 @@ impl std::fmt::Display for LevelManifest { match (level.is_empty(), level.compute_is_disjoint()) { (true, _) => ".", (false, true) => "D", - - (false, false) => { - // #[allow(clippy::all)] - // if idx > 0 { - // eprintln!("wtf??!"); - // eprintln!( - // "disjoint: {}", - // KeyRange::is_disjoint( - // &level - // .segments - // .iter() - // .map(|x| &x.metadata.key_range) - // .collect::>() - // ) - // ); - - // for segment in &level.segments { - // eprintln!( - // "{}: {:?}", - // segment.metadata.id, segment.metadata.key_range - // ); - // } - // } - "_" - } + (false, false) => "_", } )?; @@ -128,7 +104,7 @@ impl std::fmt::Display for LevelManifest { f, " | # = {}, {} MiB", level.len(), - level.size() / 1_024 / 1_024 + level.size() / 1_024 / 1_024, )?; } From 95bdeb0c94dd0752f8fcf827974275f36f5d02e3 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 26 Nov 2024 22:23:57 +0100 Subject: [PATCH 30/90] preliminary parallel compactions --- src/compaction/leveled.rs | 77 +++++++++++++++++++++++--------------- src/compaction/worker.rs | 22 +++++++++-- src/level_manifest/mod.rs | 2 +- src/segment/value_block.rs | 5 +++ 4 files changed, 71 insertions(+), 35 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 639a9ec3..2c8a69db 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -6,7 +6,7 @@ use super::{Choice, CompactionStrategy, Input as CompactionInput}; use crate::{ config::Config, key_range::KeyRange, - level_manifest::{level::Level, LevelManifest}, + level_manifest::{level::Level, HiddenSet, LevelManifest}, segment::Segment, HashSet, SegmentId, }; @@ -16,8 +16,12 @@ fn aggregate_key_range(segments: &[Segment]) -> KeyRange { } // TODO: Currently does not take in `overshoot` -// TODO: Need to make sure compactions are not too small -fn pick_minimal_overlap(curr_level: &Level, next_level: &Level) -> (HashSet, bool) { +// TODO: Need to make sure compactions are not too small either +fn pick_minimal_compaction( + curr_level: &Level, + next_level: &Level, + hidden_set: &HiddenSet, +) -> Option<(HashSet, bool)> { // assert!(curr_level.is_disjoint, "Lx is not disjoint"); // assert!(next_level.is_disjoint, "Lx+1 is not disjoint"); @@ -27,6 +31,16 @@ fn pick_minimal_overlap(curr_level: &Level, next_level: &Level) -> (HashSet (HashSet (HashSet Choice { - let resolved_view = levels.resolved_view(); - - // If there are any levels that already have a compactor working on it - // we can't touch those, because that could cause a race condition - // violating the leveled compaction invariance of having a single sorted - // run per level - // - // TODO: However, this can probably improved by checking two compaction - // workers just don't cross key ranges - let busy_levels = levels.busy_levels(); - - for (curr_level_index, level) in resolved_view - .iter() - .enumerate() - .skip(1) - .take(resolved_view.len() - 2) - //.rev() - { + let view = &levels.levels; + + // L1+ compactions + for (curr_level_index, level) in view.iter().enumerate().skip(1).take(view.len() - 2) { // NOTE: Level count is 255 max #[allow(clippy::cast_possible_truncation)] let curr_level_index = curr_level_index as u8; @@ -202,20 +210,24 @@ impl CompactionStrategy for Strategy { continue; } - if busy_levels.contains(&curr_level_index) || busy_levels.contains(&next_level_index) { + /* if busy_levels.contains(&curr_level_index) || busy_levels.contains(&next_level_index) { continue; - } + } */ let desired_bytes = self.level_target_size(curr_level_index); let overshoot = level.size().saturating_sub(desired_bytes); if overshoot > 0 { - let Some(next_level) = &resolved_view.get(next_level_index as usize) else { + let Some(next_level) = &view.get(next_level_index as usize) else { break; }; - let (segment_ids, can_trivial_move) = pick_minimal_overlap(level, next_level); + let Some((segment_ids, can_trivial_move)) = + pick_minimal_compaction(level, next_level, &levels.hidden_set) + else { + break; + }; // eprintln!( // "merge {} segments, L{}->L{next_level_index}: {segment_ids:?}", @@ -250,8 +262,11 @@ impl CompactionStrategy for Strategy { } } + // L0->L1 compactions { - let Some(first_level) = resolved_view.first() else { + let busy_levels = levels.busy_levels(); + + let Some(first_level) = view.first() else { return Choice::DoNothing; }; @@ -296,10 +311,10 @@ impl CompactionStrategy for Strategy { } if !busy_levels.contains(&1) { - let mut level = first_level.clone(); + let mut level = (**first_level).clone(); level.sort_by_key_range(); - let Some(next_level) = &resolved_view.get(1) else { + let Some(next_level) = &view.get(1) else { return Choice::DoNothing; }; diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 9b42bcf8..100b89d1 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -125,22 +125,38 @@ fn merge_segments( ) -> crate::Result<()> { if opts.stop_signal.is_stopped() { log::debug!("compactor: stopping before compaction because of stop signal"); + return Ok(()); + } + + // TODO: this sometimes runs, but shouldn't be possible + // TODO: because we have a mutex when hiding & showing segments and checking compaction strategy... + if payload + .segment_ids + .iter() + .any(|id| levels.hidden_set.contains(id)) + { + log::warn!("Compaction task contained hidden segments, declining to run it"); + return Ok(()); } let segments_base_folder = opts.config.path.join(SEGMENTS_FOLDER); let merge_iter = { - let to_merge: Vec<_> = { + let to_merge: Option> = { let segments = levels.get_all_segments(); payload .segment_ids .iter() - .filter_map(|x| segments.get(x)) - .cloned() + .map(|x| segments.get(x).cloned()) .collect() }; + let Some(to_merge) = to_merge else { + log::warn!("Compaction task contained segments that do not exist, declining to run it"); + return Ok(()); + }; + let mut segment_readers: Vec> = Vec::with_capacity(to_merge.len()); for segment in to_merge { diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index e7e0d3c3..bf4e25cd 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -38,7 +38,7 @@ pub struct LevelManifest { /// /// While consuming segments (because of compaction) they will not appear in the list of segments /// as to not cause conflicts between multiple compaction threads (compacting the same segments) - hidden_set: HiddenSet, + pub hidden_set: HiddenSet, is_disjoint: bool, } diff --git a/src/segment/value_block.rs b/src/segment/value_block.rs index 48832716..40c923dc 100644 --- a/src/segment/value_block.rs +++ b/src/segment/value_block.rs @@ -80,7 +80,12 @@ impl ValueBlock { let file_guard = descriptor_table .access(&segment_id)? + .ok_or(()) + .map_err(|()| { + log::error!("Failed to get file guard for segment {segment_id:?}"); + }) .expect("should acquire file handle"); + // TODO: ^ use inspect instead: 1.76 let block = Self::from_file( &mut *file_guard.file.lock().expect("lock is poisoned"), From 4dff1d2ce15435767c179beb5c7ba3e2f4afade9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 28 Nov 2024 00:40:16 +0100 Subject: [PATCH 31/90] refactor --- src/compaction/worker.rs | 5 +---- src/segment/file_offsets.rs | 2 +- src/tree/mod.rs | 5 +---- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 100b89d1..2425139a 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -249,9 +249,6 @@ fn merge_segments( let segment_id = trailer.metadata.id; let segment_file_path = segments_base_folder.join(segment_id.to_string()); - #[cfg(feature = "bloom")] - let bloom_ptr = trailer.offsets.bloom_ptr; - let block_index = match payload.dest_level { 0 | 1 => { let block_index = FullBlockIndex::from_file( @@ -291,7 +288,7 @@ fn merge_segments( block_index, #[cfg(feature = "bloom")] - bloom_filter: Segment::load_bloom(&segment_file_path, bloom_ptr)?, + bloom_filter: Segment::load_bloom(&segment_file_path, trailer.offsets.bloom_ptr)?, } .into()) }) diff --git a/src/segment/file_offsets.rs b/src/segment/file_offsets.rs index f25a1910..e96fcc34 100644 --- a/src/segment/file_offsets.rs +++ b/src/segment/file_offsets.rs @@ -7,7 +7,7 @@ use crate::coding::{Decode, DecodeError, Encode, EncodeError}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use std::io::{Read, Write}; -#[derive(Debug, Default, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)] pub struct FileOffsets { pub metadata_ptr: BlockOffset, pub index_block_ptr: BlockOffset, diff --git a/src/tree/mod.rs b/src/tree/mod.rs index cb149bf0..57a2e669 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -512,9 +512,6 @@ impl Tree { FullBlockIndex::from_file(&segment_file_path, &trailer.metadata, &trailer.offsets)?; let block_index = Arc::new(BlockIndexImpl::Full(block_index)); - #[cfg(feature = "bloom")] - let bloom_ptr = trailer.offsets.bloom_ptr; - let created_segment: Segment = SegmentInner { tree_id: self.id, @@ -526,7 +523,7 @@ impl Tree { block_cache: self.config.block_cache.clone(), #[cfg(feature = "bloom")] - bloom_filter: Segment::load_bloom(&segment_file_path, bloom_ptr)?, + bloom_filter: Segment::load_bloom(&segment_file_path, trailer.offsets.bloom_ptr)?, } .into(); From 85a815120039f0f1e5bccde6821e57cb5d44010c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 28 Nov 2024 00:40:25 +0100 Subject: [PATCH 32/90] add comments --- src/compaction/worker.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 2425139a..0b961524 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -176,6 +176,8 @@ fn merge_segments( levels.hide_segments(payload.segment_ids.iter().copied()); + // IMPORTANT: Free lock so the compaction (which may go on for a while) + // does not block possible other compactions and reads drop(levels); // NOTE: Only evict tombstones when reaching the last level, @@ -203,10 +205,11 @@ fn merge_segments( if opts.config.bloom_bits_per_key >= 0 { // NOTE: Apply some MONKEY to have very high FPR on small levels // because it's cheap + // + // See https://nivdayan.github.io/monkeykeyvaluestore.pdf let bloom_policy = match payload.dest_level { - // TODO: increase to 0.00001 when https://github.com/fjall-rs/lsm-tree/issues/63 is fixed - 0 => BloomConstructionPolicy::FpRate(0.0001), - 1 => BloomConstructionPolicy::FpRate(0.001), + 0 => BloomConstructionPolicy::FpRate(0.00001), + 1 => BloomConstructionPolicy::FpRate(0.0005), _ => BloomConstructionPolicy::BitsPerKey( opts.config.bloom_bits_per_key.unsigned_abs(), ), From 9081447ae51192274b546440362647bd42505929 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 28 Nov 2024 00:40:41 +0100 Subject: [PATCH 33/90] refactor: rename --- src/compaction/worker.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 0b961524..97888bb9 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -299,14 +299,14 @@ fn merge_segments( // NOTE: Mind lock order L -> M -> S log::trace!("compactor: acquiring levels manifest write lock"); - let mut original_levels = opts.levels.write().expect("lock is poisoned"); + let mut levels = opts.levels.write().expect("lock is poisoned"); // IMPORTANT: Write lock memtable(s), otherwise segments may get deleted while a range read is happening // NOTE: Mind lock order L -> M -> S log::trace!("compactor: acquiring sealed memtables write lock"); let sealed_memtables_guard = opts.sealed_memtables.write().expect("lock is poisoned"); - let swap_result = original_levels.atomic_swap(|recipe| { + let swap_result = levels.atomic_swap(|recipe| { for segment in created_segments.iter().cloned() { log::trace!("Persisting segment {}", segment.metadata.id); @@ -327,7 +327,7 @@ fn merge_segments( if let Err(e) = swap_result { // IMPORTANT: Show the segments again, because compaction failed - original_levels.show_segments(payload.segment_ids.iter().copied()); + levels.show_segments(payload.segment_ids.iter().copied()); return Err(e); }; @@ -363,9 +363,9 @@ fn merge_segments( .remove((opts.tree_id, *segment_id).into()); } - original_levels.show_segments(payload.segment_ids.iter().copied()); + levels.show_segments(payload.segment_ids.iter().copied()); - drop(original_levels); + drop(levels); log::debug!("compactor: done"); From f49d4766e738f07210196e0a9f58b4b00e1082ff Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 28 Nov 2024 21:09:53 +0100 Subject: [PATCH 34/90] clippy --- src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib.rs b/src/lib.rs index de3f075a..73ef002e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -99,6 +99,7 @@ #![allow(clippy::missing_const_for_fn)] #![warn(clippy::multiple_crate_versions)] #![allow(clippy::option_if_let_else)] +#![warn(clippy::needless_lifetimes)] pub(crate) type HashMap = std::collections::HashMap; pub(crate) type HashSet = std::collections::HashSet; From e661a0254e9ff16f848b0400cfbde9e882ccba4d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 28 Nov 2024 21:09:53 +0100 Subject: [PATCH 35/90] clippy --- src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib.rs b/src/lib.rs index de3f075a..73ef002e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -99,6 +99,7 @@ #![allow(clippy::missing_const_for_fn)] #![warn(clippy::multiple_crate_versions)] #![allow(clippy::option_if_let_else)] +#![warn(clippy::needless_lifetimes)] pub(crate) type HashMap = std::collections::HashMap; pub(crate) type HashSet = std::collections::HashSet; From bcdbff15d809608ffac6c75d3259384dca88af0c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 29 Nov 2024 17:32:13 +0100 Subject: [PATCH 36/90] change L0 bloom filter FPR on flush as well --- src/tree/mod.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 57a2e669..f2dca810 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -169,11 +169,8 @@ impl AbstractTree for Tree { use crate::segment::writer::BloomConstructionPolicy; if self.config.bloom_bits_per_key >= 0 { - segment_writer = segment_writer.use_bloom_policy( - // TODO: increase to 0.00001 when https://github.com/fjall-rs/lsm-tree/issues/63 - // is fixed - BloomConstructionPolicy::FpRate(0.0001), - ); + segment_writer = + segment_writer.use_bloom_policy(BloomConstructionPolicy::FpRate(0.00001)); } else { segment_writer = segment_writer.use_bloom_policy(BloomConstructionPolicy::BitsPerKey(0)); From 27b773b41b15708b5ea8fc787113d19629a2e4c5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 30 Nov 2024 02:50:12 +0100 Subject: [PATCH 37/90] take compacted bytes into account --- src/compaction/leveled.rs | 16 +++++++++++----- src/compaction/tiered.rs | 11 +++++++++-- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 2c8a69db..76449424 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -199,7 +199,8 @@ impl CompactionStrategy for Strategy { let view = &levels.levels; // L1+ compactions - for (curr_level_index, level) in view.iter().enumerate().skip(1).take(view.len() - 2) { + for (curr_level_index, level) in view.iter().enumerate().skip(1).take(view.len() - 2).rev() + { // NOTE: Level count is 255 max #[allow(clippy::cast_possible_truncation)] let curr_level_index = curr_level_index as u8; @@ -210,13 +211,18 @@ impl CompactionStrategy for Strategy { continue; } - /* if busy_levels.contains(&curr_level_index) || busy_levels.contains(&next_level_index) { - continue; - } */ + let level_size: u64 = level + .segments + .iter() + // NOTE: Take bytes that are already being compacted into account, + // otherwise we may be overcompensating + .filter(|x| !levels.hidden_set.contains(&x.metadata.id)) + .map(|x| x.metadata.file_size) + .sum(); let desired_bytes = self.level_target_size(curr_level_index); - let overshoot = level.size().saturating_sub(desired_bytes); + let overshoot = level_size.saturating_sub(desired_bytes); if overshoot > 0 { let Some(next_level) = &view.get(next_level_index as usize) else { diff --git a/src/compaction/tiered.rs b/src/compaction/tiered.rs index 056186cb..1c093e36 100644 --- a/src/compaction/tiered.rs +++ b/src/compaction/tiered.rs @@ -69,13 +69,20 @@ impl CompactionStrategy for Strategy { continue; } - let curr_level_bytes = level.size(); + let level_size: u64 = level + .segments + .iter() + // NOTE: Take bytes that are already being compacted into account, + // otherwise we may be overcompensating + .filter(|x| !levels.hidden_set.contains(&x.metadata.id)) + .map(|x| x.metadata.file_size) + .sum(); let desired_bytes = desired_level_size_in_bytes(curr_level_index, self.level_ratio, self.base_size) as u64; - if curr_level_bytes >= desired_bytes { + if level_size >= desired_bytes { // NOTE: Take desired_bytes because we are in tiered mode // We want to take N segments, not just the overshoot (like in leveled) let mut overshoot = desired_bytes; From 6dd9abfd08711be362f492695b2ad1135dc7c4b4 Mon Sep 17 00:00:00 2001 From: Marvin <33938500+marvin-j97@users.noreply.github.com> Date: Sat, 30 Nov 2024 03:15:37 +0100 Subject: [PATCH 38/90] Update memtable.rs --- benches/memtable.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/benches/memtable.rs b/benches/memtable.rs index eb8ec32d..e25c986d 100644 --- a/benches/memtable.rs +++ b/benches/memtable.rs @@ -21,6 +21,7 @@ fn memtable_get_hit(c: &mut Criterion) { )); } + c.bench_function("memtable get", |b| { b.iter(|| { assert_eq!( [1, 2, 3], From 168d4e040bcc440155702e10bbb5f455345ec68f Mon Sep 17 00:00:00 2001 From: Carl Sverre <82591+carlsverre@users.noreply.github.com> Date: Tue, 3 Dec 2024 16:32:14 -0800 Subject: [PATCH 39/90] better zero copy support from types that implement Into --- src/abstract.rs | 13 +++++++++---- src/blob_tree/mod.rs | 19 ++++++++++++------- src/tree/mod.rs | 22 +++++++++++++--------- 3 files changed, 34 insertions(+), 20 deletions(-) diff --git a/src/abstract.rs b/src/abstract.rs index b2c6bad6..e4d1a8b8 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -581,10 +581,15 @@ pub trait AbstractTree { /// # Errors /// /// Will return `Err` if an IO error occurs. - fn insert, V: AsRef<[u8]>>(&self, key: K, value: V, seqno: SeqNo) -> (u32, u32); + fn insert, V: Into>( + &self, + key: K, + value: V, + seqno: SeqNo, + ) -> (u32, u32); /// Inserts a key-value pair. - fn raw_insert_with_lock, V: AsRef<[u8]>>( + fn raw_insert_with_lock, V: Into>( &self, lock: &RwLockWriteGuard<'_, Memtable>, key: K, @@ -620,7 +625,7 @@ pub trait AbstractTree { /// # Errors /// /// Will return `Err` if an IO error occurs. - fn remove>(&self, key: K, seqno: SeqNo) -> (u32, u32); + fn remove>(&self, key: K, seqno: SeqNo) -> (u32, u32); /// Removes an item from the tree. /// @@ -654,5 +659,5 @@ pub trait AbstractTree { /// # Errors /// /// Will return `Err` if an IO error occurs. - fn remove_weak>(&self, key: K, seqno: SeqNo) -> (u32, u32); + fn remove_weak>(&self, key: K, seqno: SeqNo) -> (u32, u32); } diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index c2e183db..ae8a8e84 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -600,7 +600,7 @@ impl AbstractTree for BlobTree { ) } - fn raw_insert_with_lock, V: AsRef<[u8]>>( + fn raw_insert_with_lock, V: Into>( &self, lock: &RwLockWriteGuard<'_, Memtable>, key: K, @@ -613,21 +613,26 @@ impl AbstractTree for BlobTree { // NOTE: Initially, we always write an inline value // On memtable flush, depending on the values' sizes, they will be separated // into inline or indirect values - let item = MaybeInlineValue::Inline(value.as_ref().into()); + let item = MaybeInlineValue::Inline(value.into()); let value = item.encode_into_vec(); - let value = InternalValue::from_components(key.as_ref(), value, seqno, r#type); + let value = InternalValue::from_components(key, value, seqno, r#type); lock.insert(value) } - fn insert, V: AsRef<[u8]>>(&self, key: K, value: V, seqno: SeqNo) -> (u32, u32) { + fn insert, V: Into>( + &self, + key: K, + value: V, + seqno: SeqNo, + ) -> (u32, u32) { use value::MaybeInlineValue; // NOTE: Initially, we always write an inline value // On memtable flush, depending on the values' sizes, they will be separated // into inline or indirect values - let item = MaybeInlineValue::Inline(value.as_ref().into()); + let item = MaybeInlineValue::Inline(value.into()); let value = item.encode_into_vec(); @@ -684,11 +689,11 @@ impl AbstractTree for BlobTree { } } - fn remove>(&self, key: K, seqno: SeqNo) -> (u32, u32) { + fn remove>(&self, key: K, seqno: SeqNo) -> (u32, u32) { self.index.remove(key, seqno) } - fn remove_weak>(&self, key: K, seqno: SeqNo) -> (u32, u32) { + fn remove_weak>(&self, key: K, seqno: SeqNo) -> (u32, u32) { self.index.remove_weak(key, seqno) } } diff --git a/src/tree/mod.rs b/src/tree/mod.rs index dc1e245e..c0a0a3f8 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -407,13 +407,17 @@ impl AbstractTree for Tree { Box::new(self.create_prefix(prefix, None, None)) } - fn insert, V: AsRef<[u8]>>(&self, key: K, value: V, seqno: SeqNo) -> (u32, u32) { - let value = - InternalValue::from_components(key.as_ref(), value.as_ref(), seqno, ValueType::Value); + fn insert, V: Into>( + &self, + key: K, + value: V, + seqno: SeqNo, + ) -> (u32, u32) { + let value = InternalValue::from_components(key, value, seqno, ValueType::Value); self.append_entry(value) } - fn raw_insert_with_lock, V: AsRef<[u8]>>( + fn raw_insert_with_lock, V: Into>( &self, lock: &RwLockWriteGuard<'_, Memtable>, key: K, @@ -421,17 +425,17 @@ impl AbstractTree for Tree { seqno: SeqNo, r#type: ValueType, ) -> (u32, u32) { - let value = InternalValue::from_components(key.as_ref(), value.as_ref(), seqno, r#type); + let value = InternalValue::from_components(key, value, seqno, r#type); lock.insert(value) } - fn remove>(&self, key: K, seqno: SeqNo) -> (u32, u32) { - let value = InternalValue::new_tombstone(key.as_ref(), seqno); + fn remove>(&self, key: K, seqno: SeqNo) -> (u32, u32) { + let value = InternalValue::new_tombstone(key, seqno); self.append_entry(value) } - fn remove_weak>(&self, key: K, seqno: SeqNo) -> (u32, u32) { - let value = InternalValue::new_weak_tombstone(key.as_ref(), seqno); + fn remove_weak>(&self, key: K, seqno: SeqNo) -> (u32, u32) { + let value = InternalValue::new_weak_tombstone(key, seqno); self.append_entry(value) } } From 2d675ecd318c862e486d7da51cf3db538182e7ce Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Dec 2024 19:36:53 +0100 Subject: [PATCH 40/90] wip --- src/compaction/leveled.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 76449424..bb68d137 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -21,6 +21,7 @@ fn pick_minimal_compaction( curr_level: &Level, next_level: &Level, hidden_set: &HiddenSet, + overshoot: u64, ) -> Option<(HashSet, bool)> { // assert!(curr_level.is_disjoint, "Lx is not disjoint"); // assert!(next_level.is_disjoint, "Lx+1 is not disjoint"); @@ -84,8 +85,9 @@ fn pick_minimal_compaction( .map(|x| x.metadata.file_size) .sum::(); - // NOTE: Only consider compactions where we actually do some merging - if curr_level_size > 0 { + // NOTE: Only consider compactions where we actually reach the amount + // of bytes we need to merge + if curr_level_size > overshoot { let next_level_size = window.iter().map(|x| x.metadata.file_size).sum::(); let mut segment_ids: HashSet<_> = window.iter().map(|x| x.metadata.id).collect(); @@ -230,7 +232,7 @@ impl CompactionStrategy for Strategy { }; let Some((segment_ids, can_trivial_move)) = - pick_minimal_compaction(level, next_level, &levels.hidden_set) + pick_minimal_compaction(level, next_level, &levels.hidden_set, overshoot) else { break; }; From 315d5584b81261d2213ed35eb4843f6964e0f740 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Dec 2024 19:54:58 +0100 Subject: [PATCH 41/90] remove some ? in compaction worker --- src/compaction/worker.rs | 46 +++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 2d1d6932..a33972f8 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -279,7 +279,17 @@ fn merge_segments( continue; } - segment_writer.write(item)?; + if segment_writer.write(item).is_err() { + log::error!("Compaction failed"); + + // IMPORTANT: Show the segments again, because compaction failed + opts.levels + .write() + .expect("lock is poisoned") + .show_segments(payload.segment_ids.iter().copied()); + + return Ok(()); + }; if idx % 100_000 == 0 && opts.stop_signal.is_stopped() { log::debug!("compactor: stopping amidst compaction because of stop signal"); @@ -287,7 +297,17 @@ fn merge_segments( } } - let writer_results = segment_writer.finish()?; + let Ok(writer_results) = segment_writer.finish() else { + log::error!("Compaction failed"); + + // IMPORTANT: Show the segments again, because compaction failed + opts.levels + .write() + .expect("lock is poisoned") + .show_segments(payload.segment_ids.iter().copied()); + + return Ok(()); + }; log::debug!( "Compacted in {}ms ({} segments created)", @@ -295,7 +315,7 @@ fn merge_segments( writer_results.len(), ); - let created_segments = writer_results + let Ok(created_segments) = writer_results .into_iter() .map(|trailer| -> crate::Result { let segment_id = trailer.metadata.id; @@ -340,11 +360,27 @@ fn merge_segments( block_index, #[cfg(feature = "bloom")] - bloom_filter: Segment::load_bloom(&segment_file_path, trailer.offsets.bloom_ptr)?, + bloom_filter: { + match Segment::load_bloom(&segment_file_path, trailer.offsets.bloom_ptr) { + Ok(filter) => filter, + Err(e) => return Err(e), + } + }, } .into()) }) - .collect::>>()?; + .collect::>>() + else { + log::error!("Compaction failed"); + + // IMPORTANT: Show the segments again, because compaction failed + opts.levels + .write() + .expect("lock is poisoned") + .show_segments(payload.segment_ids.iter().copied()); + + return Ok(()); + }; // NOTE: Mind lock order L -> M -> S log::trace!("compactor: acquiring levels manifest write lock"); From c7ebaec133674d8cde2d1dc7729153c8b4ac3066 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Dec 2024 19:55:59 +0100 Subject: [PATCH 42/90] replace another ? in compaction worker --- src/compaction/worker.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index a33972f8..cb353baf 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -272,7 +272,17 @@ fn merge_segments( } for (idx, item) in merge_iter.enumerate() { - let item = item?; + let Ok(item) = item else { + log::error!("Compaction failed"); + + // IMPORTANT: Show the segments again, because compaction failed + opts.levels + .write() + .expect("lock is poisoned") + .show_segments(payload.segment_ids.iter().copied()); + + return Ok(()); + }; // IMPORTANT: We can only drop tombstones when writing into last level if is_last_level && item.is_tombstone() { From 37bf57ddc52764467a530b3110314a0fb6d84259 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 4 Dec 2024 21:42:35 +0100 Subject: [PATCH 43/90] wip --- src/compaction/leveled.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index bb68d137..e2a491c9 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -87,7 +87,7 @@ fn pick_minimal_compaction( // NOTE: Only consider compactions where we actually reach the amount // of bytes we need to merge - if curr_level_size > overshoot { + if curr_level_size >= overshoot { let next_level_size = window.iter().map(|x| x.metadata.file_size).sum::(); let mut segment_ids: HashSet<_> = window.iter().map(|x| x.metadata.id).collect(); From d3e2f775c5e55903a396ce3a5b29d9a4abe1329c Mon Sep 17 00:00:00 2001 From: Carl Sverre <82591+carlsverre@users.noreply.github.com> Date: Wed, 4 Dec 2024 13:59:47 -0800 Subject: [PATCH 44/90] Improve HiddenSet ergonomics and fix a bug in the leveled compaction strategy where it didn't consider hidden segments when computing the minimal compaction job. --- src/compaction/leveled.rs | 43 +++++++++++++-------- src/compaction/tiered.rs | 2 +- src/compaction/worker.rs | 64 ++++++++++++++++---------------- src/level_manifest/hidden_set.rs | 36 ++++++++++++++++++ src/level_manifest/mod.rs | 60 +++++++++++++----------------- src/memtable/mod.rs | 2 +- 6 files changed, 125 insertions(+), 82 deletions(-) create mode 100644 src/level_manifest/hidden_set.rs diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 76449424..10c75467 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -6,7 +6,7 @@ use super::{Choice, CompactionStrategy, Input as CompactionInput}; use crate::{ config::Config, key_range::KeyRange, - level_manifest::{level::Level, HiddenSet, LevelManifest}, + level_manifest::{level::Level, LevelManifest}, segment::Segment, HashSet, SegmentId, }; @@ -20,13 +20,33 @@ fn aggregate_key_range(segments: &[Segment]) -> KeyRange { fn pick_minimal_compaction( curr_level: &Level, next_level: &Level, - hidden_set: &HiddenSet, + levels: &LevelManifest, ) -> Option<(HashSet, bool)> { // assert!(curr_level.is_disjoint, "Lx is not disjoint"); // assert!(next_level.is_disjoint, "Lx+1 is not disjoint"); let mut choices = vec![]; + let mut add_choice = + |write_amp: f32, segment_ids: HashSet, can_trivial_move: bool| { + let mut valid_choice = true; + + // IMPORTANT: Compaction is blocked because of other + // on-going compaction + valid_choice &= !segment_ids.iter().any(|x| levels.segment_hidden(*x)); + + // NOTE: Keep compactions with 25 or less segments + // to make compactions not too large + // + // TODO: ideally, if a level has a lot of compaction debt + // compactions could be parallelized as long as they don't overlap in key range + valid_choice &= segment_ids.len() <= 25; + + if valid_choice { + choices.push((write_amp, segment_ids, can_trivial_move)); + } + }; + for size in 1..=next_level.len() { let windows = next_level.windows(size); @@ -34,7 +54,7 @@ fn pick_minimal_compaction( if window .iter() .map(|x| x.metadata.id) - .any(|x| hidden_set.contains(&x)) + .any(|x| levels.segment_hidden(x)) { // IMPORTANT: Compaction is blocked because of other // on-going compaction @@ -72,7 +92,7 @@ fn pick_minimal_compaction( if curr_level_pull_in .iter() .map(|x| x.metadata.id) - .any(|x| hidden_set.contains(&x)) + .any(|x| levels.segment_hidden(x)) { // IMPORTANT: Compaction is blocked because of other // on-going compaction @@ -93,7 +113,7 @@ fn pick_minimal_compaction( let write_amp = (next_level_size as f32) / (curr_level_size as f32); - choices.push((write_amp, segment_ids, false)); + add_choice(write_amp, segment_ids, false); } } } @@ -108,18 +128,11 @@ fn pick_minimal_compaction( let key_range = aggregate_key_range(window); if next_level.overlapping_segments(&key_range).next().is_none() { - choices.push((0.0, segment_ids, true)); + add_choice(0.0, segment_ids, true); } } } - // NOTE: Keep compactions with 25 or less segments - // to make compactions not too large - // - // TODO: ideally, if a level has a lot of compaction debt - // compactions could be parallelized as long as they don't overlap in key range - choices.retain(|(_, segments, _)| segments.len() <= 25); - let minimum_effort_choice = choices .into_iter() .min_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal)); @@ -216,7 +229,7 @@ impl CompactionStrategy for Strategy { .iter() // NOTE: Take bytes that are already being compacted into account, // otherwise we may be overcompensating - .filter(|x| !levels.hidden_set.contains(&x.metadata.id)) + .filter(|x| !levels.segment_hidden(x.metadata.id)) .map(|x| x.metadata.file_size) .sum(); @@ -230,7 +243,7 @@ impl CompactionStrategy for Strategy { }; let Some((segment_ids, can_trivial_move)) = - pick_minimal_compaction(level, next_level, &levels.hidden_set) + pick_minimal_compaction(level, next_level, levels) else { break; }; diff --git a/src/compaction/tiered.rs b/src/compaction/tiered.rs index 1c093e36..92c9990c 100644 --- a/src/compaction/tiered.rs +++ b/src/compaction/tiered.rs @@ -74,7 +74,7 @@ impl CompactionStrategy for Strategy { .iter() // NOTE: Take bytes that are already being compacted into account, // otherwise we may be overcompensating - .filter(|x| !levels.hidden_set.contains(&x.metadata.id)) + .filter(|x| !levels.segment_hidden(x.metadata.id)) .map(|x| x.metadata.file_size) .sum(); diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 2d1d6932..a8bb580a 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -73,7 +73,7 @@ impl Options { /// This will block until the compactor is fully finished. pub fn do_compaction(opts: &Options) -> crate::Result<()> { log::trace!("compactor: acquiring levels manifest lock"); - let mut original_levels = opts.levels.write().expect("lock is poisoned"); + let original_levels = opts.levels.write().expect("lock is poisoned"); log::trace!("compactor: consulting compaction strategy"); let choice = opts.strategy.choose(&original_levels, &opts.config); @@ -82,35 +82,15 @@ pub fn do_compaction(opts: &Options) -> crate::Result<()> { match choice { Choice::Merge(payload) => merge_segments(original_levels, opts, &payload), - Choice::Move(payload) => { - let segment_map = original_levels.get_all_segments(); - - original_levels.atomic_swap(|recipe| { - for segment_id in payload.segment_ids { - if let Some(segment) = segment_map.get(&segment_id).cloned() { - for level in recipe.iter_mut() { - level.remove(segment_id); - } - - recipe - .get_mut(payload.dest_level as usize) - .expect("destination level should exist") - .insert(segment); - } - } - }) - } - Choice::Drop(payload) => { - drop_segments( - original_levels, - opts, - &payload - .into_iter() - .map(|x| (opts.tree_id, x).into()) - .collect::>(), - )?; - Ok(()) - } + Choice::Move(payload) => move_segments(original_levels, payload), + Choice::Drop(payload) => drop_segments( + original_levels, + opts, + &payload + .into_iter() + .map(|x| (opts.tree_id, x).into()) + .collect::>(), + ), Choice::DoNothing => { log::trace!("Compactor chose to do nothing"); Ok(()) @@ -186,6 +166,28 @@ fn create_compaction_stream<'a>( } } +fn move_segments( + mut levels: RwLockWriteGuard<'_, LevelManifest>, + payload: CompactionPayload, +) -> crate::Result<()> { + let segment_map = levels.get_all_segments(); + + levels.atomic_swap(|recipe| { + for segment_id in payload.segment_ids { + if let Some(segment) = segment_map.get(&segment_id).cloned() { + for level in recipe.iter_mut() { + level.remove(segment_id); + } + + recipe + .get_mut(payload.dest_level as usize) + .expect("destination level should exist") + .insert(segment); + } + } + }) +} + #[allow(clippy::too_many_lines)] fn merge_segments( mut levels: RwLockWriteGuard<'_, LevelManifest>, @@ -202,7 +204,7 @@ fn merge_segments( if payload .segment_ids .iter() - .any(|id| levels.hidden_set.contains(id)) + .any(|id| levels.segment_hidden(*id)) { log::warn!("Compaction task contained hidden segments, declining to run it"); return Ok(()); diff --git a/src/level_manifest/hidden_set.rs b/src/level_manifest/hidden_set.rs new file mode 100644 index 00000000..41045c7a --- /dev/null +++ b/src/level_manifest/hidden_set.rs @@ -0,0 +1,36 @@ +use crate::segment::meta::SegmentId; + +use crate::HashSet; + +#[derive(Clone)] +pub(super) struct HiddenSet { + pub(crate) set: HashSet, +} + +impl Default for HiddenSet { + fn default() -> Self { + Self { + set: HashSet::with_capacity_and_hasher(10, xxhash_rust::xxh3::Xxh3Builder::new()), + } + } +} + +impl HiddenSet { + pub(crate) fn hide>(&mut self, keys: T) { + self.set.extend(keys); + } + + pub(crate) fn show>(&mut self, keys: T) { + for key in keys { + self.set.remove(&key); + } + } + + pub(crate) fn contains(&self, key: SegmentId) -> bool { + self.set.contains(&key) + } + + pub(crate) fn is_empty(&self) -> bool { + self.set.is_empty() + } +} diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index bf4e25cd..e75b6860 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -2,6 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) +mod hidden_set; pub mod iter; pub(crate) mod level; @@ -21,8 +22,6 @@ use std::{ sync::Arc, }; -pub type HiddenSet = HashSet; - type Levels = Vec>; /// Represents the levels of a log-structured merge tree. @@ -38,7 +37,7 @@ pub struct LevelManifest { /// /// While consuming segments (because of compaction) they will not appear in the list of segments /// as to not cause conflicts between multiple compaction threads (compacting the same segments) - pub hidden_set: HiddenSet, + hidden_set: hidden_set::HiddenSet, is_disjoint: bool, } @@ -62,7 +61,7 @@ impl std::fmt::Display for LevelManifest { #[allow(clippy::indexing_slicing)] for segment in level.segments.iter().take(2) { let id = segment.metadata.id; - let is_hidden = self.hidden_set.contains(&id); + let is_hidden = self.segment_hidden(id); write!( f, @@ -76,7 +75,7 @@ impl std::fmt::Display for LevelManifest { #[allow(clippy::indexing_slicing)] for segment in level.segments.iter().rev().take(2).rev() { let id = segment.metadata.id; - let is_hidden = self.hidden_set.contains(&id); + let is_hidden = self.segment_hidden(id); write!( f, @@ -88,7 +87,7 @@ impl std::fmt::Display for LevelManifest { } else { for segment in &level.segments { let id = segment.metadata.id; - let is_hidden = self.hidden_set.contains(&id); + let is_hidden = self.segment_hidden(id); write!( f, @@ -126,10 +125,7 @@ impl LevelManifest { let mut manifest = Self { path: path.as_ref().to_path_buf(), levels, - hidden_set: HashSet::with_capacity_and_hasher( - 10, - xxhash_rust::xxh3::Xxh3Builder::new(), - ), + hidden_set: Default::default(), is_disjoint: true, }; Self::write_to_disk(path, &manifest.deep_clone())?; @@ -235,10 +231,7 @@ impl LevelManifest { let mut manifest = Self { levels, - hidden_set: HashSet::with_capacity_and_hasher( - 10, - xxhash_rust::xxh3::Xxh3Builder::new(), - ), + hidden_set: Default::default(), path: path.as_ref().to_path_buf(), is_disjoint: false, }; @@ -379,14 +372,10 @@ impl LevelManifest { HashSet::with_capacity_and_hasher(self.len(), xxhash_rust::xxh3::Xxh3Builder::new()); for (idx, level) in self.levels.iter().enumerate() { - for segment_id in level.ids() { - if self.hidden_set.contains(&segment_id) { - // NOTE: Level count is u8 - #[allow(clippy::cast_possible_truncation)] - let idx = idx as u8; - - output.insert(idx); - } + if level.ids().any(|id| self.segment_hidden(id)) { + // NOTE: Level count is u8 + #[allow(clippy::cast_possible_truncation)] + output.insert(idx as u8); } } @@ -400,7 +389,7 @@ impl LevelManifest { for raw_level in &self.levels { let mut level = raw_level.iter().cloned().collect::>(); - level.retain(|x| !self.hidden_set.contains(&x.metadata.id)); + level.retain(|x| !self.segment_hidden(x.metadata.id)); output.push(Level { segments: level, @@ -425,16 +414,16 @@ impl LevelManifest { output } - pub(crate) fn show_segments(&mut self, keys: impl Iterator) { - for key in keys { - self.hidden_set.remove(&key); - } + pub(crate) fn segment_hidden(&self, key: SegmentId) -> bool { + self.hidden_set.contains(key) } - pub(crate) fn hide_segments(&mut self, keys: impl Iterator) { - for key in keys { - self.hidden_set.insert(key); - } + pub(crate) fn hide_segments>(&mut self, keys: T) { + self.hidden_set.hide(keys); + } + + pub(crate) fn show_segments>(&mut self, keys: T) { + self.hidden_set.show(keys); } } @@ -464,8 +453,11 @@ impl Encode for Vec { #[cfg(test)] #[allow(clippy::expect_used)] mod tests { - use crate::{coding::Encode, level_manifest::LevelManifest, AbstractTree}; - use std::collections::HashSet; + use crate::{ + coding::Encode, + level_manifest::{hidden_set::HiddenSet, LevelManifest}, + AbstractTree, + }; use test_log::test; #[test] @@ -513,7 +505,7 @@ mod tests { #[test] fn level_manifest_raw_empty() -> crate::Result<()> { let manifest = LevelManifest { - hidden_set: HashSet::default(), + hidden_set: HiddenSet::default(), levels: Vec::default(), path: "a".into(), is_disjoint: false, diff --git a/src/memtable/mod.rs b/src/memtable/mod.rs index 07f9b204..a5b390f1 100644 --- a/src/memtable/mod.rs +++ b/src/memtable/mod.rs @@ -50,7 +50,7 @@ impl Memtable { pub(crate) fn range<'a, R: RangeBounds + 'a>( &'a self, range: R, - ) -> impl DoubleEndedIterator + '_ { + ) -> impl DoubleEndedIterator + 'a { self.items.range(range).map(|entry| InternalValue { key: entry.key().clone(), value: entry.value().clone(), From d98c0ebc173cd45ef063adc18fd5429bcdf5f73e Mon Sep 17 00:00:00 2001 From: Carl Sverre <82591+carlsverre@users.noreply.github.com> Date: Wed, 4 Dec 2024 14:18:59 -0800 Subject: [PATCH 45/90] pass hidden set directly to pick_minimal_computation, but still keep it somewhat restricted --- src/compaction/leveled.rs | 12 ++++++------ src/level_manifest/hidden_set.rs | 6 +++--- src/level_manifest/mod.rs | 7 ++++++- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 10c75467..18bf88b2 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -6,7 +6,7 @@ use super::{Choice, CompactionStrategy, Input as CompactionInput}; use crate::{ config::Config, key_range::KeyRange, - level_manifest::{level::Level, LevelManifest}, + level_manifest::{hidden_set::HiddenSet, level::Level, LevelManifest}, segment::Segment, HashSet, SegmentId, }; @@ -20,7 +20,7 @@ fn aggregate_key_range(segments: &[Segment]) -> KeyRange { fn pick_minimal_compaction( curr_level: &Level, next_level: &Level, - levels: &LevelManifest, + hidden_set: &HiddenSet, ) -> Option<(HashSet, bool)> { // assert!(curr_level.is_disjoint, "Lx is not disjoint"); // assert!(next_level.is_disjoint, "Lx+1 is not disjoint"); @@ -33,7 +33,7 @@ fn pick_minimal_compaction( // IMPORTANT: Compaction is blocked because of other // on-going compaction - valid_choice &= !segment_ids.iter().any(|x| levels.segment_hidden(*x)); + valid_choice &= !segment_ids.iter().any(|x| hidden_set.contains(*x)); // NOTE: Keep compactions with 25 or less segments // to make compactions not too large @@ -54,7 +54,7 @@ fn pick_minimal_compaction( if window .iter() .map(|x| x.metadata.id) - .any(|x| levels.segment_hidden(x)) + .any(|x| hidden_set.contains(x)) { // IMPORTANT: Compaction is blocked because of other // on-going compaction @@ -92,7 +92,7 @@ fn pick_minimal_compaction( if curr_level_pull_in .iter() .map(|x| x.metadata.id) - .any(|x| levels.segment_hidden(x)) + .any(|x| hidden_set.contains(x)) { // IMPORTANT: Compaction is blocked because of other // on-going compaction @@ -243,7 +243,7 @@ impl CompactionStrategy for Strategy { }; let Some((segment_ids, can_trivial_move)) = - pick_minimal_compaction(level, next_level, levels) + pick_minimal_compaction(level, next_level, levels.hidden_segments()) else { break; }; diff --git a/src/level_manifest/hidden_set.rs b/src/level_manifest/hidden_set.rs index 41045c7a..f0b05c42 100644 --- a/src/level_manifest/hidden_set.rs +++ b/src/level_manifest/hidden_set.rs @@ -3,7 +3,7 @@ use crate::segment::meta::SegmentId; use crate::HashSet; #[derive(Clone)] -pub(super) struct HiddenSet { +pub(crate) struct HiddenSet { pub(crate) set: HashSet, } @@ -16,11 +16,11 @@ impl Default for HiddenSet { } impl HiddenSet { - pub(crate) fn hide>(&mut self, keys: T) { + pub(super) fn hide>(&mut self, keys: T) { self.set.extend(keys); } - pub(crate) fn show>(&mut self, keys: T) { + pub(super) fn show>(&mut self, keys: T) { for key in keys { self.set.remove(&key); } diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index e75b6860..1a58908e 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -mod hidden_set; +pub(crate) mod hidden_set; pub mod iter; pub(crate) mod level; @@ -14,6 +14,7 @@ use crate::{ HashMap, HashSet, }; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use hidden_set::HiddenSet; use iter::LevelManifestIterator; use level::Level; use std::{ @@ -418,6 +419,10 @@ impl LevelManifest { self.hidden_set.contains(key) } + pub(crate) fn hidden_segments(&self) -> &HiddenSet { + &self.hidden_set + } + pub(crate) fn hide_segments>(&mut self, keys: T) { self.hidden_set.hide(keys); } From 186fd9feb176da3447d45b206cb9179cc7a3b6e7 Mon Sep 17 00:00:00 2001 From: Marvin <33938500+marvin-j97@users.noreply.github.com> Date: Wed, 4 Dec 2024 23:22:14 +0100 Subject: [PATCH 46/90] Update hidden_set.rs --- src/level_manifest/hidden_set.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/level_manifest/hidden_set.rs b/src/level_manifest/hidden_set.rs index f0b05c42..2aa34e6a 100644 --- a/src/level_manifest/hidden_set.rs +++ b/src/level_manifest/hidden_set.rs @@ -1,7 +1,12 @@ use crate::segment::meta::SegmentId; - use crate::HashSet; +/// The hidden set keeps track of which segments are currently being compacted +/// +/// When a segment is hidden (being compacted), no other compaction task can include that +/// segment, or it will be declined to be run. +/// +/// If a compaction task fails, the segments are shown again (removed from the hidden set). #[derive(Clone)] pub(crate) struct HiddenSet { pub(crate) set: HashSet, From ec3e1b9689baa31020bdbf5d1c6905565955d08b Mon Sep 17 00:00:00 2001 From: Marvin <33938500+marvin-j97@users.noreply.github.com> Date: Wed, 4 Dec 2024 23:28:00 +0100 Subject: [PATCH 47/90] update value-log --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 19432559..71342e33 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,7 +38,7 @@ quick_cache = { version = "0.6.5", default-features = false, features = [] } rustc-hash = "2.0.0" self_cell = "1.0.4" tempfile = "3.12.0" -value-log = "1.3.0" +value-log = "1.4.0" varint-rs = "2.2.0" xxhash-rust = { version = "0.8.12", features = ["xxh3"] } From 891011cd31134676db189b440d68cf32222352a2 Mon Sep 17 00:00:00 2001 From: Marvin <33938500+marvin-j97@users.noreply.github.com> Date: Wed, 4 Dec 2024 23:29:01 +0100 Subject: [PATCH 48/90] Update mod.rs --- src/level_manifest/mod.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 1a58908e..18802a97 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -25,20 +25,20 @@ use std::{ type Levels = Vec>; -/// Represents the levels of a log-structured merge tree. +/// Represents the levels of a log-structured merge tree pub struct LevelManifest { - /// Path of level manifest file + /// Path of level manifest file. path: PathBuf, - /// Actual levels containing segments + /// Actual levels containing segments. #[doc(hidden)] pub levels: Levels, - /// Set of segment IDs that are masked + /// Set of segment IDs that are masked. /// /// While consuming segments (because of compaction) they will not appear in the list of segments - /// as to not cause conflicts between multiple compaction threads (compacting the same segments) - hidden_set: hidden_set::HiddenSet, + /// as to not cause conflicts between multiple compaction threads (compacting the same segments). + hidden_set: HiddenSet, is_disjoint: bool, } From eeab262e645611e98d16a69e65f27e5cb37abf7e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 5 Dec 2024 01:10:37 +0100 Subject: [PATCH 49/90] cleanup --- src/compaction/leveled.rs | 87 ++++++++++++++++++++------------ src/compaction/tiered.rs | 2 +- src/compaction/worker.rs | 2 +- src/level_manifest/hidden_set.rs | 6 +-- src/level_manifest/mod.rs | 16 +++--- 5 files changed, 65 insertions(+), 48 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 08295e7a..9a9319b6 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -11,12 +11,13 @@ use crate::{ HashSet, SegmentId, }; +/// Aggregates the key range of a list of segments. fn aggregate_key_range(segments: &[Segment]) -> KeyRange { KeyRange::aggregate(segments.iter().map(|x| &x.metadata.key_range)) } -// TODO: Currently does not take in `overshoot` -// TODO: Need to make sure compactions are not too small either +/// Tries to find the most optimal compaction set from +/// one level into the other. fn pick_minimal_compaction( curr_level: &Level, next_level: &Level, @@ -26,27 +27,32 @@ fn pick_minimal_compaction( // assert!(curr_level.is_disjoint, "Lx is not disjoint"); // assert!(next_level.is_disjoint, "Lx+1 is not disjoint"); + struct Choice { + write_amp: f32, + segment_ids: HashSet, + can_trivial_move: bool, + } + let mut choices = vec![]; - let mut add_choice = - |write_amp: f32, segment_ids: HashSet, can_trivial_move: bool| { - let mut valid_choice = true; + let mut add_choice = |choice: Choice| { + let mut valid_choice = true; - // IMPORTANT: Compaction is blocked because of other - // on-going compaction - valid_choice &= !segment_ids.iter().any(|x| hidden_set.contains(*x)); + // IMPORTANT: Compaction is blocked because of other + // on-going compaction + valid_choice &= !choice.segment_ids.iter().any(|x| hidden_set.is_hidden(*x)); - // NOTE: Keep compactions with 25 or less segments - // to make compactions not too large - // - // TODO: ideally, if a level has a lot of compaction debt - // compactions could be parallelized as long as they don't overlap in key range - valid_choice &= segment_ids.len() <= 25; + // NOTE: Keep compactions with 25 or less segments + // to make compactions not too large + // + // TODO: ideally, if a level has a lot of compaction debt + // compactions could be parallelized as long as they don't overlap in key range + valid_choice &= choice.segment_ids.len() <= 25; - if valid_choice { - choices.push((write_amp, segment_ids, can_trivial_move)); - } - }; + if valid_choice { + choices.push(choice); + } + }; for size in 1..=next_level.len() { let windows = next_level.windows(size); @@ -55,7 +61,7 @@ fn pick_minimal_compaction( if window .iter() .map(|x| x.metadata.id) - .any(|x| hidden_set.contains(x)) + .any(|x| hidden_set.is_hidden(x)) { // IMPORTANT: Compaction is blocked because of other // on-going compaction @@ -93,7 +99,7 @@ fn pick_minimal_compaction( if curr_level_pull_in .iter() .map(|x| x.metadata.id) - .any(|x| hidden_set.contains(x)) + .any(|x| hidden_set.is_hidden(x)) { // IMPORTANT: Compaction is blocked because of other // on-going compaction @@ -115,7 +121,11 @@ fn pick_minimal_compaction( let write_amp = (next_level_size as f32) / (curr_level_size as f32); - add_choice(write_amp, segment_ids, false); + add_choice(Choice { + write_amp, + segment_ids, + can_trivial_move: false, + }); } } } @@ -130,23 +140,29 @@ fn pick_minimal_compaction( let key_range = aggregate_key_range(window); if next_level.overlapping_segments(&key_range).next().is_none() { - add_choice(0.0, segment_ids, true); + add_choice(Choice { + write_amp: 0.0, + segment_ids, + can_trivial_move: true, + }); } } } - let minimum_effort_choice = choices - .into_iter() - .min_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal)); + let minimum_effort_choice = choices.into_iter().min_by(|a, b| { + a.write_amp + .partial_cmp(&b.write_amp) + .unwrap_or(std::cmp::Ordering::Equal) + }); - minimum_effort_choice.map(|(_, set, can_trivial_move)| (set, can_trivial_move)) + minimum_effort_choice.map(|c| (c.segment_ids, c.can_trivial_move)) } /// Levelled compaction strategy (LCS) /// /// If a level reaches some threshold size, parts of it are merged into overlapping segments in the next level. /// -/// Each level Ln for n >= 1 can have up to ratio^n segments. +/// Each level Ln for n >= 2 can have up to `level_base_size * ratio^n` segments. /// /// LCS suffers from comparatively high write amplification, but has decent read & space amplification. /// @@ -199,6 +215,14 @@ impl Default for Strategy { } impl Strategy { + /// Calculates the level target size. + /// + /// L1 = `level_base_size` + /// + /// L2 = `level_base_size * ratio` + /// + /// L3 = `level_base_size * ratio * ratio` + /// ... fn level_target_size(&self, level_idx: u8) -> u64 { assert!(level_idx >= 1, "level_target_size does not apply to L0"); @@ -231,7 +255,7 @@ impl CompactionStrategy for Strategy { .iter() // NOTE: Take bytes that are already being compacted into account, // otherwise we may be overcompensating - .filter(|x| !levels.segment_hidden(x.metadata.id)) + .filter(|x| !levels.hidden_set().is_hidden(x.metadata.id)) .map(|x| x.metadata.file_size) .sum(); @@ -245,7 +269,7 @@ impl CompactionStrategy for Strategy { }; let Some((segment_ids, can_trivial_move)) = - pick_minimal_compaction(level, next_level, levels.hidden_segments(), overshoot) + pick_minimal_compaction(level, next_level, levels.hidden_set(), overshoot) else { break; }; @@ -392,9 +416,6 @@ mod tests { use std::{path::Path, sync::Arc}; use test_log::test; - #[cfg(feature = "bloom")] - use crate::bloom::BloomFilter; - fn string_key_range(a: &str, b: &str) -> KeyRange { KeyRange::new((a.as_bytes().into(), b.as_bytes().into())) } @@ -451,7 +472,7 @@ mod tests { block_cache, #[cfg(feature = "bloom")] - bloom_filter: Some(BloomFilter::with_fp_rate(1, 0.1)), + bloom_filter: Some(crate::bloom::BloomFilter::with_fp_rate(1, 0.1)), } .into() } diff --git a/src/compaction/tiered.rs b/src/compaction/tiered.rs index 92c9990c..bbe32fbd 100644 --- a/src/compaction/tiered.rs +++ b/src/compaction/tiered.rs @@ -74,7 +74,7 @@ impl CompactionStrategy for Strategy { .iter() // NOTE: Take bytes that are already being compacted into account, // otherwise we may be overcompensating - .filter(|x| !levels.segment_hidden(x.metadata.id)) + .filter(|x| !levels.hidden_set().is_hidden(x.metadata.id)) .map(|x| x.metadata.file_size) .sum(); diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 37ac7d2f..ea8e544c 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -204,7 +204,7 @@ fn merge_segments( if payload .segment_ids .iter() - .any(|id| levels.segment_hidden(*id)) + .any(|id| levels.hidden_set().is_hidden(*id)) { log::warn!("Compaction task contained hidden segments, declining to run it"); return Ok(()); diff --git a/src/level_manifest/hidden_set.rs b/src/level_manifest/hidden_set.rs index 2aa34e6a..8c8c64c8 100644 --- a/src/level_manifest/hidden_set.rs +++ b/src/level_manifest/hidden_set.rs @@ -21,17 +21,17 @@ impl Default for HiddenSet { } impl HiddenSet { - pub(super) fn hide>(&mut self, keys: T) { + pub(crate) fn hide>(&mut self, keys: T) { self.set.extend(keys); } - pub(super) fn show>(&mut self, keys: T) { + pub(crate) fn show>(&mut self, keys: T) { for key in keys { self.set.remove(&key); } } - pub(crate) fn contains(&self, key: SegmentId) -> bool { + pub(crate) fn is_hidden(&self, key: SegmentId) -> bool { self.set.contains(&key) } diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 18802a97..2d3f247a 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -62,7 +62,7 @@ impl std::fmt::Display for LevelManifest { #[allow(clippy::indexing_slicing)] for segment in level.segments.iter().take(2) { let id = segment.metadata.id; - let is_hidden = self.segment_hidden(id); + let is_hidden = self.hidden_set.is_hidden(id); write!( f, @@ -76,7 +76,7 @@ impl std::fmt::Display for LevelManifest { #[allow(clippy::indexing_slicing)] for segment in level.segments.iter().rev().take(2).rev() { let id = segment.metadata.id; - let is_hidden = self.segment_hidden(id); + let is_hidden = self.hidden_set.is_hidden(id); write!( f, @@ -88,7 +88,7 @@ impl std::fmt::Display for LevelManifest { } else { for segment in &level.segments { let id = segment.metadata.id; - let is_hidden = self.segment_hidden(id); + let is_hidden = self.hidden_set.is_hidden(id); write!( f, @@ -373,7 +373,7 @@ impl LevelManifest { HashSet::with_capacity_and_hasher(self.len(), xxhash_rust::xxh3::Xxh3Builder::new()); for (idx, level) in self.levels.iter().enumerate() { - if level.ids().any(|id| self.segment_hidden(id)) { + if level.ids().any(|id| self.hidden_set.is_hidden(id)) { // NOTE: Level count is u8 #[allow(clippy::cast_possible_truncation)] output.insert(idx as u8); @@ -390,7 +390,7 @@ impl LevelManifest { for raw_level in &self.levels { let mut level = raw_level.iter().cloned().collect::>(); - level.retain(|x| !self.segment_hidden(x.metadata.id)); + level.retain(|x| !self.hidden_set.is_hidden(x.metadata.id)); output.push(Level { segments: level, @@ -415,11 +415,7 @@ impl LevelManifest { output } - pub(crate) fn segment_hidden(&self, key: SegmentId) -> bool { - self.hidden_set.contains(key) - } - - pub(crate) fn hidden_segments(&self) -> &HiddenSet { + pub(crate) fn hidden_set(&self) -> &HiddenSet { &self.hidden_set } From 27e7b4d6b6c2e908257d2aefba23aaf20333fed4 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 5 Dec 2024 01:22:11 +0100 Subject: [PATCH 50/90] refactor --- src/compaction/fifo.rs | 4 +++ src/compaction/leveled.rs | 8 +++-- src/compaction/maintenance.rs | 4 +++ src/compaction/major.rs | 4 +++ src/compaction/mod.rs | 4 +++ src/compaction/pulldown.rs | 4 +++ src/compaction/tiered.rs | 4 +++ src/compaction/worker.rs | 55 +++++++++++++++++++++++------------ src/level_manifest/mod.rs | 7 +++++ 9 files changed, 74 insertions(+), 20 deletions(-) diff --git a/src/compaction/fifo.rs b/src/compaction/fifo.rs index 0942f34a..4306944f 100644 --- a/src/compaction/fifo.rs +++ b/src/compaction/fifo.rs @@ -40,6 +40,10 @@ impl Strategy { } impl CompactionStrategy for Strategy { + fn get_name(&self) -> &'static str { + "FifoStrategy" + } + fn choose(&self, levels: &LevelManifest, config: &Config) -> Choice { let resolved_view = levels.resolved_view(); diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 9a9319b6..217ee1ce 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -160,11 +160,11 @@ fn pick_minimal_compaction( /// Levelled compaction strategy (LCS) /// -/// If a level reaches some threshold size, parts of it are merged into overlapping segments in the next level. +/// When a level reaches some threshold size, parts of it are merged into overlapping segments in the next level. /// /// Each level Ln for n >= 2 can have up to `level_base_size * ratio^n` segments. /// -/// LCS suffers from comparatively high write amplification, but has decent read & space amplification. +/// LCS suffers from comparatively high write amplification, but has decent read amplification and great space amplification (~1.1x). /// /// LCS is the recommended compaction strategy to use. /// @@ -233,6 +233,10 @@ impl Strategy { } impl CompactionStrategy for Strategy { + fn get_name(&self) -> &'static str { + "LeveledStrategy" + } + #[allow(clippy::too_many_lines)] fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { let view = &levels.levels; diff --git a/src/compaction/maintenance.rs b/src/compaction/maintenance.rs index d6c8598c..6dd21edc 100644 --- a/src/compaction/maintenance.rs +++ b/src/compaction/maintenance.rs @@ -43,6 +43,10 @@ pub fn choose_least_effort_compaction(segments: &[Segment], n: usize) -> HashSet } impl CompactionStrategy for Strategy { + fn get_name(&self) -> &'static str { + "MaintenanceStrategy" + } + fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { let resolved_view = levels.resolved_view(); diff --git a/src/compaction/major.rs b/src/compaction/major.rs index 995d7ae2..61110c43 100644 --- a/src/compaction/major.rs +++ b/src/compaction/major.rs @@ -35,6 +35,10 @@ impl Default for Strategy { } impl CompactionStrategy for Strategy { + fn get_name(&self) -> &'static str { + "MajorCompaction" + } + fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { let segment_ids = levels.iter().map(|x| x.metadata.id).collect(); diff --git a/src/compaction/mod.rs b/src/compaction/mod.rs index 9222fdd9..a25a4291 100644 --- a/src/compaction/mod.rs +++ b/src/compaction/mod.rs @@ -69,6 +69,10 @@ pub enum Choice { /// and emits a choice on what to do. #[allow(clippy::module_name_repetitions)] pub trait CompactionStrategy { + // TODO: could be : Display instead + /// Gets the compaction strategy name. + fn get_name(&self) -> &'static str; + /// Decides on what to do based on the current state of the LSM-tree's levels fn choose(&self, _: &LevelManifest, config: &Config) -> Choice; } diff --git a/src/compaction/pulldown.rs b/src/compaction/pulldown.rs index 826a16fe..af70fc62 100644 --- a/src/compaction/pulldown.rs +++ b/src/compaction/pulldown.rs @@ -11,6 +11,10 @@ use crate::{level_manifest::LevelManifest, Config, HashSet}; pub struct Strategy(pub u8, pub u8); impl CompactionStrategy for Strategy { + fn get_name(&self) -> &'static str { + "PullDownCompaction" + } + #[allow(clippy::expect_used)] fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { let resolved_view = levels.resolved_view(); diff --git a/src/compaction/tiered.rs b/src/compaction/tiered.rs index bbe32fbd..cc3f431e 100644 --- a/src/compaction/tiered.rs +++ b/src/compaction/tiered.rs @@ -50,6 +50,10 @@ impl Default for Strategy { } impl CompactionStrategy for Strategy { + fn get_name(&self) -> &'static str { + "TieredStrategy" + } + fn choose(&self, levels: &LevelManifest, config: &Config) -> Choice { let resolved_view = levels.resolved_view(); diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index ea8e544c..dc2fe5be 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -41,15 +41,14 @@ pub struct Options { /// Sealed memtables (required for temporarily locking). pub sealed_memtables: Arc>, - /// Compaction strategy. - /// - /// The one inside `config` is NOT used. + /// Compaction strategy to use. pub strategy: Arc, - /// Stop signal + /// Stop signal to interrupt a compaction worker in case + /// the tree is dropped. pub stop_signal: StopSignal, - /// Evicts items that are older than this seqno + /// Evicts items that are older than this seqno (MVCC GC). pub eviction_seqno: u64, } @@ -75,14 +74,17 @@ pub fn do_compaction(opts: &Options) -> crate::Result<()> { log::trace!("compactor: acquiring levels manifest lock"); let original_levels = opts.levels.write().expect("lock is poisoned"); - log::trace!("compactor: consulting compaction strategy"); + log::trace!( + "compactor: consulting compaction strategy {:?}", + opts.strategy.get_name(), + ); let choice = opts.strategy.choose(&original_levels, &opts.config); log::debug!("compactor: choice: {choice:?}"); match choice { Choice::Merge(payload) => merge_segments(original_levels, opts, &payload), - Choice::Move(payload) => move_segments(original_levels, payload), + Choice::Move(payload) => move_segments(original_levels, opts, payload), Choice::Drop(payload) => drop_segments( original_levels, opts, @@ -168,8 +170,18 @@ fn create_compaction_stream<'a>( fn move_segments( mut levels: RwLockWriteGuard<'_, LevelManifest>, + opts: &Options, payload: CompactionPayload, ) -> crate::Result<()> { + // Fail-safe for buggy compaction strategies + if levels.should_decline_compaction(payload.segment_ids.iter().copied()) { + log::warn!( + "Compaction task created by {:?} contained hidden segments, declining to run it - please report this at https://github.com/fjall-rs/lsm-tree", + opts.strategy.get_name(), + ); + return Ok(()); + } + let segment_map = levels.get_all_segments(); levels.atomic_swap(|recipe| { @@ -199,14 +211,12 @@ fn merge_segments( return Ok(()); } - // TODO: this sometimes runs, but shouldn't be possible - // TODO: because we have a mutex when hiding & showing segments and checking compaction strategy... - if payload - .segment_ids - .iter() - .any(|id| levels.hidden_set().is_hidden(*id)) - { - log::warn!("Compaction task contained hidden segments, declining to run it"); + // Fail-safe for buggy compaction strategies + if levels.should_decline_compaction(payload.segment_ids.iter().copied()) { + log::warn!( + "Compaction task created by {:?} contained hidden segments, declining to run it - please report this at https://github.com/fjall-rs/lsm-tree", + opts.strategy.get_name(), + ); return Ok(()); } @@ -470,10 +480,19 @@ fn merge_segments( } fn drop_segments( - mut original_levels: RwLockWriteGuard<'_, LevelManifest>, + mut levels: RwLockWriteGuard<'_, LevelManifest>, opts: &Options, segment_ids: &[GlobalSegmentId], ) -> crate::Result<()> { + // Fail-safe for buggy compaction strategies + if levels.should_decline_compaction(segment_ids.iter().map(GlobalSegmentId::segment_id)) { + log::warn!( + "Compaction task created by {:?} contained hidden segments, declining to run it - please report this at https://github.com/fjall-rs/lsm-tree", + opts.strategy.get_name(), +); + return Ok(()); + } + let segments_base_folder = opts.config.path.join(SEGMENTS_FOLDER); // IMPORTANT: Write lock memtable, otherwise segments may get deleted while a range read is happening @@ -482,7 +501,7 @@ fn drop_segments( // IMPORTANT: Write the segment with the removed segments first // Otherwise the folder is deleted, but the segment is still referenced! - original_levels.atomic_swap(|recipe| { + levels.atomic_swap(|recipe| { for key in segment_ids { let segment_id = key.segment_id(); log::trace!("Removing segment {segment_id}"); @@ -494,7 +513,7 @@ fn drop_segments( })?; drop(memtable_lock); - drop(original_levels); + drop(levels); // NOTE: If the application were to crash >here< it's fine // The segments are not referenced anymore, and will be diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 2d3f247a..ce9a8505 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -415,6 +415,13 @@ impl LevelManifest { output } + pub(crate) fn should_decline_compaction>( + &self, + ids: T, + ) -> bool { + ids.into_iter().any(|id| self.hidden_set().is_hidden(id)) + } + pub(crate) fn hidden_set(&self) -> &HiddenSet { &self.hidden_set } From 46f2d83199262c6c762bf1f7b71e8c1cab5aea0b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 5 Dec 2024 01:22:27 +0100 Subject: [PATCH 51/90] fmt --- src/compaction/worker.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index dc2fe5be..570c276c 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -487,9 +487,9 @@ fn drop_segments( // Fail-safe for buggy compaction strategies if levels.should_decline_compaction(segment_ids.iter().map(GlobalSegmentId::segment_id)) { log::warn!( - "Compaction task created by {:?} contained hidden segments, declining to run it - please report this at https://github.com/fjall-rs/lsm-tree", - opts.strategy.get_name(), -); + "Compaction task created by {:?} contained hidden segments, declining to run it - please report this at https://github.com/fjall-rs/lsm-tree", + opts.strategy.get_name(), + ); return Ok(()); } From c219ff2475d51e35053bfe3b50c4a3886ad36008 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 5 Dec 2024 01:22:55 +0100 Subject: [PATCH 52/90] wip --- src/compaction/worker.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 570c276c..03349c77 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -176,7 +176,7 @@ fn move_segments( // Fail-safe for buggy compaction strategies if levels.should_decline_compaction(payload.segment_ids.iter().copied()) { log::warn!( - "Compaction task created by {:?} contained hidden segments, declining to run it - please report this at https://github.com/fjall-rs/lsm-tree", + "Compaction task created by {:?} contained hidden segments, declining to run it - please report this at https://github.com/fjall-rs/lsm-tree/issues/new?template=bug_report.md", opts.strategy.get_name(), ); return Ok(()); @@ -214,7 +214,7 @@ fn merge_segments( // Fail-safe for buggy compaction strategies if levels.should_decline_compaction(payload.segment_ids.iter().copied()) { log::warn!( - "Compaction task created by {:?} contained hidden segments, declining to run it - please report this at https://github.com/fjall-rs/lsm-tree", + "Compaction task created by {:?} contained hidden segments, declining to run it - please report this at https://github.com/fjall-rs/lsm-tree/issues/new?template=bug_report.md", opts.strategy.get_name(), ); return Ok(()); @@ -487,7 +487,7 @@ fn drop_segments( // Fail-safe for buggy compaction strategies if levels.should_decline_compaction(segment_ids.iter().map(GlobalSegmentId::segment_id)) { log::warn!( - "Compaction task created by {:?} contained hidden segments, declining to run it - please report this at https://github.com/fjall-rs/lsm-tree", + "Compaction task created by {:?} contained hidden segments, declining to run it - please report this at https://github.com/fjall-rs/lsm-tree/issues/new?template=bug_report.md", opts.strategy.get_name(), ); return Ok(()); From 0cd43e6cb0a7d46283f617312b7fee5a34728b0b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 5 Dec 2024 01:30:21 +0100 Subject: [PATCH 53/90] refactor --- src/compaction/worker.rs | 14 ++++++-------- src/level_manifest/level.rs | 15 ++++++++++++--- src/level_manifest/mod.rs | 14 +++++++------- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 03349c77..69905f7c 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -182,18 +182,16 @@ fn move_segments( return Ok(()); } - let segment_map = levels.get_all_segments(); - levels.atomic_swap(|recipe| { for segment_id in payload.segment_ids { - if let Some(segment) = segment_map.get(&segment_id).cloned() { - for level in recipe.iter_mut() { - level.remove(segment_id); - } - + if let Some(segment) = recipe.iter_mut().find_map(|x| x.remove(segment_id)) { + #[allow( + clippy::expect_used, + reason = "destination level should definitely exist" + )] recipe .get_mut(payload.dest_level as usize) - .expect("destination level should exist") + .expect("should exist") .insert(segment); } } diff --git a/src/level_manifest/level.rs b/src/level_manifest/level.rs index c3867608..3281b46f 100644 --- a/src/level_manifest/level.rs +++ b/src/level_manifest/level.rs @@ -69,9 +69,18 @@ impl Level { self.update_metadata(); } - pub fn remove(&mut self, segment_id: SegmentId) { - self.segments.retain(|x| segment_id != x.metadata.id); - self.update_metadata(); + pub fn remove(&mut self, segment_id: SegmentId) -> Option { + if let Some(idx) = self + .segments + .iter() + .position(|x| x.metadata.id == segment_id) + { + let segment = self.segments.remove(idx); + self.update_metadata(); + Some(segment) + } else { + None + } } pub(crate) fn sort(&mut self) { diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index ce9a8505..4f0223c5 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -405,15 +405,15 @@ impl LevelManifest { LevelManifestIterator::new(self) } - pub(crate) fn get_all_segments(&self) -> HashMap { - let mut output = HashMap::with_hasher(xxhash_rust::xxh3::Xxh3Builder::new()); + // pub(crate) fn get_all_segments(&self) -> HashMap { + // let mut output = HashMap::with_hasher(xxhash_rust::xxh3::Xxh3Builder::new()); - for segment in self.iter().cloned() { - output.insert(segment.metadata.id, segment); - } + // for segment in self.iter().cloned() { + // output.insert(segment.metadata.id, segment); + // } - output - } + // output + // } pub(crate) fn should_decline_compaction>( &self, From 69b40385d37a01d90a3658a3998e24601abf3700 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 5 Dec 2024 01:32:47 +0100 Subject: [PATCH 54/90] clippy --- src/level_manifest/hidden_set.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/level_manifest/hidden_set.rs b/src/level_manifest/hidden_set.rs index 8c8c64c8..4dc78c7d 100644 --- a/src/level_manifest/hidden_set.rs +++ b/src/level_manifest/hidden_set.rs @@ -8,7 +8,7 @@ use crate::HashSet; /// /// If a compaction task fails, the segments are shown again (removed from the hidden set). #[derive(Clone)] -pub(crate) struct HiddenSet { +pub struct HiddenSet { pub(crate) set: HashSet, } From d4246ae6f951df553660b155923224d79e81cc55 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 5 Dec 2024 01:33:26 +0100 Subject: [PATCH 55/90] wip --- src/compaction/worker.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 69905f7c..43bfd2f8 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -185,10 +185,8 @@ fn move_segments( levels.atomic_swap(|recipe| { for segment_id in payload.segment_ids { if let Some(segment) = recipe.iter_mut().find_map(|x| x.remove(segment_id)) { - #[allow( - clippy::expect_used, - reason = "destination level should definitely exist" - )] + // NOTE: Destination level should definitely exist + #[allow(clippy::expect_used)] recipe .get_mut(payload.dest_level as usize) .expect("should exist") From 95ccb2b7728065a06d08d525b1d5e2c5bb224b1a Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 5 Dec 2024 01:38:50 +0100 Subject: [PATCH 56/90] refactor --- src/level_manifest/mod.rs | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 4f0223c5..0b4151f6 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -405,16 +405,6 @@ impl LevelManifest { LevelManifestIterator::new(self) } - // pub(crate) fn get_all_segments(&self) -> HashMap { - // let mut output = HashMap::with_hasher(xxhash_rust::xxh3::Xxh3Builder::new()); - - // for segment in self.iter().cloned() { - // output.insert(segment.metadata.id, segment); - // } - - // output - // } - pub(crate) fn should_decline_compaction>( &self, ids: T, From 1ee227219268b64e4857dc6c2a301f76bc984003 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 5 Dec 2024 01:40:09 +0100 Subject: [PATCH 57/90] remove unneeded struct --- src/level_manifest/iter.rs | 47 -------------------------------------- src/level_manifest/mod.rs | 4 +--- 2 files changed, 1 insertion(+), 50 deletions(-) delete mode 100644 src/level_manifest/iter.rs diff --git a/src/level_manifest/iter.rs b/src/level_manifest/iter.rs deleted file mode 100644 index a1727186..00000000 --- a/src/level_manifest/iter.rs +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2024-present, fjall-rs -// This source code is licensed under both the Apache 2.0 and MIT License -// (found in the LICENSE-* files in the repository) - -use super::LevelManifest; -use crate::Segment; - -/// Iterates through all levels -pub struct LevelManifestIterator<'a> { - level_manifest: &'a LevelManifest, - current_level: usize, - current_idx: usize, -} - -impl<'a> LevelManifestIterator<'a> { - #[must_use] - pub fn new(level_manifest: &'a LevelManifest) -> Self { - Self { - level_manifest, - current_idx: 0, - current_level: 0, - } - } -} - -impl<'a> Iterator for LevelManifestIterator<'a> { - type Item = &'a Segment; - - fn next(&mut self) -> Option { - loop { - let segment = self - .level_manifest - .levels - .get(self.current_level)? - .segments - .get(self.current_idx); - - if let Some(segment) = segment { - self.current_idx += 1; - return Some(segment); - } - - self.current_level += 1; - self.current_idx = 0; - } - } -} diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 0b4151f6..166740b5 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -3,7 +3,6 @@ // (found in the LICENSE-* files in the repository) pub(crate) mod hidden_set; -pub mod iter; pub(crate) mod level; use crate::{ @@ -15,7 +14,6 @@ use crate::{ }; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use hidden_set::HiddenSet; -use iter::LevelManifestIterator; use level::Level; use std::{ io::{Cursor, Read, Write}, @@ -402,7 +400,7 @@ impl LevelManifest { } pub fn iter(&self) -> impl Iterator + '_ { - LevelManifestIterator::new(self) + self.levels.iter().flat_map(|x| &x.segments) } pub(crate) fn should_decline_compaction>( From dde29cf4cfbc8bfa61cfefa9f4ca6289d196c433 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 5 Dec 2024 01:43:12 +0100 Subject: [PATCH 58/90] wip --- src/level_manifest/level.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/level_manifest/level.rs b/src/level_manifest/level.rs index 3281b46f..19eae82f 100644 --- a/src/level_manifest/level.rs +++ b/src/level_manifest/level.rs @@ -49,11 +49,6 @@ impl Default for Level { } impl Level { - // TODO: unit test - fn set_key_range(&mut self) { - todo!() - } - pub fn list_ids(&self) -> HashSet { self.segments.iter().map(|x| x.metadata.id).collect() } @@ -61,7 +56,6 @@ impl Level { pub fn update_metadata(&mut self) { self.set_disjoint_flag(); self.sort(); - // self.set_key_range(); } pub fn insert(&mut self, segment: Segment) { From 5806a1404316e68a3392cde3b71a8cd297fe3666 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 5 Dec 2024 02:03:20 +0100 Subject: [PATCH 59/90] refactor --- src/compaction/fifo.rs | 11 ++++------- src/compaction/leveled.rs | 25 ++++++++----------------- src/compaction/maintenance.rs | 2 +- src/compaction/major.rs | 4 ++-- src/compaction/pulldown.rs | 6 +++--- src/compaction/tiered.rs | 10 +++------- src/compaction/worker.rs | 17 ++++++++--------- src/level_manifest/hidden_set.rs | 4 ++++ src/level_manifest/level.rs | 19 +++++++------------ src/level_manifest/mod.rs | 19 ++++++++----------- src/segment/mod.rs | 26 ++++++++++++++++---------- src/tree/mod.rs | 19 +++++++------------ tests/tree_recover_counter.rs | 4 ++-- 13 files changed, 73 insertions(+), 93 deletions(-) diff --git a/src/compaction/fifo.rs b/src/compaction/fifo.rs index 4306944f..2db6665a 100644 --- a/src/compaction/fifo.rs +++ b/src/compaction/fifo.rs @@ -62,11 +62,8 @@ impl CompactionStrategy for Strategy { let lifetime_sec = lifetime_us / 1000 / 1000; if lifetime_sec > ttl_seconds.into() { - log::warn!( - "segment is older than configured TTL: {:?}", - segment.metadata.id, - ); - segment_ids_to_delete.insert(segment.metadata.id); + log::warn!("segment is older than configured TTL: {:?}", segment.id(),); + segment_ids_to_delete.insert(segment.id()); } } } @@ -90,11 +87,11 @@ impl CompactionStrategy for Strategy { bytes_to_delete = bytes_to_delete.saturating_sub(segment.metadata.file_size); - segment_ids_to_delete.insert(segment.metadata.id); + segment_ids_to_delete.insert(segment.id()); log::debug!( "dropping segment to reach configured size limit: {:?}", - segment.metadata.id, + segment.id(), ); } } diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 217ee1ce..ed294e1c 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -58,11 +58,7 @@ fn pick_minimal_compaction( let windows = next_level.windows(size); for window in windows { - if window - .iter() - .map(|x| x.metadata.id) - .any(|x| hidden_set.is_hidden(x)) - { + if hidden_set.is_blocked(window.iter().map(Segment::id)) { // IMPORTANT: Compaction is blocked because of other // on-going compaction continue; @@ -96,11 +92,7 @@ fn pick_minimal_compaction( curr_level.overlapping_segments(&key_range).collect() }; - if curr_level_pull_in - .iter() - .map(|x| x.metadata.id) - .any(|x| hidden_set.is_hidden(x)) - { + if hidden_set.is_blocked(curr_level_pull_in.iter().map(|x| x.id())) { // IMPORTANT: Compaction is blocked because of other // on-going compaction continue; @@ -116,8 +108,8 @@ fn pick_minimal_compaction( if curr_level_size >= overshoot { let next_level_size = window.iter().map(|x| x.metadata.file_size).sum::(); - let mut segment_ids: HashSet<_> = window.iter().map(|x| x.metadata.id).collect(); - segment_ids.extend(curr_level_pull_in.iter().map(|x| x.metadata.id)); + let mut segment_ids: HashSet<_> = window.iter().map(Segment::id).collect(); + segment_ids.extend(curr_level_pull_in.iter().map(|x| x.id())); let write_amp = (next_level_size as f32) / (curr_level_size as f32); @@ -135,7 +127,7 @@ fn pick_minimal_compaction( let windows = curr_level.windows(size); for window in windows { - let segment_ids: HashSet = window.iter().map(|x| x.metadata.id).collect(); + let segment_ids: HashSet = window.iter().map(Segment::id).collect(); let key_range = aggregate_key_range(window); @@ -259,7 +251,7 @@ impl CompactionStrategy for Strategy { .iter() // NOTE: Take bytes that are already being compacted into account, // otherwise we may be overcompensating - .filter(|x| !levels.hidden_set().is_hidden(x.metadata.id)) + .filter(|x| !levels.hidden_set().is_hidden(x.id())) .map(|x| x.metadata.file_size) .sum(); @@ -367,15 +359,14 @@ impl CompactionStrategy for Strategy { return Choice::DoNothing; }; - let mut segment_ids: HashSet = - level.iter().map(|x| x.metadata.id).collect(); + let mut segment_ids: HashSet = level.iter().map(Segment::id).collect(); // Get overlapping segments in next level let key_range = aggregate_key_range(&level); let next_level_overlapping_segment_ids: Vec<_> = next_level .overlapping_segments(&key_range) - .map(|x| x.metadata.id) + .map(Segment::id) .collect(); segment_ids.extend(&next_level_overlapping_segment_ids); diff --git a/src/compaction/maintenance.rs b/src/compaction/maintenance.rs index 6dd21edc..64708485 100644 --- a/src/compaction/maintenance.rs +++ b/src/compaction/maintenance.rs @@ -39,7 +39,7 @@ pub fn choose_least_effort_compaction(segments: &[Segment], n: usize) -> HashSet .min_by_key(|window| window.iter().map(|s| s.metadata.file_size).sum::()) .expect("should have at least one window"); - window.iter().map(|x| x.metadata.id).collect() + window.iter().map(Segment::id).collect() } impl CompactionStrategy for Strategy { diff --git a/src/compaction/major.rs b/src/compaction/major.rs index 61110c43..b27ddcf7 100644 --- a/src/compaction/major.rs +++ b/src/compaction/major.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy, Input as CompactionInput}; -use crate::{config::Config, level_manifest::LevelManifest}; +use crate::{config::Config, level_manifest::LevelManifest, Segment}; /// Major compaction /// @@ -40,7 +40,7 @@ impl CompactionStrategy for Strategy { } fn choose(&self, levels: &LevelManifest, _: &Config) -> Choice { - let segment_ids = levels.iter().map(|x| x.metadata.id).collect(); + let segment_ids = levels.iter().map(Segment::id).collect(); Choice::Merge(CompactionInput { segment_ids, diff --git a/src/compaction/pulldown.rs b/src/compaction/pulldown.rs index af70fc62..9698e275 100644 --- a/src/compaction/pulldown.rs +++ b/src/compaction/pulldown.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy, Input}; -use crate::{level_manifest::LevelManifest, Config, HashSet}; +use crate::{level_manifest::LevelManifest, Config, HashSet, Segment}; /// Pulls down and merges a level into the destination level. /// @@ -27,9 +27,9 @@ impl CompactionStrategy for Strategy { .get(usize::from(self.1)) .expect("next level should exist"); - let mut segment_ids: HashSet<_> = level.segments.iter().map(|x| x.metadata.id).collect(); + let mut segment_ids: HashSet<_> = level.segments.iter().map(Segment::id).collect(); - segment_ids.extend(next_level.segments.iter().map(|x| x.metadata.id)); + segment_ids.extend(next_level.segments.iter().map(Segment::id)); Choice::Merge(Input { segment_ids, diff --git a/src/compaction/tiered.rs b/src/compaction/tiered.rs index cc3f431e..a7a70bc7 100644 --- a/src/compaction/tiered.rs +++ b/src/compaction/tiered.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::{Choice, CompactionStrategy, Input as CompactionInput}; -use crate::{level_manifest::LevelManifest, Config}; +use crate::{level_manifest::LevelManifest, Config, Segment}; fn desired_level_size_in_bytes(level_idx: u8, ratio: u8, base_size: u32) -> usize { (ratio as usize).pow(u32::from(level_idx + 1)) * (base_size as usize) @@ -78,7 +78,7 @@ impl CompactionStrategy for Strategy { .iter() // NOTE: Take bytes that are already being compacted into account, // otherwise we may be overcompensating - .filter(|x| !levels.hidden_set().is_hidden(x.metadata.id)) + .filter(|x| !levels.hidden_set().is_hidden(x.id())) .map(|x| x.metadata.file_size) .sum(); @@ -102,11 +102,7 @@ impl CompactionStrategy for Strategy { segments_to_compact.push(segment); } - let segment_ids = segments_to_compact - .iter() - .map(|x| &x.metadata.id) - .copied() - .collect(); + let segment_ids = segments_to_compact.iter().map(Segment::id).collect(); return Choice::Merge(CompactionInput { segment_ids, diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 43bfd2f8..64630cb5 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -120,7 +120,7 @@ fn create_compaction_stream<'a>( .segments .iter() .enumerate() - .filter(|(_, segment)| to_compact.contains(&segment.metadata.id)) + .filter(|(_, segment)| to_compact.contains(&segment.id())) .min_by(|(a, _), (b, _)| a.cmp(b)) .map(|(idx, _)| idx) else { @@ -131,7 +131,7 @@ fn create_compaction_stream<'a>( .segments .iter() .enumerate() - .filter(|(_, segment)| to_compact.contains(&segment.metadata.id)) + .filter(|(_, segment)| to_compact.contains(&segment.id())) .max_by(|(a, _), (b, _)| a.cmp(b)) .map(|(idx, _)| idx) else { @@ -148,7 +148,7 @@ fn create_compaction_stream<'a>( found += hi - lo + 1; } else { for &id in to_compact { - if let Some(segment) = level.segments.iter().find(|x| x.metadata.id == id) { + if let Some(segment) = level.segments.iter().find(|x| x.id() == id) { found += 1; readers.push(Box::new( @@ -411,7 +411,7 @@ fn merge_segments( let swap_result = levels.atomic_swap(|recipe| { for segment in created_segments.iter().cloned() { - log::trace!("Persisting segment {}", segment.metadata.id); + log::trace!("Persisting segment {}", segment.id()); recipe .get_mut(payload.dest_level as usize) @@ -435,12 +435,11 @@ fn merge_segments( }; for segment in &created_segments { - let segment_file_path = segments_base_folder.join(segment.metadata.id.to_string()); + let segment_file_path = segments_base_folder.join(segment.id().to_string()); - opts.config.descriptor_table.insert( - &segment_file_path, - (opts.tree_id, segment.metadata.id).into(), - ); + opts.config + .descriptor_table + .insert(&segment_file_path, (opts.tree_id, segment.id()).into()); } // NOTE: Segments are registered, we can unlock the memtable(s) safely diff --git a/src/level_manifest/hidden_set.rs b/src/level_manifest/hidden_set.rs index 4dc78c7d..a37f7df1 100644 --- a/src/level_manifest/hidden_set.rs +++ b/src/level_manifest/hidden_set.rs @@ -31,6 +31,10 @@ impl HiddenSet { } } + pub(crate) fn is_blocked>(&self, ids: T) -> bool { + ids.into_iter().any(|id| self.is_hidden(id)) + } + pub(crate) fn is_hidden(&self, key: SegmentId) -> bool { self.set.contains(&key) } diff --git a/src/level_manifest/level.rs b/src/level_manifest/level.rs index 19eae82f..782b23a1 100644 --- a/src/level_manifest/level.rs +++ b/src/level_manifest/level.rs @@ -23,8 +23,7 @@ pub struct Level { impl std::fmt::Display for Level { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { for segment in self.segments.iter().rev().take(2).rev() { - let id = segment.metadata.id; - write!(f, "[{id}]")?; + write!(f, "[{}]", segment.id())?; } Ok(()) } @@ -50,7 +49,7 @@ impl Default for Level { impl Level { pub fn list_ids(&self) -> HashSet { - self.segments.iter().map(|x| x.metadata.id).collect() + self.segments.iter().map(Segment::id).collect() } pub fn update_metadata(&mut self) { @@ -64,11 +63,7 @@ impl Level { } pub fn remove(&mut self, segment_id: SegmentId) -> Option { - if let Some(idx) = self - .segments - .iter() - .position(|x| x.metadata.id == segment_id) - { + if let Some(idx) = self.segments.iter().position(|x| x.id() == segment_id) { let segment = self.segments.remove(idx); self.update_metadata(); Some(segment) @@ -110,7 +105,7 @@ impl Level { /// Returns an iterator over the level's segment IDs. pub fn ids(&self) -> impl Iterator + '_ { - self.segments.iter().map(|x| x.metadata.id) + self.segments.iter().map(Segment::id) } /// Returns `true` if the level contains no segments. @@ -514,7 +509,7 @@ mod tests { Vec::::new(), level .overlapping_segments(&KeyRange::new((b"a".to_vec().into(), b"b".to_vec().into()))) - .map(|x| x.metadata.id) + .map(Segment::id) .collect::>(), ); @@ -522,7 +517,7 @@ mod tests { vec![1], level .overlapping_segments(&KeyRange::new((b"d".to_vec().into(), b"k".to_vec().into()))) - .map(|x| x.metadata.id) + .map(Segment::id) .collect::>(), ); @@ -530,7 +525,7 @@ mod tests { vec![1, 2], level .overlapping_segments(&KeyRange::new((b"f".to_vec().into(), b"x".to_vec().into()))) - .map(|x| x.metadata.id) + .map(Segment::id) .collect::>(), ); } diff --git a/src/level_manifest/mod.rs b/src/level_manifest/mod.rs index 166740b5..c89e2b0a 100644 --- a/src/level_manifest/mod.rs +++ b/src/level_manifest/mod.rs @@ -59,7 +59,7 @@ impl std::fmt::Display for LevelManifest { } else if level.segments.len() >= 30 { #[allow(clippy::indexing_slicing)] for segment in level.segments.iter().take(2) { - let id = segment.metadata.id; + let id = segment.id(); let is_hidden = self.hidden_set.is_hidden(id); write!( @@ -73,7 +73,7 @@ impl std::fmt::Display for LevelManifest { #[allow(clippy::indexing_slicing)] for segment in level.segments.iter().rev().take(2).rev() { - let id = segment.metadata.id; + let id = segment.id(); let is_hidden = self.hidden_set.is_hidden(id); write!( @@ -85,7 +85,7 @@ impl std::fmt::Display for LevelManifest { } } else { for segment in &level.segments { - let id = segment.metadata.id; + let id = segment.id(); let is_hidden = self.hidden_set.is_hidden(id); write!( @@ -221,16 +221,13 @@ impl LevelManifest { pub(crate) fn recover>(path: P, segments: Vec) -> crate::Result { let level_manifest = Self::load_level_manifest(&path)?; - let segments: HashMap<_, _> = segments - .into_iter() - .map(|seg| (seg.metadata.id, seg)) - .collect(); + let segments: HashMap<_, _> = segments.into_iter().map(|seg| (seg.id(), seg)).collect(); let levels = Self::resolve_levels(level_manifest, &segments); let mut manifest = Self { levels, - hidden_set: Default::default(), + hidden_set: HiddenSet::default(), path: path.as_ref().to_path_buf(), is_disjoint: false, }; @@ -388,7 +385,7 @@ impl LevelManifest { for raw_level in &self.levels { let mut level = raw_level.iter().cloned().collect::>(); - level.retain(|x| !self.hidden_set.is_hidden(x.metadata.id)); + level.retain(|x| !self.hidden_set.is_hidden(x.id())); output.push(Level { segments: level, @@ -407,7 +404,7 @@ impl LevelManifest { &self, ids: T, ) -> bool { - ids.into_iter().any(|id| self.hidden_set().is_hidden(id)) + self.hidden_set().is_blocked(ids) } pub(crate) fn hidden_set(&self) -> &HiddenSet { @@ -438,7 +435,7 @@ impl Encode for Vec { writer.write_u32::(level.segments.len() as u32)?; for segment in &level.segments { - writer.write_u64::(segment.metadata.id)?; + writer.write_u64::(segment.id())?; } } diff --git a/src/segment/mod.rs b/src/segment/mod.rs index bd820ba5..b193b57a 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -27,6 +27,7 @@ use crate::{ use block_index::BlockIndexImpl; use id::GlobalSegmentId; use inner::Inner; +use meta::SegmentId; use range::Range; use std::{ops::Bound, path::Path, sync::Arc}; @@ -64,15 +65,20 @@ impl std::ops::Deref for Segment { impl std::fmt::Debug for Segment { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Segment:{}({})", - self.metadata.id, self.metadata.key_range - ) + write!(f, "Segment:{}({})", self.id(), self.metadata.key_range) } } impl Segment { + /// Gets the segment ID. + /// + /// The segment ID is unique for this tree, but not + /// across multiple trees, use the global segment ID for that. + #[must_use] + pub fn id(&self) -> SegmentId { + self.metadata.id + } + pub(crate) fn verify(&self) -> crate::Result { use block::checksum::Checksum; use block_index::IndexBlock; @@ -83,7 +89,7 @@ impl Segment { let guard = self .descriptor_table - .access(&(self.tree_id, self.metadata.id).into())? + .access(&(self.tree_id, self.id()).into())? .expect("should have gotten file"); let mut file = guard.file.lock().expect("lock is poisoned"); @@ -176,7 +182,7 @@ impl Segment { if data_block_count != self.metadata.data_block_count { log::error!( "Not all data blocks were visited during verification of disk segment {:?}", - self.metadata.id + self.id(), ); broken_count += 1; } @@ -323,7 +329,7 @@ impl Segment { let Some(block) = ValueBlock::load_by_block_handle( &self.descriptor_table, &self.block_cache, - GlobalSegmentId::from((self.tree_id, self.metadata.id)), + GlobalSegmentId::from((self.tree_id, self.id())), first_block_handle, CachePolicy::Write, )? @@ -353,7 +359,7 @@ impl Segment { let mut reader = Reader::new( self.offsets.index_block_ptr, self.descriptor_table.clone(), - GlobalSegmentId::from((self.tree_id, self.metadata.id)), + GlobalSegmentId::from((self.tree_id, self.id())), self.block_cache.clone(), first_block_handle, None, @@ -473,7 +479,7 @@ impl Segment { Range::new( self.offsets.index_block_ptr, self.descriptor_table.clone(), - GlobalSegmentId::from((self.tree_id, self.metadata.id)), + GlobalSegmentId::from((self.tree_id, self.id())), self.block_cache.clone(), self.block_index.clone(), range, diff --git a/src/tree/mod.rs b/src/tree/mod.rs index f2dca810..1f4d1e76 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -208,8 +208,8 @@ impl AbstractTree for Tree { // eprintln!("{original_levels}"); for segment in segments { - log::trace!("releasing sealed memtable {}", segment.metadata.id); - sealed_memtables.remove(segment.metadata.id); + log::trace!("releasing sealed memtable {}", segment.id()); + sealed_memtables.remove(segment.id()); } Ok(()) @@ -524,10 +524,9 @@ impl Tree { } .into(); - self.config.descriptor_table.insert( - segment_file_path, - (self.id, created_segment.metadata.id).into(), - ); + self.config + .descriptor_table + .insert(segment_file_path, (self.id, created_segment.id()).into()); log::debug!("Flushed segment to {segment_folder:?}"); @@ -836,11 +835,7 @@ impl Tree { )?; levels.update_metadata(); - let highest_segment_id = levels - .iter() - .map(|x| x.metadata.id) - .max() - .unwrap_or_default(); + let highest_segment_id = levels.iter().map(Segment::id).max().unwrap_or_default(); let inner = TreeInner { id: tree_id, @@ -965,7 +960,7 @@ impl Tree { level_idx == 0 || level_idx == 1, )?; - descriptor_table.insert(&segment_file_path, (tree_id, segment.metadata.id).into()); + descriptor_table.insert(&segment_file_path, (tree_id, segment.id()).into()); segments.push(segment); log::debug!("Recovered segment from {segment_file_path:?}"); diff --git a/tests/tree_recover_counter.rs b/tests/tree_recover_counter.rs index 16f69cc8..88b54984 100644 --- a/tests/tree_recover_counter.rs +++ b/tests/tree_recover_counter.rs @@ -26,7 +26,7 @@ fn tree_recover_segment_counter() -> lsm_tree::Result<()> { { let first_level = &tree.levels.read().expect("lock is poisoned").levels[0]; - assert_eq!(0, first_level.segments[0].metadata.id); + assert_eq!(0, first_level.segments[0].id()); } tree.insert("b", "b", 0); @@ -41,7 +41,7 @@ fn tree_recover_segment_counter() -> lsm_tree::Result<()> { { let first_level = &tree.levels.read().expect("lock is poisoned").levels[0]; - assert_eq!(1, first_level.segments[1].metadata.id); + assert_eq!(1, first_level.segments[1].id()); } } From 85a3ff743749fc3af9b08f219872ece0cbbd8eda Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 5 Dec 2024 02:07:09 +0100 Subject: [PATCH 60/90] refactor --- src/compaction/worker.rs | 2 +- src/segment/mod.rs | 16 +++++++++++----- src/tree/mod.rs | 4 ++-- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 64630cb5..5bec5d36 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -439,7 +439,7 @@ fn merge_segments( opts.config .descriptor_table - .insert(&segment_file_path, (opts.tree_id, segment.id()).into()); + .insert(&segment_file_path, segment.global_id()); } // NOTE: Segments are registered, we can unlock the memtable(s) safely diff --git a/src/segment/mod.rs b/src/segment/mod.rs index b193b57a..d58c72ce 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -70,10 +70,16 @@ impl std::fmt::Debug for Segment { } impl Segment { + /// Gets the global segment ID. + #[must_use] + pub fn global_id(&self) -> GlobalSegmentId { + (self.tree_id, self.id()).into() + } + /// Gets the segment ID. /// /// The segment ID is unique for this tree, but not - /// across multiple trees, use the global segment ID for that. + /// across multiple trees, use [`Segment::global_id`] for that. #[must_use] pub fn id(&self) -> SegmentId { self.metadata.id @@ -89,7 +95,7 @@ impl Segment { let guard = self .descriptor_table - .access(&(self.tree_id, self.id()).into())? + .access(&self.global_id())? .expect("should have gotten file"); let mut file = guard.file.lock().expect("lock is poisoned"); @@ -329,7 +335,7 @@ impl Segment { let Some(block) = ValueBlock::load_by_block_handle( &self.descriptor_table, &self.block_cache, - GlobalSegmentId::from((self.tree_id, self.id())), + self.global_id(), first_block_handle, CachePolicy::Write, )? @@ -359,7 +365,7 @@ impl Segment { let mut reader = Reader::new( self.offsets.index_block_ptr, self.descriptor_table.clone(), - GlobalSegmentId::from((self.tree_id, self.id())), + self.global_id(), self.block_cache.clone(), first_block_handle, None, @@ -479,7 +485,7 @@ impl Segment { Range::new( self.offsets.index_block_ptr, self.descriptor_table.clone(), - GlobalSegmentId::from((self.tree_id, self.id())), + self.global_id(), self.block_cache.clone(), self.block_index.clone(), range, diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 1f4d1e76..4c34948d 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -526,7 +526,7 @@ impl Tree { self.config .descriptor_table - .insert(segment_file_path, (self.id, created_segment.id()).into()); + .insert(segment_file_path, created_segment.global_id()); log::debug!("Flushed segment to {segment_folder:?}"); @@ -960,7 +960,7 @@ impl Tree { level_idx == 0 || level_idx == 1, )?; - descriptor_table.insert(&segment_file_path, (tree_id, segment.id()).into()); + descriptor_table.insert(&segment_file_path, segment.global_id()); segments.push(segment); log::debug!("Recovered segment from {segment_file_path:?}"); From fb9813880bdc5edbbde337cd8826ab244a87b1c2 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 7 Dec 2024 02:07:27 +0100 Subject: [PATCH 61/90] compaction: remove overshoot checking again otherwise parallel compactions would not occur and overshoots that are too large would prevent compactions at all --- src/compaction/leveled.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index ed294e1c..d4997f09 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -22,7 +22,6 @@ fn pick_minimal_compaction( curr_level: &Level, next_level: &Level, hidden_set: &HiddenSet, - overshoot: u64, ) -> Option<(HashSet, bool)> { // assert!(curr_level.is_disjoint, "Lx is not disjoint"); // assert!(next_level.is_disjoint, "Lx+1 is not disjoint"); @@ -105,7 +104,7 @@ fn pick_minimal_compaction( // NOTE: Only consider compactions where we actually reach the amount // of bytes we need to merge - if curr_level_size >= overshoot { + if curr_level_size >= 1 { let next_level_size = window.iter().map(|x| x.metadata.file_size).sum::(); let mut segment_ids: HashSet<_> = window.iter().map(Segment::id).collect(); @@ -265,7 +264,7 @@ impl CompactionStrategy for Strategy { }; let Some((segment_ids, can_trivial_move)) = - pick_minimal_compaction(level, next_level, levels.hidden_set(), overshoot) + pick_minimal_compaction(level, next_level, levels.hidden_set()) else { break; }; From 4ada35b50c95ebd03387f72d40c9d2dcb096e76f Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 7 Dec 2024 19:59:32 +0100 Subject: [PATCH 62/90] add merge bench --- Cargo.toml | 6 ++++++ benches/merge.rs | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 benches/merge.rs diff --git a/Cargo.toml b/Cargo.toml index 19432559..1971cd60 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,6 +57,12 @@ harness = false path = "benches/tli.rs" required-features = [] +[[bench]] +name = "merge" +harness = false +path = "benches/merge.rs" +required-features = [] + [[bench]] name = "memtable" harness = false diff --git a/benches/merge.rs b/benches/merge.rs new file mode 100644 index 00000000..8c6b220f --- /dev/null +++ b/benches/merge.rs @@ -0,0 +1,40 @@ +use criterion::{criterion_group, criterion_main, Criterion}; +use lsm_tree::merge::{BoxedIterator, Merger}; +use lsm_tree::{InternalValue, Memtable}; +use nanoid::nanoid; + +fn merger(c: &mut Criterion) { + let memtables = (0..30) + .map(|_| { + let table = Memtable::default(); + + for _ in 0..100 { + table.insert(InternalValue::from_components( + nanoid!(), + vec![], + 0, + lsm_tree::ValueType::Value, + )); + } + + table + }) + .collect::>(); + + c.bench_function("Merger", |b| { + b.iter_with_large_drop(|| { + let iters = memtables + .iter() + .map(|x| x.iter().map(Ok)) + .map(|x| Box::new(x) as BoxedIterator<'_>) + .collect(); + + let merger = Merger::new(iters); + + assert_eq!(30 * 100, merger.count()); + }) + }); +} + +criterion_group!(benches, merger); +criterion_main!(benches); From 4c2754cdf151773a8e663ff4b87421ec31ec3fc6 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 7 Dec 2024 20:05:30 +0100 Subject: [PATCH 63/90] adjust bench --- benches/merge.rs | 52 +++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/benches/merge.rs b/benches/merge.rs index 8c6b220f..47f2b813 100644 --- a/benches/merge.rs +++ b/benches/merge.rs @@ -4,36 +4,38 @@ use lsm_tree::{InternalValue, Memtable}; use nanoid::nanoid; fn merger(c: &mut Criterion) { - let memtables = (0..30) - .map(|_| { - let table = Memtable::default(); + for num in [2, 4, 8, 16, 30] { + c.bench_function(&format!("Merge {num}"), |b| { + let memtables = (0..num) + .map(|_| { + let table = Memtable::default(); - for _ in 0..100 { - table.insert(InternalValue::from_components( - nanoid!(), - vec![], - 0, - lsm_tree::ValueType::Value, - )); - } + for _ in 0..100 { + table.insert(InternalValue::from_components( + nanoid!(), + vec![], + 0, + lsm_tree::ValueType::Value, + )); + } - table - }) - .collect::>(); + table + }) + .collect::>(); - c.bench_function("Merger", |b| { - b.iter_with_large_drop(|| { - let iters = memtables - .iter() - .map(|x| x.iter().map(Ok)) - .map(|x| Box::new(x) as BoxedIterator<'_>) - .collect(); + b.iter_with_large_drop(|| { + let iters = memtables + .iter() + .map(|x| x.iter().map(Ok)) + .map(|x| Box::new(x) as BoxedIterator<'_>) + .collect(); - let merger = Merger::new(iters); + let merger = Merger::new(iters); - assert_eq!(30 * 100, merger.count()); - }) - }); + assert_eq!(num * 100, merger.count()); + }) + }); + } } criterion_group!(benches, merger); From 26c6072607bf64bc043b58e1141a557841a172d0 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 7 Dec 2024 20:17:12 +0100 Subject: [PATCH 64/90] add mvcc stream bench --- benches/merge.rs | 39 +++++++++++++++++++++++++++++++++++++-- src/lib.rs | 5 ++++- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/benches/merge.rs b/benches/merge.rs index 47f2b813..8072e893 100644 --- a/benches/merge.rs +++ b/benches/merge.rs @@ -1,6 +1,6 @@ use criterion::{criterion_group, criterion_main, Criterion}; use lsm_tree::merge::{BoxedIterator, Merger}; -use lsm_tree::{InternalValue, Memtable}; +use lsm_tree::{mvcc_stream::MvccStream, InternalValue, Memtable}; use nanoid::nanoid; fn merger(c: &mut Criterion) { @@ -38,5 +38,40 @@ fn merger(c: &mut Criterion) { } } -criterion_group!(benches, merger); +fn mvcc_stream(c: &mut Criterion) { + for num in [2, 4, 8, 16, 30] { + c.bench_function(&format!("MVCC stream {num} versions"), |b| { + let memtables = (0..num) + .map(|_| { + let table = Memtable::default(); + + for key in 'a'..='z' { + table.insert(InternalValue::from_components( + key.to_string(), + vec![], + num, + lsm_tree::ValueType::Value, + )); + } + + table + }) + .collect::>(); + + b.iter_with_large_drop(|| { + let iters = memtables + .iter() + .map(|x| x.iter().map(Ok)) + .map(|x| Box::new(x) as BoxedIterator<'_>) + .collect(); + + let merger = MvccStream::new(Merger::new(iters)); + + assert_eq!(26, merger.count()); + }) + }); + } +} + +criterion_group!(benches, merger, mvcc_stream); criterion_main!(benches); diff --git a/src/lib.rs b/src/lib.rs index 73ef002e..efdc8421 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -161,7 +161,10 @@ mod memtable; pub mod merge; mod multi_reader; -mod mvcc_stream; + +#[doc(hidden)] +pub mod mvcc_stream; + mod path; #[doc(hidden)] From f4b745c63cc2fe4135edece452f2fb151cd6e96d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 7 Dec 2024 20:24:21 +0100 Subject: [PATCH 65/90] wip --- src/key_range.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/key_range.rs b/src/key_range.rs index 0c87881c..e8c2e583 100644 --- a/src/key_range.rs +++ b/src/key_range.rs @@ -65,18 +65,21 @@ impl KeyRange { true } + /// Returns `true` if the key falls within this key range. pub fn contains_key>(&self, key: K) -> bool { let key = key.as_ref(); let (start, end) = &self.0; key >= *start && key <= *end } + /// Returns `true` if the `other` is fully contained in this range. pub fn contains_range(&self, other: &Self) -> bool { let (start1, end1) = &self.0; let (start2, end2) = &other.0; start1 <= start2 && end1 >= end2 } + /// Returns `true` if the `other` overlaps at least partially with this range. pub fn overlaps_with_key_range(&self, other: &Self) -> bool { let (start1, end1) = &self.0; let (start2, end2) = &other.0; From 5c0262832eb6b981ec970cb04ba4537c976ad26e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 7 Dec 2024 23:45:57 +0100 Subject: [PATCH 66/90] catch another ? in compaction worker, #87 --- src/compaction/worker.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 5bec5d36..25011f6a 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -243,7 +243,7 @@ fn merge_segments( let start = Instant::now(); - let mut segment_writer = MultiWriter::new( + let Ok(segment_writer) = MultiWriter::new( opts.segment_id_generator.clone(), payload.target_size, crate::segment::writer::Options { @@ -252,8 +252,19 @@ fn merge_segments( data_block_size: opts.config.data_block_size, index_block_size: opts.config.index_block_size, }, - )? - .use_compression(opts.config.compression); + ) else { + log::error!("Compaction failed"); + + // IMPORTANT: Show the segments again, because compaction failed + opts.levels + .write() + .expect("lock is poisoned") + .show_segments(payload.segment_ids.iter().copied()); + + return Ok(()); + }; + + let mut segment_writer = segment_writer.use_compression(opts.config.compression); #[cfg(feature = "bloom")] { From f46b6fe26a1e90113dc2dbb0342db160a295e616 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 9 Dec 2024 22:19:53 +0100 Subject: [PATCH 67/90] fix(MultiWriter): make sure KV versions cannot span segments --- src/segment/multi_writer.rs | 31 ++++++++++++++++++++++++++++++- src/segment/writer/mod.rs | 16 ++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/src/segment/multi_writer.rs b/src/segment/multi_writer.rs index af53da3a..c2153b33 100644 --- a/src/segment/multi_writer.rs +++ b/src/segment/multi_writer.rs @@ -126,7 +126,7 @@ impl MultiWriter { pub fn write(&mut self, item: InternalValue) -> crate::Result<()> { self.writer.write(item)?; - if *self.writer.meta.file_pos >= self.target_size { + if *self.writer.meta.file_pos >= self.target_size && self.writer.can_rotate() { self.rotate()?; } @@ -144,3 +144,32 @@ impl MultiWriter { Ok(self.results) } } + +#[cfg(test)] +mod tests { + use crate::{AbstractTree, Config}; + use test_log::test; + + // NOTE: Tests that versions of the same key stay + // in the same segment even if it needs to be rotated + // This avoids segments' key ranges overlapping + #[test] + fn segment_multi_writer_same_key_norotate() -> crate::Result<()> { + let folder = tempfile::tempdir()?; + + let tree = Config::new(&folder).open()?; + + tree.insert("a", "a1".repeat(4_000), 0); + tree.insert("a", "a2".repeat(4_000), 1); + tree.insert("a", "a3".repeat(4_000), 2); + tree.insert("a", "a4".repeat(4_000), 3); + tree.insert("a", "a5".repeat(4_000), 4); + tree.flush_active_memtable(0)?; + assert_eq!(1, tree.segment_count()); + + tree.major_compact(1_024, 0)?; + assert_eq!(1, tree.segment_count()); + + Ok(()) + } +} diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index e7181cf3..8109afec 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -55,6 +55,8 @@ pub struct Writer { current_key: Option, + can_rotate: bool, + #[cfg(feature = "bloom")] bloom_policy: BloomConstructionPolicy, @@ -134,6 +136,8 @@ impl Writer { current_key: None, + can_rotate: false, + #[cfg(feature = "bloom")] bloom_policy: BloomConstructionPolicy::default(), @@ -142,6 +146,11 @@ impl Writer { }) } + #[must_use] + pub fn can_rotate(&self) -> bool { + self.can_rotate + } + #[must_use] pub(crate) fn use_compression(mut self, compression: CompressionType) -> Self { self.compression = compression; @@ -224,7 +233,14 @@ impl Writer { self.meta.tombstone_count += 1; } + // NOTE: Check if we visit a new key if Some(&item.key.user_key) != self.current_key.as_ref() { + // IMPORTANT: Check that we are not at the first key + if self.current_key.is_some() { + dbg!("JUHU"); + self.can_rotate = true; + } + self.meta.key_count += 1; self.current_key = Some(item.key.user_key.clone()); From ffed6290f9161ef15b4c7bb0d4f7203ab46d99dc Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 9 Dec 2024 22:19:57 +0100 Subject: [PATCH 68/90] doc --- src/compaction/major.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/compaction/major.rs b/src/compaction/major.rs index b27ddcf7..002404ec 100644 --- a/src/compaction/major.rs +++ b/src/compaction/major.rs @@ -7,17 +7,17 @@ use crate::{config::Config, level_manifest::LevelManifest, Segment}; /// Major compaction /// -/// Compacts all segments into the last level +/// Compacts all segments into the last level. pub struct Strategy { target_size: u64, } impl Strategy { - /// Configures a new `SizeTiered` compaction strategy + /// Configures a new `SizeTiered` compaction strategy. /// /// # Panics /// - /// Panics, if `target_size` is below 1024 bytes + /// Panics, if `target_size` is below 1024 bytes. #[must_use] #[allow(dead_code)] pub fn new(target_size: u64) -> Self { From 5dca9e5375845a100d5d2c92b0fc8cc63011e6e0 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 9 Dec 2024 22:24:13 +0100 Subject: [PATCH 69/90] refactor --- src/compaction/fifo.rs | 5 +---- src/compaction/maintenance.rs | 5 +---- src/compaction/stream.rs | 21 +++++++++--------- src/level_manifest/level.rs | 5 +---- src/mvcc_stream.rs | 29 +++++++++++++------------ src/segment/block_index/block_handle.rs | 1 + src/segment/meta/compression.rs | 7 +++--- src/segment/mod.rs | 17 +++++++++++++++ tests/blob_drop_after_flush.rs | 3 ++- tests/blob_gc.rs | 7 +++--- tests/blob_gc_watermark.rs | 3 ++- tests/blob_tree_flush.rs | 3 ++- tests/multi_trees.rs | 1 + tests/snapshot_compact.rs | 3 ++- tests/snapshot_len.rs | 3 ++- tests/tree_different_block_size.rs | 3 ++- tests/tree_disjoint_iter.rs | 3 ++- tests/tree_flush_eviction.rs | 3 +-- tests/tree_iter_lifetime.rs | 3 ++- tests/tree_mvcc_simple.rs | 3 ++- tests/tree_recover_counter.rs | 1 + tests/tree_seqno.rs | 3 ++- 22 files changed, 78 insertions(+), 54 deletions(-) diff --git a/src/compaction/fifo.rs b/src/compaction/fifo.rs index 2db6665a..2683f010 100644 --- a/src/compaction/fifo.rs +++ b/src/compaction/fifo.rs @@ -137,9 +137,6 @@ mod tests { use std::sync::Arc; use test_log::test; - #[cfg(feature = "bloom")] - use crate::bloom::BloomFilter; - #[allow(clippy::expect_used)] #[allow(clippy::cast_possible_truncation)] fn fixture_segment(id: SegmentId, created_at: u128) -> Segment { @@ -184,7 +181,7 @@ mod tests { block_cache, #[cfg(feature = "bloom")] - bloom_filter: Some(BloomFilter::with_fp_rate(1, 0.1)), + bloom_filter: Some(crate::bloom::BloomFilter::with_fp_rate(1, 0.1)), } .into() } diff --git a/src/compaction/maintenance.rs b/src/compaction/maintenance.rs index 64708485..8f8deb49 100644 --- a/src/compaction/maintenance.rs +++ b/src/compaction/maintenance.rs @@ -100,9 +100,6 @@ mod tests { use std::sync::Arc; use test_log::test; - #[cfg(feature = "bloom")] - use crate::bloom::BloomFilter; - #[allow(clippy::expect_used)] fn fixture_segment(id: SegmentId, created_at: u128) -> Segment { let block_cache = Arc::new(BlockCache::with_capacity_bytes(10 * 1_024 * 1_024)); @@ -146,7 +143,7 @@ mod tests { block_cache, #[cfg(feature = "bloom")] - bloom_filter: Some(BloomFilter::with_fp_rate(1, 0.1)), + bloom_filter: Some(crate::bloom::BloomFilter::with_fp_rate(1, 0.1)), } .into() } diff --git a/src/compaction/stream.rs b/src/compaction/stream.rs index d7f86db6..4314009d 100644 --- a/src/compaction/stream.rs +++ b/src/compaction/stream.rs @@ -102,6 +102,7 @@ impl>> Iterator for CompactionSt mod tests { use super::*; use crate::value::{InternalValue, ValueType}; + use test_log::test; macro_rules! stream { ($($key:expr, $sub_key:expr, $value_type:expr),* $(,)?) => {{ @@ -136,7 +137,7 @@ mod tests { }; } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn compaction_stream_queue_weak_tombstones() { #[rustfmt::skip] @@ -156,7 +157,7 @@ mod tests { } /// GC should not evict tombstones, unless they are covered up - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn compaction_stream_tombstone_no_gc() -> crate::Result<()> { #[rustfmt::skip] @@ -186,7 +187,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn compaction_stream_old_tombstone() -> crate::Result<()> { #[rustfmt::skip] @@ -231,7 +232,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn compaction_stream_tombstone_overwrite_gc() -> crate::Result<()> { #[rustfmt::skip] @@ -252,7 +253,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn compaction_stream_weak_tombstone_simple() -> crate::Result<()> { #[rustfmt::skip] @@ -277,7 +278,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn compaction_stream_weak_tombstone_no_gc() -> crate::Result<()> { #[rustfmt::skip] @@ -302,7 +303,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn compaction_stream_weak_tombstone_evict() { #[rustfmt::skip] @@ -319,7 +320,7 @@ mod tests { iter_closed!(iter); } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn compaction_stream_weak_tombstone_evict_next_value() -> crate::Result<()> { #[rustfmt::skip] @@ -349,7 +350,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn compaction_stream_no_evict_simple() -> crate::Result<()> { #[rustfmt::skip] @@ -379,7 +380,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn compaction_stream_no_evict_simple_multi_keys() -> crate::Result<()> { #[rustfmt::skip] diff --git a/src/level_manifest/level.rs b/src/level_manifest/level.rs index 782b23a1..481c4485 100644 --- a/src/level_manifest/level.rs +++ b/src/level_manifest/level.rs @@ -258,9 +258,6 @@ mod tests { use std::sync::Arc; use test_log::test; - #[cfg(feature = "bloom")] - use crate::bloom::BloomFilter; - #[allow(clippy::expect_used)] fn fixture_segment(id: SegmentId, key_range: KeyRange) -> Segment { let block_cache = Arc::new(BlockCache::with_capacity_bytes(10 * 1_024 * 1_024)); @@ -304,7 +301,7 @@ mod tests { block_cache, #[cfg(feature = "bloom")] - bloom_filter: Some(BloomFilter::with_fp_rate(1, 0.1)), + bloom_filter: Some(crate::bloom::BloomFilter::with_fp_rate(1, 0.1)), } .into() } diff --git a/src/mvcc_stream.rs b/src/mvcc_stream.rs index 27eac255..258d3225 100644 --- a/src/mvcc_stream.rs +++ b/src/mvcc_stream.rs @@ -97,6 +97,7 @@ impl>> DoubleEndedIte mod tests { use super::*; use crate::value::{InternalValue, ValueType}; + use test_log::test; macro_rules! stream { ($($key:expr, $sub_key:expr, $value_type:expr),* $(,)?) => {{ @@ -147,7 +148,7 @@ mod tests { }; } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn mvcc_queue_reverse_almost_gone() -> crate::Result<()> { let vec = [ @@ -193,7 +194,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn mvcc_queue_almost_gone_2() -> crate::Result<()> { let vec = [ @@ -235,7 +236,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn mvcc_queue() -> crate::Result<()> { let vec = [ @@ -278,7 +279,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn mvcc_queue_weak_almost_gone() -> crate::Result<()> { let vec = [ @@ -324,7 +325,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn mvcc_queue_weak_almost_gone_2() -> crate::Result<()> { let vec = [ @@ -366,7 +367,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn mvcc_queue_weak_reverse() -> crate::Result<()> { let vec = [ @@ -409,7 +410,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn mvcc_stream_simple() -> crate::Result<()> { #[rustfmt::skip] @@ -433,7 +434,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn mvcc_stream_simple_multi_keys() -> crate::Result<()> { #[rustfmt::skip] @@ -470,7 +471,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn mvcc_stream_tombstone() -> crate::Result<()> { #[rustfmt::skip] @@ -494,7 +495,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn mvcc_stream_tombstone_multi_keys() -> crate::Result<()> { #[rustfmt::skip] @@ -531,7 +532,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn mvcc_stream_weak_tombstone_simple() -> crate::Result<()> { #[rustfmt::skip] @@ -555,7 +556,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn mvcc_stream_weak_tombstone_resurrection() -> crate::Result<()> { #[rustfmt::skip] @@ -580,7 +581,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn mvcc_stream_weak_tombstone_priority() -> crate::Result<()> { #[rustfmt::skip] @@ -606,7 +607,7 @@ mod tests { Ok(()) } - #[test_log::test] + #[test] #[allow(clippy::unwrap_used)] fn mvcc_stream_weak_tombstone_multi_keys() -> crate::Result<()> { #[rustfmt::skip] diff --git a/src/segment/block_index/block_handle.rs b/src/segment/block_index/block_handle.rs index 6845c231..e1be126f 100644 --- a/src/segment/block_index/block_handle.rs +++ b/src/segment/block_index/block_handle.rs @@ -97,6 +97,7 @@ impl Decode for KeyedBlockHandle { #[cfg(test)] mod tests { use super::*; + use test_log::test; #[test] fn index_block_size() { diff --git a/src/segment/meta/compression.rs b/src/segment/meta/compression.rs index 00413ceb..e1a08b52 100644 --- a/src/segment/meta/compression.rs +++ b/src/segment/meta/compression.rs @@ -113,8 +113,9 @@ impl std::fmt::Display for CompressionType { #[cfg(test)] mod tests { use super::*; + use test_log::test; - #[test_log::test] + #[test] fn compression_serialize_none() -> crate::Result<()> { let serialized = CompressionType::None.encode_into_vec(); assert_eq!(2, serialized.len()); @@ -125,7 +126,7 @@ mod tests { mod lz4 { use super::*; - #[test_log::test] + #[test] fn compression_serialize_none() -> crate::Result<()> { let serialized = CompressionType::Lz4.encode_into_vec(); assert_eq!(2, serialized.len()); @@ -137,7 +138,7 @@ mod tests { mod miniz { use super::*; - #[test_log::test] + #[test] fn compression_serialize_none() -> crate::Result<()> { for lvl in 0..10 { let serialized = CompressionType::Miniz(lvl).encode_into_vec(); diff --git a/src/segment/mod.rs b/src/segment/mod.rs index d58c72ce..64119753 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -21,6 +21,7 @@ use crate::{ block_cache::BlockCache, descriptor_table::FileDescriptorTable, segment::reader::Reader, + time::unix_timestamp, tree::inner::TreeId, value::{InternalValue, SeqNo, UserKey}, }; @@ -70,6 +71,22 @@ impl std::fmt::Debug for Segment { } impl Segment { + // TODO: in Leveled compaction, compact segments that live very long and have + // many versions (possibly unnecessary space usage of old, stale versions) + /// Calculates how many versions per key there are on average. + #[must_use] + pub fn version_factor(&self) -> f32 { + self.metadata.item_count as f32 / self.metadata.key_count as f32 + } + + /// Gets the segment age in nanoseconds. + #[must_use] + pub fn age(&self) -> u128 { + let now = unix_timestamp().as_nanos(); + let created_at = self.metadata.created_at * 1_000; + now.saturating_sub(created_at) + } + /// Gets the global segment ID. #[must_use] pub fn global_id(&self) -> GlobalSegmentId { diff --git a/tests/blob_drop_after_flush.rs b/tests/blob_drop_after_flush.rs index 31e820a4..8b0b74c8 100644 --- a/tests/blob_drop_after_flush.rs +++ b/tests/blob_drop_after_flush.rs @@ -1,7 +1,8 @@ use lsm_tree::{AbstractTree, Config}; use std::time::Duration; +use test_log::test; -#[test_log::test] +#[test] fn blob_drop_after_flush() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/blob_gc.rs b/tests/blob_gc.rs index 8436f9c7..2366da18 100644 --- a/tests/blob_gc.rs +++ b/tests/blob_gc.rs @@ -1,6 +1,7 @@ use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use test_log::test; -#[test_log::test] +#[test] fn blob_gc_1() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; @@ -43,7 +44,7 @@ fn blob_gc_1() -> lsm_tree::Result<()> { Ok(()) } -#[test_log::test] +#[test] fn blob_gc_2() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; @@ -92,7 +93,7 @@ fn blob_gc_2() -> lsm_tree::Result<()> { Ok(()) } -#[test_log::test] +#[test] fn blob_gc_3() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/blob_gc_watermark.rs b/tests/blob_gc_watermark.rs index 721d10e4..8089e8ea 100644 --- a/tests/blob_gc_watermark.rs +++ b/tests/blob_gc_watermark.rs @@ -1,6 +1,7 @@ use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use test_log::test; -#[test_log::test] +#[test] fn blob_gc_seqno_watermark() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/blob_tree_flush.rs b/tests/blob_tree_flush.rs index 1041b106..ff43396c 100644 --- a/tests/blob_tree_flush.rs +++ b/tests/blob_tree_flush.rs @@ -1,6 +1,7 @@ use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use test_log::test; -#[test_log::test] +#[test] fn blob_gc_flush_tombstone() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/multi_trees.rs b/tests/multi_trees.rs index 67f22e9a..ae4ddd8b 100644 --- a/tests/multi_trees.rs +++ b/tests/multi_trees.rs @@ -1,4 +1,5 @@ use lsm_tree::{AbstractTree, Config}; +use test_log::test; #[test] fn tree_multi_segment_ids() -> lsm_tree::Result<()> { diff --git a/tests/snapshot_compact.rs b/tests/snapshot_compact.rs index 90f6e0d3..fae1878d 100644 --- a/tests/snapshot_compact.rs +++ b/tests/snapshot_compact.rs @@ -1,8 +1,9 @@ use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use test_log::test; const ITEM_COUNT: usize = 100; -#[test_log::test] +#[test] fn snapshot_after_compaction() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/snapshot_len.rs b/tests/snapshot_len.rs index e01c35e7..4f576220 100644 --- a/tests/snapshot_len.rs +++ b/tests/snapshot_len.rs @@ -1,8 +1,9 @@ use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use test_log::test; const ITEM_COUNT: usize = 100; -#[test_log::test] +#[test] fn snapshot_basic() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/tree_different_block_size.rs b/tests/tree_different_block_size.rs index f0073133..387cee7e 100644 --- a/tests/tree_different_block_size.rs +++ b/tests/tree_different_block_size.rs @@ -1,8 +1,9 @@ use lsm_tree::{AbstractTree, Config, SequenceNumberCounter}; +use test_log::test; const ITEM_COUNT: usize = 1_000; -#[test_log::test] +#[test] fn tree_block_size_after_recovery() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?; diff --git a/tests/tree_disjoint_iter.rs b/tests/tree_disjoint_iter.rs index 4bc9d42e..407552c1 100644 --- a/tests/tree_disjoint_iter.rs +++ b/tests/tree_disjoint_iter.rs @@ -1,4 +1,5 @@ use lsm_tree::{AbstractTree, Config, Slice}; +use test_log::test; macro_rules! iter_closed { ($iter:expr) => { @@ -10,7 +11,7 @@ macro_rules! iter_closed { }; } -#[test_log::test] +#[test] fn tree_disjoint_iter() -> lsm_tree::Result<()> { let tempdir = tempfile::tempdir()?; let tree = crate::Config::new(&tempdir).open()?; diff --git a/tests/tree_flush_eviction.rs b/tests/tree_flush_eviction.rs index d756dcc0..2e78e5f0 100644 --- a/tests/tree_flush_eviction.rs +++ b/tests/tree_flush_eviction.rs @@ -1,6 +1,5 @@ -use std::sync::Arc; - use lsm_tree::AbstractTree; +use std::sync::Arc; use test_log::test; #[test] diff --git a/tests/tree_iter_lifetime.rs b/tests/tree_iter_lifetime.rs index 9fc86139..8c8212e7 100644 --- a/tests/tree_iter_lifetime.rs +++ b/tests/tree_iter_lifetime.rs @@ -1,5 +1,6 @@ use lsm_tree::{AbstractTree, KvPair}; use std::path::Path; +use test_log::test; fn iterrr( path: &Path, @@ -14,7 +15,7 @@ fn iterrr( Ok(tree.iter()) } -#[test_log::test] +#[test] fn tree_iter_lifetime() -> lsm_tree::Result<()> { let folder = tempfile::tempdir().unwrap(); assert_eq!(100, iterrr(folder.path())?.count()); diff --git a/tests/tree_mvcc_simple.rs b/tests/tree_mvcc_simple.rs index 674d7233..1d3e2456 100644 --- a/tests/tree_mvcc_simple.rs +++ b/tests/tree_mvcc_simple.rs @@ -1,6 +1,7 @@ use lsm_tree::{AbstractTree, Config}; +use test_log::test; -#[test_log::test] +#[test] fn tree_read_mvcc() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.into_path(); diff --git a/tests/tree_recover_counter.rs b/tests/tree_recover_counter.rs index 88b54984..464fcaf3 100644 --- a/tests/tree_recover_counter.rs +++ b/tests/tree_recover_counter.rs @@ -1,4 +1,5 @@ use lsm_tree::{AbstractTree, Config}; +use test_log::test; #[test] fn tree_recover_segment_counter() -> lsm_tree::Result<()> { diff --git a/tests/tree_seqno.rs b/tests/tree_seqno.rs index a45a9ef4..6cf46217 100644 --- a/tests/tree_seqno.rs +++ b/tests/tree_seqno.rs @@ -1,6 +1,7 @@ use lsm_tree::{AbstractTree, Config}; +use test_log::test; -#[test_log::test] +#[test] fn tree_highest_seqno() -> lsm_tree::Result<()> { let folder = tempfile::tempdir()?.into_path(); From 615ce00a6db3506064603897bf206a8f049ede3b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 9 Dec 2024 22:29:39 +0100 Subject: [PATCH 70/90] fix --- src/segment/meta/compression.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/segment/meta/compression.rs b/src/segment/meta/compression.rs index e1a08b52..eac0358a 100644 --- a/src/segment/meta/compression.rs +++ b/src/segment/meta/compression.rs @@ -116,35 +116,34 @@ mod tests { use test_log::test; #[test] - fn compression_serialize_none() -> crate::Result<()> { + fn compression_serialize_none() { let serialized = CompressionType::None.encode_into_vec(); assert_eq!(2, serialized.len()); - Ok(()) } #[cfg(feature = "lz4")] mod lz4 { use super::*; + use test_log::test; #[test] - fn compression_serialize_none() -> crate::Result<()> { + fn compression_serialize_none() { let serialized = CompressionType::Lz4.encode_into_vec(); assert_eq!(2, serialized.len()); - Ok(()) } } #[cfg(feature = "miniz")] mod miniz { use super::*; + use test_log::test; #[test] - fn compression_serialize_none() -> crate::Result<()> { + fn compression_serialize_none() { for lvl in 0..10 { let serialized = CompressionType::Miniz(lvl).encode_into_vec(); assert_eq!(2, serialized.len()); } - Ok(()) } } } From 96ead4780f07b887d4ffcb3f38929c59e4c2fe4c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 10 Dec 2024 20:54:23 +0100 Subject: [PATCH 71/90] remove dbg log --- src/segment/writer/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 8109afec..3d503adb 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -237,7 +237,6 @@ impl Writer { if Some(&item.key.user_key) != self.current_key.as_ref() { // IMPORTANT: Check that we are not at the first key if self.current_key.is_some() { - dbg!("JUHU"); self.can_rotate = true; } From a9c883ea8a1a4a9c29c2a37e7d5da1ab07f14ad8 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Wed, 11 Dec 2024 20:18:44 +0100 Subject: [PATCH 72/90] doc: update internal docs --- src/descriptor_table/mod.rs | 3 ++ src/segment/block_index/full_index.rs | 4 +- src/segment/block_index/mod.rs | 58 ++++++++++++++-------- src/segment/block_index/top_level.rs | 19 +------ src/segment/block_index/two_level_index.rs | 11 ++-- 5 files changed, 52 insertions(+), 43 deletions(-) diff --git a/src/descriptor_table/mod.rs b/src/descriptor_table/mod.rs index 7715b22b..1f1c6a10 100644 --- a/src/descriptor_table/mod.rs +++ b/src/descriptor_table/mod.rs @@ -53,6 +53,9 @@ pub struct FileDescriptorTableInner { size: AtomicUsize, } +/// The descriptor table caches file descriptors to avoid `fopen()` calls +/// +/// See `TableCache` in `RocksDB`. #[doc(alias("table cache"))] #[allow(clippy::module_name_repetitions)] pub struct FileDescriptorTable { diff --git a/src/segment/block_index/full_index.rs b/src/segment/block_index/full_index.rs index 6b9f69cb..1268465d 100644 --- a/src/segment/block_index/full_index.rs +++ b/src/segment/block_index/full_index.rs @@ -5,9 +5,11 @@ use crate::segment::{ }; use std::{fs::File, io::Seek, path::Path}; -/// Index that translates item keys to block handles +/// Index that translates item keys to data block handles /// /// The index is fully loaded into memory. +/// +/// Currently, a full block index is used for L0 & L1 segments. pub struct FullBlockIndex(Box<[KeyedBlockHandle]>); impl std::ops::Deref for FullBlockIndex { diff --git a/src/segment/block_index/mod.rs b/src/segment/block_index/mod.rs index 67fcea27..a539fcc5 100644 --- a/src/segment/block_index/mod.rs +++ b/src/segment/block_index/mod.rs @@ -18,6 +18,26 @@ use two_level_index::TwoLevelBlockIndex; pub type IndexBlock = Block; +#[allow(clippy::module_name_repetitions)] +pub trait KeyedBlockIndex { + /// Gets the lowest block handle that may contain the given item + fn get_lowest_block_containing_key( + &self, + key: &[u8], + cache_policy: CachePolicy, + ) -> crate::Result>; + + /// Gets the last block handle that may contain the given item + fn get_last_block_containing_key( + &self, + key: &[u8], + cache_policy: CachePolicy, + ) -> crate::Result>; + + /// Returns a handle to the last block + fn get_last_block_handle(&self, cache_policy: CachePolicy) -> crate::Result<&KeyedBlockHandle>; +} + impl KeyedBlockIndex for [KeyedBlockHandle] { fn get_lowest_block_containing_key( &self, @@ -80,26 +100,24 @@ pub trait BlockIndex { fn get_last_block_handle(&self, cache_policy: CachePolicy) -> crate::Result; } -#[allow(clippy::module_name_repetitions)] -pub trait KeyedBlockIndex { - /// Gets the lowest block handle that may contain the given item - fn get_lowest_block_containing_key( - &self, - key: &[u8], - cache_policy: CachePolicy, - ) -> crate::Result>; - - /// Gets the last block handle that may contain the given item - fn get_last_block_containing_key( - &self, - key: &[u8], - cache_policy: CachePolicy, - ) -> crate::Result>; - - /// Returns a handle to the last block - fn get_last_block_handle(&self, cache_policy: CachePolicy) -> crate::Result<&KeyedBlockHandle>; -} - +/// The block index stores references to the positions of blocks on a file and their size +/// +/// __________________ +/// | | +/// | BLOCK0 | +/// |________________| <- 'G': 0x0 +/// | | +/// | BLOCK1 | +/// |________________| <- 'M': 0x... +/// | | +/// | BLOCK2 | +/// |________________| <- 'Z': 0x... +/// +/// The block information can be accessed by key. +/// Because the blocks are sorted, any entries not covered by the index (it is sparse) can be +/// found by finding the highest block that has a lower or equal end key than the searched key (by performing in-memory binary search). +/// In the diagram above, searching for 'J' yields the block starting with 'G'. +/// 'J' must be in that block, because the next block starts with 'M'). #[enum_dispatch::enum_dispatch(BlockIndex)] #[allow(clippy::module_name_repetitions)] pub enum BlockIndexImpl { diff --git a/src/segment/block_index/top_level.rs b/src/segment/block_index/top_level.rs index c37bbc3c..4395e8f7 100644 --- a/src/segment/block_index/top_level.rs +++ b/src/segment/block_index/top_level.rs @@ -9,24 +9,9 @@ use crate::segment::{ }; use std::{fs::File, path::Path}; -/// The block index stores references to the positions of blocks on a file and their size +/// The top-level index (TLI) is the level-0 index in a partitioned (two-level) block index /// -/// __________________ -/// | | -/// | BLOCK0 | -/// |________________| <- 'G': 0x0 -/// | | -/// | BLOCK1 | -/// |________________| <- 'M': 0x... -/// | | -/// | BLOCK2 | -/// |________________| <- 'Z': 0x... -/// -/// The block information can be accessed by key. -/// Because the blocks are sorted, any entries not covered by the index (it is sparse) can be -/// found by finding the highest block that has a lower or equal end key than the searched key (by performing in-memory binary search). -/// In the diagram above, searching for 'J' yields the block starting with 'G'. -/// 'J' must be in that block, because the next block starts with 'M'). +/// See `top_level_index.rs` for more info. #[allow(clippy::module_name_repetitions)] #[derive(Debug)] pub struct TopLevelIndex(Box<[KeyedBlockHandle]>); diff --git a/src/segment/block_index/two_level_index.rs b/src/segment/block_index/two_level_index.rs index eaa0bc1e..1d454548 100644 --- a/src/segment/block_index/two_level_index.rs +++ b/src/segment/block_index/two_level_index.rs @@ -29,24 +29,25 @@ impl IndexBlockFetcher { } } -/// Index that translates item keys to block handles +/// Index that translates item keys to data block handles /// /// The index is only partially loaded into memory. /// /// See #[allow(clippy::module_name_repetitions)] pub struct TwoLevelBlockIndex { - descriptor_table: Arc, - - /// Segment ID segment_id: GlobalSegmentId, + descriptor_table: Arc, + /// Level-0 index. Is read-only and always fully loaded. /// /// This index points to index blocks inside the level-1 index. pub(crate) top_level_index: TopLevelIndex, - /// Level-1 index. This index is only partially loaded into memory, decreasing memory usage, compared to a fully loaded one. + /// Level-1 index. + /// + /// This index is only partially loaded into memory, decreasing memory usage, compared to a fully loaded one. /// /// However to find a disk block, one layer of indirection is required: /// From 8332b1a34b19936025594b28813c3e063619610a Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Thu, 12 Dec 2024 01:21:56 +0100 Subject: [PATCH 73/90] refactor: segment writer --- src/segment/writer/mod.rs | 57 +++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 8 deletions(-) diff --git a/src/segment/writer/mod.rs b/src/segment/writer/mod.rs index 3d503adb..eb69a1ce 100644 --- a/src/segment/writer/mod.rs +++ b/src/segment/writer/mod.rs @@ -256,7 +256,7 @@ impl Writer { let seqno = item.key.seqno; if self.meta.first_key.is_none() { - self.meta.first_key = Some(item.key.clone().user_key); + self.meta.first_key = Some(item.key.user_key.clone()); } self.chunk_size += item.size(); @@ -266,13 +266,8 @@ impl Writer { self.spill_block()?; } - if self.meta.lowest_seqno > seqno { - self.meta.lowest_seqno = seqno; - } - - if self.meta.highest_seqno < seqno { - self.meta.highest_seqno = seqno; - } + self.meta.lowest_seqno = self.meta.lowest_seqno.min(seqno); + self.meta.highest_seqno = self.meta.highest_seqno.max(seqno); Ok(()) } @@ -391,6 +386,52 @@ mod tests { use std::sync::Arc; use test_log::test; + #[test] + fn segment_writer_seqnos() -> crate::Result<()> { + let folder = tempfile::tempdir()?.into_path(); + + let segment_id = 532; + + let mut writer = Writer::new(Options { + folder, + data_block_size: 4_096, + index_block_size: 4_096, + segment_id, + })?; + + writer.write(InternalValue::from_components( + "a", + nanoid::nanoid!().as_bytes(), + 7, + ValueType::Value, + ))?; + writer.write(InternalValue::from_components( + "b", + nanoid::nanoid!().as_bytes(), + 5, + ValueType::Value, + ))?; + writer.write(InternalValue::from_components( + "c", + nanoid::nanoid!().as_bytes(), + 8, + ValueType::Value, + ))?; + writer.write(InternalValue::from_components( + "d", + nanoid::nanoid!().as_bytes(), + 10, + ValueType::Value, + ))?; + + let trailer = writer.finish()?.expect("should exist"); + + assert_eq!(5, trailer.metadata.seqnos.0); + assert_eq!(10, trailer.metadata.seqnos.1); + + Ok(()) + } + #[test] #[cfg(feature = "bloom")] fn segment_writer_zero_bpk() -> crate::Result<()> { From 88575ef29017cba26f4854d78ac3b84efed0896d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 14 Dec 2024 14:42:51 +0100 Subject: [PATCH 74/90] move module --- src/compaction/worker.rs | 2 +- src/{segment => }/level_reader.rs | 7 +++++-- src/lib.rs | 2 ++ src/range.rs | 3 ++- src/segment/mod.rs | 1 - 5 files changed, 10 insertions(+), 5 deletions(-) rename src/{segment => }/level_reader.rs (98%) diff --git a/src/compaction/worker.rs b/src/compaction/worker.rs index 25011f6a..a45aedf7 100644 --- a/src/compaction/worker.rs +++ b/src/compaction/worker.rs @@ -7,13 +7,13 @@ use crate::{ compaction::{stream::CompactionStream, Choice}, file::SEGMENTS_FOLDER, level_manifest::LevelManifest, + level_reader::LevelReader, merge::{BoxedIterator, Merger}, segment::{ block_index::{ full_index::FullBlockIndex, two_level_index::TwoLevelBlockIndex, BlockIndexImpl, }, id::GlobalSegmentId, - level_reader::LevelReader, multi_writer::MultiWriter, Segment, SegmentInner, }, diff --git a/src/segment/level_reader.rs b/src/level_reader.rs similarity index 98% rename from src/segment/level_reader.rs rename to src/level_reader.rs index 792ca4f1..334c6256 100644 --- a/src/segment/level_reader.rs +++ b/src/level_reader.rs @@ -2,8 +2,11 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use super::{range::Range, value_block::CachePolicy}; -use crate::{level_manifest::level::Level, InternalValue, UserKey}; +use crate::{ + level_manifest::level::Level, + segment::{range::Range, value_block::CachePolicy}, + InternalValue, UserKey, +}; use std::{ops::Bound, sync::Arc}; /// Reads through a disjoint level diff --git a/src/lib.rs b/src/lib.rs index efdc8421..69b2b63c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -154,6 +154,8 @@ mod key_range; #[doc(hidden)] pub mod level_manifest; +mod level_reader; + mod manifest; mod memtable; diff --git a/src/range.rs b/src/range.rs index 829bc7b1..404550b5 100644 --- a/src/range.rs +++ b/src/range.rs @@ -5,11 +5,12 @@ use crate::{ key::InternalKey, level_manifest::LevelManifest, + level_reader::LevelReader, memtable::Memtable, merge::{BoxedIterator, Merger}, multi_reader::MultiReader, mvcc_stream::MvccStream, - segment::{level_reader::LevelReader, value_block::CachePolicy}, + segment::value_block::CachePolicy, tree::inner::SealedMemtables, value::{SeqNo, UserKey}, InternalValue, diff --git a/src/segment/mod.rs b/src/segment/mod.rs index 64119753..b8b2c06f 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -7,7 +7,6 @@ pub mod block_index; pub mod file_offsets; pub mod id; pub mod inner; -pub mod level_reader; pub mod meta; pub mod multi_writer; pub mod range; From b9c1bcd1968f7d9078bfe92efec2b759546bd4ba Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 14 Dec 2024 14:54:35 +0100 Subject: [PATCH 75/90] perf: upgrade crossbeam-skiplist to skip heap allocation in get() --- Cargo.toml | 2 +- src/key.rs | 52 +++++++++++++++++++++++++++++++++++++++++++++ src/memtable/mod.rs | 4 ++-- 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1971cd60..d7d8958c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ bytes = ["value-log/bytes"] [dependencies] byteorder = "1.5.0" -crossbeam-skiplist = "0.1.3" +crossbeam-skiplist = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "45425b032b75d40c8f79be2133eb7d33aaa1d4e4", package = "crossbeam-skiplist" } double-ended-peekable = "0.1.0" enum_dispatch = "0.3.13" guardian = "1.1.0" diff --git a/src/key.rs b/src/key.rs index 6693be0a..b52729da 100644 --- a/src/key.rs +++ b/src/key.rs @@ -7,6 +7,7 @@ use crate::{ SeqNo, UserKey, ValueType, }; use byteorder::{ReadBytesExt, WriteBytesExt}; +use crossbeam_skiplist::equivalent::{Comparable, Equivalent}; use std::{ cmp::Reverse, io::{Read, Write}, @@ -104,3 +105,54 @@ impl Ord for InternalKey { (&self.user_key, Reverse(self.seqno)).cmp(&(&other.user_key, Reverse(other.seqno))) } } + +impl Equivalent> for InternalKey { + fn equivalent(&self, other: &InternalKeyRef<'_>) -> bool { + self.user_key == other.user_key && self.seqno == other.seqno + } +} + +impl Comparable> for InternalKey { + fn compare(&self, other: &InternalKeyRef<'_>) -> std::cmp::Ordering { + (&*self.user_key, Reverse(self.seqno)).cmp(&(other.user_key, Reverse(other.seqno))) + } +} + +// REF + +// Temporary internal key without heap allocation +#[derive(Debug, Eq)] +pub struct InternalKeyRef<'a> { + pub user_key: &'a [u8], + pub seqno: SeqNo, + pub value_type: ValueType, +} + +impl<'a> InternalKeyRef<'a> { + // Constructor for InternalKeyRef + pub fn new(user_key: &'a [u8], seqno: u64, value_type: ValueType) -> Self { + InternalKeyRef { + user_key, + seqno, + value_type, + } + } +} + +impl<'a> PartialEq for InternalKeyRef<'a> { + fn eq(&self, other: &Self) -> bool { + self.user_key == other.user_key && self.seqno == other.seqno + } +} + +impl<'a> PartialOrd for InternalKeyRef<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl<'a> Ord for InternalKeyRef<'a> { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + (&self.user_key, Reverse(self.seqno)).cmp(&(&other.user_key, Reverse(other.seqno))) + } +} diff --git a/src/memtable/mod.rs b/src/memtable/mod.rs index a5b390f1..b96ea0a9 100644 --- a/src/memtable/mod.rs +++ b/src/memtable/mod.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::key::InternalKey; +use crate::key::{InternalKey, InternalKeyRef}; use crate::segment::block::ItemSize; use crate::value::{InternalValue, SeqNo, UserValue, ValueType}; use crossbeam_skiplist::SkipMap; @@ -84,7 +84,7 @@ impl Memtable { // abcdef -> 6 // abcdef -> 5 // - let lower_bound = InternalKey::new( + let lower_bound = InternalKeyRef::new( key, match seqno { Some(seqno) => seqno - 1, From 2d29e5545b43f48c4d1a3ac66f1de8e18923166d Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 14 Dec 2024 15:01:15 +0100 Subject: [PATCH 76/90] perf: simplify segment point read fast path --- src/segment/mod.rs | 12 ++---------- tests/segment_remove_weak.rs | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 10 deletions(-) create mode 100644 tests/segment_remove_weak.rs diff --git a/src/segment/mod.rs b/src/segment/mod.rs index b8b2c06f..da2ab2d9 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -334,7 +334,7 @@ impl Segment { key: K, seqno: Option, ) -> crate::Result> { - use crate::{mvcc_stream::MvccStream, ValueType}; + use crate::mvcc_stream::MvccStream; use block_index::BlockIndex; use value_block::{CachePolicy, ValueBlock}; use value_block_consumer::ValueBlockConsumer; @@ -365,15 +365,7 @@ impl Segment { // (see explanation for that below) // This only really works because sequence numbers are sorted // in descending order - let Some(latest) = block.get_latest(key.as_ref()) else { - return Ok(None); - }; - - if latest.key.value_type == ValueType::WeakTombstone { - // NOTE: Continue in slow path - } else { - return Ok(Some(latest.clone())); - } + return Ok(block.get_latest(key.as_ref()).cloned()); } // TODO: it would be nice to have the possibility of using a lifetime'd diff --git a/tests/segment_remove_weak.rs b/tests/segment_remove_weak.rs new file mode 100644 index 00000000..614de031 --- /dev/null +++ b/tests/segment_remove_weak.rs @@ -0,0 +1,22 @@ +use lsm_tree::{AbstractTree, Config}; +use test_log::test; + +#[test] +fn segment_remove_weak_simple() -> lsm_tree::Result<()> { + let folder = tempfile::tempdir()?.into_path(); + + let tree = Config::new(folder) + .data_block_size(1_024) + .index_block_size(1_024) + .open()?; + + tree.insert("a", "a", 0); + tree.insert("a", "b", 1); + tree.remove_weak("a", 2); + + tree.flush_active_memtable(0)?; + + assert!(tree.get("a")?.is_none()); + + Ok(()) +} From 164adb5a459572a064b177cce10c74469d062606 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 14 Dec 2024 15:01:53 +0100 Subject: [PATCH 77/90] revert version change --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index d7d8958c..11398bd2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "lsm-tree" description = "A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs)" license = "MIT OR Apache-2.0" -version = "2.5.0" +version = "2.4.0" edition = "2021" rust-version = "1.74.0" readme = "README.md" From 26847b3a7b5cc7a7138c9c47f9ba5edf2929e546 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 14 Dec 2024 15:13:03 +0100 Subject: [PATCH 78/90] 2.5.0-pre.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index d5641c7e..b29604aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "lsm-tree" description = "A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs)" license = "MIT OR Apache-2.0" -version = "2.4.0" +version = "2.5.0-pre.0" edition = "2021" rust-version = "1.74.0" readme = "README.md" From ab2843416d86f6c0d770de6d153606fa1246cb81 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sat, 14 Dec 2024 23:10:30 +0100 Subject: [PATCH 79/90] perf: remove heap allocation in snapshot point read path + also not using MvccStream because we just need to terminate on first matching item --- src/segment/mod.rs | 11 +++----- src/segment/reader.rs | 6 ++++- src/segment/value_block_consumer.rs | 41 +++++++++++------------------ 3 files changed, 23 insertions(+), 35 deletions(-) diff --git a/src/segment/mod.rs b/src/segment/mod.rs index da2ab2d9..c9243850 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -334,7 +334,6 @@ impl Segment { key: K, seqno: Option, ) -> crate::Result> { - use crate::mvcc_stream::MvccStream; use block_index::BlockIndex; use value_block::{CachePolicy, ValueBlock}; use value_block_consumer::ValueBlockConsumer; @@ -379,11 +378,7 @@ impl Segment { None, ); reader.lo_block_size = block.header.data_length.into(); - reader.lo_block_items = Some(ValueBlockConsumer::with_bounds( - block, - &Some(key.into()), // TODO: this may cause a heap alloc - &None, - )); + reader.lo_block_items = Some(ValueBlockConsumer::with_bounds(block, Some(key), None)); reader.lo_initialized = true; // NOTE: For finding a specific seqno, @@ -403,7 +398,7 @@ impl Segment { // unfortunately is in the next block // // Also because of weak tombstones, we may have to look further than the first item we encounter - let reader = reader.filter(|x| { + let mut reader = reader.filter(|x| { match x { Ok(entry) => { // Check for seqno if needed @@ -417,7 +412,7 @@ impl Segment { } }); - let Some(entry) = MvccStream::new(reader).next().transpose()? else { + let Some(entry) = reader.next().transpose()? else { return Ok(None); }; diff --git a/src/segment/reader.rs b/src/segment/reader.rs index f26bd687..058110b0 100644 --- a/src/segment/reader.rs +++ b/src/segment/reader.rs @@ -103,7 +103,11 @@ impl Reader { Ok(Some(( block.header.data_length.into(), block.header.previous_block_offset, - ValueBlockConsumer::with_bounds(block, &self.start_key, &self.end_key), + ValueBlockConsumer::with_bounds( + block, + self.start_key.as_deref(), + self.end_key.as_deref(), + ), ))) }) } diff --git a/src/segment/value_block_consumer.rs b/src/segment/value_block_consumer.rs index 5e70efd5..0e399d28 100644 --- a/src/segment/value_block_consumer.rs +++ b/src/segment/value_block_consumer.rs @@ -3,7 +3,7 @@ // (found in the LICENSE-* files in the repository) use super::value_block::ValueBlock; -use crate::{value::InternalValue, UserKey}; +use crate::value::InternalValue; use std::sync::Arc; pub struct ValueBlockConsumer { @@ -15,14 +15,14 @@ pub struct ValueBlockConsumer { impl ValueBlockConsumer { #[must_use] pub fn new(inner: Arc) -> Self { - Self::with_bounds(inner, &None, &None) + Self::with_bounds(inner, None, None) } #[must_use] pub fn with_bounds( inner: Arc, - start_key: &Option, - end_key: &Option, + start_key: Option<&[u8]>, + end_key: Option<&[u8]>, ) -> Self { let mut lo = start_key.as_ref().map_or(0, |key| { inner.items.partition_point(|x| &*x.key.user_key < *key) @@ -90,12 +90,9 @@ impl DoubleEndedIterator for ValueBlockConsumer { #[allow(clippy::expect_used)] mod tests { use super::*; - use crate::{ - segment::{ - block::{checksum::Checksum, header::Header}, - value_block::BlockOffset, - }, - Slice, + use crate::segment::{ + block::{checksum::Checksum, header::Header}, + value_block::BlockOffset, }; use test_log::test; @@ -227,15 +224,13 @@ mod tests { InternalValue::from_components(*b"e", vec![], 0, crate::ValueType::Value), ]); - let mut iter = - ValueBlockConsumer::with_bounds(block.clone().into(), &Some(Slice::from(*b"c")), &None); + let mut iter = ValueBlockConsumer::with_bounds(block.clone().into(), Some(b"c"), None); assert_eq!(*b"c", &*iter.next().expect("should exist").key.user_key); assert_eq!(*b"d", &*iter.next().expect("should exist").key.user_key); assert_eq!(*b"e", &*iter.next().expect("should exist").key.user_key); iter_closed!(iter); - let mut iter = - ValueBlockConsumer::with_bounds(block.into(), &Some(Slice::from(*b"c")), &None); + let mut iter = ValueBlockConsumer::with_bounds(block.into(), Some(b"c"), None); assert_eq!( *b"e", &*iter.next_back().expect("should exist").key.user_key @@ -261,15 +256,13 @@ mod tests { InternalValue::from_components(*b"e", vec![], 0, crate::ValueType::Value), ]); - let mut iter = - ValueBlockConsumer::with_bounds(block.clone().into(), &None, &Some(Slice::from(*b"c"))); + let mut iter = ValueBlockConsumer::with_bounds(block.clone().into(), None, Some(b"c")); assert_eq!(*b"a", &*iter.next().expect("should exist").key.user_key); assert_eq!(*b"b", &*iter.next().expect("should exist").key.user_key); assert_eq!(*b"c", &*iter.next().expect("should exist").key.user_key); iter_closed!(iter); - let mut iter = - ValueBlockConsumer::with_bounds(block.into(), &None, &Some(Slice::from(*b"c"))); + let mut iter = ValueBlockConsumer::with_bounds(block.into(), None, Some(b"c")); assert_eq!( *b"c", &*iter.next_back().expect("should exist").key.user_key @@ -294,12 +287,10 @@ mod tests { InternalValue::from_components(*b"e", vec![], 0, crate::ValueType::Value), ]); - let mut iter = - ValueBlockConsumer::with_bounds(block.clone().into(), &None, &Some(Slice::from(*b"a"))); + let mut iter = ValueBlockConsumer::with_bounds(block.clone().into(), None, Some(b"a")); iter_closed!(iter); - let mut iter = - ValueBlockConsumer::with_bounds(block.into(), &None, &Some(Slice::from(*b"a"))).rev(); + let mut iter = ValueBlockConsumer::with_bounds(block.into(), None, Some(b"a")).rev(); iter_closed!(iter); } @@ -313,12 +304,10 @@ mod tests { InternalValue::from_components(*b"e", vec![], 0, crate::ValueType::Value), ]); - let mut iter = - ValueBlockConsumer::with_bounds(block.clone().into(), &Some(Slice::from(*b"f")), &None); + let mut iter = ValueBlockConsumer::with_bounds(block.clone().into(), Some(b"f"), None); iter_closed!(iter); - let mut iter = - ValueBlockConsumer::with_bounds(block.into(), &Some(Slice::from(*b"f")), &None).rev(); + let mut iter = ValueBlockConsumer::with_bounds(block.into(), Some(b"f"), None).rev(); iter_closed!(iter); } } From 9113cb463a4c00bdbb47921163b97875f63f530c Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 17 Dec 2024 18:20:46 +0100 Subject: [PATCH 80/90] perf: specialize Segment reader for snapshot point reads ~15-20ns shaved off --- src/segment/forward_reader.rs | 132 ++++++++++++++++++++++++++++++++++ src/segment/mod.rs | 12 ++-- src/segment/reader.rs | 3 +- 3 files changed, 139 insertions(+), 8 deletions(-) create mode 100644 src/segment/forward_reader.rs diff --git a/src/segment/forward_reader.rs b/src/segment/forward_reader.rs new file mode 100644 index 00000000..ba203c7c --- /dev/null +++ b/src/segment/forward_reader.rs @@ -0,0 +1,132 @@ +// Copyright (c) 2024-present, fjall-rs +// This source code is licensed under both the Apache 2.0 and MIT License +// (found in the LICENSE-* files in the repository) + +use super::{ + value_block::{BlockOffset, CachePolicy, ValueBlock}, + value_block_consumer::ValueBlockConsumer, +}; +use crate::{ + descriptor_table::FileDescriptorTable, segment::block::header::Header, value::InternalValue, + BlockCache, GlobalSegmentId, +}; + +/// Segment forward reader specialized for point reads +pub struct ForwardReader<'a> { + segment_id: GlobalSegmentId, + + descriptor_table: &'a FileDescriptorTable, + block_cache: &'a BlockCache, + + data_block_boundary: BlockOffset, + + pub lo_block_offset: BlockOffset, + pub(crate) lo_block_size: u64, + pub(crate) lo_block_items: Option, + pub(crate) lo_initialized: bool, + + cache_policy: CachePolicy, +} + +impl<'a> ForwardReader<'a> { + #[must_use] + pub fn new( + data_block_boundary: BlockOffset, + descriptor_table: &'a FileDescriptorTable, + segment_id: GlobalSegmentId, + block_cache: &'a BlockCache, + lo_block_offset: BlockOffset, + ) -> Self { + Self { + descriptor_table, + segment_id, + block_cache, + + data_block_boundary, + + lo_block_offset, + lo_block_size: 0, + lo_block_items: None, + lo_initialized: false, + + cache_policy: CachePolicy::Write, + } + } + + fn load_data_block( + &self, + offset: BlockOffset, + ) -> crate::Result> { + let block = ValueBlock::load_by_block_handle( + self.descriptor_table, + self.block_cache, + self.segment_id, + offset, + self.cache_policy, + )?; + + // Truncate as many items as possible + block.map_or(Ok(None), |block| { + Ok(Some(( + block.header.data_length.into(), + block.header.previous_block_offset, + ValueBlockConsumer::with_bounds(block, None, None), + ))) + }) + } + + fn initialize_lo(&mut self) -> crate::Result<()> { + if let Some((size, _, items)) = self.load_data_block(self.lo_block_offset)? { + self.lo_block_items = Some(items); + self.lo_block_size = size; + } + + self.lo_initialized = true; + + Ok(()) + } +} + +impl<'a> Iterator for ForwardReader<'a> { + type Item = crate::Result; + + fn next(&mut self) -> Option { + if !self.lo_initialized { + fail_iter!(self.initialize_lo()); + } + + if let Some(head) = self.lo_block_items.as_mut()?.next() { + // Just consume item + return Some(Ok(head)); + } + + // Load next block + let next_block_offset = BlockOffset( + *self.lo_block_offset + Header::serialized_len() as u64 + self.lo_block_size, + ); + + if next_block_offset >= self.data_block_boundary { + // We are done + return None; + } + + assert_ne!( + self.lo_block_offset, next_block_offset, + "invalid next block offset", + ); + + match fail_iter!(self.load_data_block(next_block_offset)) { + Some((size, _, items)) => { + self.lo_block_items = Some(items); + self.lo_block_size = size; + self.lo_block_offset = next_block_offset; + + // We just loaded the block + self.lo_block_items.as_mut()?.next().map(Ok) + } + None => { + panic!("searched for invalid data block"); + } + } + } +} diff --git a/src/segment/mod.rs b/src/segment/mod.rs index c9243850..229fe12a 100644 --- a/src/segment/mod.rs +++ b/src/segment/mod.rs @@ -5,6 +5,7 @@ pub mod block; pub mod block_index; pub mod file_offsets; +mod forward_reader; pub mod id; pub mod inner; pub mod meta; @@ -19,12 +20,12 @@ pub mod writer; use crate::{ block_cache::BlockCache, descriptor_table::FileDescriptorTable, - segment::reader::Reader, time::unix_timestamp, tree::inner::TreeId, value::{InternalValue, SeqNo, UserKey}, }; use block_index::BlockIndexImpl; +use forward_reader::ForwardReader; use id::GlobalSegmentId; use inner::Inner; use meta::SegmentId; @@ -367,15 +368,12 @@ impl Segment { return Ok(block.get_latest(key.as_ref()).cloned()); } - // TODO: it would be nice to have the possibility of using a lifetime'd - // reader, so we don't need to Arc::clone descriptor_table, and block_cache - let mut reader = Reader::new( + let mut reader = ForwardReader::new( self.offsets.index_block_ptr, - self.descriptor_table.clone(), + &self.descriptor_table, self.global_id(), - self.block_cache.clone(), + &self.block_cache, first_block_handle, - None, ); reader.lo_block_size = block.header.data_length.into(); reader.lo_block_items = Some(ValueBlockConsumer::with_bounds(block, Some(key), None)); diff --git a/src/segment/reader.rs b/src/segment/reader.rs index 058110b0..932fabb6 100644 --- a/src/segment/reader.rs +++ b/src/segment/reader.rs @@ -13,8 +13,9 @@ use crate::{ use std::sync::Arc; pub struct Reader { - descriptor_table: Arc, segment_id: GlobalSegmentId, + + descriptor_table: Arc, block_cache: Arc, data_block_boundary: BlockOffset, From 8175914c711d43aadcde6fee9b78e9c608904de5 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 20 Dec 2024 14:44:24 +0100 Subject: [PATCH 81/90] revert crossbeam-skiplist for now --- Cargo.toml | 2 +- src/key.rs | 98 ++++++++++++++++++++++----------------------- src/memtable/mod.rs | 4 +- 3 files changed, 52 insertions(+), 52 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b29604aa..57cda897 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ bytes = ["value-log/bytes"] [dependencies] byteorder = "1.5.0" -crossbeam-skiplist = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "45425b032b75d40c8f79be2133eb7d33aaa1d4e4", package = "crossbeam-skiplist" } +crossbeam-skiplist = "0.1.3" double-ended-peekable = "0.1.0" enum_dispatch = "0.3.13" guardian = "1.1.0" diff --git a/src/key.rs b/src/key.rs index b52729da..45ed596f 100644 --- a/src/key.rs +++ b/src/key.rs @@ -7,7 +7,6 @@ use crate::{ SeqNo, UserKey, ValueType, }; use byteorder::{ReadBytesExt, WriteBytesExt}; -use crossbeam_skiplist::equivalent::{Comparable, Equivalent}; use std::{ cmp::Reverse, io::{Read, Write}, @@ -106,53 +105,54 @@ impl Ord for InternalKey { } } -impl Equivalent> for InternalKey { - fn equivalent(&self, other: &InternalKeyRef<'_>) -> bool { - self.user_key == other.user_key && self.seqno == other.seqno - } -} - -impl Comparable> for InternalKey { - fn compare(&self, other: &InternalKeyRef<'_>) -> std::cmp::Ordering { - (&*self.user_key, Reverse(self.seqno)).cmp(&(other.user_key, Reverse(other.seqno))) - } -} - -// REF +// TODO: wait for new crossbeam-skiplist +// TODO: https://github.com/crossbeam-rs/crossbeam/pull/1162 +// +// impl Equivalent> for InternalKey { +// fn equivalent(&self, other: &InternalKeyRef<'_>) -> bool { +// self.user_key == other.user_key && self.seqno == other.seqno +// } +// } + +// impl Comparable> for InternalKey { +// fn compare(&self, other: &InternalKeyRef<'_>) -> std::cmp::Ordering { +// (&*self.user_key, Reverse(self.seqno)).cmp(&(other.user_key, Reverse(other.seqno))) +// } +// } // Temporary internal key without heap allocation -#[derive(Debug, Eq)] -pub struct InternalKeyRef<'a> { - pub user_key: &'a [u8], - pub seqno: SeqNo, - pub value_type: ValueType, -} - -impl<'a> InternalKeyRef<'a> { - // Constructor for InternalKeyRef - pub fn new(user_key: &'a [u8], seqno: u64, value_type: ValueType) -> Self { - InternalKeyRef { - user_key, - seqno, - value_type, - } - } -} - -impl<'a> PartialEq for InternalKeyRef<'a> { - fn eq(&self, other: &Self) -> bool { - self.user_key == other.user_key && self.seqno == other.seqno - } -} - -impl<'a> PartialOrd for InternalKeyRef<'a> { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl<'a> Ord for InternalKeyRef<'a> { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - (&self.user_key, Reverse(self.seqno)).cmp(&(&other.user_key, Reverse(other.seqno))) - } -} +// #[derive(Debug, Eq)] +// pub struct InternalKeyRef<'a> { +// pub user_key: &'a [u8], +// pub seqno: SeqNo, +// pub value_type: ValueType, +// } + +// impl<'a> InternalKeyRef<'a> { +// // Constructor for InternalKeyRef +// pub fn new(user_key: &'a [u8], seqno: u64, value_type: ValueType) -> Self { +// InternalKeyRef { +// user_key, +// seqno, +// value_type, +// } +// } +// } + +// impl<'a> PartialEq for InternalKeyRef<'a> { +// fn eq(&self, other: &Self) -> bool { +// self.user_key == other.user_key && self.seqno == other.seqno +// } +// } + +// impl<'a> PartialOrd for InternalKeyRef<'a> { +// fn partial_cmp(&self, other: &Self) -> Option { +// Some(self.cmp(other)) +// } +// } + +// impl<'a> Ord for InternalKeyRef<'a> { +// fn cmp(&self, other: &Self) -> std::cmp::Ordering { +// (&self.user_key, Reverse(self.seqno)).cmp(&(&other.user_key, Reverse(other.seqno))) +// } +// } diff --git a/src/memtable/mod.rs b/src/memtable/mod.rs index b96ea0a9..a5b390f1 100644 --- a/src/memtable/mod.rs +++ b/src/memtable/mod.rs @@ -2,7 +2,7 @@ // This source code is licensed under both the Apache 2.0 and MIT License // (found in the LICENSE-* files in the repository) -use crate::key::{InternalKey, InternalKeyRef}; +use crate::key::InternalKey; use crate::segment::block::ItemSize; use crate::value::{InternalValue, SeqNo, UserValue, ValueType}; use crossbeam_skiplist::SkipMap; @@ -84,7 +84,7 @@ impl Memtable { // abcdef -> 6 // abcdef -> 5 // - let lower_bound = InternalKeyRef::new( + let lower_bound = InternalKey::new( key, match seqno { Some(seqno) => seqno - 1, From 72f69fdd7c3329bdc11f42da65635008e1db80ef Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 20 Dec 2024 17:12:59 +0100 Subject: [PATCH 82/90] refactor: simplify get_internal_entry --- src/blob_tree/gc/reader.rs | 2 +- src/tree/mod.rs | 43 +++++++++--------------------------- tests/major_compaction.rs | 6 ++--- tests/segment_point_reads.rs | 4 ++-- tests/snapshot_point_read.rs | 2 +- tests/tree_write_read.rs | 12 +++++----- 6 files changed, 23 insertions(+), 46 deletions(-) diff --git a/src/blob_tree/gc/reader.rs b/src/blob_tree/gc/reader.rs index 2ab94957..6cef7e2a 100644 --- a/src/blob_tree/gc/reader.rs +++ b/src/blob_tree/gc/reader.rs @@ -20,7 +20,7 @@ impl<'a> GcReader<'a> { fn get_internal(&self, key: &[u8]) -> crate::Result> { let Some(item) = self .tree - .get_internal_entry_with_lock(self.memtable, key, true, None)? + .get_internal_entry_with_lock(self.memtable, key, None)? .map(|x| x.value) else { return Ok(None); diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 4e609734..7fcab188 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -355,13 +355,11 @@ impl AbstractTree for Tree { key: K, seqno: SeqNo, ) -> crate::Result> { - Ok(self - .get_internal_entry(key, true, Some(seqno))? - .map(|x| x.value)) + Ok(self.get_internal_entry(key, Some(seqno))?.map(|x| x.value)) } fn get>(&self, key: K) -> crate::Result> { - Ok(self.get_internal_entry(key, true, None)?.map(|x| x.value)) + Ok(self.get_internal_entry(key, None)?.map(|x| x.value)) } fn iter_with_seqno( @@ -578,25 +576,18 @@ impl Tree { &self, memtable_lock: &RwLockWriteGuard<'_, Memtable>, key: K, - evict_tombstone: bool, seqno: Option, ) -> crate::Result> { if let Some(entry) = memtable_lock.get(&key, seqno) { - if evict_tombstone { - return Ok(ignore_tombstone_value(entry)); - } - return Ok(Some(entry)); + return Ok(ignore_tombstone_value(entry)); }; // Now look in sealed memtables if let Some(entry) = self.get_internal_entry_from_sealed_memtables(&key, seqno) { - if evict_tombstone { - return Ok(ignore_tombstone_value(entry)); - } - return Ok(Some(entry)); + return Ok(ignore_tombstone_value(entry)); } - self.get_internal_entry_from_segments(key, evict_tombstone, seqno) + self.get_internal_entry_from_segments(key, seqno) } fn get_internal_entry_from_sealed_memtables>( @@ -618,7 +609,6 @@ impl Tree { fn get_internal_entry_from_segments>( &self, key: K, - evict_tombstone: bool, // TODO: remove?, just always true seqno: Option, ) -> crate::Result> { // NOTE: Create key hash for hash sharing @@ -648,10 +638,7 @@ impl Tree { let maybe_item = segment.get_with_hash(&key, seqno, key_hash)?; if let Some(item) = maybe_item { - if evict_tombstone { - return Ok(ignore_tombstone_value(item)); - } - return Ok(Some(item)); + return Ok(ignore_tombstone_value(item)); } } @@ -668,10 +655,7 @@ impl Tree { let maybe_item = segment.get_with_hash(&key, seqno, key_hash)?; if let Some(item) = maybe_item { - if evict_tombstone { - return Ok(ignore_tombstone_value(item)); - } - return Ok(Some(item)); + return Ok(ignore_tombstone_value(item)); } } } @@ -683,7 +667,6 @@ impl Tree { pub fn get_internal_entry>( &self, key: K, - evict_tombstone: bool, // TODO: remove?, just always true seqno: Option, ) -> crate::Result> { // TODO: consolidate memtable & sealed behind single RwLock @@ -691,24 +674,18 @@ impl Tree { let memtable_lock = self.active_memtable.read().expect("lock is poisoned"); if let Some(entry) = memtable_lock.get(&key, seqno) { - if evict_tombstone { - return Ok(ignore_tombstone_value(entry)); - } - return Ok(Some(entry)); + return Ok(ignore_tombstone_value(entry)); }; drop(memtable_lock); // Now look in sealed memtables if let Some(entry) = self.get_internal_entry_from_sealed_memtables(&key, seqno) { - if evict_tombstone { - return Ok(ignore_tombstone_value(entry)); - } - return Ok(Some(entry)); + return Ok(ignore_tombstone_value(entry)); } // Now look in segments... this may involve disk I/O - self.get_internal_entry_from_segments(key, evict_tombstone, seqno) + self.get_internal_entry_from_segments(key, seqno) } #[doc(hidden)] diff --git a/tests/major_compaction.rs b/tests/major_compaction.rs index 9976002f..f5f92bf5 100644 --- a/tests/major_compaction.rs +++ b/tests/major_compaction.rs @@ -20,17 +20,17 @@ fn tree_major_compaction() -> lsm_tree::Result<()> { tree.major_compact(u64::MAX, 1_000 /* NOTE: Simulate some time passing */)?; assert_eq!(1, tree.segment_count()); - let item = tree.get_internal_entry("a", true, None)?.unwrap(); + let item = tree.get_internal_entry("a", None)?.unwrap(); assert_eq!(&*item.key.user_key, "a".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 0); - let item = tree.get_internal_entry("b", true, None)?.unwrap(); + let item = tree.get_internal_entry("b", None)?.unwrap(); assert_eq!(&*item.key.user_key, "b".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 1); - let item = tree.get_internal_entry("c", true, None)?.unwrap(); + let item = tree.get_internal_entry("c", None)?.unwrap(); assert_eq!(&*item.key.user_key, "c".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 2); diff --git a/tests/segment_point_reads.rs b/tests/segment_point_reads.rs index 5dd39b03..107e5a95 100644 --- a/tests/segment_point_reads.rs +++ b/tests/segment_point_reads.rs @@ -47,7 +47,7 @@ fn segment_point_reads_mvcc() -> lsm_tree::Result<()> { for x in 0..ITEM_COUNT as u64 { let key = x.to_be_bytes(); - let item = tree.get_internal_entry(key, true, None)?.unwrap(); + let item = tree.get_internal_entry(key, None)?.unwrap(); assert_eq!(item.key.seqno, 2); assert_eq!(&*item.value, b"2"); @@ -89,7 +89,7 @@ fn segment_point_reads_mvcc_slab() -> lsm_tree::Result<()> { tree.flush_active_memtable(0)?; for key in &keys { - let item = tree.get_internal_entry(key, true, None)?.unwrap(); + let item = tree.get_internal_entry(key, None)?.unwrap(); assert_eq!(item.key.seqno, ITEM_COUNT as u64 - 1); } diff --git a/tests/snapshot_point_read.rs b/tests/snapshot_point_read.rs index f459c2a9..87f1cf32 100644 --- a/tests/snapshot_point_read.rs +++ b/tests/snapshot_point_read.rs @@ -60,7 +60,7 @@ fn snapshot_lots_of_versions() -> lsm_tree::Result<()> { for seqno in 1..version_count { let item = tree - .get_internal_entry(key, true, Some(seqno))? + .get_internal_entry(key, Some(seqno))? .expect("should exist"); assert_eq!(format!("abc{}", version_count).as_bytes(), &*item.value); diff --git a/tests/tree_write_read.rs b/tests/tree_write_read.rs index 45ef1bce..8f6e856b 100644 --- a/tests/tree_write_read.rs +++ b/tests/tree_write_read.rs @@ -11,17 +11,17 @@ fn tree_write_and_read() -> lsm_tree::Result<()> { tree.insert("b".as_bytes(), nanoid::nanoid!().as_bytes(), 1); tree.insert("c".as_bytes(), nanoid::nanoid!().as_bytes(), 2); - let item = tree.get_internal_entry("a", true, None)?.unwrap(); + let item = tree.get_internal_entry("a", None)?.unwrap(); assert_eq!(&*item.key.user_key, "a".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 0); - let item = tree.get_internal_entry("b", true, None)?.unwrap(); + let item = tree.get_internal_entry("b", None)?.unwrap(); assert_eq!(&*item.key.user_key, "b".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 1); - let item = tree.get_internal_entry("c", true, None)?.unwrap(); + let item = tree.get_internal_entry("c", None)?.unwrap(); assert_eq!(&*item.key.user_key, "c".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 2); @@ -30,17 +30,17 @@ fn tree_write_and_read() -> lsm_tree::Result<()> { let tree = Config::new(folder).open()?; - let item = tree.get_internal_entry("a", true, None)?.unwrap(); + let item = tree.get_internal_entry("a", None)?.unwrap(); assert_eq!(&*item.key.user_key, "a".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 0); - let item = tree.get_internal_entry("b", true, None)?.unwrap(); + let item = tree.get_internal_entry("b", None)?.unwrap(); assert_eq!(&*item.key.user_key, "b".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 1); - let item = tree.get_internal_entry("c", true, None)?.unwrap(); + let item = tree.get_internal_entry("c", None)?.unwrap(); assert_eq!(&*item.key.user_key, "c".as_bytes()); assert!(!item.is_tombstone()); assert_eq!(item.key.seqno, 2); From 13d725205e8a12fd52ad71c53ab7cb6692633cd9 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 20 Dec 2024 17:15:30 +0100 Subject: [PATCH 83/90] leveled compaction: only consider best trivial move --- src/compaction/leveled.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index d4997f09..669c1bbf 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -121,8 +121,8 @@ fn pick_minimal_compaction( } } - // NOTE: Find trivial moves - for size in (1..=curr_level.len()).rev() { + // NOTE: Find largest trivial move (if it exists) + 'trivial_move_search: for size in (1..=curr_level.len()).rev() { let windows = curr_level.windows(size); for window in windows { @@ -136,6 +136,7 @@ fn pick_minimal_compaction( segment_ids, can_trivial_move: true, }); + break 'trivial_move_search; } } } From 540013d0a650b69aa4411cf6ce868dd06e72b183 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 22 Dec 2024 22:54:19 +0100 Subject: [PATCH 84/90] fix(leveled compaction): trivial moves --- src/compaction/leveled.rs | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 669c1bbf..20656e9c 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -43,10 +43,7 @@ fn pick_minimal_compaction( // NOTE: Keep compactions with 25 or less segments // to make compactions not too large - // - // TODO: ideally, if a level has a lot of compaction debt - // compactions could be parallelized as long as they don't overlap in key range - valid_choice &= choice.segment_ids.len() <= 25; + valid_choice &= choice.can_trivial_move || choice.segment_ids.len() <= 25; if valid_choice { choices.push(choice); @@ -126,14 +123,12 @@ fn pick_minimal_compaction( let windows = curr_level.windows(size); for window in windows { - let segment_ids: HashSet = window.iter().map(Segment::id).collect(); - let key_range = aggregate_key_range(window); if next_level.overlapping_segments(&key_range).next().is_none() { add_choice(Choice { write_amp: 0.0, - segment_ids, + segment_ids: window.iter().map(Segment::id).collect(), can_trivial_move: true, }); break 'trivial_move_search; @@ -251,7 +246,7 @@ impl CompactionStrategy for Strategy { .iter() // NOTE: Take bytes that are already being compacted into account, // otherwise we may be overcompensating - .filter(|x| !levels.hidden_set().is_hidden(x.id())) + // .filter(|x| !levels.hidden_set().is_hidden(x.id())) .map(|x| x.metadata.file_size) .sum(); From a89bf4cee39ba566e03699b0db2d4cf719b8ea2b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Sun, 22 Dec 2024 22:59:52 +0100 Subject: [PATCH 85/90] reenable size filter --- src/compaction/leveled.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compaction/leveled.rs b/src/compaction/leveled.rs index 20656e9c..0a000899 100644 --- a/src/compaction/leveled.rs +++ b/src/compaction/leveled.rs @@ -246,7 +246,7 @@ impl CompactionStrategy for Strategy { .iter() // NOTE: Take bytes that are already being compacted into account, // otherwise we may be overcompensating - // .filter(|x| !levels.hidden_set().is_hidden(x.id())) + .filter(|x| !levels.hidden_set().is_hidden(x.id())) .map(|x| x.metadata.file_size) .sum(); From 000fa085676d2b674373e6bd9c8da62219cb687e Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 23 Dec 2024 21:13:39 +0100 Subject: [PATCH 86/90] refactor --- src/abstract.rs | 10 ---------- src/blob_tree/mod.rs | 21 --------------------- src/tree/mod.rs | 12 ------------ 3 files changed, 43 deletions(-) diff --git a/src/abstract.rs b/src/abstract.rs index 736a0da0..63de101b 100644 --- a/src/abstract.rs +++ b/src/abstract.rs @@ -566,16 +566,6 @@ pub trait AbstractTree { seqno: SeqNo, ) -> (u32, u32); - /// Inserts a key-value pair. - fn raw_insert_with_lock, V: Into>( - &self, - lock: &RwLockWriteGuard<'_, Memtable>, - key: K, - value: V, - seqno: SeqNo, - r#type: ValueType, - ) -> (u32, u32); - /// Removes an item from the tree. /// /// Returns the added item's size and new size of the memtable. diff --git a/src/blob_tree/mod.rs b/src/blob_tree/mod.rs index 5084c09e..b38b1ccb 100644 --- a/src/blob_tree/mod.rs +++ b/src/blob_tree/mod.rs @@ -596,27 +596,6 @@ impl AbstractTree for BlobTree { ) } - fn raw_insert_with_lock, V: Into>( - &self, - lock: &RwLockWriteGuard<'_, Memtable>, - key: K, - value: V, - seqno: SeqNo, - r#type: ValueType, - ) -> (u32, u32) { - use value::MaybeInlineValue; - - // NOTE: Initially, we always write an inline value - // On memtable flush, depending on the values' sizes, they will be separated - // into inline or indirect values - let item = MaybeInlineValue::Inline(value.into()); - - let value = item.encode_into_vec(); - - let value = InternalValue::from_components(key, value, seqno, r#type); - lock.insert(value) - } - fn insert, V: Into>( &self, key: K, diff --git a/src/tree/mod.rs b/src/tree/mod.rs index 7fcab188..f5ad98f9 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -412,18 +412,6 @@ impl AbstractTree for Tree { self.append_entry(value) } - fn raw_insert_with_lock, V: Into>( - &self, - lock: &RwLockWriteGuard<'_, Memtable>, - key: K, - value: V, - seqno: SeqNo, - r#type: ValueType, - ) -> (u32, u32) { - let value = InternalValue::from_components(key, value, seqno, r#type); - lock.insert(value) - } - fn remove>(&self, key: K, seqno: SeqNo) -> (u32, u32) { let value = InternalValue::new_tombstone(key, seqno); self.append_entry(value) From f4e5e58f6a6dab8fa4627bd3f4c9a86f7bef9fde Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Fri, 27 Dec 2024 21:13:48 +0100 Subject: [PATCH 87/90] change logging --- src/tree/mod.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/tree/mod.rs b/src/tree/mod.rs index f5ad98f9..8916bb43 100644 --- a/src/tree/mod.rs +++ b/src/tree/mod.rs @@ -866,9 +866,8 @@ impl Tree { let tree_path = tree_path.as_ref(); - log::info!("Recovering LSM-tree at {tree_path:?}"); - let level_manifest_path = tree_path.join(LEVELS_MANIFEST_FILE); + log::info!("Recovering manifest at {level_manifest_path:?}"); let segment_id_map = LevelManifest::recover_ids(&level_manifest_path)?; let cnt = segment_id_map.len(); From 0198adaa2d5647b75c44b6a58816f37b098ac72b Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Mon, 6 Jan 2025 13:22:45 +0100 Subject: [PATCH 88/90] refactor --- src/bloom/bit_array.rs | 12 ------------ src/bloom/mod.rs | 2 +- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/src/bloom/bit_array.rs b/src/bloom/bit_array.rs index 779943dc..e4f0535c 100644 --- a/src/bloom/bit_array.rs +++ b/src/bloom/bit_array.rs @@ -39,18 +39,6 @@ impl BitArray { Self(bytes) } - /// Size in bytes - #[must_use] - pub fn len(&self) -> usize { - self.0.len() - } - - #[allow(unused)] - #[must_use] - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - #[must_use] pub fn bytes(&self) -> &[u8] { &self.0 diff --git a/src/bloom/mod.rs b/src/bloom/mod.rs index 8ce117ac..cc45b203 100644 --- a/src/bloom/mod.rs +++ b/src/bloom/mod.rs @@ -88,7 +88,7 @@ impl BloomFilter { /// Size of bloom filter in bytes. #[must_use] pub fn len(&self) -> usize { - self.inner.len() + self.inner.bytes().len() } fn from_raw(m: usize, k: usize, bytes: Box<[u8]>) -> Self { From e3e264d09d5b80d04a46e573ac8950ae624e9278 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Jan 2025 19:57:12 +0100 Subject: [PATCH 89/90] update value log --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 57cda897..6a826deb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,7 +38,7 @@ quick_cache = { version = "0.6.5", default-features = false, features = [] } rustc-hash = "2.0.0" self_cell = "1.0.4" tempfile = "3.12.0" -value-log = "1.4.0" +value-log = "1.4.1" varint-rs = "2.2.0" xxhash-rust = { version = "0.8.12", features = ["xxh3"] } From abddb10e41ccd6ad69453d5c42f4fdbdfc2f0f77 Mon Sep 17 00:00:00 2001 From: marvin-j97 Date: Tue, 7 Jan 2025 20:01:46 +0100 Subject: [PATCH 90/90] 2.5.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 6a826deb..a7028792 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "lsm-tree" description = "A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs)" license = "MIT OR Apache-2.0" -version = "2.5.0-pre.0" +version = "2.5.0" edition = "2021" rust-version = "1.74.0" readme = "README.md"