Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add associated functions/methods for exposing/loading filter data in BloomFilter + CuckooFilter #125

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions src/filters/bloomfilter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,20 @@ where
Self::with_params_and_hash(m, k, bh)
}

/// Create BloomFilter with internal parameters and existing bitmap.
///
/// - `k` is the number of hash functions
/// - `m` is the number of bits used to store state
/// - `bitmap` is the bitmap from an existing bloom filter
pub fn with_existing_filter<I: IntoIterator<Item = u32>>(
m: usize,
k: usize,
bitmap: I,
) -> Self {
let bh = BuildHasherDefault::<DefaultHasher>::default();
Self::with_existing_filter_and_hash(m, k, bitmap, bh)
}

/// Create new, empty BloomFilter with given properties.
///
/// - `n` number of unique elements the BloomFilter is expected to hold, must be `> 0`
Expand Down Expand Up @@ -207,6 +221,21 @@ where
}
}

/// Same as `with_existing_filter` but with specific `BuildHasher`.
pub fn with_existing_filter_and_hash<I: IntoIterator<Item = u32>>(
m: usize,
k: usize,
bitmap: I,
buildhasher: B,
) -> Self {
Self {
bs: FixedBitSet::with_capacity_and_blocks(m, bitmap),
k,
builder: HashIterBuilder::new(m, k, buildhasher),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this check that the iterator has the correct number of elements for the given parameters?

phantom: PhantomData,
}
}

/// Same as `with_properties` but with specific `BuildHasher`.
pub fn with_properties_and_hash(n: usize, p: f64, buildhasher: B) -> Self {
assert!(n > 0, "n must be greater than 0");
Expand Down Expand Up @@ -237,6 +266,11 @@ where
pub fn buildhasher(&self) -> &B {
self.builder.buildhasher()
}

/// Get bitmap data.
pub fn bitmap(&self) -> &[u32] {
self.bs.as_slice()
}
}

impl<T, B> Filter<T> for BloomFilter<T, B>
Expand Down Expand Up @@ -529,4 +563,22 @@ mod tests {
let bf = BloomFilter::<NotSend>::with_params(100, 2);
assert_send(&bf);
}

#[test]
fn bitmap_save_load() {
let mut bf = BloomFilter::with_params(100, 2);

assert!(bf.insert(&1).unwrap());
assert!(bf.insert(&7).unwrap());
assert!(bf.insert(&52).unwrap());

let bitmap = bf.bitmap().to_vec();

let loaded_bf = BloomFilter::with_existing_filter(100, 2, bitmap);

assert!(loaded_bf.query(&1));
assert!(loaded_bf.query(&7));
assert!(loaded_bf.query(&52));
assert!(!loaded_bf.query(&15));
}
}
121 changes: 120 additions & 1 deletion src/filters/cuckoofilter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::hash::{BuildHasher, BuildHasherDefault, Hash, Hasher};
use std::marker::PhantomData;

use rand::Rng;
use succinct::{IntVec, IntVecMut, IntVector};
use succinct::{BitVec, BitVecMut, IntVec, IntVecMut, IntVector};

use crate::filters::Filter;
use crate::helpers::all_zero_intvector;
Expand Down Expand Up @@ -186,6 +186,36 @@ where
Self::with_params_and_hash(rng, bucketsize, n_buckets, l_fingerprint, bh)
}

/// Create CuckooFilter with existing filter table data:
///
/// - `rng`: random number generator used for certain random actions
/// - `bucketsize`: number of elements per bucket, must be at least 2
/// - `n_buckets`: number of buckets, must be a power of 2 and at least 2
/// - `l_fingerprint`: size of the fingerprint in bits
/// - `n_elements`: number of elements in existing filter
/// - `table_succinct_blocks`: filter table block data
///
/// The BuildHasher is set to the `DefaultHasher`.
pub fn with_existing_filter<I: IntoIterator<Item = u64>>(
rng: R,
bucketsize: usize,
n_buckets: usize,
l_fingerprint: usize,
n_elements: usize,
table_succinct_blocks: I,
) -> Self {
let bh = BuildHasherDefault::<DefaultHasher>::default();
Self::with_existing_filter_and_hash(
rng,
bucketsize,
n_buckets,
l_fingerprint,
n_elements,
table_succinct_blocks,
bh,
)
}

/// Construct new `bucketsize=4`-cuckoofilter with properties:
///
/// - `false_positive_rate`: false positive lookup rate
Expand Down Expand Up @@ -260,6 +290,28 @@ where
}
}

/// Same as `with_existing_filter` but with specific `BuildHasher`.
pub fn with_existing_filter_and_hash<I: IntoIterator<Item = u64>>(
rng: R,
bucketsize: usize,
n_buckets: usize,
l_fingerprint: usize,
n_elements: usize,
table_succinct_blocks: I,
bh: B,
) -> Self {
let mut filter = Self::with_params_and_hash(rng, bucketsize, n_buckets, l_fingerprint, bh);
for (i, block) in table_succinct_blocks.into_iter().enumerate() {
assert!(
i < filter.table.block_len(),
"existing input table block length must not exceed filter table block length"
);
filter.table.set_block(i, block);
}
filter.n_elements = n_elements;
Comment on lines +304 to +311
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this just use filter.load_table(...) instead of a duplicate code block?

filter
}

/// Construct new `bucketsize=4`-cuckoofilter with properties:
///
/// - `false_positive_rate`: false positive lookup rate
Expand Down Expand Up @@ -481,6 +533,35 @@ where
self.table.set(pos as u64, data);
}
}

/// Clear and load filter table with individual filter table elements
/// and existing element count.
pub fn load_table<I: IntoIterator<Item = u64>>(&mut self, table: I, n_elements: usize) {
self.clear();
for (i, value) in table.into_iter().enumerate() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably validate that the iterator has the correct length because this depends on the filter parameters.

let i = i as u64;
assert!(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be a should_panic test that triggers this assertion.

i < self.table.len(),
"input table length must not exceed filter table length"
);
self.table.set(i, value);
}
self.n_elements = n_elements;
}

/// Return the individual filter table elements.
pub fn table(&self) -> Vec<u64> {
self.table.iter().collect()
}

/// Return the filter table succinct block data.
pub fn table_succinct_blocks(&self) -> Vec<u64> {
let mut result = Vec::with_capacity(self.table.block_len());
for i in 0..self.table.block_len() {
result.push(self.table.get_block(i));
}
result
}
}

impl<T, R, B> Filter<T> for CuckooFilter<T, R, B>
Expand Down Expand Up @@ -949,4 +1030,42 @@ mod tests {
let cf = CuckooFilter::<NotSend, _>::with_params(ChaChaRng::from_seed([0; 32]), 2, 16, 8);
assert_send(&cf);
}

#[test]
fn succinct_table_save_load() {
let mut cf = CuckooFilter::with_params(ChaChaRng::from_seed([0; 32]), 2, 16, 8);
assert!(cf.insert(&10).unwrap());
assert!(cf.insert(&51).unwrap());
assert_eq!(cf.len(), 2);

let loaded_cf = CuckooFilter::with_existing_filter(
ChaChaRng::from_seed([0; 32]),
2,
16,
8,
cf.len(),
cf.table_succinct_blocks(),
);

assert!(loaded_cf.query(&10));
assert!(loaded_cf.query(&51));
assert!(!loaded_cf.query(&33));
assert_eq!(loaded_cf.len(), 2);
}

#[test]
fn table_save_load() {
let mut cf = CuckooFilter::with_params(ChaChaRng::from_seed([0; 32]), 2, 16, 8);
assert!(cf.insert(&10).unwrap());
assert!(cf.insert(&51).unwrap());
assert_eq!(cf.len(), 2);

let mut loaded_cf = CuckooFilter::with_params(ChaChaRng::from_seed([0; 32]), 2, 16, 8);
loaded_cf.load_table(cf.table(), cf.len());

assert!(loaded_cf.query(&10));
assert!(loaded_cf.query(&51));
assert!(!loaded_cf.query(&33));
assert_eq!(loaded_cf.len(), 2);
}
}