Skip to content

Commit

Permalink
Merged LevelSizeChooser with SimpleLevelSizeChooser
Browse files Browse the repository at this point in the history
  • Loading branch information
beling committed Sep 28, 2024
1 parent a13c485 commit b3b8ce8
Show file tree
Hide file tree
Showing 7 changed files with 144 additions and 59 deletions.
5 changes: 3 additions & 2 deletions csf/src/fp/cmap/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,9 @@ impl<C: Coding, S: BuildSeededHasher> CMap<C, S> {
let mut level_nr = 0u32;
while input_size != 0 {
let level_size_segments = conf.level_size_chooser.size_segments(
&value_coding,
&values[0..input_size], &value_rev_indices[0..input_size]);
|| values[0..input_size].iter().zip(value_rev_indices[0..input_size].iter()).map(|(c, ri)| value_coding.rev_fragment_of(*c, *ri) as u64),
input_size,
value_coding.bits_per_fragment());
let level_size = level_size_segments * 64;
stats.level(input_size, level_size);
let mut collision_solver = conf.collision_solver.new(level_size_segments, value_coding.bits_per_fragment());
Expand Down
7 changes: 5 additions & 2 deletions csf/src/fp/gocmap/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,11 @@ impl<C: Coding, GS: GroupSize, SS: SeedSize, S: BuildSeededHasher> GOCMap<C, GS,
let in_keys = &keys[0..input_size];
let in_values = &values[0..input_size];
let in_value_rev_indices = &value_rev_indices[0..input_size];

let suggested_level_size_segments = conf.level_size_chooser.size_segments(&value_coding, in_values, in_value_rev_indices);
let suggested_level_size_segments = conf.level_size_chooser.size_segments(
|| in_values.iter().zip(in_value_rev_indices.iter()).map(|(c, ri)| value_coding.rev_fragment_of(*c, *ri) as u64),
input_size,
value_coding.bits_per_fragment());

let (level_size_groups, level_size_segments) = conf.goconf.bits_per_group.level_size_groups_segments(suggested_level_size_segments * 64);
//let seed = level_nr;
stats.level(input_size, level_size_segments * 64);
Expand Down
17 changes: 17 additions & 0 deletions csf/src/fp/kvset.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::collections::{BTreeMap, HashMap};

use crate::{bits_to_store_any_of, bits_to_store_any_of_ref};

/// A trait for accessing and managing sets of key (of the type `K`) and value pairs
/// during construction of [`fp::Map`](super::Map) or [`fp::GOMap`](super::GOMap).
pub trait KVSet<K> {
Expand All @@ -11,6 +13,9 @@ pub trait KVSet<K> {
/// If `self` doesn't remember which keys are retained it uses `retained_hint` to check this.
fn for_each_key_value<F, P>(&self, f: F, retained_hint: P) where F: FnMut(&K, u8), P: FnMut(&K) -> bool;

/// Returns minimal number of bits that can store any value.
fn bits_per_value(&self) -> u8;

/// Calls `map` for each key-value pair in the set, and returns outputs of these calls. Uses single thread.
///
/// If `self` doesn't remember which keys are retained it uses `retained_hint` to check this.
Expand Down Expand Up @@ -63,6 +68,10 @@ impl<K, S> KVSet<K> for HashMap<K, u8, S> {
for (k, v) in self { f(k, *v) }
}

fn bits_per_value(&self) -> u8 {
bits_to_store_any_of_ref(self.values())
}

fn retain_keys<F, P, R>(&mut self, mut filter: F, _retained_earlier: P, _remove_count: R)
where F: FnMut(&K) -> bool, P: FnMut(&K) -> bool, R: FnMut() -> usize
{
Expand All @@ -77,6 +86,10 @@ impl<K: Ord> KVSet<K> for BTreeMap<K, u8> {
for (k, v) in self { f(k, *v) }
}

fn bits_per_value(&self) -> u8 {
bits_to_store_any_of_ref(self.values())
}

fn retain_keys<F, P, R>(&mut self, mut filter: F, _retained_earlier: P, _remove_count: R)
where F: FnMut(&K) -> bool, P: FnMut(&K) -> bool, R: FnMut() -> usize
{
Expand Down Expand Up @@ -114,6 +127,10 @@ impl<'k, K: Sync> KVSet<K> for SlicesMutSource<'k, K> {
}
}

fn bits_per_value(&self) -> u8 {
bits_to_store_any_of_ref(self.values.iter())
}

#[inline(always)] fn map_each_key_value<R, M, P>(&self, mut map: M, _retained_hint: P) -> Vec<R>
where M: FnMut(&K, u8) -> R, P: FnMut(&K) -> bool
{
Expand Down
74 changes: 27 additions & 47 deletions csf/src/fp/level_size_chooser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,15 @@ use std::mem::MaybeUninit;
use fsum::FSum;
use std::fmt;
use std::fmt::Formatter;
use crate::coding::Coding;

/// Chooses the size of level for the given level input.
/// Chooses the size of level for the given sequence of retained values.
pub trait LevelSizeChooser {

/// Returns number of 64-bit segments to use for given level input.
fn size_segments<C: Coding>(&self, _coding: &C, values: &[C::Codeword], _value_rev_indices: &[u8]) -> usize {
self.max_size_segments(values.len())
}

/// Returns maximal number of segment that can be returned by `size_segments` for level of size `max_level_size` or less.
fn max_size_segments(&self, max_level_size: usize) -> usize;
}

pub trait SimpleLevelSizeChooser {

/// Returns number of 64-bit segments to use for given level input.
fn size_segments(&self, values: &[u8], _bits_per_value: u8) -> usize {
self.max_size_segments(values.len())
/// Returns number of 64-bit segments to use for given sequence of retained `values`.
fn size_segments<VIt, F>(&self, _values: F, values_len: usize, _bits_per_value: u8) -> usize
where VIt: IntoIterator<Item = u64>, F: FnMut() -> VIt
{
self.max_size_segments(values_len)
}

/// Returns maximal number of segment that can be returned by `size_segments` for level of size `max_level_size` or less.
Expand All @@ -48,12 +38,6 @@ impl LevelSizeChooser for ProportionalLevelSize {
}
}

impl SimpleLevelSizeChooser for ProportionalLevelSize {
fn max_size_segments(&self, max_level_size: usize) -> usize {
ceiling_div(max_level_size*self.percent as usize, 64*100)
}
}

impl fmt::Display for ProportionalLevelSize {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
write!(f, "{}percent", self.percent)
Expand Down Expand Up @@ -141,7 +125,7 @@ impl OptimalLevelSize {
// poisson(licza fragmentów do zapisania, wielkość wejścia / wielkość tablicy, liczba wpisów)
}

impl LevelSizeChooser for OptimalLevelSize {
/*impl LevelSizeChooser for OptimalLevelSize {
fn size_segments<C: Coding>(&self, coding: &C, values: &[C::Codeword], value_rev_indices: &[u8]) -> usize {
let mut counts = [0u32; 256];
for (c, ri) in values.iter().zip(value_rev_indices.iter()) {
Expand All @@ -157,15 +141,17 @@ impl LevelSizeChooser for OptimalLevelSize {
fn max_size_segments(&self, max_level_size: usize) -> usize {
ceiling_div(max_level_size, 64)
}
}
}*/

impl SimpleLevelSizeChooser for OptimalLevelSize {
fn size_segments(&self, values: &[u8], bits_per_value: u8) -> usize {
let mut counts = [0u32; 256];
for v in values { counts[*v as usize] += 1; }
impl LevelSizeChooser for OptimalLevelSize {
fn size_segments<VIt, F>(&self, mut values: F, values_len: usize, bits_per_value: u8) -> usize
where VIt: IntoIterator<Item = u64>, F: FnMut() -> VIt
{
let mut counts = [0u32; 256]; // TODO support bits_per_value > 8
for v in values() { counts[v as usize] += 1; }
Self::size_segments_for_dist(
&mut counts[0..(1usize<<bits_per_value)],
values.len(),
values_len,
bits_per_value
)
}
Expand Down Expand Up @@ -199,16 +185,18 @@ impl OptimalGroupedLevelSize {
}
}

impl SimpleLevelSizeChooser for OptimalGroupedLevelSize {
fn size_segments(&self, values: &[u8], bits_per_value: u8) -> usize {
impl LevelSizeChooser for OptimalGroupedLevelSize {
fn size_segments<VIt, F>(&self, mut values: F, values_len: usize, bits_per_value: u8) -> usize
where VIt: IntoIterator<Item = u64>, F: FnMut() -> VIt
{
let divider = self.divider as usize;
let max_value = (1usize<<bits_per_value) - 1;
(0..divider).map(|delta| {
let mut counts = [0u32; 256];
for v in values { counts[(*v as usize + delta) / divider] += 1; }
let mut counts = [0u32; 256]; // TODO support for bits_per_value > 8
for v in values() { counts[(v as usize + delta) / divider] += 1; }
OptimalLevelSize::size_segments_for_dist(
&mut counts[0 ..= (max_value + delta) / divider],
values.len(),
values_len,
bits_per_value // this must be unchanged as it is used to calculate memory used by a value
)
}).min().unwrap()
Expand Down Expand Up @@ -243,21 +231,13 @@ impl<LSC> ResizedLevel<LSC> {
}

impl<LSC: LevelSizeChooser> LevelSizeChooser for ResizedLevel<LSC> {
fn size_segments<C: Coding>(&self, coding: &C, values: &[C::Codeword], value_rev_indices: &[u8]) -> usize {
self.resized(self.level_size_chooser.size_segments(coding, values, value_rev_indices))
}

fn max_size_segments(&self, max_level_size: usize) -> usize {
self.resized(self.level_size_chooser.max_size_segments(max_level_size))
}
}

impl<LSC: SimpleLevelSizeChooser> SimpleLevelSizeChooser for ResizedLevel<LSC> {
#[inline(always)] fn size_segments(&self, values: &[u8], bits_per_value: u8) -> usize {
self.resized(self.level_size_chooser.size_segments(values, bits_per_value))
#[inline] fn size_segments<VIt, F>(&self, values: F, values_len: usize, bits_per_value: u8) -> usize
where VIt: IntoIterator<Item = u64>, F: FnMut() -> VIt
{
self.resized(self.level_size_chooser.size_segments(values, values_len, bits_per_value))
}

#[inline(always)] fn max_size_segments(&self, max_level_size: usize) -> usize {
#[inline] fn max_size_segments(&self, max_level_size: usize) -> usize {
self.resized(max_level_size)
}
}
96 changes: 90 additions & 6 deletions csf/src/fp/map/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ pub use conf::MapConf;
use std::hash::Hash;
use bitm::{BitAccess, Rank};

pub use super::level_size_chooser::SimpleLevelSizeChooser;
use super::kvset::KVSet;
pub use super::level_size_chooser::LevelSizeChooser;
use ph::{BuildDefaultSeededHasher, BuildSeededHasher, utils, stats, utils::{ArrayWithRank, read_bits}};
use std::collections::HashMap;
use std::io;
Expand Down Expand Up @@ -63,6 +64,89 @@ impl<S: BuildSeededHasher> Map<S> {
self.get_stats(k, &mut ())
}

/// Constructs [`Map`] for given key-value pairs `kv`, using the build configuration `conf` and reporting statistics with `stats`.
///
/// TODO Panics if the construction fails.
/// Then it is almost certain that the input contains either duplicate keys
/// or keys indistinguishable by any hash function from the family used.
/*fn with_conf_stats<K, LSC, CSB, BS>(
kv: impl KVSet<K>,
mut conf: MapConf<LSC, CSB, S>,
stats: &mut BS
) -> Self
where K: Hash,
LSC: SimpleLevelSizeChooser,
CSB: CollisionSolverBuilder,
BS: stats::BuildStatsCollector
{
if conf.bits_per_value == 0 {
conf.bits_per_value = kv.bits_per_value();
}
let mut level_sizes = Vec::<u64>::new();
let mut arrays = Vec::<Box<[u64]>>::new();
let mut values = Vec::<Box<[u64]>>::new();
let mut input_size = kv.kv_len();
let mut level_nr = 0u32;
while input_size != 0 {
let level_size_segments = conf.level_size_chooser.size_segments(
&values[0..input_size], conf.bits_per_value);
let level_size = level_size_segments * 64;
stats.level(input_size, level_size);
let mut collision_solver = conf.collision_solver.new(level_size_segments, conf.bits_per_value);
for i in 0..input_size {
let a_index = utils::map64_to_64(conf.hash.hash_one(&keys[i], level_nr), level_size as u64) as usize;
if collision_solver.is_under_collision(a_index) { continue }
collision_solver.process_fragment(a_index, values[i], conf.bits_per_value);
}
let current_array = collision_solver.to_collision_array();
let mut i = 0usize;
while i < input_size {
let a_index = utils::map64_to_64(conf.hash.hash_one(&keys[i], level_nr), level_size as u64) as usize;
if current_array.get_bit(a_index) { // no collision
// remove i-th element by replacing it with the last one
input_size -= 1;
keys.swap(i, input_size);
//values.swap_fragments(i, input_size, bits_per_value);
values.swap(i, input_size);
} else { // collision, has to be processed again, at the next level
i += 1;
}
}
arrays.push(current_array);
level_sizes.push(level_size_segments as u64);
level_nr += 1;
}
let (array, out_fragments_num) = ArrayWithRank::build(arrays.concat().into_boxed_slice());
let mut output_value_fragments = CSB::CollisionSolver::construct_value_array(out_fragments_num as usize, conf.bits_per_value);
for input_index in 0..keys.len() {
//let mut result_decoder = self.value_coding.decoder();
let mut array_begin_index = 0usize;
let mut level = 0u32;
loop {
let level_size = (level_sizes[level as usize] as usize) << 6usize;
let i = array_begin_index + utils::map64_to_64(conf.hash.hash_one(&keys[input_index], level), level_size as u64) as usize;
if array.content.get_bit(i) {
CSB::CollisionSolver::set_value(&mut output_value_fragments, array.rank(i), values[input_index], conf.bits_per_value);
// stats.value_on_level(level); // TODO do we need this? we can get average levels from lookups
break;
}
array_begin_index += level_size;
level += 1;
}
}
stats.end(0);
Self {
array,
values: output_value_fragments,
bits_per_value: conf.bits_per_value,
level_sizes: level_sizes.into_boxed_slice(),
hash_builder: conf.hash
}
}*/


/// Build `Map` for given keys -> values map, where:
/// - keys are given directly,
/// - TODO values are given as bit vector with bit_per_value.
Expand All @@ -73,7 +157,7 @@ impl<S: BuildSeededHasher> Map<S> {
stats: &mut BS
) -> Self
where K: Hash,
LSC: SimpleLevelSizeChooser,
LSC: LevelSizeChooser,
CSB: CollisionSolverBuilder,
BS: stats::BuildStatsCollector

Expand All @@ -87,7 +171,7 @@ impl<S: BuildSeededHasher> Map<S> {
let mut level_nr = 0u32;
while input_size != 0 {
let level_size_segments = conf.level_size_chooser.size_segments(
&values[0..input_size], conf.bits_per_value);
|| values[0..input_size].iter().map(|v| *v as u64), input_size, conf.bits_per_value);
let level_size = level_size_segments * 64;
stats.level(input_size, level_size);
let mut collision_solver = conf.collision_solver.new(level_size_segments, conf.bits_per_value);
Expand Down Expand Up @@ -145,7 +229,7 @@ impl<S: BuildSeededHasher> Map<S> {
}

#[inline]
pub fn with_slices_conf<K: Hash, LSC: SimpleLevelSizeChooser, CSB: CollisionSolverBuilder>(
pub fn with_slices_conf<K: Hash, LSC: LevelSizeChooser, CSB: CollisionSolverBuilder>(
keys: &mut [K], values: &mut [u8], /*&mut [u64],*/ conf: MapConf<LSC, CSB, S>) -> Self
{
Self::with_slices_conf_stats(keys, values, conf, &mut ())
Expand Down Expand Up @@ -195,7 +279,7 @@ impl Map {

impl<S: BuildSeededHasher> Map<S> {

pub fn with_map_conf<K: Hash + Clone, H, LSC: SimpleLevelSizeChooser, CSB: CollisionSolverBuilder, BS: stats::BuildStatsCollector>(
pub fn with_map_conf<K: Hash + Clone, H, LSC: LevelSizeChooser, CSB: CollisionSolverBuilder, BS: stats::BuildStatsCollector>(
map: &HashMap<K, u8, H>,
conf: MapConf<LSC, CSB, S>,
stats: &mut BS
Expand Down Expand Up @@ -268,7 +352,7 @@ mod tests {
test_4pairs(MapConf::default());
}

fn test_8pairs<LSC: SimpleLevelSizeChooser>(conf: MapConf<LSC>) {
fn test_8pairs<LSC: LevelSizeChooser>(conf: MapConf<LSC>) {
let fpmap = Map::with_map_conf(&hashmap!(
'a' => 1, 'b' => 2, 'c' => 1, 'd' => 3,
'e' => 4, 'f' => 1, 'g' => 5, 'h' => 6), conf, &mut ());
Expand Down
2 changes: 1 addition & 1 deletion csf/src/fp/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ pub use gocmap::{GOCMap, GOCMapConf};
pub use ph::fmph::{GroupSize, SeedSize, TwoToPowerBits, TwoToPowerBitsStatic, Bits, Bits8, GOConf};

pub mod level_size_chooser;
pub use level_size_chooser::{LevelSizeChooser, SimpleLevelSizeChooser, ProportionalLevelSize, OptimalLevelSize, ResizedLevel};
pub use level_size_chooser::{LevelSizeChooser, ProportionalLevelSize, OptimalLevelSize, ResizedLevel};

pub mod collision_solver;
pub use collision_solver::{CollisionSolver, CollisionSolverBuilder, IsLossless, LoMemAcceptEquals};
Expand Down
2 changes: 1 addition & 1 deletion csf_benchmark/src/function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ impl PrintParams for ProportionalLevelSize {
}

impl<LSC, CSB, S> CSFBuilder for fp::MapConf<LSC, CSB, S>
where LSC: fp::LevelSizeChooser+fp::SimpleLevelSizeChooser, CSB: fp::CollisionSolverBuilder, S: BuildSeededHasher
where LSC: fp::LevelSizeChooser, CSB: fp::CollisionSolverBuilder, S: BuildSeededHasher
{
type CSF = fp::Map<S>;

Expand Down

0 comments on commit b3b8ce8

Please sign in to comment.