Skip to content

Commit e87fedc

Browse files
authored
improvement: ALP-RD encoder uses fxhash instead of foldhash (#2604)
Based on improvements we saw in dict encoding. The hashmap here is small but at minimum we're consistent across our repo
1 parent 2b32499 commit e87fedc

File tree

3 files changed

+5
-2
lines changed

3 files changed

+5
-2
lines changed

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

encodings/alp/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ workspace = true
1919
[dependencies]
2020
itertools = { workspace = true }
2121
num-traits = { workspace = true }
22+
rustc-hash = { workspace = true }
2223
serde = { workspace = true, features = ["derive"] }
2324
vortex-array = { workspace = true }
2425
vortex-buffer = { workspace = true }

encodings/alp/src/alp_rd/mod.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ use std::ops::{Shl, Shr};
1414

1515
use itertools::Itertools;
1616
use num_traits::{Float, One, PrimInt};
17+
use rustc_hash::FxBuildHasher;
1718
use vortex_array::aliases::hash_map::HashMap;
1819
use vortex_array::arrays::PrimitiveArray;
1920
use vortex_array::{Array, IntoArray, ToCanonical};
@@ -350,7 +351,7 @@ fn build_left_parts_dictionary<T: ALPRDFloat>(
350351
sorted_bit_counts.sort_by_key(|(_, count)| count.wrapping_neg());
351352

352353
// Assign the most-frequently occurring left-bits as dictionary codes, up to `dict_size`...
353-
let mut dictionary = HashMap::with_capacity(max_dict_size as _);
354+
let mut dictionary = HashMap::with_capacity_and_hasher(max_dict_size as _, FxBuildHasher);
354355
let mut code = 0u16;
355356
while code < (max_dict_size as _) && (code as usize) < sorted_bit_counts.len() {
356357
let (bits, _) = sorted_bit_counts[code as usize];
@@ -397,7 +398,7 @@ fn estimate_compression_size(
397398
#[derive(Debug, Default)]
398399
struct ALPRDDictionary {
399400
/// Items in the dictionary are bit patterns, along with their 16-bit encoding.
400-
dictionary: HashMap<u16, u16>,
401+
dictionary: HashMap<u16, u16, FxBuildHasher>,
401402
/// The (compressed) left bit width. This is after bit-packing the dictionary codes.
402403
left_bit_width: u8,
403404
/// The right bit width. This is the bit-packed width of each of the "real double" values.

0 commit comments

Comments
 (0)