Skip to content

Commit 207e855

Browse files
authored
refactor: change some hashbrown RawTable uses to HashTable (#13514)
* feat: add `HashTableAllocExt` This is similar to `RawTableAllocExt` and will help #13256. * refactor: convert `ArrowBytesMap` to `HashTable` For #13256. * refactor: convert `ArrowBytesViewMap` to `HashTable` For #13256.
1 parent 2f150f6 commit 207e855

File tree

4 files changed

+84
-11
lines changed

4 files changed

+84
-11
lines changed

datafusion/common/src/utils/proxy.rs

+72-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@
1717

1818
//! [`VecAllocExt`] and [`RawTableAllocExt`] to help tracking of memory allocations
1919
20-
use hashbrown::raw::{Bucket, RawTable};
20+
use hashbrown::{
21+
hash_table::HashTable,
22+
raw::{Bucket, RawTable},
23+
};
2124
use std::mem::size_of;
2225

2326
/// Extension trait for [`Vec`] to account for allocations.
@@ -173,3 +176,71 @@ impl<T> RawTableAllocExt for RawTable<T> {
173176
}
174177
}
175178
}
179+
180+
/// Extension trait for hash browns [`HashTable`] to account for allocations.
181+
pub trait HashTableAllocExt {
182+
/// Item type.
183+
type T;
184+
185+
/// Insert new element into table and increase
186+
/// `accounting` by any newly allocated bytes.
187+
///
188+
/// Returns the bucket where the element was inserted.
189+
/// Note that allocation counts capacity, not size.
190+
///
191+
/// # Example:
192+
/// ```
193+
/// # use datafusion_common::utils::proxy::HashTableAllocExt;
194+
/// # use hashbrown::hash_table::HashTable;
195+
/// let mut table = HashTable::new();
196+
/// let mut allocated = 0;
197+
/// let hash_fn = |x: &u32| (*x as u64) % 1000;
198+
/// // pretend 0x3117 is the hash value for 1
199+
/// table.insert_accounted(1, hash_fn, &mut allocated);
200+
/// assert_eq!(allocated, 64);
201+
///
202+
/// // insert more values
203+
/// for i in 0..100 { table.insert_accounted(i, hash_fn, &mut allocated); }
204+
/// assert_eq!(allocated, 400);
205+
/// ```
206+
fn insert_accounted(
207+
&mut self,
208+
x: Self::T,
209+
hasher: impl Fn(&Self::T) -> u64,
210+
accounting: &mut usize,
211+
);
212+
}
213+
214+
impl<T> HashTableAllocExt for HashTable<T>
215+
where
216+
T: Eq,
217+
{
218+
type T = T;
219+
220+
fn insert_accounted(
221+
&mut self,
222+
x: Self::T,
223+
hasher: impl Fn(&Self::T) -> u64,
224+
accounting: &mut usize,
225+
) {
226+
let hash = hasher(&x);
227+
228+
// NOTE: `find_entry` does NOT grow!
229+
match self.find_entry(hash, |y| y == &x) {
230+
Ok(_occupied) => {}
231+
Err(_absent) => {
232+
if self.len() == self.capacity() {
233+
// need to request more memory
234+
let bump_elements = self.capacity().max(16);
235+
let bump_size = bump_elements * size_of::<T>();
236+
*accounting = (*accounting).checked_add(bump_size).expect("overflow");
237+
238+
self.reserve(bump_elements, &hasher);
239+
}
240+
241+
// still need to insert the element since first try failed
242+
self.entry(hash, |y| y == &x, hasher).insert(x);
243+
}
244+
}
245+
}
246+
}

datafusion/execution/src/memory_pool/mod.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ use std::{cmp::Ordering, sync::Arc};
2323

2424
mod pool;
2525
pub mod proxy {
26-
pub use datafusion_common::utils::proxy::{RawTableAllocExt, VecAllocExt};
26+
pub use datafusion_common::utils::proxy::{
27+
HashTableAllocExt, RawTableAllocExt, VecAllocExt,
28+
};
2729
}
2830

2931
pub use pool::*;

datafusion/physical-expr-common/src/binary_map.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ use arrow::array::{
2828
use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
2929
use arrow::datatypes::DataType;
3030
use datafusion_common::hash_utils::create_hashes;
31-
use datafusion_common::utils::proxy::{RawTableAllocExt, VecAllocExt};
31+
use datafusion_common::utils::proxy::{HashTableAllocExt, VecAllocExt};
3232
use std::any::type_name;
3333
use std::fmt::Debug;
3434
use std::mem::{size_of, swap};
@@ -215,7 +215,7 @@ where
215215
/// Should the output be String or Binary?
216216
output_type: OutputType,
217217
/// Underlying hash set for each distinct value
218-
map: hashbrown::raw::RawTable<Entry<O, V>>,
218+
map: hashbrown::hash_table::HashTable<Entry<O, V>>,
219219
/// Total size of the map in bytes
220220
map_size: usize,
221221
/// In progress arrow `Buffer` containing all values
@@ -246,7 +246,7 @@ where
246246
pub fn new(output_type: OutputType) -> Self {
247247
Self {
248248
output_type,
249-
map: hashbrown::raw::RawTable::with_capacity(INITIAL_MAP_CAPACITY),
249+
map: hashbrown::hash_table::HashTable::with_capacity(INITIAL_MAP_CAPACITY),
250250
map_size: 0,
251251
buffer: BufferBuilder::new(INITIAL_BUFFER_CAPACITY),
252252
offsets: vec![O::default()], // first offset is always 0
@@ -387,7 +387,7 @@ where
387387
let inline = value.iter().fold(0usize, |acc, &x| acc << 8 | x as usize);
388388

389389
// is value is already present in the set?
390-
let entry = self.map.get_mut(hash, |header| {
390+
let entry = self.map.find_mut(hash, |header| {
391391
// compare value if hashes match
392392
if header.len != value_len {
393393
return false;
@@ -425,7 +425,7 @@ where
425425
// value is not "small"
426426
else {
427427
// Check if the value is already present in the set
428-
let entry = self.map.get_mut(hash, |header| {
428+
let entry = self.map.find_mut(hash, |header| {
429429
// compare value if hashes match
430430
if header.len != value_len {
431431
return false;

datafusion/physical-expr-common/src/binary_view_map.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use arrow::array::cast::AsArray;
2424
use arrow::array::{Array, ArrayBuilder, ArrayRef, GenericByteViewBuilder};
2525
use arrow::datatypes::{BinaryViewType, ByteViewType, DataType, StringViewType};
2626
use datafusion_common::hash_utils::create_hashes;
27-
use datafusion_common::utils::proxy::{RawTableAllocExt, VecAllocExt};
27+
use datafusion_common::utils::proxy::{HashTableAllocExt, VecAllocExt};
2828
use std::fmt::Debug;
2929
use std::sync::Arc;
3030

@@ -122,7 +122,7 @@ where
122122
/// Should the output be StringView or BinaryView?
123123
output_type: OutputType,
124124
/// Underlying hash set for each distinct value
125-
map: hashbrown::raw::RawTable<Entry<V>>,
125+
map: hashbrown::hash_table::HashTable<Entry<V>>,
126126
/// Total size of the map in bytes
127127
map_size: usize,
128128

@@ -148,7 +148,7 @@ where
148148
pub fn new(output_type: OutputType) -> Self {
149149
Self {
150150
output_type,
151-
map: hashbrown::raw::RawTable::with_capacity(INITIAL_MAP_CAPACITY),
151+
map: hashbrown::hash_table::HashTable::with_capacity(INITIAL_MAP_CAPACITY),
152152
map_size: 0,
153153
builder: GenericByteViewBuilder::new(),
154154
random_state: RandomState::new(),
@@ -274,7 +274,7 @@ where
274274
// get the value as bytes
275275
let value: &[u8] = value.as_ref();
276276

277-
let entry = self.map.get_mut(hash, |header| {
277+
let entry = self.map.find_mut(hash, |header| {
278278
let v = self.builder.get_value(header.view_idx);
279279

280280
if v.len() != value.len() {

0 commit comments

Comments
 (0)