Skip to content

Commit 7a1ed90

Browse files
committed
define VectorizedOperationBuffers to hold buffers used in vectorized operations to make code clearer.
1 parent 406acb4 commit 7a1ed90

File tree

1 file changed

+109
-50
lines changed
  • datafusion/physical-plan/src/aggregates/group_values

1 file changed

+109
-50
lines changed

datafusion/physical-plan/src/aggregates/group_values/column.rs

Lines changed: 109 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -128,21 +128,8 @@ pub struct VectorizedGroupValuesColumn {
128128
/// a specific list in `group_index_lists`.
129129
emit_group_index_list_buffer: Vec<usize>,
130130

131-
/// Similar as `current_indices`, but `remaining_indices`
132-
/// is used to store the rows will be processed in next round.
133-
scalarized_indices: Vec<usize>,
134-
135-
/// The `vectorized_equal_tod` row indices buffer
136-
vectorized_equal_to_row_indices: Vec<usize>,
137-
138-
/// The `vectorized_equal_tod` group indices buffer
139-
vectorized_equal_to_group_indices: Vec<usize>,
140-
141-
/// The `vectorized_equal_tod` result buffer
142-
vectorized_equal_to_results: Vec<bool>,
143-
144-
/// The `vectorized append` row indices buffer
145-
vectorized_append_row_indices: Vec<usize>,
131+
/// Buffers for `vectorized_append` and `vectorized_equal_to`
132+
vectorized_operation_buffers: VectorizedOperationBuffers,
146133

147134
/// The actual group by values, stored column-wise. Compare from
148135
/// the left to right, each column is stored as [`GroupColumn`].
@@ -161,6 +148,38 @@ pub struct VectorizedGroupValuesColumn {
161148
random_state: RandomState,
162149
}
163150

151+
/// Buffers to store intermediate results in `vectorized_append`
152+
/// and `vectorized_equal_to`, for reducing memory allocation
153+
#[derive(Default)]
154+
struct VectorizedOperationBuffers {
155+
/// The `vectorized append` row indices buffer
156+
append_row_indices: Vec<usize>,
157+
158+
/// The `vectorized_equal_to` row indices buffer
159+
equal_to_row_indices: Vec<usize>,
160+
161+
/// The `vectorized_equal_to` group indices buffer
162+
equal_to_group_indices: Vec<usize>,
163+
164+
/// The `vectorized_equal_to` result buffer
165+
equal_to_results: Vec<bool>,
166+
167+
/// The buffer for storing row indices found not equal to
168+
/// exist groups in `group_values` in `vectorized_equal_to`.
169+
/// We will perform `scalarized_intern` for such rows.
170+
remaining_row_indices: Vec<usize>,
171+
}
172+
173+
impl VectorizedOperationBuffers {
174+
fn clear(&mut self) {
175+
self.append_row_indices.clear();
176+
self.equal_to_row_indices.clear();
177+
self.equal_to_group_indices.clear();
178+
self.equal_to_results.clear();
179+
self.remaining_row_indices.clear();
180+
}
181+
}
182+
164183
impl VectorizedGroupValuesColumn {
165184
/// Create a new instance of GroupValuesColumn if supported for the specified schema
166185
pub fn try_new(schema: SchemaRef) -> Result<Self> {
@@ -170,15 +189,11 @@ impl VectorizedGroupValuesColumn {
170189
map,
171190
group_index_lists: Vec::new(),
172191
emit_group_index_list_buffer: Vec::new(),
192+
vectorized_operation_buffers: VectorizedOperationBuffers::default(),
173193
map_size: 0,
174194
group_values: vec![],
175195
hashes_buffer: Default::default(),
176196
random_state: Default::default(),
177-
scalarized_indices: Default::default(),
178-
vectorized_equal_to_row_indices: Default::default(),
179-
vectorized_equal_to_group_indices: Default::default(),
180-
vectorized_equal_to_results: Default::default(),
181-
vectorized_append_row_indices: Default::default(),
182197
})
183198
}
184199

@@ -201,9 +216,13 @@ impl VectorizedGroupValuesColumn {
201216
batch_hashes: &[u64],
202217
groups: &mut [usize],
203218
) {
204-
self.vectorized_append_row_indices.clear();
205-
self.vectorized_equal_to_row_indices.clear();
206-
self.vectorized_equal_to_group_indices.clear();
219+
self.vectorized_operation_buffers.append_row_indices.clear();
220+
self.vectorized_operation_buffers
221+
.equal_to_row_indices
222+
.clear();
223+
self.vectorized_operation_buffers
224+
.equal_to_group_indices
225+
.clear();
207226

208227
let mut group_values_len = self.group_values[0].len();
209228
for (row, &target_hash) in batch_hashes.iter().enumerate() {
@@ -227,7 +246,9 @@ impl VectorizedGroupValuesColumn {
227246
);
228247

229248
// Add row index to `vectorized_append_row_indices`
230-
self.vectorized_append_row_indices.push(row);
249+
self.vectorized_operation_buffers
250+
.append_row_indices
251+
.push(row);
231252

232253
// Set group index to row in `groups`
233254
groups[row] = current_group_idx;
@@ -245,26 +266,41 @@ impl VectorizedGroupValuesColumn {
245266
let list_offset = group_index_view.value() as usize;
246267
let group_index_list = &self.group_index_lists[list_offset];
247268
for &group_index in group_index_list {
248-
self.vectorized_equal_to_row_indices.push(row);
249-
self.vectorized_equal_to_group_indices.push(group_index);
269+
self.vectorized_operation_buffers
270+
.equal_to_row_indices
271+
.push(row);
272+
self.vectorized_operation_buffers
273+
.equal_to_group_indices
274+
.push(group_index);
250275
}
251276
} else {
252277
let group_index = group_index_view.value() as usize;
253-
self.vectorized_equal_to_row_indices.push(row);
254-
self.vectorized_equal_to_group_indices.push(group_index);
278+
self.vectorized_operation_buffers
279+
.equal_to_row_indices
280+
.push(row);
281+
self.vectorized_operation_buffers
282+
.equal_to_group_indices
283+
.push(group_index);
255284
}
256285
}
257286
}
258287

259288
/// Perform `vectorized_append`` for `rows` in `vectorized_append_row_indices`
260289
fn vectorized_append(&mut self, cols: &[ArrayRef]) {
261-
if self.vectorized_append_row_indices.is_empty() {
290+
if self
291+
.vectorized_operation_buffers
292+
.append_row_indices
293+
.is_empty()
294+
{
262295
return;
263296
}
264297

265298
let iter = self.group_values.iter_mut().zip(cols.iter());
266299
for (group_column, col) in iter {
267-
group_column.vectorized_append(col, &self.vectorized_append_row_indices);
300+
group_column.vectorized_append(
301+
col,
302+
&self.vectorized_operation_buffers.append_row_indices,
303+
);
268304
}
269305
}
270306

@@ -283,63 +319,86 @@ impl VectorizedGroupValuesColumn {
283319
/// are very few.
284320
fn vectorized_equal_to(&mut self, cols: &[ArrayRef], groups: &mut [usize]) {
285321
assert_eq!(
286-
self.vectorized_equal_to_group_indices.len(),
287-
self.vectorized_equal_to_row_indices.len()
322+
self.vectorized_operation_buffers
323+
.equal_to_group_indices
324+
.len(),
325+
self.vectorized_operation_buffers.equal_to_row_indices.len()
288326
);
289327

290-
self.scalarized_indices.clear();
328+
self.vectorized_operation_buffers
329+
.remaining_row_indices
330+
.clear();
291331

292-
if self.vectorized_equal_to_group_indices.is_empty() {
332+
if self
333+
.vectorized_operation_buffers
334+
.equal_to_group_indices
335+
.is_empty()
336+
{
293337
return;
294338
}
295339

296340
// 1. Perform `vectorized_equal_to` for `rows` in `vectorized_equal_to_group_indices`
297341
// and `group_indices` in `vectorized_equal_to_group_indices`
298-
let mut equal_to_results = mem::take(&mut self.vectorized_equal_to_results);
342+
let mut equal_to_results =
343+
mem::take(&mut self.vectorized_operation_buffers.equal_to_results);
299344
equal_to_results.clear();
300-
equal_to_results.resize(self.vectorized_equal_to_group_indices.len(), true);
345+
equal_to_results.resize(
346+
self.vectorized_operation_buffers
347+
.equal_to_group_indices
348+
.len(),
349+
true,
350+
);
301351

302352
for (col_idx, group_col) in self.group_values.iter().enumerate() {
303353
group_col.vectorized_equal_to(
304-
&self.vectorized_equal_to_group_indices,
354+
&self.vectorized_operation_buffers.equal_to_group_indices,
305355
&cols[col_idx],
306-
&self.vectorized_equal_to_row_indices,
356+
&self.vectorized_operation_buffers.equal_to_row_indices,
307357
&mut equal_to_results,
308358
);
309359
}
310360

311361
// 2. Check `equal_to_results`, if found not equal to `row`s, just add them
312362
// to `scalarized_indices`, and perform `scalarized_intern` for them after.
313363
let mut current_row_equal_to_result = false;
314-
for (idx, &row) in self.vectorized_equal_to_row_indices.iter().enumerate() {
364+
for (idx, &row) in self
365+
.vectorized_operation_buffers
366+
.equal_to_row_indices
367+
.iter()
368+
.enumerate()
369+
{
315370
let equal_to_result = equal_to_results[idx];
316371

317372
// Equal to case, set the `group_indices` to `rows` in `groups`
318373
if equal_to_result {
319-
groups[row] = self.vectorized_equal_to_group_indices[idx];
374+
groups[row] =
375+
self.vectorized_operation_buffers.equal_to_group_indices[idx];
320376
}
321377
current_row_equal_to_result |= equal_to_result;
322378

323379
// Look forward next one row to check if have checked all results
324380
// of current row
325381
let next_row = self
326-
.vectorized_equal_to_row_indices
382+
.vectorized_operation_buffers
383+
.equal_to_row_indices
327384
.get(idx + 1)
328385
.unwrap_or(&usize::MAX);
329386

330387
// Have checked all results of current row, check the total result
331388
if row != *next_row {
332389
// Not equal to case, add `row` to `scalarized_indices`
333390
if !current_row_equal_to_result {
334-
self.scalarized_indices.push(row);
391+
self.vectorized_operation_buffers
392+
.remaining_row_indices
393+
.push(row);
335394
}
336395

337396
// Init the total result for checking next row
338397
current_row_equal_to_result = false;
339398
}
340399
}
341400

342-
self.vectorized_equal_to_results = equal_to_results;
401+
self.vectorized_operation_buffers.equal_to_results = equal_to_results;
343402
}
344403

345404
/// It is possible that some `input rows` have the same
@@ -384,13 +443,17 @@ impl VectorizedGroupValuesColumn {
384443
batch_hashes: &[u64],
385444
groups: &mut [usize],
386445
) {
387-
if self.scalarized_indices.is_empty() {
446+
if self
447+
.vectorized_operation_buffers
448+
.remaining_row_indices
449+
.is_empty()
450+
{
388451
return;
389452
}
390453

391454
let mut map = mem::take(&mut self.map);
392455

393-
for &row in &self.scalarized_indices {
456+
for &row in &self.vectorized_operation_buffers.remaining_row_indices {
394457
let target_hash = batch_hashes[row];
395458
let entry = map.get_mut(target_hash, |(exist_hash, _)| {
396459
// Somewhat surprisingly, this closure can be called even if the
@@ -781,11 +844,7 @@ impl GroupValues for VectorizedGroupValuesColumn {
781844
self.hashes_buffer.shrink_to(count);
782845
self.group_index_lists.clear();
783846
self.emit_group_index_list_buffer.clear();
784-
self.scalarized_indices.clear();
785-
self.vectorized_append_row_indices.clear();
786-
self.vectorized_equal_to_row_indices.clear();
787-
self.vectorized_equal_to_group_indices.clear();
788-
self.vectorized_equal_to_results.clear();
847+
self.vectorized_operation_buffers.clear();
789848
}
790849
}
791850

0 commit comments

Comments
 (0)