15
15
// specific language governing permissions and limitations
16
16
// under the License.
17
17
18
- use crate :: aggregates:: group_values:: GroupValues ;
18
+ use std:: mem;
19
+
20
+ use crate :: aggregates:: group_values:: { GroupBlock , GroupIdx , GroupValues } ;
19
21
use ahash:: RandomState ;
20
22
use arrow:: compute:: cast;
21
23
use arrow:: record_batch:: RecordBatch ;
@@ -44,7 +46,7 @@ pub struct GroupValuesRows {
44
46
///
45
47
/// keys: u64 hashes of the GroupValue
46
48
/// values: (hash, group_index)
47
- map : RawTable < ( u64 , usize ) > ,
49
+ map : RawTable < ( u64 , GroupIdx ) > ,
48
50
49
51
/// The size of `map` in bytes
50
52
map_size : usize ,
@@ -57,7 +59,7 @@ pub struct GroupValuesRows {
57
59
/// important for multi-column group keys.
58
60
///
59
61
/// [`Row`]: arrow::row::Row
60
- group_values : Option < Rows > ,
62
+ group_values_blocks : Vec < Rows > ,
61
63
62
64
/// reused buffer to store hashes
63
65
hashes_buffer : Vec < u64 > ,
@@ -67,10 +69,14 @@ pub struct GroupValuesRows {
67
69
68
70
/// Random state for creating hashes
69
71
random_state : RandomState ,
72
+
73
+ max_block_size : usize ,
74
+
75
+ cur_block_id : u16 ,
70
76
}
71
77
72
78
impl GroupValuesRows {
73
- pub fn try_new ( schema : SchemaRef ) -> Result < Self > {
79
+ pub fn try_new ( schema : SchemaRef , page_size : usize ) -> Result < Self > {
74
80
let row_converter = RowConverter :: new (
75
81
schema
76
82
. fields ( )
@@ -90,27 +96,31 @@ impl GroupValuesRows {
90
96
row_converter,
91
97
map,
92
98
map_size : 0 ,
93
- group_values : None ,
99
+ group_values_blocks : Vec :: new ( ) ,
94
100
hashes_buffer : Default :: default ( ) ,
95
101
rows_buffer,
96
102
random_state : Default :: default ( ) ,
103
+ max_block_size : page_size,
97
104
} )
98
105
}
99
106
}
100
107
101
108
impl GroupValues for GroupValuesRows {
102
- fn intern ( & mut self , cols : & [ ArrayRef ] , groups : & mut Vec < usize > ) -> Result < ( ) > {
109
+ fn intern ( & mut self , cols : & [ ArrayRef ] , groups : & mut Vec < GroupIdx > ) -> Result < ( ) > {
103
110
// Convert the group keys into the row format
104
111
let group_rows = & mut self . rows_buffer ;
105
112
group_rows. clear ( ) ;
106
113
self . row_converter . append ( group_rows, cols) ?;
107
114
let n_rows = group_rows. num_rows ( ) ;
108
115
109
- let mut group_values = match self . group_values . take ( ) {
110
- Some ( group_values) => group_values,
111
- None => self . row_converter . empty_rows ( 0 , 0 ) ,
116
+ if self . group_values_blocks . is_empty ( ) {
117
+ // TODO: calc and use the capacity to init.
118
+ let block = self . row_converter . empty_rows ( 0 , 0 ) ;
119
+ self . group_values_blocks . push ( block) ;
112
120
} ;
113
121
122
+ let mut group_values_blocks = mem:: take ( & mut self . group_values_blocks ) ;
123
+
114
124
// tracks to which group each of the input rows belongs
115
125
groups. clear ( ) ;
116
126
@@ -126,21 +136,38 @@ impl GroupValues for GroupValuesRows {
126
136
// hash doesn't match, so check the hash first with an integer
127
137
// comparison first avoid the more expensive comparison with
128
138
// group value. https://github.com/apache/datafusion/pull/11718
129
- target_hash == * exist_hash
130
- // verify that the group that we are inserting with hash is
131
- // actually the same key value as the group in
132
- // existing_idx (aka group_values @ row)
133
- && group_rows. row ( row) == group_values. row ( * group_idx)
139
+ if target_hash != * exist_hash {
140
+ return false ;
141
+ }
142
+
143
+ // verify that the group that we are inserting with hash is
144
+ // actually the same key value as the group in
145
+ // existing_idx (aka group_values @ row)
146
+ let block_id = group_idx. block_id ( ) ;
147
+ let block_offset = group_idx. block_offset ( ) ;
148
+ let group_value = group_values_blocks[ block_id] . row ( block_offset) ;
149
+ group_rows. row ( row) == group_value
134
150
} ) ;
135
151
136
152
let group_idx = match entry {
137
153
// Existing group_index for this group value
138
154
Some ( ( _hash, group_idx) ) => * group_idx,
139
155
// 1.2 Need to create new entry for the group
140
156
None => {
157
+ // Check if the block size has reached the limit, if so we switch to next block.
158
+ let block_size = group_values_blocks. last ( ) . unwrap ( ) . num_rows ( ) ;
159
+ if block_size == self . max_block_size {
160
+ self . cur_block_id += 1 ;
161
+ // TODO: calc and use the capacity to init.
162
+ let block = self . row_converter . empty_rows ( 0 , 0 ) ;
163
+ self . group_values_blocks . push ( block) ;
164
+ }
165
+
141
166
// Add new entry to aggr_state and save newly created index
142
- let group_idx = group_values. num_rows ( ) ;
143
- group_values. push ( group_rows. row ( row) ) ;
167
+ let cur_group_values = self . group_values_blocks . last_mut ( ) . unwrap ( ) ;
168
+ let block_offset = group_values. num_rows ( ) ;
169
+ let group_idx = GroupIdx :: new ( self . cur_block_id , block_offset) ;
170
+ cur_group_values. push ( group_rows. row ( row) ) ;
144
171
145
172
// for hasher function, use precomputed hash value
146
173
self . map . insert_accounted (
@@ -154,13 +181,13 @@ impl GroupValues for GroupValuesRows {
154
181
groups. push ( group_idx) ;
155
182
}
156
183
157
- self . group_values = Some ( group_values ) ;
184
+ self . group_values_blocks = group_values_blocks ;
158
185
159
186
Ok ( ( ) )
160
187
}
161
188
162
189
fn size ( & self ) -> usize {
163
- let group_values_size = self . group_values . as_ref ( ) . map ( |v| v. size ( ) ) . unwrap_or ( 0 ) ;
190
+ let group_values_size = self . group_values_blocks . as_ref ( ) . map ( |v| v. size ( ) ) . unwrap_or ( 0 ) ;
164
191
self . row_converter . size ( )
165
192
+ group_values_size
166
193
+ self . map_size
@@ -173,15 +200,15 @@ impl GroupValues for GroupValuesRows {
173
200
}
174
201
175
202
fn len ( & self ) -> usize {
176
- self . group_values
203
+ self . group_values_blocks
177
204
. as_ref ( )
178
205
. map ( |group_values| group_values. num_rows ( ) )
179
206
. unwrap_or ( 0 )
180
207
}
181
208
182
- fn emit ( & mut self , emit_to : EmitTo ) -> Result < Vec < ArrayRef > > {
209
+ fn emit ( & mut self , emit_to : EmitTo ) -> Result < Vec < GroupBlock > > {
183
210
let mut group_values = self
184
- . group_values
211
+ . group_values_blocks
185
212
. take ( )
186
213
. expect ( "Can not emit from empty rows" ) ;
187
214
@@ -232,13 +259,13 @@ impl GroupValues for GroupValuesRows {
232
259
}
233
260
}
234
261
235
- self . group_values = Some ( group_values) ;
262
+ self . group_values_blocks = Some ( group_values) ;
236
263
Ok ( output)
237
264
}
238
265
239
266
fn clear_shrink ( & mut self , batch : & RecordBatch ) {
240
267
let count = batch. num_rows ( ) ;
241
- self . group_values = self . group_values . take ( ) . map ( |mut rows| {
268
+ self . group_values_blocks = self . group_values_blocks . take ( ) . map ( |mut rows| {
242
269
rows. clear ( ) ;
243
270
rows
244
271
} ) ;
0 commit comments