@@ -458,7 +458,7 @@ impl Codec {
458
458
let nulls = converter. convert_columns ( & [ null_array] ) ?;
459
459
460
460
let owned = OwnedRow {
461
- data : nulls. buffer ,
461
+ data : nulls. buffer . into ( ) ,
462
462
config : nulls. config ,
463
463
} ;
464
464
Ok ( Self :: DictionaryValues ( converter, owned) )
@@ -496,7 +496,7 @@ impl Codec {
496
496
497
497
let nulls = converter. convert_columns ( & nulls) ?;
498
498
let owned = OwnedRow {
499
- data : nulls. buffer ,
499
+ data : nulls. buffer . into ( ) ,
500
500
config : nulls. config ,
501
501
} ;
502
502
@@ -715,7 +715,13 @@ impl RowConverter {
715
715
columns. iter ( ) . zip ( self . fields . iter ( ) ) . zip ( encoders)
716
716
{
717
717
// We encode a column at a time to minimise dispatch overheads
718
- encode_column ( & mut rows, column. as_ref ( ) , field. options , & encoder)
718
+ encode_column (
719
+ & mut rows. buffer ,
720
+ & mut rows. offsets ,
721
+ column. as_ref ( ) ,
722
+ field. options ,
723
+ & encoder,
724
+ )
719
725
}
720
726
721
727
if cfg ! ( debug_assertions) {
@@ -756,6 +762,48 @@ impl RowConverter {
756
762
unsafe { self . convert_raw ( & mut rows, validate_utf8) }
757
763
}
758
764
765
+ /// Returns an empty [`Rows`] with capacity for `row_capacity` rows with
766
+ /// a total length of `data_capacity`
767
+ ///
768
+ /// This can be used to buffer a selection of [`Row`]
769
+ ///
770
+ /// ```
771
+ /// # use std::sync::Arc;
772
+ /// # use std::collections::HashSet;
773
+ /// # use arrow_array::cast::AsArray;
774
+ /// # use arrow_array::StringArray;
775
+ /// # use arrow_row::{Row, RowConverter, SortField};
776
+ /// # use arrow_schema::DataType;
777
+ /// #
778
+ /// let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap();
779
+ /// let array = StringArray::from(vec!["hello", "world", "a", "a", "hello"]);
780
+ ///
781
+ /// // Convert to row format and deduplicate
782
+ /// let converted = converter.convert_columns(&[Arc::new(array)]).unwrap();
783
+ /// let mut distinct_rows = converter.empty_rows(3, 100);
784
+ /// let mut dedup: HashSet<Row> = HashSet::with_capacity(3);
785
+ /// converted.iter().filter(|row| dedup.insert(*row)).for_each(|row| distinct_rows.push(row));
786
+ ///
787
+ /// // Note: we could skip buffering and feed the filtered iterator directly
788
+ /// // into convert_rows, this is done for demonstration purposes only
789
+ /// let distinct = converter.convert_rows(&distinct_rows).unwrap();
790
+ /// let values: Vec<_> = distinct[0].as_string::<i32>().iter().map(Option::unwrap).collect();
791
+ /// assert_eq!(&values, &["hello", "world", "a"]);
792
+ /// ```
793
+ pub fn empty_rows ( & self , row_capacity : usize , data_capacity : usize ) -> Rows {
794
+ let mut offsets = Vec :: with_capacity ( row_capacity. saturating_add ( 1 ) ) ;
795
+ offsets. push ( 0 ) ;
796
+
797
+ Rows {
798
+ offsets,
799
+ buffer : Vec :: with_capacity ( data_capacity) ,
800
+ config : RowConfig {
801
+ fields : self . fields . clone ( ) ,
802
+ validate_utf8 : false ,
803
+ } ,
804
+ }
805
+ }
806
+
759
807
/// Convert raw bytes into [`ArrayRef`]
760
808
///
761
809
/// # Safety
@@ -832,14 +880,25 @@ struct RowConfig {
832
880
#[ derive( Debug ) ]
833
881
pub struct Rows {
834
882
/// Underlying row bytes
835
- buffer : Box < [ u8 ] > ,
883
+ buffer : Vec < u8 > ,
836
884
/// Row `i` has data `&buffer[offsets[i]..offsets[i+1]]`
837
- offsets : Box < [ usize ] > ,
885
+ offsets : Vec < usize > ,
838
886
/// The config for these rows
839
887
config : RowConfig ,
840
888
}
841
889
842
890
impl Rows {
891
+ /// Append a [`Row`] to this [`Rows`]
892
+ pub fn push ( & mut self , row : Row < ' _ > ) {
893
+ assert ! (
894
+ Arc :: ptr_eq( & row. config. fields, & self . config. fields) ,
895
+ "row was not produced by this RowConverter"
896
+ ) ;
897
+ self . config . validate_utf8 |= row. config . validate_utf8 ;
898
+ self . buffer . extend_from_slice ( row. data ) ;
899
+ self . offsets . push ( self . buffer . len ( ) )
900
+ }
901
+
843
902
pub fn row ( & self , row : usize ) -> Row < ' _ > {
844
903
let end = self . offsets [ row + 1 ] ;
845
904
let start = self . offsets [ row] ;
@@ -1171,66 +1230,67 @@ fn new_empty_rows(cols: &[ArrayRef], encoders: &[Encoder], config: RowConfig) ->
1171
1230
let buffer = vec ! [ 0_u8 ; cur_offset] ;
1172
1231
1173
1232
Rows {
1174
- buffer : buffer . into ( ) ,
1175
- offsets : offsets . into ( ) ,
1233
+ buffer,
1234
+ offsets,
1176
1235
config,
1177
1236
}
1178
1237
}
1179
1238
1180
1239
/// Encodes a column to the provided [`Rows`] incrementing the offsets as it progresses
1181
1240
fn encode_column (
1182
- out : & mut Rows ,
1241
+ data : & mut [ u8 ] ,
1242
+ offsets : & mut [ usize ] ,
1183
1243
column : & dyn Array ,
1184
1244
opts : SortOptions ,
1185
1245
encoder : & Encoder < ' _ > ,
1186
1246
) {
1187
1247
match encoder {
1188
1248
Encoder :: Stateless => {
1189
1249
downcast_primitive_array ! {
1190
- column => fixed:: encode( out , column, opts) ,
1250
+ column => fixed:: encode( data , offsets , column, opts) ,
1191
1251
DataType :: Null => { }
1192
- DataType :: Boolean => fixed:: encode( out , column. as_boolean( ) , opts) ,
1252
+ DataType :: Boolean => fixed:: encode( data , offsets , column. as_boolean( ) , opts) ,
1193
1253
DataType :: Binary => {
1194
- variable:: encode( out , as_generic_binary_array:: <i32 >( column) . iter( ) , opts)
1254
+ variable:: encode( data , offsets , as_generic_binary_array:: <i32 >( column) . iter( ) , opts)
1195
1255
}
1196
1256
DataType :: LargeBinary => {
1197
- variable:: encode( out , as_generic_binary_array:: <i64 >( column) . iter( ) , opts)
1257
+ variable:: encode( data , offsets , as_generic_binary_array:: <i64 >( column) . iter( ) , opts)
1198
1258
}
1199
1259
DataType :: Utf8 => variable:: encode(
1200
- out ,
1260
+ data , offsets ,
1201
1261
column. as_string:: <i32 >( ) . iter( ) . map( |x| x. map( |x| x. as_bytes( ) ) ) ,
1202
1262
opts,
1203
1263
) ,
1204
1264
DataType :: LargeUtf8 => variable:: encode(
1205
- out ,
1265
+ data , offsets ,
1206
1266
column. as_string:: <i64 >( )
1207
1267
. iter( )
1208
1268
. map( |x| x. map( |x| x. as_bytes( ) ) ) ,
1209
1269
opts,
1210
1270
) ,
1211
1271
DataType :: FixedSizeBinary ( _) => {
1212
1272
let array = column. as_any( ) . downcast_ref( ) . unwrap( ) ;
1213
- fixed:: encode_fixed_size_binary( out , array, opts)
1273
+ fixed:: encode_fixed_size_binary( data , offsets , array, opts)
1214
1274
}
1215
1275
_ => unreachable!( ) ,
1216
1276
}
1217
1277
}
1218
1278
Encoder :: Dictionary ( dict) => {
1219
1279
downcast_dictionary_array ! {
1220
- column => encode_dictionary( out , column, dict, opts) ,
1280
+ column => encode_dictionary( data , offsets , column, dict, opts) ,
1221
1281
_ => unreachable!( )
1222
1282
}
1223
1283
}
1224
1284
Encoder :: DictionaryValues ( values, nulls) => {
1225
1285
downcast_dictionary_array ! {
1226
- column => encode_dictionary_values( out , column, values, nulls) ,
1286
+ column => encode_dictionary_values( data , offsets , column, values, nulls) ,
1227
1287
_ => unreachable!( )
1228
1288
}
1229
1289
}
1230
1290
Encoder :: Struct ( rows, null) => {
1231
1291
let array = as_struct_array ( column) ;
1232
1292
let null_sentinel = null_sentinel ( opts) ;
1233
- out . offsets
1293
+ offsets
1234
1294
. iter_mut ( )
1235
1295
. skip ( 1 )
1236
1296
. enumerate ( )
@@ -1240,15 +1300,17 @@ fn encode_column(
1240
1300
false => ( * null, null_sentinel) ,
1241
1301
} ;
1242
1302
let end_offset = * offset + 1 + row. as_ref ( ) . len ( ) ;
1243
- out . buffer [ * offset] = sentinel;
1244
- out . buffer [ * offset + 1 ..end_offset] . copy_from_slice ( row. as_ref ( ) ) ;
1303
+ data [ * offset] = sentinel;
1304
+ data [ * offset + 1 ..end_offset] . copy_from_slice ( row. as_ref ( ) ) ;
1245
1305
* offset = end_offset;
1246
1306
} )
1247
1307
}
1248
1308
Encoder :: List ( rows) => match column. data_type ( ) {
1249
- DataType :: List ( _) => list:: encode ( out, rows, opts, as_list_array ( column) ) ,
1309
+ DataType :: List ( _) => {
1310
+ list:: encode ( data, offsets, rows, opts, as_list_array ( column) )
1311
+ }
1250
1312
DataType :: LargeList ( _) => {
1251
- list:: encode ( out , rows, opts, as_large_list_array ( column) )
1313
+ list:: encode ( data , offsets , rows, opts, as_large_list_array ( column) )
1252
1314
}
1253
1315
_ => unreachable ! ( ) ,
1254
1316
} ,
@@ -1384,9 +1446,9 @@ mod tests {
1384
1446
. unwrap ( ) ;
1385
1447
let rows = converter. convert_columns ( & cols) . unwrap ( ) ;
1386
1448
1387
- assert_eq ! ( rows. offsets. as_ref ( ) , & [ 0 , 8 , 16 , 24 , 32 , 40 , 48 , 56 ] ) ;
1449
+ assert_eq ! ( rows. offsets, & [ 0 , 8 , 16 , 24 , 32 , 40 , 48 , 56 ] ) ;
1388
1450
assert_eq ! (
1389
- rows. buffer. as_ref ( ) ,
1451
+ rows. buffer,
1390
1452
& [
1391
1453
1 , 128 , 1 , //
1392
1454
1 , 191 , 166 , 102 , 102 , //
0 commit comments