Skip to content

Commit 80c2d78

Browse files
committed
Run End Encoding DataType
1 parent ddba53b commit 80c2d78

File tree

10 files changed

+59
-31
lines changed

10 files changed

+59
-31
lines changed

arrow-array/src/array/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,7 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef {
737737
new_null_sized_decimal(data_type, length, std::mem::size_of::<i128>())
738738
}
739739
DataType::Decimal256(_, _) => new_null_sized_decimal(data_type, length, 32),
740+
DataType::RunEndEncodedType(_, _) => todo!(),
740741
}
741742
}
742743

arrow-data/src/data.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -198,9 +198,9 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff
198198
],
199199
_ => unreachable!(),
200200
},
201-
DataType::FixedSizeList(_, _) | DataType::Struct(_) => {
202-
[empty_buffer, MutableBuffer::new(0)]
203-
}
201+
DataType::FixedSizeList(_, _)
202+
| DataType::Struct(_)
203+
| DataType::RunEndEncodedType(_, _) => [empty_buffer, MutableBuffer::new(0)],
204204
DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => [
205205
MutableBuffer::new(capacity * mem::size_of::<u8>()),
206206
empty_buffer,
@@ -654,6 +654,12 @@ impl ArrayData {
654654
DataType::Dictionary(_, data_type) => {
655655
vec![Self::new_empty(data_type)]
656656
}
657+
DataType::RunEndEncodedType(run_ends, values) => {
658+
vec![
659+
Self::new_empty(run_ends.data_type()),
660+
Self::new_empty(values.data_type()),
661+
]
662+
}
657663
};
658664

659665
// Data was constructed correctly above
@@ -1508,6 +1514,7 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout {
15081514
// same as ListType
15091515
DataTypeLayout::new_fixed_width(size_of::<i32>())
15101516
}
1517+
DataType::RunEndEncodedType(_, _) => DataTypeLayout::new_empty(), // all in child data,
15111518
}
15121519
}
15131520

arrow-data/src/equal/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ fn equal_values(
137137
},
138138
DataType::Float16 => primitive_equal::<f16>(lhs, rhs, lhs_start, rhs_start, len),
139139
DataType::Map(_, _) => list_equal::<i32>(lhs, rhs, lhs_start, rhs_start, len),
140+
DataType::RunEndEncodedType(_, _) => todo!(),
140141
}
141142
}
142143

arrow-data/src/transform/mod.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ fn build_extend(array: &ArrayData) -> Extend {
230230
UnionMode::Sparse => union::build_extend_sparse(array),
231231
UnionMode::Dense => union::build_extend_dense(array),
232232
},
233+
DataType::RunEndEncodedType(_, _) => todo!(),
233234
}
234235
}
235236

@@ -281,6 +282,7 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls {
281282
UnionMode::Sparse => union::extend_nulls_sparse,
282283
UnionMode::Dense => union::extend_nulls_dense,
283284
},
285+
DataType::RunEndEncodedType(_, _) => todo!(),
284286
})
285287
}
286288

@@ -473,6 +475,20 @@ impl<'a> MutableArrayData<'a> {
473475
})
474476
.collect::<Vec<_>>(),
475477
},
478+
DataType::RunEndEncodedType(_, _) => {
479+
let run_ends_child = arrays
480+
.iter()
481+
.map(|array| &array.child_data()[0])
482+
.collect::<Vec<_>>();
483+
let value_child = arrays
484+
.iter()
485+
.map(|array| &array.child_data()[1])
486+
.collect::<Vec<_>>();
487+
vec![
488+
MutableArrayData::new(run_ends_child, use_nulls, array_capacity),
489+
MutableArrayData::new(value_child, use_nulls, array_capacity),
490+
]
491+
}
476492
DataType::FixedSizeList(_, _) => {
477493
let childs = arrays
478494
.iter()

arrow-integration-test/src/datatype.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,7 @@ pub fn data_type_to_json(data_type: &DataType) -> serde_json::Value {
357357
DataType::Map(_, keys_sorted) => {
358358
json!({"name": "map", "keysSorted": keys_sorted})
359359
}
360+
DataType::RunEndEncodedType(_, _) => todo!(),
360361
}
361362
}
362363

arrow-ipc/src/convert.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,7 @@ pub(crate) fn get_fb_field_type<'a>(
772772
children: Some(fbb.create_vector(&children[..])),
773773
}
774774
}
775+
RunEndEncodedType(_, _) => todo!(),
775776
}
776777
}
777778

arrow-schema/src/datatype.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,19 @@ pub enum DataType {
242242
/// child fields may be respectively "entries", "key", and "value", but this is
243243
/// not enforced.
244244
Map(Box<Field>, bool),
245+
246+
/// A run-end encoding (REE) array is a variation of run-length encoding (RLE). These
247+
/// encodings are well-suited for representing data containing sequences of the
248+
/// same value, called runs. Each run is represented as a value and an integer giving
249+
/// the index in the array where the run ends.
250+
///
251+
/// A run-end encoded array has no buffers by itself, but has two child arrays. The
252+
/// first child array, called the run ends array, holds either 16, 32, or 64-bit
253+
/// signed integers. The actual values of each run are held in the second child array.
254+
///
255+
/// These child arrays are prescribed the standard names of "run_ends" and "values"
256+
/// respectively.
257+
RunEndEncodedType(Box<Field>, Box<Field>),
245258
}
246259

247260
/// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds.
@@ -438,6 +451,10 @@ impl DataType {
438451
+ (std::mem::size_of::<Field>() * fields.capacity())
439452
}
440453
DataType::Dictionary(dt1, dt2) => dt1.size() + dt2.size(),
454+
DataType::RunEndEncodedType(run_ends, values) => {
455+
run_ends.size() - std::mem::size_of_val(run_ends) + values.size()
456+
- std::mem::size_of_val(values)
457+
}
441458
}
442459
}
443460
}

arrow-schema/src/field.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,7 @@ impl Field {
410410
| DataType::List(_)
411411
| DataType::Map(_, _)
412412
| DataType::Dictionary(_, _)
413+
| DataType::RunEndEncodedType(_, _)
413414
| DataType::FixedSizeList(_, _)
414415
| DataType::FixedSizeBinary(_)
415416
| DataType::Utf8

parquet/src/arrow/arrow_writer/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,7 @@ fn write_leaves<W: Write>(
360360
ArrowDataType::Float16 => Err(ParquetError::ArrowError(
361361
"Float16 arrays not supported".to_string(),
362362
)),
363-
ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_, _, _) => {
363+
ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_, _, _) | ArrowDataType::RunEndEncodedType(_, _) => {
364364
Err(ParquetError::NYI(
365365
format!(
366366
"Attempting to write an Arrow type {:?} to parquet that is not yet implemented",

parquet/src/arrow/schema/mod.rs

Lines changed: 10 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,9 @@ fn arrow_to_parquet_type(field: &Field) -> Result<Type> {
507507
let dict_field = Field::new(name, *value.clone(), field.is_nullable());
508508
arrow_to_parquet_type(&dict_field)
509509
}
510+
DataType::RunEndEncodedType(_, _) => Err(arrow_err!(
511+
"Converting RunEndEncodedType to parquet not supported",
512+
)),
510513
}
511514
}
512515

@@ -640,7 +643,7 @@ mod tests {
640643
ProjectionMask::all(),
641644
None,
642645
)
643-
.unwrap();
646+
.unwrap();
644647
assert_eq!(&arrow_fields, converted_arrow_schema.fields());
645648
}
646649

@@ -1342,20 +1345,9 @@ mod tests {
13421345
))),
13431346
false,
13441347
),
1345-
Field::new(
1346-
"decimal_int32",
1347-
DataType::Decimal128(8, 2),
1348-
false,
1349-
),
1350-
Field::new(
1351-
"decimal_int64",
1352-
DataType::Decimal128(16, 2),
1353-
false,
1354-
),
1355-
Field::new(
1356-
"decimal_fix_length",
1357-
DataType::Decimal128(30, 2),
1358-
false, ),
1348+
Field::new("decimal_int32", DataType::Decimal128(8, 2), false),
1349+
Field::new("decimal_int64", DataType::Decimal128(16, 2), false),
1350+
Field::new("decimal_fix_length", DataType::Decimal128(30, 2), false),
13591351
];
13601352

13611353
assert_eq!(arrow_fields, converted_arrow_fields);
@@ -1491,18 +1483,9 @@ mod tests {
14911483
DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
14921484
false,
14931485
),
1494-
Field::new(
1495-
"decimal_int32",
1496-
DataType::Decimal128(8, 2),
1497-
false),
1498-
Field::new("decimal_int64",
1499-
DataType::Decimal128(16, 2),
1500-
false),
1501-
Field::new(
1502-
"decimal_fix_length",
1503-
DataType::Decimal128(30, 2),
1504-
false,
1505-
),
1486+
Field::new("decimal_int32", DataType::Decimal128(8, 2), false),
1487+
Field::new("decimal_int64", DataType::Decimal128(16, 2), false),
1488+
Field::new("decimal_fix_length", DataType::Decimal128(30, 2), false),
15061489
];
15071490
let arrow_schema = Schema::new(arrow_fields);
15081491
let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema).unwrap();

0 commit comments

Comments
 (0)