Skip to content

Commit b07fd5d

Browse files
authored
feat(arrow-json): encode Binary and LargeBinary types as hex when writing JSON (#5785)
* feat: encode Binary and LargeBinary types in JSON as hex Added ability to the JSON writer to encode Binary and LargeBinary types as hex. This follows the behaviour for FixedSizeBinary. A test was added to check functionality for both Binary and LargeBinary. * refactor: use ArrayAccessor instead of custom trait * refactor: use generic in test instead of macro * refactor: use const DATA_TYPE from GenericBinaryType
1 parent 3e7e701 commit b07fd5d

File tree

2 files changed

+106
-5
lines changed

2 files changed

+106
-5
lines changed

arrow-json/src/writer.rs

+83
Original file line numberDiff line numberDiff line change
@@ -1565,6 +1565,89 @@ mod tests {
15651565
Ok(())
15661566
}
15671567

1568+
fn binary_encoding_test<O: OffsetSizeTrait>() {
1569+
// set up schema
1570+
let schema = SchemaRef::new(Schema::new(vec![Field::new(
1571+
"bytes",
1572+
GenericBinaryType::<O>::DATA_TYPE,
1573+
true,
1574+
)]));
1575+
1576+
// build record batch:
1577+
let mut builder = GenericByteBuilder::<GenericBinaryType<O>>::new();
1578+
let values = [Some(b"Ned Flanders"), None, Some(b"Troy McClure")];
1579+
for value in values {
1580+
match value {
1581+
Some(v) => builder.append_value(v),
1582+
None => builder.append_null(),
1583+
}
1584+
}
1585+
let array = Arc::new(builder.finish()) as ArrayRef;
1586+
let batch = RecordBatch::try_new(schema, vec![array]).unwrap();
1587+
1588+
// encode and check JSON with explicit nulls:
1589+
{
1590+
let mut buf = Vec::new();
1591+
let json_value: Value = {
1592+
let mut writer = WriterBuilder::new()
1593+
.with_explicit_nulls(true)
1594+
.build::<_, JsonArray>(&mut buf);
1595+
writer.write(&batch).unwrap();
1596+
writer.close().unwrap();
1597+
serde_json::from_slice(&buf).unwrap()
1598+
};
1599+
1600+
assert_eq!(
1601+
json!([
1602+
{
1603+
"bytes": "4e656420466c616e64657273"
1604+
},
1605+
{
1606+
"bytes": null // the explicit null
1607+
},
1608+
{
1609+
"bytes": "54726f79204d63436c757265"
1610+
}
1611+
]),
1612+
json_value,
1613+
);
1614+
}
1615+
1616+
// encode and check JSON with no explicit nulls:
1617+
{
1618+
let mut buf = Vec::new();
1619+
let json_value: Value = {
1620+
// explicit nulls are off by default, so we don't need
1621+
// to set that when creating the writer:
1622+
let mut writer = ArrayWriter::new(&mut buf);
1623+
writer.write(&batch).unwrap();
1624+
writer.close().unwrap();
1625+
serde_json::from_slice(&buf).unwrap()
1626+
};
1627+
1628+
assert_eq!(
1629+
json!([
1630+
{
1631+
"bytes": "4e656420466c616e64657273"
1632+
},
1633+
{}, // empty because nulls are omitted
1634+
{
1635+
"bytes": "54726f79204d63436c757265"
1636+
}
1637+
]),
1638+
json_value
1639+
);
1640+
}
1641+
}
1642+
1643+
#[test]
1644+
fn test_writer_binary() {
1645+
// Binary:
1646+
binary_encoding_test::<i32>();
1647+
// LargeBinary:
1648+
binary_encoding_test::<i64>();
1649+
}
1650+
15681651
#[test]
15691652
fn test_writer_fixed_size_binary() {
15701653
// set up schema:

arrow-json/src/writer/encoder.rs

+23-5
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,17 @@ fn make_encoder_impl<'a>(
105105

106106
DataType::FixedSizeBinary(_) => {
107107
let array = array.as_fixed_size_binary();
108-
(Box::new(FixedSizeBinaryEncoder::new(array)) as _, array.nulls().cloned())
108+
(Box::new(BinaryEncoder::new(array)) as _, array.nulls().cloned())
109+
}
110+
111+
DataType::Binary => {
112+
let array: &BinaryArray = array.as_binary();
113+
(Box::new(BinaryEncoder::new(array)) as _, array.nulls().cloned())
114+
}
115+
116+
DataType::LargeBinary => {
117+
let array: &LargeBinaryArray = array.as_binary();
118+
(Box::new(BinaryEncoder::new(array)) as _, array.nulls().cloned())
109119
}
110120

111121
DataType::Struct(fields) => {
@@ -509,15 +519,23 @@ impl<'a> Encoder for MapEncoder<'a> {
509519
}
510520
}
511521

512-
struct FixedSizeBinaryEncoder<'a>(&'a FixedSizeBinaryArray);
522+
/// New-type wrapper for encoding the binary types in arrow: `Binary`, `LargeBinary`
523+
/// and `FixedSizeBinary` as hex strings in JSON.
524+
struct BinaryEncoder<B>(B);
513525

514-
impl<'a> FixedSizeBinaryEncoder<'a> {
515-
fn new(array: &'a FixedSizeBinaryArray) -> Self {
526+
impl<'a, B> BinaryEncoder<B>
527+
where
528+
B: ArrayAccessor<Item = &'a [u8]>,
529+
{
530+
fn new(array: B) -> Self {
516531
Self(array)
517532
}
518533
}
519534

520-
impl<'a> Encoder for FixedSizeBinaryEncoder<'a> {
535+
impl<'a, B> Encoder for BinaryEncoder<B>
536+
where
537+
B: ArrayAccessor<Item = &'a [u8]>,
538+
{
521539
fn encode(&mut self, idx: usize, out: &mut Vec<u8>) {
522540
out.push(b'"');
523541
for byte in self.0.value(idx) {

0 commit comments

Comments
 (0)