20
20
//! This JSON writer converts Arrow [`RecordBatch`]es into arrays of
21
21
//! JSON objects or JSON formatted byte streams.
22
22
//!
23
- //! ## Writing JSON Objects
24
- //!
25
- //! To serialize [`RecordBatch`]es into array of
26
- //! [JSON](https://docs.serde.rs/serde_json/) objects, use
27
- //! [`record_batches_to_json_rows`]:
28
- //!
29
- //! ```
30
- //! # use std::sync::Arc;
31
- //! # use arrow_array::{Int32Array, RecordBatch};
32
- //! # use arrow_schema::{DataType, Field, Schema};
33
- //!
34
- //! let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
35
- //! let a = Int32Array::from(vec![1, 2, 3]);
36
- //! let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap();
37
- //!
38
- //! let json_rows = arrow_json::writer::record_batches_to_json_rows(&[&batch]).unwrap();
39
- //! assert_eq!(
40
- //! serde_json::Value::Object(json_rows[1].clone()),
41
- //! serde_json::json!({"a": 2}),
42
- //! );
43
- //! ```
44
- //!
45
23
//! ## Writing JSON formatted byte streams
46
24
//!
47
25
//! To serialize [`RecordBatch`]es into line-delimited JSON bytes, use
97
75
//! In order to explicitly write null values for keys, configure a custom [`Writer`] by
98
76
//! using a [`WriterBuilder`] to construct a [`Writer`].
99
77
78
+ mod encoder;
79
+
100
80
use std:: iter;
101
81
use std:: { fmt:: Debug , io:: Write } ;
102
82
@@ -109,7 +89,9 @@ use arrow_array::types::*;
109
89
use arrow_array:: * ;
110
90
use arrow_schema:: * ;
111
91
92
+ use crate :: writer:: encoder:: EncoderOptions ;
112
93
use arrow_cast:: display:: { ArrayFormatter , FormatOptions } ;
94
+ use encoder:: make_encoder;
113
95
114
96
fn primitive_array_to_json < T > ( array : & dyn Array ) -> Result < Vec < Value > , ArrowError >
115
97
where
@@ -481,6 +463,7 @@ fn set_column_for_json_rows(
481
463
482
464
/// Converts an arrow [`RecordBatch`] into a `Vec` of Serde JSON
483
465
/// [`JsonMap`]s (objects)
466
+ #[ deprecated( note = "Use Writer" ) ]
484
467
pub fn record_batches_to_json_rows (
485
468
batches : & [ & RecordBatch ] ,
486
469
) -> Result < Vec < JsonMap < String , Value > > , ArrowError > {
@@ -597,11 +580,7 @@ pub type ArrayWriter<W> = Writer<W, JsonArray>;
597
580
598
581
/// JSON writer builder.
599
582
#[ derive( Debug , Clone , Default ) ]
600
- pub struct WriterBuilder {
601
- /// Controls whether null values should be written explicitly for keys
602
- /// in objects, or whether the key should be omitted entirely.
603
- explicit_nulls : bool ,
604
- }
583
+ pub struct WriterBuilder ( EncoderOptions ) ;
605
584
606
585
impl WriterBuilder {
607
586
/// Create a new builder for configuring JSON writing options.
@@ -629,7 +608,7 @@ impl WriterBuilder {
629
608
630
609
/// Returns `true` if this writer is configured to keep keys with null values.
631
610
pub fn explicit_nulls ( & self ) -> bool {
632
- self . explicit_nulls
611
+ self . 0 . explicit_nulls
633
612
}
634
613
635
614
/// Set whether to keep keys with null values, or to omit writing them.
@@ -654,7 +633,7 @@ impl WriterBuilder {
654
633
///
655
634
/// Default is to skip nulls (set to `false`).
656
635
pub fn with_explicit_nulls ( mut self , explicit_nulls : bool ) -> Self {
657
- self . explicit_nulls = explicit_nulls;
636
+ self . 0 . explicit_nulls = explicit_nulls;
658
637
self
659
638
}
660
639
@@ -669,7 +648,7 @@ impl WriterBuilder {
669
648
started : false ,
670
649
finished : false ,
671
650
format : F :: default ( ) ,
672
- explicit_nulls : self . explicit_nulls ,
651
+ options : self . 0 ,
673
652
}
674
653
}
675
654
}
@@ -703,7 +682,7 @@ where
703
682
format : F ,
704
683
705
684
/// Whether keys with null values should be written or skipped
706
- explicit_nulls : bool ,
685
+ options : EncoderOptions ,
707
686
}
708
687
709
688
impl < W , F > Writer < W , F >
@@ -718,11 +697,12 @@ where
718
697
started : false ,
719
698
finished : false ,
720
699
format : F :: default ( ) ,
721
- explicit_nulls : false ,
700
+ options : EncoderOptions :: default ( ) ,
722
701
}
723
702
}
724
703
725
704
/// Write a single JSON row to the output writer
705
+ #[ deprecated( note = "Use Writer::write" ) ]
726
706
pub fn write_row ( & mut self , row : & Value ) -> Result < ( ) , ArrowError > {
727
707
let is_first_row = !self . started ;
728
708
if !self . started {
@@ -738,18 +718,48 @@ where
738
718
Ok ( ( ) )
739
719
}
740
720
741
- /// Convert the `RecordBatch` into JSON rows, and write them to the output
721
+ /// Serialize `batch` to JSON output
742
722
pub fn write ( & mut self , batch : & RecordBatch ) -> Result < ( ) , ArrowError > {
743
- for row in record_batches_to_json_rows_internal ( & [ batch] , self . explicit_nulls ) ? {
744
- self . write_row ( & Value :: Object ( row ) ) ? ;
723
+ if batch. num_rows ( ) == 0 {
724
+ return Ok ( ( ) ) ;
745
725
}
726
+
727
+ // BufWriter uses a buffer size of 8KB
728
+ // We therefore double this and flush once we have more than 8KB
729
+ let mut buffer = Vec :: with_capacity ( 16 * 1024 ) ;
730
+
731
+ let mut is_first_row = !self . started ;
732
+ if !self . started {
733
+ self . format . start_stream ( & mut buffer) ?;
734
+ self . started = true ;
735
+ }
736
+
737
+ let array = StructArray :: from ( batch. clone ( ) ) ;
738
+ let mut encoder = make_encoder ( & array, & self . options ) ?;
739
+
740
+ for idx in 0 ..batch. num_rows ( ) {
741
+ self . format . start_row ( & mut buffer, is_first_row) ?;
742
+ is_first_row = false ;
743
+
744
+ encoder. encode ( idx, & mut buffer) ;
745
+ if buffer. len ( ) > 8 * 1024 {
746
+ self . writer . write_all ( & buffer) ?;
747
+ buffer. clear ( ) ;
748
+ }
749
+ self . format . end_row ( & mut buffer) ?;
750
+ }
751
+
752
+ if !buffer. is_empty ( ) {
753
+ self . writer . write_all ( & buffer) ?;
754
+ }
755
+
746
756
Ok ( ( ) )
747
757
}
748
758
749
- /// Convert the [`RecordBatch`] into JSON rows, and write them to the output
759
+ /// Serialize `batches` to JSON output
750
760
pub fn write_batches ( & mut self , batches : & [ & RecordBatch ] ) -> Result < ( ) , ArrowError > {
751
- for row in record_batches_to_json_rows_internal ( batches, self . explicit_nulls ) ? {
752
- self . write_row ( & Value :: Object ( row ) ) ?;
761
+ for b in batches {
762
+ self . write ( b ) ?;
753
763
}
754
764
Ok ( ( ) )
755
765
}
@@ -803,6 +813,9 @@ mod tests {
803
813
804
814
/// Asserts that the NDJSON `input` is semantically identical to `expected`
805
815
fn assert_json_eq ( input : & [ u8 ] , expected : & str ) {
816
+ let s = std:: str:: from_utf8 ( input) . unwrap ( ) ;
817
+ println ! ( "{s}" ) ;
818
+
806
819
let expected: Vec < Option < Value > > = expected
807
820
. split ( '\n' )
808
821
. map ( |s| ( !s. is_empty ( ) ) . then ( || serde_json:: from_str ( s) . unwrap ( ) ) )
@@ -1453,6 +1466,7 @@ mod tests {
1453
1466
}
1454
1467
1455
1468
#[ test]
1469
+ #[ allow( deprecated) ]
1456
1470
fn json_writer_one_row ( ) {
1457
1471
let mut writer = ArrayWriter :: new ( vec ! [ ] as Vec < u8 > ) ;
1458
1472
let v = json ! ( { "an" : "object" } ) ;
@@ -1465,6 +1479,7 @@ mod tests {
1465
1479
}
1466
1480
1467
1481
#[ test]
1482
+ #[ allow( deprecated) ]
1468
1483
fn json_writer_two_rows ( ) {
1469
1484
let mut writer = ArrayWriter :: new ( vec ! [ ] as Vec < u8 > ) ;
1470
1485
let v = json ! ( { "an" : "object" } ) ;
@@ -1564,9 +1579,9 @@ mod tests {
1564
1579
r#"{"a":{"list":[1,2]},"b":{"list":[1,2]}}
1565
1580
{"a":{"list":[null]},"b":{"list":[null]}}
1566
1581
{"a":{"list":[]},"b":{"list":[]}}
1567
- {"a":null," b":{"list":[3,null]}}
1582
+ {"b":{"list":[3,null]}}
1568
1583
{"a":{"list":[4,5]},"b":{"list":[4,5]}}
1569
- {"a":null," b":{}}
1584
+ {"b":{}}
1570
1585
{"a":{},"b":{}}
1571
1586
"# ,
1572
1587
) ;
@@ -1621,7 +1636,7 @@ mod tests {
1621
1636
assert_json_eq (
1622
1637
& buf,
1623
1638
r#"{"map":{"foo":10}}
1624
- {"map":null }
1639
+ {}
1625
1640
{"map":{}}
1626
1641
{"map":{"bar":20,"baz":30,"qux":40}}
1627
1642
{"map":{"quux":50}}
@@ -1918,6 +1933,8 @@ mod tests {
1918
1933
writer. finish ( ) ?;
1919
1934
}
1920
1935
1936
+ println ! ( "{}" , std:: str :: from_utf8( & buf) . unwrap( ) ) ;
1937
+
1921
1938
let actual = serde_json:: from_slice :: < Vec < Value > > ( & buf) . unwrap ( ) ;
1922
1939
let expected = serde_json:: from_value :: < Vec < Value > > ( json ! ( [
1923
1940
{
0 commit comments