From d4c5377f28221240edcc389a0b0bb2141dc5bd78 Mon Sep 17 00:00:00 2001 From: Yang Xiufeng Date: Tue, 2 Jul 2024 22:00:18 +0800 Subject: [PATCH] fix: read array columns correctly (#112) * fix: panic when read column of type array * ci: add tests for nested column. --- src/array_decoder/mod.rs | 4 +- src/reader/decode/float.rs | 39 ++++++----------- tests/basic/data/nested_array_float.orc | Bin 0 -> 358 bytes tests/basic/data/nested_array_struct.orc | Bin 0 -> 653 bytes tests/basic/data/nested_map_struct.orc | Bin 0 -> 742 bytes tests/basic/data/write.py | 30 +++++++++++++ tests/basic/main.rs | 52 +++++++++++++++++++++++ 7 files changed, 98 insertions(+), 27 deletions(-) create mode 100644 tests/basic/data/nested_array_float.orc create mode 100644 tests/basic/data/nested_array_struct.orc create mode 100644 tests/basic/data/nested_map_struct.orc diff --git a/src/array_decoder/mod.rs b/src/array_decoder/mod.rs index 755e920c..ee17392d 100644 --- a/src/array_decoder/mod.rs +++ b/src/array_decoder/mod.rs @@ -404,7 +404,7 @@ pub fn array_decoder_factory( } ); let iter = stripe.stream_map().get(column, Kind::Data); - let iter = Box::new(FloatIter::new(iter, stripe.number_of_rows())); + let iter = Box::new(FloatIter::new(iter)); let present = get_present_vec(column, stripe)? .map(|iter| Box::new(iter.into_iter()) as Box + Send>); Box::new(Float32ArrayDecoder::new(iter, present)) @@ -418,7 +418,7 @@ pub fn array_decoder_factory( } ); let iter = stripe.stream_map().get(column, Kind::Data); - let iter = Box::new(FloatIter::new(iter, stripe.number_of_rows())); + let iter = Box::new(FloatIter::new(iter)); let present = get_present_vec(column, stripe)? .map(|iter| Box::new(iter.into_iter()) as Box + Send>); Box::new(Float64ArrayDecoder::new(iter, present)) diff --git a/src/reader/decode/float.rs b/src/reader/decode/float.rs index 20d2496d..c681d88b 100644 --- a/src/reader/decode/float.rs +++ b/src/reader/decode/float.rs @@ -32,26 +32,18 @@ impl Float for f64 { /// An iterator pub struct FloatIter { reader: R, - remaining: usize, phantom: std::marker::PhantomData, } impl FloatIter { /// Returns a new [`FloatIter`] #[inline] - pub fn new(reader: R, length: usize) -> Self { + pub fn new(reader: R) -> Self { Self { reader, - remaining: length, phantom: Default::default(), } } - - /// The number of items remaining - #[inline] - pub fn len(&self) -> usize { - self.remaining - } } impl Iterator for FloatIter { @@ -59,26 +51,23 @@ impl Iterator for FloatIter { #[inline] fn next(&mut self) -> Option { - if self.remaining == 0 { - return None; - } let mut chunk: T::OBytes = Default::default(); - if let Err(err) = self + match self .reader - .read_exact(chunk.as_mut()) + .read(chunk.as_mut()) .context(error::DecodeFloatSnafu) { - return Some(Err(err)); + Err(err) => { + return Some(Err(err)); + } + Ok(n) => { + if n == 0 { + return None; + } + } }; - self.remaining -= 1; Some(Ok(T::from_le_bytes(chunk))) } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let remaining = self.len(); - (remaining, Some(remaining)) - } } #[cfg(test)] @@ -100,7 +89,7 @@ mod tests { let bytes = float_to_bytes(&input); let bytes = Cursor::new(bytes); - let iter = FloatIter::::new(bytes, input.len()); + let iter = FloatIter::::new(bytes); let actual = iter.collect::>>().unwrap(); assert_eq!(input, actual); @@ -131,7 +120,7 @@ mod tests { let bytes = float_to_bytes(&[f32::NAN]); let bytes = Cursor::new(bytes); - let iter = FloatIter::::new(bytes, 1); + let iter = FloatIter::::new(bytes); let actual = iter.collect::>>().unwrap(); assert_eq!(actual.len(), 1); assert!(actual[0].is_nan()); @@ -142,7 +131,7 @@ mod tests { let bytes = float_to_bytes(&[f64::NAN]); let bytes = Cursor::new(bytes); - let iter = FloatIter::::new(bytes, 1); + let iter = FloatIter::::new(bytes); let actual = iter.collect::>>().unwrap(); assert_eq!(actual.len(), 1); assert!(actual[0].is_nan()); diff --git a/tests/basic/data/nested_array_float.orc b/tests/basic/data/nested_array_float.orc new file mode 100644 index 0000000000000000000000000000000000000000..341f00b136441edde37fc351119ceecb5d756c9c GIT binary patch literal 358 zcmaivF;2rk5JmsY#?~-cXtNU1VjAPh2q|)eQs5@5xCj-AA~*n%=xCusLY#n}h9hwl z3VK-E2+B;c@9i}I&uY_Z>PEtA=z%ier1LF%=&6nn$DN4DkM}wSKIh{>BpeOm!ZBO=Rk!;DENLcn9Ns(D>0E@0i|3=Tv&2_8iI#l~FwY!WM z`$oA&w0>MHD=wX6jwnwaZ|@)O3RI|3?u$C&-;YQ#Sj|lz*!%@Hy92}P)g^6{>EvQE Vm8W=iQ8Kb>R;Mpt#;i8&;SXmYBbopJ literal 0 HcmV?d00001 diff --git a/tests/basic/data/nested_array_struct.orc b/tests/basic/data/nested_array_struct.orc new file mode 100644 index 0000000000000000000000000000000000000000..be822746be14e1832e749f9de00bab7bba8175f3 GIT binary patch literal 653 zcma)(F;2rk5JhKa*Jjulw5&u}3`GbOK}eB9D%@ZZ2~m+Kf&$TyAUay85nO;Hz-egV z1~~!+eHiaXfhYw?mf!#1znSlDzoTQT2z#KHH3C>F^$;ECik85EBH0sqS1UGPw@9WN zi=_nkJX{HaiuE8|!PQkAd1n)nHYPN&OE^w-97@)ZtZR)~%VO3pza5E}@%wlL@NmeL zf}dhMI>wzV&bXHCXKfu*Y`~GHaK;nelBaZgp31FxMsAadMmWg?gAP5Vr!dkb-m8gM zPI(`Bk0wws%Wc4vBcOi={X5}2mFH(2eWAmQKLdv4{;}BXP;>K;G;f%4$vg?SU3(P` ztXoQ~5NUKjxV^inK?5?VS4dld6I11alqyQaAQi(@Tu*YMXsYB?Ed-Te?vKc)N3iGL rpZ|SCK66AqcSP}gdLo8Vt9j6D(TCuFS*s%?jms>4{F1WU?^M45)rCEi literal 0 HcmV?d00001 diff --git a/tests/basic/data/nested_map_struct.orc b/tests/basic/data/nested_map_struct.orc new file mode 100644 index 0000000000000000000000000000000000000000..4c95e3c7e9501d207f550b8ca3a081c15371dd51 GIT binary patch literal 742 zcma)4yH3L}6!oUCYy+ zGc;eElZ3k(nB7JhZLRwA?5)3E1}1Nn!A?$dR&!4-Z9$FBQtdbL&)E^ofWDUQrw3_| zv4=h}<+SL1$d5&Q_1)D9m0=D&f@^x@uH{iwb?FkH2X4co@_lB&amqQN@W_1*B)#AR zBc1UWkQK$8s`|1T$~&KLHw@}nM!^Rt1z(WTML{??5xZT*L%Auf7ZBSK+bc@z>n zgWGGrKq6v`p#_M7`^oj)Wt$}|WpRjWmUB^Mh$~gppjjaU6~QEeX#^MZiC`YpPNgFA zDzhb(S?%X&iY`y~w@M;fP+2F+pH5F;8gx5{oi4ruU4(YbS(x0W>hVK};ds#a0*H4) ATmS$7 literal 0 HcmV?d00001 diff --git a/tests/basic/data/write.py b/tests/basic/data/write.py index 7c43c045..1606c99c 100644 --- a/tests/basic/data/write.py +++ b/tests/basic/data/write.py @@ -98,6 +98,7 @@ def _write( _write("struct>", nested_struct, "nested_struct.orc") + nested_array = { "value": [ [1, None, 3, 43, 5], @@ -110,6 +111,25 @@ def _write( _write("struct>", nested_array, "nested_array.orc") + +nested_array_float = { + "value": [ + [1.0, 3.0], + [None, 2.0], + ], +} + +_write("struct>", nested_array_float, "nested_array_float.orc") + +nested_array_struct = { + "value": [ + [(1.0, 1, "01"), (2.0, 2, "02")], + [None, (3.0, 3, "03")], + ], +} + +_write("struct>>", nested_array_struct, "nested_array_struct.orc") + nested_map = { "map": [ {"zero": 0, "one": 1}, @@ -121,6 +141,16 @@ def _write( _write("struct>", nested_map, "nested_map.orc") +nested_map_struct = { + "map": [ + {"01": (1.0, 1, "01"), "02": (2.0, 1, "02")}, + None, + {"03": (3.0, 3, "03"), "04": (4.0, 4, "04")}, + ], +} + +_write("struct>>", nested_map_struct, "nested_map_struct.orc") + _write( infer_schema(data), diff --git a/tests/basic/main.rs b/tests/basic/main.rs index 9a541f7b..b0014ca5 100644 --- a/tests/basic/main.rs +++ b/tests/basic/main.rs @@ -252,6 +252,58 @@ pub fn basic_test_nested_array() { assert_batches_eq(&batch, &expected); } +#[test] +pub fn basic_test_nested_array_float() { + let path = basic_path("nested_array_float.orc"); + let reader = new_arrow_reader_root(&path); + let batch = reader.collect::, _>>().unwrap(); + + let expected = [ + "+------------+", + "| value |", + "+------------+", + "| [1.0, 3.0] |", + "| [, 2.0] |", + "+------------+", + ]; + assert_batches_eq(&batch, &expected); +} + +#[test] +pub fn basic_test_nested_array_struct() { + let path = basic_path("nested_array_struct.orc"); + let reader = new_arrow_reader_root(&path); + let batch = reader.collect::, _>>().unwrap(); + + let expected = [ + "+------------------------------------------------+", + "| value |", + "+------------------------------------------------+", + "| [{a: 1.0, b: 1, c: 01}, {a: 2.0, b: 2, c: 02}] |", + "| [, {a: 3.0, b: 3, c: 03}] |", + "+------------------------------------------------+", + ]; + assert_batches_eq(&batch, &expected); +} + +#[test] +pub fn basic_test_nested_map_struct() { + let path = basic_path("nested_map_struct.orc"); + let reader = new_arrow_reader_root(&path); + let batch = reader.collect::, _>>().unwrap(); + + let expected = [ + "+--------------------------------------------------------+", + "| value |", + "+--------------------------------------------------------+", + "| {01: {a: 1.0, b: 1, c: 01}, 02: {a: 2.0, b: 1, c: 02}} |", + "| |", + "| {03: {a: 3.0, b: 3, c: 03}, 04: {a: 4.0, b: 4, c: 04}} |", + "+--------------------------------------------------------+", + ]; + assert_batches_eq(&batch, &expected); +} + #[test] pub fn basic_test_nested_map() { let path = basic_path("nested_map.orc");