From e9edbfcbea41852fda5700bb579f0e7dfe367462 Mon Sep 17 00:00:00 2001 From: Jefffrey <jeffrey.vo.australia@gmail.com> Date: Wed, 1 Jan 2025 00:08:54 +1100 Subject: [PATCH] Add roundtrip test case for null buffer test --- src/arrow_writer.rs | 61 +++++++++++++++++++++++++++++++++- tests/basic/data/no_nulls.orc | Bin 355 -> 0 bytes tests/basic/data/no_nulls.py | 32 ------------------ tests/basic/main.rs | 43 ------------------------ 4 files changed, 60 insertions(+), 76 deletions(-) delete mode 100644 tests/basic/data/no_nulls.orc delete mode 100644 tests/basic/data/no_nulls.py diff --git a/src/arrow_writer.rs b/src/arrow_writer.rs index a8b1dd9..0b4085d 100644 --- a/src/arrow_writer.rs +++ b/src/arrow_writer.rs @@ -268,12 +268,13 @@ mod tests { Int64Array, Int8Array, LargeBinaryArray, LargeStringArray, RecordBatchReader, StringArray, }, + buffer::NullBuffer, compute::concat_batches, datatypes::{DataType as ArrowDataType, Field, Schema}, }; use bytes::Bytes; - use crate::ArrowReaderBuilder; + use crate::{stripe::Stripe, ArrowReaderBuilder}; use super::*; @@ -474,4 +475,62 @@ mod tests { let rows = roundtrip(&[batch1, batch2]); assert_eq!(expected_batch, rows[0]); } + + #[test] + fn test_empty_null_buffers() { + // Create an ORC file with present streams, but which have no nulls. + // When this file is read then the resulting Arrow arrays show have + // NO null buffer, even though there is a present stream. + let schema = Arc::new(Schema::new(vec![Field::new( + "int64", + ArrowDataType::Int64, + true, + )])); + + // Array with null buffer but has no nulls + let array_empty_nulls = Arc::new(Int64Array::from_iter_values_with_nulls( + vec![1], + Some(NullBuffer::from_iter(vec![true])), + )); + assert!(array_empty_nulls.nulls().is_some()); + assert!(array_empty_nulls.null_count() == 0); + + let batch = RecordBatch::try_new(schema, vec![array_empty_nulls]).unwrap(); + + // Encoding to bytes + let mut f = vec![]; + let mut writer = ArrowWriterBuilder::new(&mut f, batch.schema()) + .try_build() + .unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + let mut f = Bytes::from(f); + let builder = ArrowReaderBuilder::try_new(f.clone()).unwrap(); + + // Ensure the ORC file we wrote indeed has a present stream + let stripe = Stripe::new( + &mut f, + &builder.file_metadata, + builder.file_metadata().root_data_type(), + &builder.file_metadata().stripe_metadatas()[0], + ) + .unwrap(); + assert_eq!(stripe.columns().len(), 1); + // Make sure we're getting the right column + assert_eq!(stripe.columns()[0].name(), "int64"); + // Then check present stream + let present_stream = stripe + .stream_map() + .get_opt(&stripe.columns()[0], proto::stream::Kind::Present); + assert!(present_stream.is_some()); + + // Decoding from bytes + let reader = builder.build(); + let rows = reader.collect::<Result<Vec<_>, _>>().unwrap(); + + assert_eq!(rows.len(), 1); + assert_eq!(rows[0].num_columns(), 1); + // Ensure read array has no null buffer + assert!(rows[0].column(0).nulls().is_none()); + } } diff --git a/tests/basic/data/no_nulls.orc b/tests/basic/data/no_nulls.orc deleted file mode 100644 index 0bae0dae8a8eaeec149dd6eec772a65a2adf6731..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 355 zcmeYda+YOa;Nsz8VE_Ul77nHW1|0^5$3oAO(i<2Q<@nST&G^(9#W=SyurXKxC4!zk zN@-wJG~`oLWz@U2=uZ)=v!X$&ilWc~1_o9JhW`h^1UmzR8>0$>87j)aP?TS;U<e|N z%7Er1Jx&l#m>}?k>0*!2q=Z?XPK+ML3`ZjmnkdgmW4V-*U?)7|Jj)Wt8aIWr+DDjz z_*G^mED<Tr=rCpo1M0r?FzI0eSHgow4-<rdCJ7}>Okn<W#A3!2mK_n@pH7SNv>uo; zvBiUdnc*Bzb%NN90~$w<9@KfDx^Q8_@`FoM4jxoKa^!@{(MfaWCo`z=ib|MB@JLLt zu$b{sr;Out!^y7JX3d6!UG2@D9lgz-4X#bRCxjm-1PMJz(mkagZXm)G>?Wkd$ipZ6 o$3|ju1n<`~U(YadWC<`zG&Ha%F){FH=ozpzePL$y4{{a<0J{Kpy8r+H diff --git a/tests/basic/data/no_nulls.py b/tests/basic/data/no_nulls.py deleted file mode 100644 index 4597f44..0000000 --- a/tests/basic/data/no_nulls.py +++ /dev/null @@ -1,32 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Generates a file with a 'present' stream for each column, but no actual nulls in it -""" - -from pathlib import Path - -import pyorc - -path = Path(__file__).parent / "no_nulls.orc" - -with path.open("wb") as data: - #with pyorc.Writer(data, "struct<col0:int,col1:string>") as writer: - with pyorc.Writer(data, pyorc.Struct(col0=pyorc.Int(), col1=pyorc.String())) as writer: - writer.write((1, "row 1")) - writer.write((2, "row 2")) diff --git a/tests/basic/main.rs b/tests/basic/main.rs index 4353102..84d62d8 100644 --- a/tests/basic/main.rs +++ b/tests/basic/main.rs @@ -625,46 +625,3 @@ pub fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) { expected_lines, actual_lines ); } - -/// Tests a file with a 'present' stream for each column, but no actual nulls in it -#[test] -pub fn no_nulls_test() { - let path = basic_path("no_nulls.orc"); - let reader = new_arrow_reader_root(&path); - let schema = reader.schema(); - let batches = reader.collect::<Result<Vec<_>, _>>().unwrap(); - - let expected = [ - "+------+-------+", - "| col0 | col1 |", - "+------+-------+", - "| 1 | row 1 |", - "| 2 | row 2 |", - "+------+-------+", - ]; - assert_batches_eq(&batches, &expected); - - let expected_file_schema = Arc::new(Schema::new(vec![ - Field::new( - "col0", - DataType::Int32, - true, // this shouldn't change unless no_nulls.orc was incorrectly regenerated - ), - Field::new( - "col1", - DataType::Utf8, - true, // this shouldn't change unless no_nulls.orc was incorrectly regenerated - ), - ])); - assert_eq!(schema, expected_file_schema); - - let expected_batch_schema = Arc::new(Schema::new(vec![ - Field::new("col0", DataType::Int32, false), - Field::new("col1", DataType::Utf8, false), - ])); - for batch in &batches { - assert_eq!(batch.schema(), expected_batch_schema); - assert!(batch.column_by_name("col0").unwrap().nulls().is_none()); - assert!(batch.column_by_name("col1").unwrap().nulls().is_none()); - } -}