From e9edbfcbea41852fda5700bb579f0e7dfe367462 Mon Sep 17 00:00:00 2001
From: Jefffrey <jeffrey.vo.australia@gmail.com>
Date: Wed, 1 Jan 2025 00:08:54 +1100
Subject: [PATCH] Add roundtrip test case for null buffer test

---
 src/arrow_writer.rs           |  61 +++++++++++++++++++++++++++++++++-
 tests/basic/data/no_nulls.orc | Bin 355 -> 0 bytes
 tests/basic/data/no_nulls.py  |  32 ------------------
 tests/basic/main.rs           |  43 ------------------------
 4 files changed, 60 insertions(+), 76 deletions(-)
 delete mode 100644 tests/basic/data/no_nulls.orc
 delete mode 100644 tests/basic/data/no_nulls.py

diff --git a/src/arrow_writer.rs b/src/arrow_writer.rs
index a8b1dd9..0b4085d 100644
--- a/src/arrow_writer.rs
+++ b/src/arrow_writer.rs
@@ -268,12 +268,13 @@ mod tests {
             Int64Array, Int8Array, LargeBinaryArray, LargeStringArray, RecordBatchReader,
             StringArray,
         },
+        buffer::NullBuffer,
         compute::concat_batches,
         datatypes::{DataType as ArrowDataType, Field, Schema},
     };
     use bytes::Bytes;
 
-    use crate::ArrowReaderBuilder;
+    use crate::{stripe::Stripe, ArrowReaderBuilder};
 
     use super::*;
 
@@ -474,4 +475,62 @@ mod tests {
         let rows = roundtrip(&[batch1, batch2]);
         assert_eq!(expected_batch, rows[0]);
     }
+
+    #[test]
+    fn test_empty_null_buffers() {
+        // Create an ORC file with present streams, but which have no nulls.
+        // When this file is read then the resulting Arrow arrays show have
+        // NO null buffer, even though there is a present stream.
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "int64",
+            ArrowDataType::Int64,
+            true,
+        )]));
+
+        // Array with null buffer but has no nulls
+        let array_empty_nulls = Arc::new(Int64Array::from_iter_values_with_nulls(
+            vec![1],
+            Some(NullBuffer::from_iter(vec![true])),
+        ));
+        assert!(array_empty_nulls.nulls().is_some());
+        assert!(array_empty_nulls.null_count() == 0);
+
+        let batch = RecordBatch::try_new(schema, vec![array_empty_nulls]).unwrap();
+
+        // Encoding to bytes
+        let mut f = vec![];
+        let mut writer = ArrowWriterBuilder::new(&mut f, batch.schema())
+            .try_build()
+            .unwrap();
+        writer.write(&batch).unwrap();
+        writer.close().unwrap();
+        let mut f = Bytes::from(f);
+        let builder = ArrowReaderBuilder::try_new(f.clone()).unwrap();
+
+        // Ensure the ORC file we wrote indeed has a present stream
+        let stripe = Stripe::new(
+            &mut f,
+            &builder.file_metadata,
+            builder.file_metadata().root_data_type(),
+            &builder.file_metadata().stripe_metadatas()[0],
+        )
+        .unwrap();
+        assert_eq!(stripe.columns().len(), 1);
+        // Make sure we're getting the right column
+        assert_eq!(stripe.columns()[0].name(), "int64");
+        // Then check present stream
+        let present_stream = stripe
+            .stream_map()
+            .get_opt(&stripe.columns()[0], proto::stream::Kind::Present);
+        assert!(present_stream.is_some());
+
+        // Decoding from bytes
+        let reader = builder.build();
+        let rows = reader.collect::<Result<Vec<_>, _>>().unwrap();
+
+        assert_eq!(rows.len(), 1);
+        assert_eq!(rows[0].num_columns(), 1);
+        // Ensure read array has no null buffer
+        assert!(rows[0].column(0).nulls().is_none());
+    }
 }
diff --git a/tests/basic/data/no_nulls.orc b/tests/basic/data/no_nulls.orc
deleted file mode 100644
index 0bae0dae8a8eaeec149dd6eec772a65a2adf6731..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 355
zcmeYda+YOa;Nsz8VE_Ul77nHW1|0^5$3oAO(i<2Q<@nST&G^(9#W=SyurXKxC4!zk
zN@-wJG~`oLWz@U2=uZ)=v!X$&ilWc~1_o9JhW`h^1UmzR8>0$>87j)aP?TS;U<e|N
z%7Er1Jx&l#m>}?k>0*!2q=Z?XPK+ML3`ZjmnkdgmW4V-*U?)7|Jj)Wt8aIWr+DDjz
z_*G^mED<Tr=rCpo1M0r?FzI0eSHgow4-<rdCJ7}>Okn<W#A3!2mK_n@pH7SNv>uo;
zvBiUdnc*Bzb%NN90~$w<9@KfDx^Q8_@`FoM4jxoKa^!@{(MfaWCo`z=ib|MB@JLLt
zu$b{sr;Out!^y7JX3d6!UG2@D9lgz-4X#bRCxjm-1PMJz(mkagZXm)G>?Wkd$ipZ6
o$3|ju1n<`~U(YadWC<`zG&Ha%F){FH=ozpzePL$y4{{a<0J{Kpy8r+H

diff --git a/tests/basic/data/no_nulls.py b/tests/basic/data/no_nulls.py
deleted file mode 100644
index 4597f44..0000000
--- a/tests/basic/data/no_nulls.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Generates a file with a 'present' stream for each column, but no actual nulls in it
-"""
-
-from pathlib import Path
-
-import pyorc
-
-path = Path(__file__).parent / "no_nulls.orc"
-
-with path.open("wb") as data:
-    #with pyorc.Writer(data, "struct<col0:int,col1:string>") as writer:
-    with pyorc.Writer(data, pyorc.Struct(col0=pyorc.Int(), col1=pyorc.String())) as writer:
-        writer.write((1, "row 1"))
-        writer.write((2, "row 2"))
diff --git a/tests/basic/main.rs b/tests/basic/main.rs
index 4353102..84d62d8 100644
--- a/tests/basic/main.rs
+++ b/tests/basic/main.rs
@@ -625,46 +625,3 @@ pub fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) {
         expected_lines, actual_lines
     );
 }
-
-/// Tests a file with a 'present' stream for each column, but no actual nulls in it
-#[test]
-pub fn no_nulls_test() {
-    let path = basic_path("no_nulls.orc");
-    let reader = new_arrow_reader_root(&path);
-    let schema = reader.schema();
-    let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
-
-    let expected = [
-        "+------+-------+",
-        "| col0 | col1  |",
-        "+------+-------+",
-        "| 1    | row 1 |",
-        "| 2    | row 2 |",
-        "+------+-------+",
-    ];
-    assert_batches_eq(&batches, &expected);
-
-    let expected_file_schema = Arc::new(Schema::new(vec![
-        Field::new(
-            "col0",
-            DataType::Int32,
-            true, // this shouldn't change unless no_nulls.orc was incorrectly regenerated
-        ),
-        Field::new(
-            "col1",
-            DataType::Utf8,
-            true, // this shouldn't change unless no_nulls.orc was incorrectly regenerated
-        ),
-    ]));
-    assert_eq!(schema, expected_file_schema);
-
-    let expected_batch_schema = Arc::new(Schema::new(vec![
-        Field::new("col0", DataType::Int32, false),
-        Field::new("col1", DataType::Utf8, false),
-    ]));
-    for batch in &batches {
-        assert_eq!(batch.schema(), expected_batch_schema);
-        assert!(batch.column_by_name("col0").unwrap().nulls().is_none());
-        assert!(batch.column_by_name("col1").unwrap().nulls().is_none());
-    }
-}