Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add roundtrip test case for null buffer test #1

Merged
merged 1 commit into from
Dec 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 60 additions & 1 deletion src/arrow_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,12 +268,13 @@ mod tests {
Int64Array, Int8Array, LargeBinaryArray, LargeStringArray, RecordBatchReader,
StringArray,
},
buffer::NullBuffer,
compute::concat_batches,
datatypes::{DataType as ArrowDataType, Field, Schema},
};
use bytes::Bytes;

use crate::ArrowReaderBuilder;
use crate::{stripe::Stripe, ArrowReaderBuilder};

use super::*;

Expand Down Expand Up @@ -474,4 +475,62 @@ mod tests {
let rows = roundtrip(&[batch1, batch2]);
assert_eq!(expected_batch, rows[0]);
}

#[test]
fn test_empty_null_buffers() {
// Create an ORC file with present streams, but which have no nulls.
// When this file is read then the resulting Arrow arrays show have
// NO null buffer, even though there is a present stream.
let schema = Arc::new(Schema::new(vec![Field::new(
"int64",
ArrowDataType::Int64,
true,
)]));

// Array with null buffer but has no nulls
let array_empty_nulls = Arc::new(Int64Array::from_iter_values_with_nulls(
vec![1],
Some(NullBuffer::from_iter(vec![true])),
));
assert!(array_empty_nulls.nulls().is_some());
assert!(array_empty_nulls.null_count() == 0);

let batch = RecordBatch::try_new(schema, vec![array_empty_nulls]).unwrap();

// Encoding to bytes
let mut f = vec![];
let mut writer = ArrowWriterBuilder::new(&mut f, batch.schema())
.try_build()
.unwrap();
writer.write(&batch).unwrap();
writer.close().unwrap();
let mut f = Bytes::from(f);
let builder = ArrowReaderBuilder::try_new(f.clone()).unwrap();

// Ensure the ORC file we wrote indeed has a present stream
let stripe = Stripe::new(
&mut f,
&builder.file_metadata,
builder.file_metadata().root_data_type(),
&builder.file_metadata().stripe_metadatas()[0],
)
.unwrap();
assert_eq!(stripe.columns().len(), 1);
// Make sure we're getting the right column
assert_eq!(stripe.columns()[0].name(), "int64");
// Then check present stream
let present_stream = stripe
.stream_map()
.get_opt(&stripe.columns()[0], proto::stream::Kind::Present);
assert!(present_stream.is_some());

// Decoding from bytes
let reader = builder.build();
let rows = reader.collect::<Result<Vec<_>, _>>().unwrap();

assert_eq!(rows.len(), 1);
assert_eq!(rows[0].num_columns(), 1);
// Ensure read array has no null buffer
assert!(rows[0].column(0).nulls().is_none());
Comment on lines +500 to +534
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A bit verbose but has important checks to ensure that if our ORC writer behaviour changes (such that it no longer writes an empty present stream) we'll be informed of this test breaking

}
}
Binary file removed tests/basic/data/no_nulls.orc
Binary file not shown.
32 changes: 0 additions & 32 deletions tests/basic/data/no_nulls.py

This file was deleted.

43 changes: 0 additions & 43 deletions tests/basic/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -625,46 +625,3 @@ pub fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) {
expected_lines, actual_lines
);
}

/// Tests a file with a 'present' stream for each column, but no actual nulls in it
#[test]
pub fn no_nulls_test() {
let path = basic_path("no_nulls.orc");
let reader = new_arrow_reader_root(&path);
let schema = reader.schema();
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();

let expected = [
"+------+-------+",
"| col0 | col1 |",
"+------+-------+",
"| 1 | row 1 |",
"| 2 | row 2 |",
"+------+-------+",
];
assert_batches_eq(&batches, &expected);

let expected_file_schema = Arc::new(Schema::new(vec![
Field::new(
"col0",
DataType::Int32,
true, // this shouldn't change unless no_nulls.orc was incorrectly regenerated
),
Field::new(
"col1",
DataType::Utf8,
true, // this shouldn't change unless no_nulls.orc was incorrectly regenerated
),
]));
assert_eq!(schema, expected_file_schema);

let expected_batch_schema = Arc::new(Schema::new(vec![
Field::new("col0", DataType::Int32, false),
Field::new("col1", DataType::Utf8, false),
]));
for batch in &batches {
assert_eq!(batch.schema(), expected_batch_schema);
assert!(batch.column_by_name("col0").unwrap().nulls().is_none());
assert!(batch.column_by_name("col1").unwrap().nulls().is_none());
}
}