diff --git a/crates/polars-io/src/parquet/metadata.rs b/crates/polars-io/src/parquet/metadata.rs index bc032651b837..ad62aecf36d3 100644 --- a/crates/polars-io/src/parquet/metadata.rs +++ b/crates/polars-io/src/parquet/metadata.rs @@ -2,7 +2,7 @@ use std::sync::Arc; -pub use polars_parquet::parquet::metadata::FileMetaData; +pub use polars_parquet::parquet::metadata::FileMetadata; pub use polars_parquet::read::statistics::{deserialize, Statistics as ParquetStatistics}; -pub type FileMetaDataRef = Arc; +pub type FileMetadataRef = Arc; diff --git a/crates/polars-io/src/parquet/read/async_impl.rs b/crates/polars-io/src/parquet/read/async_impl.rs index 562156405b95..0c1ead03b85b 100644 --- a/crates/polars-io/src/parquet/read/async_impl.rs +++ b/crates/polars-io/src/parquet/read/async_impl.rs @@ -6,8 +6,8 @@ use bytes::Bytes; use object_store::path::Path as ObjectPath; use polars_core::config::{get_rg_prefetch_size, verbose}; use polars_core::prelude::*; -use polars_parquet::read::RowGroupMetaData; -use polars_parquet::write::FileMetaData; +use polars_parquet::read::RowGroupMetadata; +use polars_parquet::write::FileMetadata; use polars_utils::pl_str::PlSmallStr; use tokio::sync::mpsc::{channel, Receiver, Sender}; use tokio::sync::Mutex; @@ -17,7 +17,7 @@ use super::predicates::read_this_row_group; use crate::cloud::{ build_object_store, object_path_from_str, CloudLocation, CloudOptions, PolarsObjectStore, }; -use crate::parquet::metadata::FileMetaDataRef; +use crate::parquet::metadata::FileMetadataRef; use crate::pl_async::get_runtime; use crate::predicates::PhysicalIoExpr; @@ -29,14 +29,14 @@ pub struct ParquetObjectStore { store: PolarsObjectStore, path: ObjectPath, length: Option, - metadata: Option, + metadata: Option, } impl ParquetObjectStore { pub async fn from_uri( uri: &str, options: Option<&CloudOptions>, - metadata: Option, + metadata: Option, ) -> PolarsResult { let (CloudLocation { prefix, .. }, store) = build_object_store(uri, options, false).await?; let path = object_path_from_str(&prefix)?; @@ -74,13 +74,13 @@ impl ParquetObjectStore { } /// Fetch the metadata of the parquet file, do not memoize it. - async fn fetch_metadata(&mut self) -> PolarsResult { + async fn fetch_metadata(&mut self) -> PolarsResult { let length = self.length().await?; fetch_metadata(&self.store, &self.path, length).await } /// Fetch and memoize the metadata of the parquet file. - pub async fn get_metadata(&mut self) -> PolarsResult<&FileMetaDataRef> { + pub async fn get_metadata(&mut self) -> PolarsResult<&FileMetadataRef> { if self.metadata.is_none() { self.metadata = Some(Arc::new(self.fetch_metadata().await?)); } @@ -107,7 +107,7 @@ pub async fn fetch_metadata( store: &PolarsObjectStore, path: &ObjectPath, file_byte_length: usize, -) -> PolarsResult { +) -> PolarsResult { let footer_header_bytes = store .get_range( path, @@ -165,7 +165,7 @@ pub async fn fetch_metadata( /// We concurrently download the columns for each field. async fn download_projection( fields: Arc<[PlSmallStr]>, - row_group: RowGroupMetaData, + row_group: RowGroupMetadata, async_reader: Arc, sender: QueueSend, rg_index: usize, @@ -205,7 +205,7 @@ async fn download_projection( } async fn download_row_group( - rg: RowGroupMetaData, + rg: RowGroupMetadata, async_reader: Arc, sender: QueueSend, rg_index: usize, @@ -255,7 +255,7 @@ impl FetchRowGroupsFromObjectStore { projection: Option<&[usize]>, predicate: Option>, row_group_range: Range, - row_groups: &[RowGroupMetaData], + row_groups: &[RowGroupMetadata], ) -> PolarsResult { let projected_fields: Option> = projection.map(|projection| { projection diff --git a/crates/polars-io/src/parquet/read/predicates.rs b/crates/polars-io/src/parquet/read/predicates.rs index fa1655be4846..87615de1b8c2 100644 --- a/crates/polars-io/src/parquet/read/predicates.rs +++ b/crates/polars-io/src/parquet/read/predicates.rs @@ -1,6 +1,6 @@ use polars_core::prelude::*; use polars_parquet::read::statistics::{deserialize, Statistics}; -use polars_parquet::read::RowGroupMetaData; +use polars_parquet::read::RowGroupMetadata; use crate::predicates::{BatchStats, ColumnStats, PhysicalIoExpr}; @@ -17,7 +17,7 @@ impl ColumnStats { /// Collect the statistics in a row-group pub(crate) fn collect_statistics( - md: &RowGroupMetaData, + md: &RowGroupMetadata, schema: &ArrowSchema, ) -> PolarsResult> { // TODO! fix this performance. This is a full sequential scan. @@ -47,7 +47,7 @@ pub(crate) fn collect_statistics( pub fn read_this_row_group( predicate: Option<&dyn PhysicalIoExpr>, - md: &RowGroupMetaData, + md: &RowGroupMetadata, schema: &ArrowSchema, ) -> PolarsResult { if let Some(pred) = predicate { diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index e43c34ca2d70..c621b698cebc 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -12,7 +12,7 @@ use polars_core::POOL; use polars_parquet::parquet::error::ParquetResult; use polars_parquet::parquet::statistics::Statistics; use polars_parquet::read::{ - self, ColumnChunkMetadata, FileMetaData, Filter, PhysicalType, RowGroupMetaData, + self, ColumnChunkMetadata, FileMetadata, Filter, PhysicalType, RowGroupMetadata, }; use polars_utils::mmap::MemSlice; use rayon::prelude::*; @@ -26,7 +26,7 @@ use super::utils::materialize_empty_df; use super::{mmap, ParallelStrategy}; use crate::hive::materialize_hive_partitions; use crate::mmap::{MmapBytesReader, ReaderBytes}; -use crate::parquet::metadata::FileMetaDataRef; +use crate::parquet::metadata::FileMetadataRef; use crate::parquet::read::ROW_COUNT_OVERFLOW_ERR; use crate::predicates::{apply_predicate, PhysicalIoExpr}; use crate::utils::get_reader_bytes; @@ -142,7 +142,7 @@ fn rg_to_dfs( row_group_start: usize, row_group_end: usize, slice: (usize, usize), - file_metadata: &FileMetaData, + file_metadata: &FileMetadata, schema: &ArrowSchemaRef, predicate: Option<&dyn PhysicalIoExpr>, row_index: Option, @@ -227,7 +227,7 @@ fn rg_to_dfs_prefiltered( previous_row_count: &mut IdxSize, row_group_start: usize, row_group_end: usize, - file_metadata: &FileMetaData, + file_metadata: &FileMetadata, schema: &ArrowSchemaRef, live_variables: Vec, predicate: &dyn PhysicalIoExpr, @@ -501,7 +501,7 @@ fn rg_to_dfs_optionally_par_over_columns( row_group_start: usize, row_group_end: usize, slice: (usize, usize), - file_metadata: &FileMetaData, + file_metadata: &FileMetadata, schema: &ArrowSchemaRef, predicate: Option<&dyn PhysicalIoExpr>, row_index: Option, @@ -605,7 +605,7 @@ fn rg_to_dfs_par_over_rg( row_group_end: usize, previous_row_count: &mut IdxSize, slice: (usize, usize), - file_metadata: &FileMetaData, + file_metadata: &FileMetadata, schema: &ArrowSchemaRef, predicate: Option<&dyn PhysicalIoExpr>, row_index: Option, @@ -701,7 +701,7 @@ pub fn read_parquet( slice: (usize, usize), projection: Option<&[usize]>, reader_schema: &ArrowSchemaRef, - metadata: Option, + metadata: Option, predicate: Option<&dyn PhysicalIoExpr>, mut parallel: ParallelStrategy, row_index: Option, @@ -855,7 +855,7 @@ pub(super) fn compute_row_group_range( row_group_start: usize, row_group_end: usize, slice: (usize, usize), - row_groups: &[RowGroupMetaData], + row_groups: &[RowGroupMetadata], ) -> std::ops::Range { let mut start = row_group_start; let mut cum_rows: usize = (0..row_group_start).map(|i| row_groups[i].num_rows()).sum(); @@ -901,7 +901,7 @@ pub struct BatchedParquetReader { slice: (usize, usize), projection: Arc<[usize]>, schema: ArrowSchemaRef, - metadata: FileMetaDataRef, + metadata: FileMetadataRef, predicate: Option>, row_index: Option, rows_read: IdxSize, @@ -921,7 +921,7 @@ impl BatchedParquetReader { #[allow(clippy::too_many_arguments)] pub fn new( row_group_fetcher: RowGroupFetcher, - metadata: FileMetaDataRef, + metadata: FileMetadataRef, schema: ArrowSchemaRef, slice: (usize, usize), projection: Option>, diff --git a/crates/polars-io/src/parquet/read/reader.rs b/crates/polars-io/src/parquet/read/reader.rs index 25e8852a92ce..0f6f3b70b4f3 100644 --- a/crates/polars-io/src/parquet/read/reader.rs +++ b/crates/polars-io/src/parquet/read/reader.rs @@ -18,7 +18,7 @@ use super::utils::materialize_empty_df; #[cfg(feature = "cloud")] use crate::cloud::CloudOptions; use crate::mmap::MmapBytesReader; -use crate::parquet::metadata::FileMetaDataRef; +use crate::parquet::metadata::FileMetadataRef; use crate::predicates::PhysicalIoExpr; use crate::prelude::*; use crate::RowIndex; @@ -35,7 +35,7 @@ pub struct ParquetReader { schema: Option, row_index: Option, low_memory: bool, - metadata: Option, + metadata: Option, predicate: Option>, hive_partition_columns: Option>, include_file_path: Option<(PlSmallStr, Arc)>, @@ -138,7 +138,7 @@ impl ParquetReader { self } - pub fn get_metadata(&mut self) -> PolarsResult<&FileMetaDataRef> { + pub fn get_metadata(&mut self) -> PolarsResult<&FileMetadataRef> { if self.metadata.is_none() { self.metadata = Some(Arc::new(read::read_metadata(&mut self.reader)?)); } @@ -267,7 +267,7 @@ impl ParquetAsyncReader { pub async fn from_uri( uri: &str, cloud_options: Option<&CloudOptions>, - metadata: Option, + metadata: Option, ) -> PolarsResult { Ok(ParquetAsyncReader { reader: ParquetObjectStore::from_uri(uri, cloud_options, metadata).await?, @@ -406,7 +406,7 @@ impl ParquetAsyncReader { ) } - pub async fn get_metadata(&mut self) -> PolarsResult<&FileMetaDataRef> { + pub async fn get_metadata(&mut self) -> PolarsResult<&FileMetadataRef> { self.reader.get_metadata().await } diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs index a37fc7c42f33..b9012344abb9 100644 --- a/crates/polars-mem-engine/src/executors/scan/parquet.rs +++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs @@ -5,7 +5,7 @@ use polars_core::config::{get_file_prefetch_size, verbose}; use polars_core::utils::accumulate_dataframes_vertical; use polars_error::feature_gated; use polars_io::cloud::CloudOptions; -use polars_io::parquet::metadata::FileMetaDataRef; +use polars_io::parquet::metadata::FileMetadataRef; use polars_io::utils::slice::split_slice_at_file; use polars_io::RowIndex; @@ -21,7 +21,7 @@ pub struct ParquetExec { cloud_options: Option, file_options: FileScanOptions, #[allow(dead_code)] - metadata: Option, + metadata: Option, } impl ParquetExec { @@ -34,7 +34,7 @@ impl ParquetExec { options: ParquetOptions, cloud_options: Option, file_options: FileScanOptions, - metadata: Option, + metadata: Option, ) -> Self { ParquetExec { sources, diff --git a/crates/polars-parquet/src/arrow/read/mod.rs b/crates/polars-parquet/src/arrow/read/mod.rs index 8af4fb3f67bb..1f00987fa074 100644 --- a/crates/polars-parquet/src/arrow/read/mod.rs +++ b/crates/polars-parquet/src/arrow/read/mod.rs @@ -15,7 +15,7 @@ pub use deserialize::{ #[cfg(feature = "async")] use futures::{AsyncRead, AsyncSeek}; use polars_error::PolarsResult; -pub use schema::{infer_schema, FileMetaData}; +pub use schema::{infer_schema, FileMetadata}; use crate::parquet::error::ParquetResult; #[cfg(feature = "async")] @@ -24,7 +24,7 @@ pub use crate::parquet::read::{get_page_stream, read_metadata_async as _read_met pub use crate::parquet::{ error::ParquetError, fallible_streaming_iterator, - metadata::{ColumnChunkMetadata, ColumnDescriptor, RowGroupMetaData}, + metadata::{ColumnChunkMetadata, ColumnDescriptor, RowGroupMetadata}, page::{CompressedDataPage, DataPageHeader, Page}, read::{ decompress, get_column_iterator, read_metadata as _read_metadata, BasicDecompressor, @@ -54,7 +54,7 @@ pub fn get_field_pages<'a, T>( } /// Reads parquets' metadata synchronously. -pub fn read_metadata(reader: &mut R) -> PolarsResult { +pub fn read_metadata(reader: &mut R) -> PolarsResult { Ok(_read_metadata(reader)?) } @@ -62,7 +62,7 @@ pub fn read_metadata(reader: &mut R) -> PolarsResult( reader: &mut R, -) -> PolarsResult { +) -> PolarsResult { Ok(_read_metadata_async(reader).await?) } diff --git a/crates/polars-parquet/src/arrow/read/schema/mod.rs b/crates/polars-parquet/src/arrow/read/schema/mod.rs index 50d937e7e840..347cd49faefd 100644 --- a/crates/polars-parquet/src/arrow/read/schema/mod.rs +++ b/crates/polars-parquet/src/arrow/read/schema/mod.rs @@ -10,7 +10,7 @@ pub use metadata::read_schema_from_metadata; use polars_error::PolarsResult; use self::metadata::parse_key_value_metadata; -pub use crate::parquet::metadata::{FileMetaData, KeyValue, SchemaDescriptor}; +pub use crate::parquet::metadata::{FileMetadata, KeyValue, SchemaDescriptor}; pub use crate::parquet::schema::types::ParquetType; /// Options when inferring schemas from Parquet @@ -33,7 +33,7 @@ impl Default for SchemaInferenceOptions { } } -/// Infers a [`ArrowSchema`] from parquet's [`FileMetaData`]. +/// Infers a [`ArrowSchema`] from parquet's [`FileMetadata`]. /// /// This first looks for the metadata key `"ARROW:schema"`; if it does not exist, it converts the /// Parquet types declared in the file's Parquet schema to Arrow's equivalent. @@ -41,13 +41,13 @@ impl Default for SchemaInferenceOptions { /// # Error /// This function errors iff the key `"ARROW:schema"` exists but is not correctly encoded, /// indicating that that the file's arrow metadata was incorrectly written. -pub fn infer_schema(file_metadata: &FileMetaData) -> PolarsResult { +pub fn infer_schema(file_metadata: &FileMetadata) -> PolarsResult { infer_schema_with_options(file_metadata, &None) } /// Like [`infer_schema`] but with configurable options which affects the behavior of inference pub fn infer_schema_with_options( - file_metadata: &FileMetaData, + file_metadata: &FileMetadata, options: &Option, ) -> PolarsResult { let mut metadata = parse_key_value_metadata(file_metadata.key_value_metadata()); diff --git a/crates/polars-parquet/src/arrow/write/file.rs b/crates/polars-parquet/src/arrow/write/file.rs index d4162b8c08d5..0fd32deb5b07 100644 --- a/crates/polars-parquet/src/arrow/write/file.rs +++ b/crates/polars-parquet/src/arrow/write/file.rs @@ -4,7 +4,7 @@ use arrow::datatypes::ArrowSchema; use polars_error::{PolarsError, PolarsResult}; use super::schema::schema_to_metadata_key; -use super::{to_parquet_schema, ThriftFileMetaData, WriteOptions}; +use super::{to_parquet_schema, ThriftFileMetadata, WriteOptions}; use crate::parquet::metadata::{KeyValue, SchemaDescriptor}; use crate::parquet::write::{RowGroupIterColumns, WriteOptions as FileWriteOptions}; @@ -86,10 +86,10 @@ impl FileWriter { self.writer.into_inner() } - /// Returns the underlying writer and [`ThriftFileMetaData`] + /// Returns the underlying writer and [`ThriftFileMetadata`] /// # Panics /// This function panics if [`Self::end`] has not yet been called - pub fn into_inner_and_metadata(self) -> (W, ThriftFileMetaData) { + pub fn into_inner_and_metadata(self) -> (W, ThriftFileMetadata) { self.writer.into_inner_and_metadata() } } diff --git a/crates/polars-parquet/src/arrow/write/mod.rs b/crates/polars-parquet/src/arrow/write/mod.rs index b5f816518401..02f0165d04c7 100644 --- a/crates/polars-parquet/src/arrow/write/mod.rs +++ b/crates/polars-parquet/src/arrow/write/mod.rs @@ -38,7 +38,7 @@ pub use utils::write_def_levels; pub use crate::parquet::compression::{BrotliLevel, CompressionOptions, GzipLevel, ZstdLevel}; pub use crate::parquet::encoding::Encoding; pub use crate::parquet::metadata::{ - Descriptor, FileMetaData, KeyValue, SchemaDescriptor, ThriftFileMetaData, + Descriptor, FileMetadata, KeyValue, SchemaDescriptor, ThriftFileMetadata, }; pub use crate::parquet::page::{CompressedDataPage, CompressedPage, Page}; use crate::parquet::schema::types::PrimitiveType as ParquetPrimitiveType; diff --git a/crates/polars-parquet/src/parquet/metadata/file_metadata.rs b/crates/polars-parquet/src/parquet/metadata/file_metadata.rs index a7ffd6f7ba6d..492d283f64ed 100644 --- a/crates/polars-parquet/src/parquet/metadata/file_metadata.rs +++ b/crates/polars-parquet/src/parquet/metadata/file_metadata.rs @@ -2,7 +2,7 @@ use parquet_format_safe::ColumnOrder as TColumnOrder; use super::column_order::ColumnOrder; use super::schema_descriptor::SchemaDescriptor; -use super::RowGroupMetaData; +use super::RowGroupMetadata; use crate::parquet::error::ParquetError; use crate::parquet::metadata::get_sort_order; pub use crate::parquet::thrift_format::KeyValue; @@ -11,7 +11,7 @@ pub use crate::parquet::thrift_format::KeyValue; // This is almost equal to [`parquet_format_safe::FileMetaData`] but contains the descriptors, // which are crucial to deserialize pages. #[derive(Debug)] -pub struct FileMetaData { +pub struct FileMetadata { /// version of this file. pub version: i32, /// number of rows in the file. @@ -26,7 +26,7 @@ pub struct FileMetaData { /// ``` pub created_by: Option, /// The row groups of this file - pub row_groups: Vec, + pub row_groups: Vec, /// key_value_metadata of this file. pub key_value_metadata: Option>, /// schema descriptor. @@ -41,7 +41,7 @@ pub struct FileMetaData { pub column_orders: Option>, } -impl FileMetaData { +impl FileMetadata { /// Returns the [`SchemaDescriptor`] that describes schema of this file. pub fn schema(&self) -> &SchemaDescriptor { &self.schema_descr @@ -61,7 +61,7 @@ impl FileMetaData { .unwrap_or(ColumnOrder::Undefined) } - /// Deserializes [`crate::parquet::thrift_format::FileMetaData`] into this struct + /// Deserializes [`crate::parquet::thrift_format::FileMetadata`] into this struct pub fn try_from_thrift( metadata: parquet_format_safe::FileMetaData, ) -> Result { @@ -70,14 +70,14 @@ impl FileMetaData { let row_groups = metadata .row_groups .into_iter() - .map(|rg| RowGroupMetaData::try_from_thrift(&schema_descr, rg)) + .map(|rg| RowGroupMetadata::try_from_thrift(&schema_descr, rg)) .collect::>()?; let column_orders = metadata .column_orders .map(|orders| parse_column_orders(&orders, &schema_descr)); - Ok(FileMetaData { + Ok(FileMetadata { version: metadata.version, num_rows: metadata.num_rows.try_into()?, created_by: metadata.created_by, diff --git a/crates/polars-parquet/src/parquet/metadata/mod.rs b/crates/polars-parquet/src/parquet/metadata/mod.rs index c153cd7cf592..b7a80739e719 100644 --- a/crates/polars-parquet/src/parquet/metadata/mod.rs +++ b/crates/polars-parquet/src/parquet/metadata/mod.rs @@ -9,9 +9,9 @@ mod sort; pub use column_chunk_metadata::ColumnChunkMetadata; pub use column_descriptor::{ColumnDescriptor, Descriptor}; pub use column_order::ColumnOrder; -pub use file_metadata::{FileMetaData, KeyValue}; -pub use row_metadata::RowGroupMetaData; +pub use file_metadata::{FileMetadata, KeyValue}; +pub use row_metadata::RowGroupMetadata; pub use schema_descriptor::SchemaDescriptor; pub use sort::*; -pub use crate::parquet::thrift_format::FileMetaData as ThriftFileMetaData; +pub use crate::parquet::thrift_format::FileMetaData as ThriftFileMetadata; diff --git a/crates/polars-parquet/src/parquet/metadata/row_metadata.rs b/crates/polars-parquet/src/parquet/metadata/row_metadata.rs index 717bc7e243d8..013308ad7f12 100644 --- a/crates/polars-parquet/src/parquet/metadata/row_metadata.rs +++ b/crates/polars-parquet/src/parquet/metadata/row_metadata.rs @@ -35,7 +35,7 @@ impl InitColumnLookup for ColumnLookup { /// Metadata for a row group. #[derive(Debug, Clone, Default)] -pub struct RowGroupMetaData { +pub struct RowGroupMetadata { columns: Arc<[ColumnChunkMetadata]>, column_lookup: PlHashMap>, num_rows: usize, @@ -43,7 +43,7 @@ pub struct RowGroupMetaData { full_byte_range: core::ops::Range, } -impl RowGroupMetaData { +impl RowGroupMetadata { #[inline(always)] pub fn n_columns(&self) -> usize { self.columns.len() @@ -91,7 +91,7 @@ impl RowGroupMetaData { pub(crate) fn try_from_thrift( schema_descr: &SchemaDescriptor, rg: RowGroup, - ) -> ParquetResult { + ) -> ParquetResult { if schema_descr.columns().len() != rg.columns.len() { return Err(ParquetError::oos(format!("The number of columns in the row group ({}) must be equal to the number of columns in the schema ({})", rg.columns.len(), schema_descr.columns().len()))); } @@ -127,7 +127,7 @@ impl RowGroupMetaData { }) .collect::>>()?; - Ok(RowGroupMetaData { + Ok(RowGroupMetadata { columns, column_lookup, num_rows, diff --git a/crates/polars-parquet/src/parquet/read/column/mod.rs b/crates/polars-parquet/src/parquet/read/column/mod.rs index 54065389328e..56f914ba568e 100644 --- a/crates/polars-parquet/src/parquet/read/column/mod.rs +++ b/crates/polars-parquet/src/parquet/read/column/mod.rs @@ -4,7 +4,7 @@ use polars_utils::idx_vec::UnitVec; use super::{get_page_iterator, MemReader, PageReader}; use crate::parquet::error::{ParquetError, ParquetResult}; -use crate::parquet::metadata::{ColumnChunkMetadata, RowGroupMetaData}; +use crate::parquet::metadata::{ColumnChunkMetadata, RowGroupMetadata}; use crate::parquet::page::CompressedPage; use crate::parquet::schema::types::ParquetType; @@ -17,7 +17,7 @@ use crate::parquet::schema::types::ParquetType; /// `max_page_size` is the maximum number of bytes allowed. pub fn get_column_iterator<'a>( reader: MemReader, - row_group: &'a RowGroupMetaData, + row_group: &'a RowGroupMetadata, field_name: &str, max_page_size: usize, ) -> ColumnIterator<'a> { diff --git a/crates/polars-parquet/src/parquet/read/metadata.rs b/crates/polars-parquet/src/parquet/read/metadata.rs index f92794fc2839..e14a2a60e997 100644 --- a/crates/polars-parquet/src/parquet/read/metadata.rs +++ b/crates/polars-parquet/src/parquet/read/metadata.rs @@ -2,9 +2,9 @@ use std::cmp::min; use std::io::{Read, Seek, SeekFrom}; use parquet_format_safe::thrift::protocol::TCompactInputProtocol; -use parquet_format_safe::FileMetaData as TFileMetaData; +use parquet_format_safe::FileMetaData as TFileMetadata; -use super::super::metadata::FileMetaData; +use super::super::metadata::FileMetadata; use super::super::{DEFAULT_FOOTER_READ_SIZE, FOOTER_SIZE, HEADER_SIZE, PARQUET_MAGIC}; use crate::parquet::error::{ParquetError, ParquetResult}; @@ -26,18 +26,18 @@ fn stream_len(seek: &mut impl Seek) -> std::result::Result Ok(len) } -/// Reads a [`FileMetaData`] from the reader, located at the end of the file. -pub fn read_metadata(reader: &mut R) -> ParquetResult { +/// Reads a [`FileMetadata`] from the reader, located at the end of the file. +pub fn read_metadata(reader: &mut R) -> ParquetResult { // check file is large enough to hold footer let file_size = stream_len(reader)?; read_metadata_with_size(reader, file_size) } -/// Reads a [`FileMetaData`] from the reader, located at the end of the file, with known file size. +/// Reads a [`FileMetadata`] from the reader, located at the end of the file, with known file size. pub fn read_metadata_with_size( reader: &mut R, file_size: u64, -) -> ParquetResult { +) -> ParquetResult { if file_size < HEADER_SIZE + FOOTER_SIZE { return Err(ParquetError::oos( "A parquet file must contain a header and footer with at least 12 bytes", @@ -92,9 +92,9 @@ pub fn read_metadata_with_size( } /// Parse loaded metadata bytes -pub fn deserialize_metadata(reader: R, max_size: usize) -> ParquetResult { +pub fn deserialize_metadata(reader: R, max_size: usize) -> ParquetResult { let mut prot = TCompactInputProtocol::new(reader, max_size); - let metadata = TFileMetaData::read_from_in_protocol(&mut prot)?; + let metadata = TFileMetadata::read_from_in_protocol(&mut prot)?; - FileMetaData::try_from_thrift(metadata) + FileMetadata::try_from_thrift(metadata) } diff --git a/crates/polars-parquet/src/parquet/read/stream.rs b/crates/polars-parquet/src/parquet/read/stream.rs index ec8b26c3d31d..c3755106742b 100644 --- a/crates/polars-parquet/src/parquet/read/stream.rs +++ b/crates/polars-parquet/src/parquet/read/stream.rs @@ -2,7 +2,7 @@ use std::io::SeekFrom; use futures::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; -use super::super::metadata::FileMetaData; +use super::super::metadata::FileMetadata; use super::super::{DEFAULT_FOOTER_READ_SIZE, FOOTER_SIZE, PARQUET_MAGIC}; use super::metadata::{deserialize_metadata, metadata_len}; use crate::parquet::error::{ParquetError, ParquetResult}; @@ -26,7 +26,7 @@ async fn stream_len( /// Asynchronously reads the files' metadata pub async fn read_metadata( reader: &mut R, -) -> ParquetResult { +) -> ParquetResult { let file_size = stream_len(reader).await?; if file_size < HEADER_SIZE + FOOTER_SIZE { diff --git a/crates/polars-parquet/src/parquet/write/file.rs b/crates/polars-parquet/src/parquet/write/file.rs index e9a95be68e73..8dd3212bb76a 100644 --- a/crates/polars-parquet/src/parquet/write/file.rs +++ b/crates/polars-parquet/src/parquet/write/file.rs @@ -9,7 +9,7 @@ use super::row_group::write_row_group; use super::{RowGroupIterColumns, WriteOptions}; use crate::parquet::error::{ParquetError, ParquetResult}; pub use crate::parquet::metadata::KeyValue; -use crate::parquet::metadata::{SchemaDescriptor, ThriftFileMetaData}; +use crate::parquet::metadata::{SchemaDescriptor, ThriftFileMetadata}; use crate::parquet::write::State; use crate::parquet::{FOOTER_SIZE, PARQUET_MAGIC}; @@ -20,7 +20,7 @@ pub(super) fn start_file(writer: &mut W) -> ParquetResult { pub(super) fn end_file( mut writer: &mut W, - metadata: &ThriftFileMetaData, + metadata: &ThriftFileMetadata, ) -> ParquetResult { // Write metadata let mut protocol = TCompactOutputProtocol::new(&mut writer); @@ -67,7 +67,7 @@ pub struct FileWriter { /// Used to store the current state for writing the file state: State, // when the file is written, metadata becomes available - metadata: Option, + metadata: Option, } /// Writes a parquet file containing only the header and footer @@ -75,11 +75,11 @@ pub struct FileWriter { /// This is used to write the metadata as a separate Parquet file, usually when data /// is partitioned across multiple files. /// -/// Note: Recall that when combining row groups from [`ThriftFileMetaData`], the `file_path` on each +/// Note: Recall that when combining row groups from [`ThriftFileMetadata`], the `file_path` on each /// of their column chunks must be updated with their path relative to where they are written to. pub fn write_metadata_sidecar( writer: &mut W, - metadata: &ThriftFileMetaData, + metadata: &ThriftFileMetadata, ) -> ParquetResult { let mut len = start_file(writer)?; len += end_file(writer, metadata)?; @@ -98,11 +98,11 @@ impl FileWriter { &self.schema } - /// Returns the [`ThriftFileMetaData`]. This is Some iff the [`Self::end`] has been called. + /// Returns the [`ThriftFileMetadata`]. This is Some iff the [`Self::end`] has been called. /// /// This is used to write the metadata as a separate Parquet file, usually when data /// is partitioned across multiple files - pub fn metadata(&self) -> Option<&ThriftFileMetaData> { + pub fn metadata(&self) -> Option<&ThriftFileMetadata> { self.metadata.as_ref() } } @@ -225,7 +225,7 @@ impl FileWriter { ParquetResult::Ok(()) })?; - let metadata = ThriftFileMetaData::new( + let metadata = ThriftFileMetadata::new( self.options.version.into(), self.schema.clone().into_thrift(), num_rows, @@ -248,10 +248,10 @@ impl FileWriter { self.writer } - /// Returns the underlying writer and [`ThriftFileMetaData`] + /// Returns the underlying writer and [`ThriftFileMetadata`] /// # Panics /// This function panics if [`Self::end`] has not yet been called - pub fn into_inner_and_metadata(self) -> (W, ThriftFileMetaData) { + pub fn into_inner_and_metadata(self) -> (W, ThriftFileMetadata) { (self.writer, self.metadata.expect("File to have ended")) } } diff --git a/crates/polars-parquet/src/parquet/write/stream.rs b/crates/polars-parquet/src/parquet/write/stream.rs index eadc4640e856..eca712db65dc 100644 --- a/crates/polars-parquet/src/parquet/write/stream.rs +++ b/crates/polars-parquet/src/parquet/write/stream.rs @@ -2,7 +2,7 @@ use std::io::Write; use futures::{AsyncWrite, AsyncWriteExt}; use parquet_format_safe::thrift::protocol::TCompactOutputStreamProtocol; -use parquet_format_safe::{FileMetaData, RowGroup}; +use parquet_format_safe::RowGroup; use super::row_group::write_row_group_async; use super::{RowGroupIterColumns, WriteOptions}; @@ -20,7 +20,7 @@ async fn start_file(writer: &mut W) -> ParquetResult async fn end_file( mut writer: &mut W, - metadata: FileMetaData, + metadata: parquet_format_safe::FileMetaData, ) -> ParquetResult { // Write file metadata let mut protocol = TCompactOutputStreamProtocol::new(&mut writer); @@ -169,7 +169,7 @@ impl FileStreamer { } } - let metadata = FileMetaData::new( + let metadata = parquet_format_safe::FileMetaData::new( self.options.version.into(), self.schema.clone().into_thrift(), num_rows, diff --git a/crates/polars-pipe/src/executors/sources/parquet.rs b/crates/polars-pipe/src/executors/sources/parquet.rs index 8592021b2ff3..7a0dabeb10df 100644 --- a/crates/polars-pipe/src/executors/sources/parquet.rs +++ b/crates/polars-pipe/src/executors/sources/parquet.rs @@ -10,7 +10,7 @@ use polars_core::error::*; use polars_core::prelude::Series; use polars_core::POOL; use polars_io::cloud::CloudOptions; -use polars_io::parquet::metadata::FileMetaDataRef; +use polars_io::parquet::metadata::FileMetadataRef; use polars_io::parquet::read::{BatchedParquetReader, ParquetOptions, ParquetReader}; use polars_io::path_utils::is_cloud_url; use polars_io::pl_async::get_runtime; @@ -41,7 +41,7 @@ pub struct ParquetSource { file_options: FileScanOptions, #[allow(dead_code)] cloud_options: Option, - metadata: Option, + metadata: Option, file_info: FileInfo, hive_parts: Option>>, verbose: bool, @@ -252,7 +252,7 @@ impl ParquetSource { sources: ScanSources, options: ParquetOptions, cloud_options: Option, - metadata: Option, + metadata: Option, file_options: FileScanOptions, file_info: FileInfo, hive_parts: Option>>, diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs index 25dd61aa1eb9..9fd419f90f63 100644 --- a/crates/polars-plan/src/plans/conversion/scans.rs +++ b/crates/polars-plan/src/plans/conversion/scans.rs @@ -32,7 +32,7 @@ pub(super) fn parquet_file_info( sources: &ScanSources, file_options: &FileScanOptions, #[allow(unused)] cloud_options: Option<&polars_io::cloud::CloudOptions>, -) -> PolarsResult<(FileInfo, Option)> { +) -> PolarsResult<(FileInfo, Option)> { use polars_core::error::feature_gated; let (reader_schema, num_rows, metadata) = { diff --git a/crates/polars-plan/src/plans/file_scan.rs b/crates/polars-plan/src/plans/file_scan.rs index 73ae85d93646..e868b98d2799 100644 --- a/crates/polars-plan/src/plans/file_scan.rs +++ b/crates/polars-plan/src/plans/file_scan.rs @@ -5,7 +5,7 @@ use polars_io::csv::read::CsvReadOptions; #[cfg(feature = "ipc")] use polars_io::ipc::IpcScanOptions; #[cfg(feature = "parquet")] -use polars_io::parquet::metadata::FileMetaDataRef; +use polars_io::parquet::metadata::FileMetadataRef; #[cfg(feature = "parquet")] use polars_io::parquet::read::ParquetOptions; @@ -24,7 +24,7 @@ pub enum FileScan { options: ParquetOptions, cloud_options: Option, #[cfg_attr(feature = "serde", serde(skip))] - metadata: Option, + metadata: Option, }, #[cfg(feature = "ipc")] Ipc { diff --git a/crates/polars-stream/src/nodes/parquet_source.rs b/crates/polars-stream/src/nodes/parquet_source.rs index bf5d4262fed6..cff5b9582d3a 100644 --- a/crates/polars-stream/src/nodes/parquet_source.rs +++ b/crates/polars-stream/src/nodes/parquet_source.rs @@ -17,13 +17,13 @@ use polars_expr::prelude::PhysicalExpr; use polars_io::cloud::CloudOptions; use polars_io::predicates::PhysicalIoExpr; use polars_io::prelude::_internal::read_this_row_group; -use polars_io::prelude::{FileMetaData, ParquetOptions}; +use polars_io::prelude::{FileMetadata, ParquetOptions}; use polars_io::utils::byte_source::{ ByteSource, DynByteSource, DynByteSourceBuilder, MemSliceByteSource, }; use polars_io::utils::slice::SplitSlicePosition; use polars_io::{is_cloud_url, RowIndex}; -use polars_parquet::read::RowGroupMetaData; +use polars_parquet::read::RowGroupMetadata; use polars_plan::plans::hive::HivePartitions; use polars_plan::plans::FileInfo; use polars_plan::prelude::FileScanOptions; @@ -540,7 +540,7 @@ impl ParquetSourceNode { usize, usize, Arc, - FileMetaData, + FileMetadata, usize, )>, task_handles_ext::AbortOnDropHandle>, @@ -1007,7 +1007,7 @@ struct RowGroupData { row_offset: usize, slice: Option<(usize, usize)>, file_max_row_group_height: usize, - row_group_metadata: RowGroupMetaData, + row_group_metadata: RowGroupMetadata, shared_file_state: Arc>, } @@ -1016,7 +1016,7 @@ struct RowGroupDataFetcher { usize, usize, Arc, - FileMetaData, + FileMetadata, usize, )>, use_statistics: bool, @@ -1028,7 +1028,7 @@ struct RowGroupDataFetcher { memory_prefetch_func: fn(&[u8]) -> (), current_path_index: usize, current_byte_source: Arc, - current_row_groups: std::vec::IntoIter, + current_row_groups: std::vec::IntoIter, current_row_group_idx: usize, current_max_row_group_height: usize, current_row_offset: usize, @@ -1731,7 +1731,7 @@ async fn read_parquet_metadata_bytes( } fn get_row_group_byte_ranges( - row_group_metadata: &RowGroupMetaData, + row_group_metadata: &RowGroupMetadata, ) -> impl ExactSizeIterator> + '_ { row_group_metadata .byte_ranges_iter() @@ -1739,7 +1739,7 @@ fn get_row_group_byte_ranges( } fn get_row_group_byte_ranges_for_projection<'a>( - row_group_metadata: &'a RowGroupMetaData, + row_group_metadata: &'a RowGroupMetadata, columns: &'a [PlSmallStr], ) -> impl Iterator> + 'a { columns.iter().flat_map(|col_name| { @@ -1756,7 +1756,7 @@ fn get_row_group_byte_ranges_for_projection<'a>( /// dtype. There are no ordering requirements and extra columns are permitted. fn ensure_metadata_has_projected_fields( projected_fields: &[polars_core::prelude::ArrowField], - metadata: &FileMetaData, + metadata: &FileMetadata, ) -> PolarsResult<()> { let schema = polars_parquet::arrow::read::infer_schema(metadata)?; diff --git a/crates/polars/tests/it/io/parquet/read/file.rs b/crates/polars/tests/it/io/parquet/read/file.rs index 5007dcdf0755..d2be2c5402d9 100644 --- a/crates/polars/tests/it/io/parquet/read/file.rs +++ b/crates/polars/tests/it/io/parquet/read/file.rs @@ -4,7 +4,7 @@ use arrow::array::Array; use arrow::datatypes::ArrowSchema; use arrow::record_batch::RecordBatchT; use polars_error::PolarsResult; -use polars_parquet::read::{Filter, RowGroupMetaData}; +use polars_parquet::read::{Filter, RowGroupMetadata}; use super::row_group::{read_columns_many, RowGroupDeserializer}; @@ -25,7 +25,7 @@ impl FileReader { /// Returns a new [`FileReader`]. pub fn new( reader: R, - row_groups: Vec, + row_groups: Vec, schema: ArrowSchema, limit: Option, ) -> Self { @@ -104,7 +104,7 @@ impl Iterator for FileReader { pub struct RowGroupReader { reader: R, schema: ArrowSchema, - row_groups: std::vec::IntoIter, + row_groups: std::vec::IntoIter, remaining_rows: usize, } @@ -113,7 +113,7 @@ impl RowGroupReader { pub fn new( reader: R, schema: ArrowSchema, - row_groups: Vec, + row_groups: Vec, limit: Option, ) -> Self { Self { diff --git a/crates/polars/tests/it/io/parquet/read/row_group.rs b/crates/polars/tests/it/io/parquet/read/row_group.rs index f23ee779b120..6d567a120c92 100644 --- a/crates/polars/tests/it/io/parquet/read/row_group.rs +++ b/crates/polars/tests/it/io/parquet/read/row_group.rs @@ -8,7 +8,7 @@ use polars_error::PolarsResult; use polars_parquet::arrow::read::{column_iter_to_arrays, Filter}; use polars_parquet::parquet::metadata::ColumnChunkMetadata; use polars_parquet::parquet::read::{BasicDecompressor, PageReader}; -use polars_parquet::read::RowGroupMetaData; +use polars_parquet::read::RowGroupMetadata; use polars_utils::mmap::MemReader; /// An [`Iterator`] of [`RecordBatchT`] that (dynamically) adapts a vector of iterators of [`Array`] into @@ -70,7 +70,7 @@ impl Iterator for RowGroupDeserializer { /// the field (one for non-nested types) pub fn read_columns<'a, R: Read + Seek>( reader: &mut R, - row_group_metadata: &'a RowGroupMetaData, + row_group_metadata: &'a RowGroupMetadata, field_name: &'a str, ) -> PolarsResult)>> { row_group_metadata @@ -135,7 +135,7 @@ pub fn to_deserializer( /// and convert them to [`ArrayIter`] via [`to_deserializer`]. pub fn read_columns_many( reader: &mut R, - row_group: &RowGroupMetaData, + row_group: &RowGroupMetadata, fields: &ArrowSchema, filter: Option, ) -> PolarsResult>> {