From 4d176e0ba7cedf4395e3ac59668e6d96fc1d7ebb Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Mon, 9 Sep 2024 16:13:04 +0200 Subject: [PATCH 01/28] refactor(rust): Fix unimplemented panics to give todo!s for AUTO_NEW_STREAMING (#18628) --- crates/polars-lazy/src/frame/mod.rs | 4 +++- crates/polars-stream/src/physical_plan/lower_ir.rs | 13 ++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index b2cfd7267025..d6fd3d4364cb 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -733,7 +733,9 @@ impl LazyFrame { // Fallback to normal engine if error is due to not being implemented // and auto_new_streaming is set, otherwise propagate error. if auto_new_streaming - && e.downcast_ref::<&str>() == Some(&"not yet implemented") + && e.downcast_ref::<&str>() + .map(|s| s.starts_with("not yet implemented")) + .unwrap_or(false) { if polars_core::config::verbose() { eprintln!("caught unimplemented error in new streaming engine, falling back to normal engine"); diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs index 65044a717213..5a1e44694a99 100644 --- a/crates/polars-stream/src/physical_plan/lower_ir.rs +++ b/crates/polars-stream/src/physical_plan/lower_ir.rs @@ -2,7 +2,7 @@ use std::sync::Arc; use polars_core::prelude::{InitHashMaps, PlHashMap, PlIndexMap}; use polars_core::schema::Schema; -use polars_error::{polars_err, PolarsResult}; +use polars_error::PolarsResult; use polars_plan::plans::expr_ir::{ExprIR, OutputName}; use polars_plan::plans::{AExpr, IR}; use polars_plan::prelude::SinkType; @@ -345,7 +345,7 @@ pub fn lower_ir( let paths = sources .into_paths() - .ok_or_else(|| polars_err!(nyi = "Streaming scanning of in-memory buffers"))?; + .unwrap_or_else(|| todo!("streaming scanning of in-memory buffers")); PhysNodeKind::FileScan { paths, @@ -358,7 +358,14 @@ pub fn lower_ir( } }, - _ => todo!(), + IR::PythonScan { .. } => todo!(), + IR::Reduce { .. } => todo!(), + IR::Cache { .. } => todo!(), + IR::GroupBy { .. } => todo!(), + IR::Join { .. } => todo!(), + IR::Distinct { .. } => todo!(), + IR::ExtContext { .. } => todo!(), + IR::Invalid => unreachable!(), }; Ok(phys_sm.insert(PhysNode::new(output_schema, node_kind))) From 72d861eefa9e59fe3d4bb0a52ec8af12e5520dce Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Mon, 9 Sep 2024 17:37:08 +0200 Subject: [PATCH 02/28] fix: Properly slice validity mask on pl.Object series (#18631) --- crates/polars-core/src/chunked_array/object/mod.rs | 6 +++++- py-polars/tests/unit/datatypes/test_object.py | 9 +++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/crates/polars-core/src/chunked_array/object/mod.rs b/crates/polars-core/src/chunked_array/object/mod.rs index 1b018800dd98..88b3d84e726f 100644 --- a/crates/polars-core/src/chunked_array/object/mod.rs +++ b/crates/polars-core/src/chunked_array/object/mod.rs @@ -183,7 +183,11 @@ where unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) { let len = std::cmp::min(self.len - offset, length); - + self.null_bitmap = self + .null_bitmap + .take() + .map(|bitmap| bitmap.sliced_unchecked(offset, length)) + .filter(|bitmap| bitmap.unset_bits() > 0); self.len = len; self.offset = offset; } diff --git a/py-polars/tests/unit/datatypes/test_object.py b/py-polars/tests/unit/datatypes/test_object.py index 803e7933b8ab..8db373d3f58a 100644 --- a/py-polars/tests/unit/datatypes/test_object.py +++ b/py-polars/tests/unit/datatypes/test_object.py @@ -7,6 +7,7 @@ import polars as pl from polars.exceptions import ComputeError +from polars.testing import assert_series_equal def test_series_init_instantiated_object() -> None: @@ -190,3 +191,11 @@ def test_raise_list_object() -> None: # We don't want to support this. Unsafe enough as it is already. with pytest.raises(ValueError): pl.Series([[object()]], dtype=pl.List(pl.Object())) + + +def test_object_null_slice() -> None: + s = pl.Series("x", [1, None, 42], dtype=pl.Object) + assert_series_equal(s.is_null(), pl.Series("x", [False, True, False])) + assert_series_equal(s.slice(0, 2).is_null(), pl.Series("x", [False, True])) + assert_series_equal(s.slice(1, 1).is_null(), pl.Series("x", [True])) + assert_series_equal(s.slice(2, 1).is_null(), pl.Series("x", [False])) From 76a340b0957e181d981d812cb58848a8121a4353 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Mon, 9 Sep 2024 19:45:10 +0200 Subject: [PATCH 03/28] fix: Use Buffer in ObjectSeries, fixes variety of offset bugs (#18637) --- .../src/chunked_array/from_iterator.rs | 14 +---- .../src/chunked_array/iterator/mod.rs | 2 +- .../src/chunked_array/object/builder.rs | 18 ++---- .../src/chunked_array/object/mod.rs | 63 +++++++++---------- .../src/datatypes/static_array_collect.rs | 13 +--- 5 files changed, 40 insertions(+), 70 deletions(-) diff --git a/crates/polars-core/src/chunked_array/from_iterator.rs b/crates/polars-core/src/chunked_array/from_iterator.rs index 766ef94acc8e..72f2bc8c60cb 100644 --- a/crates/polars-core/src/chunked_array/from_iterator.rs +++ b/crates/polars-core/src/chunked_array/from_iterator.rs @@ -2,7 +2,7 @@ use std::borrow::{Borrow, Cow}; #[cfg(feature = "object")] -use arrow::bitmap::{Bitmap, MutableBitmap}; +use arrow::bitmap::MutableBitmap; use crate::chunked_array::builder::{get_list_builder, AnonymousOwnedListBuilder}; #[cfg(feature = "object")] @@ -268,17 +268,7 @@ impl FromIterator> for ObjectChunked { }) .collect(); - let null_bit_buffer: Option = null_mask_builder.into(); - let null_bitmap = null_bit_buffer; - - let len = values.len(); - - let arr = Box::new(ObjectArray { - values: Arc::new(values), - null_bitmap, - offset: 0, - len, - }); + let arr = Box::new(ObjectArray::from(values).with_validity(null_mask_builder.into())); ChunkedArray::new_with_compute_len( Arc::new(Field::new(PlSmallStr::EMPTY, get_object_type::())), vec![arr], diff --git a/crates/polars-core/src/chunked_array/iterator/mod.rs b/crates/polars-core/src/chunked_array/iterator/mod.rs index 7756153891c6..728ffc5a8cff 100644 --- a/crates/polars-core/src/chunked_array/iterator/mod.rs +++ b/crates/polars-core/src/chunked_array/iterator/mod.rs @@ -432,7 +432,7 @@ impl ObjectChunked { // we know that we only iterate over length == self.len() unsafe { self.downcast_iter() - .flat_map(|arr| arr.values().iter()) + .flat_map(|arr| arr.values_iter()) .trust_my_length(self.len()) } } diff --git a/crates/polars-core/src/chunked_array/object/builder.rs b/crates/polars-core/src/chunked_array/object/builder.rs index 01524c018ec2..45f63847e97f 100644 --- a/crates/polars-core/src/chunked_array/object/builder.rs +++ b/crates/polars-core/src/chunked_array/object/builder.rs @@ -61,10 +61,8 @@ where .unwrap_or(0) as IdxSize; let arr = Box::new(ObjectArray { - values: Arc::new(self.values), - null_bitmap, - offset: 0, - len, + values: self.values.into(), + validity: null_bitmap, }); self.field.dtype = get_object_type::(); @@ -140,10 +138,8 @@ where let field = Arc::new(Field::new(name, DataType::Object(T::type_name(), None))); let len = v.len(); let arr = Box::new(ObjectArray { - values: Arc::new(v), - null_bitmap: None, - offset: 0, - len, + values: v.into(), + validity: None, }); unsafe { ObjectChunked::new_with_dims(field, vec![arr], len as IdxSize, 0) } @@ -154,10 +150,8 @@ where let len = v.len(); let null_count = validity.unset_bits(); let arr = Box::new(ObjectArray { - values: Arc::new(v), - null_bitmap: Some(validity), - offset: 0, - len, + values: v.into(), + validity: Some(validity), }); unsafe { diff --git a/crates/polars-core/src/chunked_array/object/mod.rs b/crates/polars-core/src/chunked_array/object/mod.rs index 88b3d84e726f..a7e3d2f9952d 100644 --- a/crates/polars-core/src/chunked_array/object/mod.rs +++ b/crates/polars-core/src/chunked_array/object/mod.rs @@ -4,6 +4,7 @@ use std::hash::Hash; use arrow::bitmap::utils::{BitmapIter, ZipValidity}; use arrow::bitmap::{Bitmap, MutableBitmap}; +use arrow::buffer::Buffer; use polars_utils::total_ord::TotalHash; use crate::prelude::*; @@ -22,10 +23,8 @@ pub struct ObjectArray where T: PolarsObject, { - pub(crate) values: Arc>, - pub(crate) null_bitmap: Option, - pub(crate) offset: usize, - pub(crate) len: usize, + values: Buffer, + validity: Option, } /// Trimmed down object safe polars object @@ -80,23 +79,18 @@ impl ObjectArray where T: PolarsObject, { - /// Get a reference to the underlying data - pub fn values(&self) -> &Arc> { - &self.values - } - pub fn values_iter(&self) -> ObjectValueIter<'_, T> { self.values.iter() } /// Returns an iterator of `Option<&T>` over every element of this array. pub fn iter(&self) -> ZipValidity<&T, ObjectValueIter<'_, T>, BitmapIter> { - ZipValidity::new_with_validity(self.values_iter(), self.null_bitmap.as_ref()) + ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref()) } /// Get a value at a certain index location pub fn value(&self, index: usize) -> &T { - &self.values[self.offset + index] + &self.values[index] } pub fn get(&self, index: usize) -> Option<&T> { @@ -123,7 +117,7 @@ where /// No bounds checks #[inline] pub unsafe fn is_valid_unchecked(&self, i: usize) -> bool { - if let Some(b) = &self.null_bitmap { + if let Some(b) = &self.validity { b.get_bit_unchecked(i) } else { true @@ -157,7 +151,7 @@ where if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) { panic!("validity must be equal to the array's length") } - self.null_bitmap = validity; + self.validity = validity; } } @@ -182,14 +176,12 @@ where } unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) { - let len = std::cmp::min(self.len - offset, length); - self.null_bitmap = self - .null_bitmap + self.validity = self + .validity .take() .map(|bitmap| bitmap.sliced_unchecked(offset, length)) .filter(|bitmap| bitmap.unset_bits() > 0); - self.len = len; - self.offset = offset; + self.values.slice_unchecked(offset, length); } fn split_at_boxed(&self, offset: usize) -> (Box, Box) { @@ -203,11 +195,11 @@ where } fn len(&self) -> usize { - self.len + self.values.len() } fn validity(&self) -> Option<&Bitmap> { - self.null_bitmap.as_ref() + self.validity.as_ref() } fn with_validity(&self, validity: Option) -> Box { @@ -223,7 +215,7 @@ where } fn null_count(&self) -> usize { - match &self.null_bitmap { + match &self.validity { None => 0, Some(validity) => validity.unset_bits(), } @@ -236,18 +228,16 @@ impl Splitable for ObjectArray { } unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) { + let (left_values, right_values) = unsafe { self.values.split_at_unchecked(offset) }; + let (left_validity, right_validity) = unsafe { self.validity.split_at_unchecked(offset) }; ( Self { - values: self.values.clone(), - null_bitmap: self.null_bitmap.clone(), - len: offset, - offset: self.offset, + values: left_values, + validity: left_validity, }, Self { - values: self.values.clone(), - null_bitmap: self.null_bitmap.clone(), - len: self.len() - offset, - offset: self.offset + offset, + values: right_values, + validity: right_validity, }, ) } @@ -277,10 +267,8 @@ impl StaticArray for ObjectArray { fn full_null(length: usize, _dtype: ArrowDataType) -> Self { ObjectArray { - values: Arc::new(vec![T::default(); length]), - null_bitmap: Some(Bitmap::new_with_value(false, length)), - offset: 0, - len: length, + values: vec![T::default(); length].into(), + validity: Some(Bitmap::new_with_value(false, length)), } } } @@ -328,3 +316,12 @@ where } } } + +impl From> for ObjectArray { + fn from(values: Vec) -> Self { + Self { + values: values.into(), + validity: None, + } + } +} diff --git a/crates/polars-core/src/datatypes/static_array_collect.rs b/crates/polars-core/src/datatypes/static_array_collect.rs index 02974d7b33a8..5974b3049a0d 100644 --- a/crates/polars-core/src/datatypes/static_array_collect.rs +++ b/crates/polars-core/src/datatypes/static_array_collect.rs @@ -1,7 +1,4 @@ -use std::sync::Arc; - use arrow::array::ArrayFromIter; -use arrow::bitmap::Bitmap; use crate::chunked_array::object::{ObjectArray, PolarsObject}; @@ -41,14 +38,6 @@ impl<'a, T: PolarsObject> ArrayFromIter> for ObjectArray { }) .collect::, E>>()?; - let null_bit_buffer: Option = null_mask_builder.into(); - let null_bitmap = null_bit_buffer; - let len = values.len(); - Ok(ObjectArray { - values: Arc::new(values), - null_bitmap, - offset: 0, - len, - }) + Ok(ObjectArray::from(values).with_validity(null_mask_builder.into())) } } From 45c8e964bbed25e627e1c6365d24df4f84e78225 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 10 Sep 2024 07:57:06 +0200 Subject: [PATCH 04/28] refactor: Change join_where semantics (#18640) --- crates/polars-plan/src/dsl/expr.rs | 23 +++ .../src/plans/conversion/dsl_to_ir.rs | 4 +- .../polars-plan/src/plans/conversion/join.rs | 190 ++++++++++++++---- py-polars/polars/dataframe/frame.py | 13 +- py-polars/polars/lazyframe/frame.py | 15 +- .../unit/operations/test_inequality_join.py | 25 ++- 6 files changed, 200 insertions(+), 70 deletions(-) diff --git a/crates/polars-plan/src/dsl/expr.rs b/crates/polars-plan/src/dsl/expr.rs index ced2de5e7eb5..a8c48cd17fb8 100644 --- a/crates/polars-plan/src/dsl/expr.rs +++ b/crates/polars-plan/src/dsl/expr.rs @@ -393,6 +393,29 @@ impl Operator { ) } + pub fn swap_operands(self) -> Self { + match self { + Operator::Eq => Operator::Eq, + Operator::Gt => Operator::Lt, + Operator::GtEq => Operator::LtEq, + Operator::LtEq => Operator::GtEq, + Operator::Or => Operator::Or, + Operator::LogicalAnd => Operator::LogicalAnd, + Operator::LogicalOr => Operator::LogicalOr, + Operator::Xor => Operator::Xor, + Operator::NotEq => Operator::NotEq, + Operator::EqValidity => Operator::EqValidity, + Operator::NotEqValidity => Operator::NotEqValidity, + Operator::Divide => Operator::Multiply, + Operator::Multiply => Operator::Divide, + Operator::And => Operator::And, + Operator::Plus => Operator::Minus, + Operator::Minus => Operator::Plus, + Operator::Lt => Operator::Gt, + _ => unimplemented!(), + } + } + pub fn is_arithmetic(&self) -> bool { !(self.is_comparison()) } diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs index 5ab23be19a14..658dc9989eb5 100644 --- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs @@ -558,8 +558,8 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult options, } => { return join::resolve_join( - input_left, - input_right, + Either::Left(input_left), + Either::Left(input_right), left_on, right_on, predicates, diff --git a/crates/polars-plan/src/plans/conversion/join.rs b/crates/polars-plan/src/plans/conversion/join.rs index 6c3e28bb6c7a..36701f9ab5a7 100644 --- a/crates/polars-plan/src/plans/conversion/join.rs +++ b/crates/polars-plan/src/plans/conversion/join.rs @@ -1,4 +1,5 @@ use arrow::legacy::error::PolarsResult; +use either::Either; use super::*; use crate::dsl::Expr; @@ -16,8 +17,8 @@ fn check_join_keys(keys: &[Expr]) -> PolarsResult<()> { Ok(()) } pub fn resolve_join( - input_left: Arc, - input_right: Arc, + input_left: Either, Node>, + input_right: Either, Node>, left_on: Vec, right_on: Vec, predicates: Vec, @@ -26,7 +27,13 @@ pub fn resolve_join( ) -> PolarsResult { if !predicates.is_empty() { debug_assert!(left_on.is_empty() && right_on.is_empty()); - return resolve_join_where(input_left, input_right, predicates, options, ctxt); + return resolve_join_where( + input_left.unwrap_left(), + input_right.unwrap_left(), + predicates, + options, + ctxt, + ); } let owned = Arc::unwrap_or_clone; @@ -62,10 +69,12 @@ pub fn resolve_join( ); } - let input_left = - to_alp_impl(owned(input_left), ctxt).map_err(|e| e.context(failed_input!(join left)))?; - let input_right = - to_alp_impl(owned(input_right), ctxt).map_err(|e| e.context(failed_input!(join, right)))?; + let input_left = input_left.map_right(Ok).right_or_else(|input| { + to_alp_impl(owned(input), ctxt).map_err(|e| e.context(failed_input!(join left))) + })?; + let input_right = input_right.map_right(Ok).right_or_else(|input| { + to_alp_impl(owned(input), ctxt).map_err(|e| e.context(failed_input!(join right))) + })?; let schema_left = ctxt.lp_arena.get(input_left).schema(ctxt.lp_arena); let schema_right = ctxt.lp_arena.get(input_right).schema(ctxt.lp_arena); @@ -129,7 +138,6 @@ fn resolve_join_where( ctxt: &mut DslConversionContext, ) -> PolarsResult { check_join_keys(&predicates)?; - for e in &predicates { let no_binary_comparisons = e .into_iter() @@ -138,15 +146,40 @@ fn resolve_join_where( _ => false, }) .count(); - polars_ensure!(no_binary_comparisons == 1, InvalidOperation: "only 1 binary comparison allowed as join condition") + polars_ensure!(no_binary_comparisons == 1, InvalidOperation: "only 1 binary comparison allowed as join condition"); } + let input_left = to_alp_impl(Arc::unwrap_or_clone(input_left), ctxt) + .map_err(|e| e.context(failed_input!(join left)))?; + let input_right = to_alp_impl(Arc::unwrap_or_clone(input_right), ctxt) + .map_err(|e| e.context(failed_input!(join left)))?; + + let schema_left = ctxt.lp_arena.get(input_left).schema(ctxt.lp_arena); + let schema_right = ctxt + .lp_arena + .get(input_right) + .schema(ctxt.lp_arena) + .into_owned(); let owned = |e: Arc| (*e).clone(); - // Partition to: + // We do a few things + // First we partition to: // - IEjoin supported inequality predicates // - equality predicates // - remaining predicates + // And then decide to which join we dispatch. + // The remaining predicates will be applied as filter. + + // What make things a bit complicated is that duplicate join names + // are referred to in the query with the name post-join, but on joins + // we refer to the names pre-join (e.g. without suffix). So there is some + // bookkeeping. + // + // - First we determine which side of the binary expression refers to the left and right table + // and make sure that lhs of the binary expr, maps to the lhs of the join tables and vice versa. + // Next we ensure the suffixes are removed when we partition. + // + // If a predicate has to be applied as post-join filter, we put the suffixes back if needed. let mut ie_left_on = vec![]; let mut ie_right_on = vec![]; let mut ie_op = vec![]; @@ -166,37 +199,110 @@ fn resolve_join_where( } } + fn rename_expr(e: Expr, old: &str, new: &str) -> Expr { + e.map_expr(|e| match e { + Expr::Column(name) if name.as_str() == old => Expr::Column(new.into()), + e => e, + }) + } + + fn determine_order_and_pre_join_names( + left: Expr, + op: Operator, + right: Expr, + schema_left: &Schema, + schema_right: &Schema, + suffix: &str, + ) -> PolarsResult<(Expr, Operator, Expr)> { + let left_names = expr_to_leaf_column_names_iter(&left).collect::>(); + let right_names = expr_to_leaf_column_names_iter(&right).collect::>(); + + // All left should be in the left schema. + let (left_names, right_names, left, op, mut right) = + if !left_names.iter().all(|n| schema_left.contains(n)) { + // If all right names are in left schema -> swap + if right_names.iter().all(|n| schema_left.contains(n)) { + (right_names, left_names, right, op.swap_operands(), left) + } else { + polars_bail!(InvalidOperation: "got ambiguous column names in 'join_where'") + } + } else { + (left_names, right_names, left, op, right) + }; + for name in &left_names { + polars_ensure!(!right_names.contains(name.as_str()), InvalidOperation: "got ambiguous column names in 'join_where'\n\n\ + Note that you should refer to the column names as they are post-join operation.") + } + + // Now we know left belongs to the left schema, rhs suffixes are dealt with. + for post_join_name in right_names { + if let Some(pre_join_name) = post_join_name.strip_suffix(suffix) { + // Name is both sides, so a suffix will be added by the join. + // We rename + if schema_right.contains(pre_join_name) && schema_left.contains(pre_join_name) { + right = rename_expr(right, &post_join_name, pre_join_name); + } + } + } + Ok((left, op, right)) + } + + // Make it a binary comparison and ensure the columns refer to post join names. + fn to_binary_post_join( + l: Expr, + op: Operator, + mut r: Expr, + schema_right: &Schema, + suffix: &str, + ) -> Expr { + let names = expr_to_leaf_column_names_iter(&r).collect::>(); + for pre_join_name in &names { + if !schema_right.contains(pre_join_name) { + let post_join_name = _join_suffix_name(pre_join_name, suffix); + r = rename_expr(r, pre_join_name, post_join_name.as_str()); + } + } + + Expr::BinaryExpr { + left: Arc::from(l), + op, + right: Arc::from(r), + } + } + + let suffix = options.args.suffix().clone(); for pred in predicates.into_iter() { let Expr::BinaryExpr { left, op, right } = pred.clone() else { polars_bail!(InvalidOperation: "can only join on binary expressions") }; polars_ensure!(op.is_comparison(), InvalidOperation: "expected comparison in join predicate"); + let (left, op, right) = determine_order_and_pre_join_names( + owned(left), + op, + owned(right), + &schema_left, + &schema_right, + &suffix, + )?; if let Some(ie_op_) = to_inequality_operator(&op) { // We already have an IEjoin or an Inner join, push to remaining if ie_op.len() >= 2 || !eq_right_on.is_empty() { - remaining_preds.push(Expr::BinaryExpr { left, op, right }) + remaining_preds.push(to_binary_post_join(left, op, right, &schema_right, &suffix)) } else { - ie_left_on.push(owned(left)); - ie_right_on.push(owned(right)); + ie_left_on.push(left); + ie_right_on.push(right); ie_op.push(ie_op_) } } else if matches!(op, Operator::Eq) { - eq_left_on.push(owned(left)); - eq_right_on.push(owned(right)); + eq_left_on.push(left); + eq_right_on.push(right); } else { - remaining_preds.push(pred); + remaining_preds.push(to_binary_post_join(left, op, right, &schema_right, &suffix)); } } // Now choose a primary join and do the remaining predicates as filters - fn to_binary(l: Expr, op: Operator, r: Expr) -> Expr { - Expr::BinaryExpr { - left: Arc::from(l), - op, - right: Arc::from(r), - } - } // Add the ie predicates to the remaining predicates buffer so that they will be executed in the // filter node. fn ie_predicates_to_remaining( @@ -204,13 +310,15 @@ fn resolve_join_where( ie_left_on: Vec, ie_right_on: Vec, ie_op: Vec, + schema_right: &Schema, + suffix: &str, ) { for ((l, op), r) in ie_left_on .into_iter() .zip(ie_op.into_iter()) .zip(ie_right_on.into_iter()) { - remaining_preds.push(to_binary(l, op.into(), r)) + remaining_preds.push(to_binary_post_join(l, op.into(), r, schema_right, suffix)) } } @@ -218,8 +326,8 @@ fn resolve_join_where( // We found one or more equality predicates. Go into a default equi join // as those are cheapest on avg. let join_node = resolve_join( - input_left, - input_right, + Either::Right(input_left), + Either::Right(input_right), eq_left_on, eq_right_on, vec![], @@ -227,7 +335,14 @@ fn resolve_join_where( ctxt, )?; - ie_predicates_to_remaining(&mut remaining_preds, ie_left_on, ie_right_on, ie_op); + ie_predicates_to_remaining( + &mut remaining_preds, + ie_left_on, + ie_right_on, + ie_op, + &schema_right, + &suffix, + ); join_node } // TODO! once we support single IEjoin predicates, we must add a branch for the singe ie_pred case. @@ -240,8 +355,8 @@ fn resolve_join_where( }); let join_node = resolve_join( - input_left, - input_right, + Either::Right(input_left), + Either::Right(input_right), ie_left_on[..2].to_vec(), ie_right_on[..2].to_vec(), vec![], @@ -258,7 +373,7 @@ fn resolve_join_where( let r = ie_left_on.pop().unwrap(); let op = ie_op.pop().unwrap(); - remaining_preds.push(to_binary(l, op.into(), r)) + remaining_preds.push(to_binary_post_join(l, op.into(), r, &schema_right, &suffix)) } join_node } else { @@ -268,8 +383,8 @@ fn resolve_join_where( opts.args.how = JoinType::Cross; let join_node = resolve_join( - input_left, - input_right, + Either::Right(input_left), + Either::Right(input_right), vec![], vec![], vec![], @@ -277,7 +392,14 @@ fn resolve_join_where( ctxt, )?; // TODO: This can be removed once we support the single IEjoin. - ie_predicates_to_remaining(&mut remaining_preds, ie_left_on, ie_right_on, ie_op); + ie_predicates_to_remaining( + &mut remaining_preds, + ie_left_on, + ie_right_on, + ie_op, + &schema_right, + &suffix, + ); join_node }; @@ -301,8 +423,6 @@ fn resolve_join_where( .schema(ctxt.lp_arena) .into_owned(); - let suffix = options.args.suffix(); - let mut last_node = join_node; // Ensure that the predicates use the proper suffix diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 8a023b4e01ae..2d8710b9f431 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -7118,20 +7118,11 @@ def join_where( DataFrame to join with. *predicates (In)Equality condition to join the two table on. - The left `pl.col(..)` will refer to the left table - and the right `pl.col(..)` - to the right table. - For example: `pl.col("time") >= pl.col("duration")` + When a column name occurs in both tables, the proper suffix must + be applied in the predicate. suffix Suffix to append to columns with a duplicate name. - Notes - ----- - This method is strict about its equality expressions. - Only 1 equality expression is allowed per predicate, where - the lhs `pl.col` refers to the left table in the join, and the - rhs `pl.col` refers to the right table. - Examples -------- >>> east = pl.DataFrame( diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 44978528e272..ec329898441a 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -4581,23 +4581,14 @@ def join_where( Parameters ---------- other - LazyFrame to join with. + DataFrame to join with. *predicates (In)Equality condition to join the two table on. - The left `pl.col(..)` will refer to the left table - and the right `pl.col(..)` - to the right table. - For example: `pl.col("time") >= pl.col("duration")` + When a column name occurs in both tables, the proper suffix must + be applied in the predicate. suffix Suffix to append to columns with a duplicate name. - Notes - ----- - This method is strict about its equality expressions. - Only 1 equality expression is allowed per predicate, where - the lhs `pl.col` refers to the left table in the join, and the - rhs `pl.col` refers to the right table. - Examples -------- >>> east = pl.LazyFrame( diff --git a/py-polars/tests/unit/operations/test_inequality_join.py b/py-polars/tests/unit/operations/test_inequality_join.py index 7b3ddb279fec..242a0e1f5e8e 100644 --- a/py-polars/tests/unit/operations/test_inequality_join.py +++ b/py-polars/tests/unit/operations/test_inequality_join.py @@ -15,7 +15,14 @@ from hypothesis.strategies import DrawFn, SearchStrategy -def test_self_join() -> None: +@pytest.mark.parametrize( + ("pred_1", "pred_2"), + [ + (pl.col("time") > pl.col("time_right"), pl.col("cost") < pl.col("cost_right")), + (pl.col("time_right") < pl.col("time"), pl.col("cost_right") > pl.col("cost")), + ], +) +def test_self_join(pred_1: pl.Expr, pred_2: pl.Expr) -> None: west = pl.DataFrame( { "t_id": [404, 498, 676, 742], @@ -25,9 +32,7 @@ def test_self_join() -> None: } ) - actual = west.join_where( - west, pl.col("time") > pl.col("time"), pl.col("cost") < pl.col("cost") - ) + actual = west.join_where(west, pred_1, pred_2) expected = pl.DataFrame( { @@ -223,7 +228,7 @@ def test_join_where_predicates() -> None: right.lazy(), pl.col("time") >= pl.col("start_time"), pl.col("time") < pl.col("end_time"), - pl.col("group") == pl.col("group"), + pl.col("group_right") == pl.col("group"), ) .select("id", "id_right", "group") .sort("id") @@ -252,7 +257,7 @@ def test_join_where_predicates() -> None: right.lazy(), pl.col("time") >= pl.col("start_time"), pl.col("time") < pl.col("end_time"), - pl.col("group") != pl.col("group"), + pl.col("group") != pl.col("group_right"), ) .select("id", "id_right", "group") .sort("id") @@ -279,7 +284,7 @@ def test_join_where_predicates() -> None: left.lazy() .join_where( right.lazy(), - pl.col("group") != pl.col("group"), + pl.col("group") != pl.col("group_right"), ) .select("id", "group", "group_right") .sort("id") @@ -443,10 +448,10 @@ def test_ie_join_with_floats( assert_frame_equal(actual, expected, check_row_order=False, check_exact=True) -def test_raise_on_suffixed_predicate_18604() -> None: +def test_raise_on_ambiguous_name() -> None: df = pl.DataFrame({"id": [1, 2]}) - with pytest.raises(pl.exceptions.ColumnNotFoundError): - df.join_where(df, pl.col("id") >= pl.col("id_right")) + with pytest.raises(pl.exceptions.InvalidOperationError): + df.join_where(df, pl.col("id") >= pl.col("id")) def test_raise_on_multiple_binary_comparisons() -> None: From 8ebd739b4c8c4d423c332029b39d957d45c6118a Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Tue, 10 Sep 2024 17:20:57 +1000 Subject: [PATCH 05/28] refactor(rust): Rename `MetaData` -> `Metadata` (#18644) --- crates/polars-io/src/parquet/metadata.rs | 4 ++-- .../polars-io/src/parquet/read/async_impl.rs | 22 +++++++++---------- .../polars-io/src/parquet/read/predicates.rs | 6 ++--- .../polars-io/src/parquet/read/read_impl.rs | 20 ++++++++--------- crates/polars-io/src/parquet/read/reader.rs | 10 ++++----- .../src/executors/scan/parquet.rs | 6 ++--- crates/polars-parquet/src/arrow/read/mod.rs | 8 +++---- .../src/arrow/read/schema/mod.rs | 8 +++---- crates/polars-parquet/src/arrow/write/file.rs | 6 ++--- crates/polars-parquet/src/arrow/write/mod.rs | 2 +- .../src/parquet/metadata/file_metadata.rs | 14 ++++++------ .../src/parquet/metadata/mod.rs | 6 ++--- .../src/parquet/metadata/row_metadata.rs | 8 +++---- .../src/parquet/read/column/mod.rs | 4 ++-- .../src/parquet/read/metadata.rs | 18 +++++++-------- .../polars-parquet/src/parquet/read/stream.rs | 4 ++-- .../polars-parquet/src/parquet/write/file.rs | 20 ++++++++--------- .../src/parquet/write/stream.rs | 6 ++--- .../src/executors/sources/parquet.rs | 6 ++--- .../polars-plan/src/plans/conversion/scans.rs | 2 +- crates/polars-plan/src/plans/file_scan.rs | 4 ++-- .../polars-stream/src/nodes/parquet_source.rs | 18 +++++++-------- .../polars/tests/it/io/parquet/read/file.rs | 8 +++---- .../tests/it/io/parquet/read/row_group.rs | 6 ++--- 24 files changed, 108 insertions(+), 108 deletions(-) diff --git a/crates/polars-io/src/parquet/metadata.rs b/crates/polars-io/src/parquet/metadata.rs index bc032651b837..ad62aecf36d3 100644 --- a/crates/polars-io/src/parquet/metadata.rs +++ b/crates/polars-io/src/parquet/metadata.rs @@ -2,7 +2,7 @@ use std::sync::Arc; -pub use polars_parquet::parquet::metadata::FileMetaData; +pub use polars_parquet::parquet::metadata::FileMetadata; pub use polars_parquet::read::statistics::{deserialize, Statistics as ParquetStatistics}; -pub type FileMetaDataRef = Arc; +pub type FileMetadataRef = Arc; diff --git a/crates/polars-io/src/parquet/read/async_impl.rs b/crates/polars-io/src/parquet/read/async_impl.rs index 562156405b95..0c1ead03b85b 100644 --- a/crates/polars-io/src/parquet/read/async_impl.rs +++ b/crates/polars-io/src/parquet/read/async_impl.rs @@ -6,8 +6,8 @@ use bytes::Bytes; use object_store::path::Path as ObjectPath; use polars_core::config::{get_rg_prefetch_size, verbose}; use polars_core::prelude::*; -use polars_parquet::read::RowGroupMetaData; -use polars_parquet::write::FileMetaData; +use polars_parquet::read::RowGroupMetadata; +use polars_parquet::write::FileMetadata; use polars_utils::pl_str::PlSmallStr; use tokio::sync::mpsc::{channel, Receiver, Sender}; use tokio::sync::Mutex; @@ -17,7 +17,7 @@ use super::predicates::read_this_row_group; use crate::cloud::{ build_object_store, object_path_from_str, CloudLocation, CloudOptions, PolarsObjectStore, }; -use crate::parquet::metadata::FileMetaDataRef; +use crate::parquet::metadata::FileMetadataRef; use crate::pl_async::get_runtime; use crate::predicates::PhysicalIoExpr; @@ -29,14 +29,14 @@ pub struct ParquetObjectStore { store: PolarsObjectStore, path: ObjectPath, length: Option, - metadata: Option, + metadata: Option, } impl ParquetObjectStore { pub async fn from_uri( uri: &str, options: Option<&CloudOptions>, - metadata: Option, + metadata: Option, ) -> PolarsResult { let (CloudLocation { prefix, .. }, store) = build_object_store(uri, options, false).await?; let path = object_path_from_str(&prefix)?; @@ -74,13 +74,13 @@ impl ParquetObjectStore { } /// Fetch the metadata of the parquet file, do not memoize it. - async fn fetch_metadata(&mut self) -> PolarsResult { + async fn fetch_metadata(&mut self) -> PolarsResult { let length = self.length().await?; fetch_metadata(&self.store, &self.path, length).await } /// Fetch and memoize the metadata of the parquet file. - pub async fn get_metadata(&mut self) -> PolarsResult<&FileMetaDataRef> { + pub async fn get_metadata(&mut self) -> PolarsResult<&FileMetadataRef> { if self.metadata.is_none() { self.metadata = Some(Arc::new(self.fetch_metadata().await?)); } @@ -107,7 +107,7 @@ pub async fn fetch_metadata( store: &PolarsObjectStore, path: &ObjectPath, file_byte_length: usize, -) -> PolarsResult { +) -> PolarsResult { let footer_header_bytes = store .get_range( path, @@ -165,7 +165,7 @@ pub async fn fetch_metadata( /// We concurrently download the columns for each field. async fn download_projection( fields: Arc<[PlSmallStr]>, - row_group: RowGroupMetaData, + row_group: RowGroupMetadata, async_reader: Arc, sender: QueueSend, rg_index: usize, @@ -205,7 +205,7 @@ async fn download_projection( } async fn download_row_group( - rg: RowGroupMetaData, + rg: RowGroupMetadata, async_reader: Arc, sender: QueueSend, rg_index: usize, @@ -255,7 +255,7 @@ impl FetchRowGroupsFromObjectStore { projection: Option<&[usize]>, predicate: Option>, row_group_range: Range, - row_groups: &[RowGroupMetaData], + row_groups: &[RowGroupMetadata], ) -> PolarsResult { let projected_fields: Option> = projection.map(|projection| { projection diff --git a/crates/polars-io/src/parquet/read/predicates.rs b/crates/polars-io/src/parquet/read/predicates.rs index fa1655be4846..87615de1b8c2 100644 --- a/crates/polars-io/src/parquet/read/predicates.rs +++ b/crates/polars-io/src/parquet/read/predicates.rs @@ -1,6 +1,6 @@ use polars_core::prelude::*; use polars_parquet::read::statistics::{deserialize, Statistics}; -use polars_parquet::read::RowGroupMetaData; +use polars_parquet::read::RowGroupMetadata; use crate::predicates::{BatchStats, ColumnStats, PhysicalIoExpr}; @@ -17,7 +17,7 @@ impl ColumnStats { /// Collect the statistics in a row-group pub(crate) fn collect_statistics( - md: &RowGroupMetaData, + md: &RowGroupMetadata, schema: &ArrowSchema, ) -> PolarsResult> { // TODO! fix this performance. This is a full sequential scan. @@ -47,7 +47,7 @@ pub(crate) fn collect_statistics( pub fn read_this_row_group( predicate: Option<&dyn PhysicalIoExpr>, - md: &RowGroupMetaData, + md: &RowGroupMetadata, schema: &ArrowSchema, ) -> PolarsResult { if let Some(pred) = predicate { diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index e43c34ca2d70..c621b698cebc 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -12,7 +12,7 @@ use polars_core::POOL; use polars_parquet::parquet::error::ParquetResult; use polars_parquet::parquet::statistics::Statistics; use polars_parquet::read::{ - self, ColumnChunkMetadata, FileMetaData, Filter, PhysicalType, RowGroupMetaData, + self, ColumnChunkMetadata, FileMetadata, Filter, PhysicalType, RowGroupMetadata, }; use polars_utils::mmap::MemSlice; use rayon::prelude::*; @@ -26,7 +26,7 @@ use super::utils::materialize_empty_df; use super::{mmap, ParallelStrategy}; use crate::hive::materialize_hive_partitions; use crate::mmap::{MmapBytesReader, ReaderBytes}; -use crate::parquet::metadata::FileMetaDataRef; +use crate::parquet::metadata::FileMetadataRef; use crate::parquet::read::ROW_COUNT_OVERFLOW_ERR; use crate::predicates::{apply_predicate, PhysicalIoExpr}; use crate::utils::get_reader_bytes; @@ -142,7 +142,7 @@ fn rg_to_dfs( row_group_start: usize, row_group_end: usize, slice: (usize, usize), - file_metadata: &FileMetaData, + file_metadata: &FileMetadata, schema: &ArrowSchemaRef, predicate: Option<&dyn PhysicalIoExpr>, row_index: Option, @@ -227,7 +227,7 @@ fn rg_to_dfs_prefiltered( previous_row_count: &mut IdxSize, row_group_start: usize, row_group_end: usize, - file_metadata: &FileMetaData, + file_metadata: &FileMetadata, schema: &ArrowSchemaRef, live_variables: Vec, predicate: &dyn PhysicalIoExpr, @@ -501,7 +501,7 @@ fn rg_to_dfs_optionally_par_over_columns( row_group_start: usize, row_group_end: usize, slice: (usize, usize), - file_metadata: &FileMetaData, + file_metadata: &FileMetadata, schema: &ArrowSchemaRef, predicate: Option<&dyn PhysicalIoExpr>, row_index: Option, @@ -605,7 +605,7 @@ fn rg_to_dfs_par_over_rg( row_group_end: usize, previous_row_count: &mut IdxSize, slice: (usize, usize), - file_metadata: &FileMetaData, + file_metadata: &FileMetadata, schema: &ArrowSchemaRef, predicate: Option<&dyn PhysicalIoExpr>, row_index: Option, @@ -701,7 +701,7 @@ pub fn read_parquet( slice: (usize, usize), projection: Option<&[usize]>, reader_schema: &ArrowSchemaRef, - metadata: Option, + metadata: Option, predicate: Option<&dyn PhysicalIoExpr>, mut parallel: ParallelStrategy, row_index: Option, @@ -855,7 +855,7 @@ pub(super) fn compute_row_group_range( row_group_start: usize, row_group_end: usize, slice: (usize, usize), - row_groups: &[RowGroupMetaData], + row_groups: &[RowGroupMetadata], ) -> std::ops::Range { let mut start = row_group_start; let mut cum_rows: usize = (0..row_group_start).map(|i| row_groups[i].num_rows()).sum(); @@ -901,7 +901,7 @@ pub struct BatchedParquetReader { slice: (usize, usize), projection: Arc<[usize]>, schema: ArrowSchemaRef, - metadata: FileMetaDataRef, + metadata: FileMetadataRef, predicate: Option>, row_index: Option, rows_read: IdxSize, @@ -921,7 +921,7 @@ impl BatchedParquetReader { #[allow(clippy::too_many_arguments)] pub fn new( row_group_fetcher: RowGroupFetcher, - metadata: FileMetaDataRef, + metadata: FileMetadataRef, schema: ArrowSchemaRef, slice: (usize, usize), projection: Option>, diff --git a/crates/polars-io/src/parquet/read/reader.rs b/crates/polars-io/src/parquet/read/reader.rs index 25e8852a92ce..0f6f3b70b4f3 100644 --- a/crates/polars-io/src/parquet/read/reader.rs +++ b/crates/polars-io/src/parquet/read/reader.rs @@ -18,7 +18,7 @@ use super::utils::materialize_empty_df; #[cfg(feature = "cloud")] use crate::cloud::CloudOptions; use crate::mmap::MmapBytesReader; -use crate::parquet::metadata::FileMetaDataRef; +use crate::parquet::metadata::FileMetadataRef; use crate::predicates::PhysicalIoExpr; use crate::prelude::*; use crate::RowIndex; @@ -35,7 +35,7 @@ pub struct ParquetReader { schema: Option, row_index: Option, low_memory: bool, - metadata: Option, + metadata: Option, predicate: Option>, hive_partition_columns: Option>, include_file_path: Option<(PlSmallStr, Arc)>, @@ -138,7 +138,7 @@ impl ParquetReader { self } - pub fn get_metadata(&mut self) -> PolarsResult<&FileMetaDataRef> { + pub fn get_metadata(&mut self) -> PolarsResult<&FileMetadataRef> { if self.metadata.is_none() { self.metadata = Some(Arc::new(read::read_metadata(&mut self.reader)?)); } @@ -267,7 +267,7 @@ impl ParquetAsyncReader { pub async fn from_uri( uri: &str, cloud_options: Option<&CloudOptions>, - metadata: Option, + metadata: Option, ) -> PolarsResult { Ok(ParquetAsyncReader { reader: ParquetObjectStore::from_uri(uri, cloud_options, metadata).await?, @@ -406,7 +406,7 @@ impl ParquetAsyncReader { ) } - pub async fn get_metadata(&mut self) -> PolarsResult<&FileMetaDataRef> { + pub async fn get_metadata(&mut self) -> PolarsResult<&FileMetadataRef> { self.reader.get_metadata().await } diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs index a37fc7c42f33..b9012344abb9 100644 --- a/crates/polars-mem-engine/src/executors/scan/parquet.rs +++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs @@ -5,7 +5,7 @@ use polars_core::config::{get_file_prefetch_size, verbose}; use polars_core::utils::accumulate_dataframes_vertical; use polars_error::feature_gated; use polars_io::cloud::CloudOptions; -use polars_io::parquet::metadata::FileMetaDataRef; +use polars_io::parquet::metadata::FileMetadataRef; use polars_io::utils::slice::split_slice_at_file; use polars_io::RowIndex; @@ -21,7 +21,7 @@ pub struct ParquetExec { cloud_options: Option, file_options: FileScanOptions, #[allow(dead_code)] - metadata: Option, + metadata: Option, } impl ParquetExec { @@ -34,7 +34,7 @@ impl ParquetExec { options: ParquetOptions, cloud_options: Option, file_options: FileScanOptions, - metadata: Option, + metadata: Option, ) -> Self { ParquetExec { sources, diff --git a/crates/polars-parquet/src/arrow/read/mod.rs b/crates/polars-parquet/src/arrow/read/mod.rs index 8af4fb3f67bb..1f00987fa074 100644 --- a/crates/polars-parquet/src/arrow/read/mod.rs +++ b/crates/polars-parquet/src/arrow/read/mod.rs @@ -15,7 +15,7 @@ pub use deserialize::{ #[cfg(feature = "async")] use futures::{AsyncRead, AsyncSeek}; use polars_error::PolarsResult; -pub use schema::{infer_schema, FileMetaData}; +pub use schema::{infer_schema, FileMetadata}; use crate::parquet::error::ParquetResult; #[cfg(feature = "async")] @@ -24,7 +24,7 @@ pub use crate::parquet::read::{get_page_stream, read_metadata_async as _read_met pub use crate::parquet::{ error::ParquetError, fallible_streaming_iterator, - metadata::{ColumnChunkMetadata, ColumnDescriptor, RowGroupMetaData}, + metadata::{ColumnChunkMetadata, ColumnDescriptor, RowGroupMetadata}, page::{CompressedDataPage, DataPageHeader, Page}, read::{ decompress, get_column_iterator, read_metadata as _read_metadata, BasicDecompressor, @@ -54,7 +54,7 @@ pub fn get_field_pages<'a, T>( } /// Reads parquets' metadata synchronously. -pub fn read_metadata(reader: &mut R) -> PolarsResult { +pub fn read_metadata(reader: &mut R) -> PolarsResult { Ok(_read_metadata(reader)?) } @@ -62,7 +62,7 @@ pub fn read_metadata(reader: &mut R) -> PolarsResult( reader: &mut R, -) -> PolarsResult { +) -> PolarsResult { Ok(_read_metadata_async(reader).await?) } diff --git a/crates/polars-parquet/src/arrow/read/schema/mod.rs b/crates/polars-parquet/src/arrow/read/schema/mod.rs index 50d937e7e840..347cd49faefd 100644 --- a/crates/polars-parquet/src/arrow/read/schema/mod.rs +++ b/crates/polars-parquet/src/arrow/read/schema/mod.rs @@ -10,7 +10,7 @@ pub use metadata::read_schema_from_metadata; use polars_error::PolarsResult; use self::metadata::parse_key_value_metadata; -pub use crate::parquet::metadata::{FileMetaData, KeyValue, SchemaDescriptor}; +pub use crate::parquet::metadata::{FileMetadata, KeyValue, SchemaDescriptor}; pub use crate::parquet::schema::types::ParquetType; /// Options when inferring schemas from Parquet @@ -33,7 +33,7 @@ impl Default for SchemaInferenceOptions { } } -/// Infers a [`ArrowSchema`] from parquet's [`FileMetaData`]. +/// Infers a [`ArrowSchema`] from parquet's [`FileMetadata`]. /// /// This first looks for the metadata key `"ARROW:schema"`; if it does not exist, it converts the /// Parquet types declared in the file's Parquet schema to Arrow's equivalent. @@ -41,13 +41,13 @@ impl Default for SchemaInferenceOptions { /// # Error /// This function errors iff the key `"ARROW:schema"` exists but is not correctly encoded, /// indicating that that the file's arrow metadata was incorrectly written. -pub fn infer_schema(file_metadata: &FileMetaData) -> PolarsResult { +pub fn infer_schema(file_metadata: &FileMetadata) -> PolarsResult { infer_schema_with_options(file_metadata, &None) } /// Like [`infer_schema`] but with configurable options which affects the behavior of inference pub fn infer_schema_with_options( - file_metadata: &FileMetaData, + file_metadata: &FileMetadata, options: &Option, ) -> PolarsResult { let mut metadata = parse_key_value_metadata(file_metadata.key_value_metadata()); diff --git a/crates/polars-parquet/src/arrow/write/file.rs b/crates/polars-parquet/src/arrow/write/file.rs index d4162b8c08d5..0fd32deb5b07 100644 --- a/crates/polars-parquet/src/arrow/write/file.rs +++ b/crates/polars-parquet/src/arrow/write/file.rs @@ -4,7 +4,7 @@ use arrow::datatypes::ArrowSchema; use polars_error::{PolarsError, PolarsResult}; use super::schema::schema_to_metadata_key; -use super::{to_parquet_schema, ThriftFileMetaData, WriteOptions}; +use super::{to_parquet_schema, ThriftFileMetadata, WriteOptions}; use crate::parquet::metadata::{KeyValue, SchemaDescriptor}; use crate::parquet::write::{RowGroupIterColumns, WriteOptions as FileWriteOptions}; @@ -86,10 +86,10 @@ impl FileWriter { self.writer.into_inner() } - /// Returns the underlying writer and [`ThriftFileMetaData`] + /// Returns the underlying writer and [`ThriftFileMetadata`] /// # Panics /// This function panics if [`Self::end`] has not yet been called - pub fn into_inner_and_metadata(self) -> (W, ThriftFileMetaData) { + pub fn into_inner_and_metadata(self) -> (W, ThriftFileMetadata) { self.writer.into_inner_and_metadata() } } diff --git a/crates/polars-parquet/src/arrow/write/mod.rs b/crates/polars-parquet/src/arrow/write/mod.rs index b5f816518401..02f0165d04c7 100644 --- a/crates/polars-parquet/src/arrow/write/mod.rs +++ b/crates/polars-parquet/src/arrow/write/mod.rs @@ -38,7 +38,7 @@ pub use utils::write_def_levels; pub use crate::parquet::compression::{BrotliLevel, CompressionOptions, GzipLevel, ZstdLevel}; pub use crate::parquet::encoding::Encoding; pub use crate::parquet::metadata::{ - Descriptor, FileMetaData, KeyValue, SchemaDescriptor, ThriftFileMetaData, + Descriptor, FileMetadata, KeyValue, SchemaDescriptor, ThriftFileMetadata, }; pub use crate::parquet::page::{CompressedDataPage, CompressedPage, Page}; use crate::parquet::schema::types::PrimitiveType as ParquetPrimitiveType; diff --git a/crates/polars-parquet/src/parquet/metadata/file_metadata.rs b/crates/polars-parquet/src/parquet/metadata/file_metadata.rs index a7ffd6f7ba6d..492d283f64ed 100644 --- a/crates/polars-parquet/src/parquet/metadata/file_metadata.rs +++ b/crates/polars-parquet/src/parquet/metadata/file_metadata.rs @@ -2,7 +2,7 @@ use parquet_format_safe::ColumnOrder as TColumnOrder; use super::column_order::ColumnOrder; use super::schema_descriptor::SchemaDescriptor; -use super::RowGroupMetaData; +use super::RowGroupMetadata; use crate::parquet::error::ParquetError; use crate::parquet::metadata::get_sort_order; pub use crate::parquet::thrift_format::KeyValue; @@ -11,7 +11,7 @@ pub use crate::parquet::thrift_format::KeyValue; // This is almost equal to [`parquet_format_safe::FileMetaData`] but contains the descriptors, // which are crucial to deserialize pages. #[derive(Debug)] -pub struct FileMetaData { +pub struct FileMetadata { /// version of this file. pub version: i32, /// number of rows in the file. @@ -26,7 +26,7 @@ pub struct FileMetaData { /// ``` pub created_by: Option, /// The row groups of this file - pub row_groups: Vec, + pub row_groups: Vec, /// key_value_metadata of this file. pub key_value_metadata: Option>, /// schema descriptor. @@ -41,7 +41,7 @@ pub struct FileMetaData { pub column_orders: Option>, } -impl FileMetaData { +impl FileMetadata { /// Returns the [`SchemaDescriptor`] that describes schema of this file. pub fn schema(&self) -> &SchemaDescriptor { &self.schema_descr @@ -61,7 +61,7 @@ impl FileMetaData { .unwrap_or(ColumnOrder::Undefined) } - /// Deserializes [`crate::parquet::thrift_format::FileMetaData`] into this struct + /// Deserializes [`crate::parquet::thrift_format::FileMetadata`] into this struct pub fn try_from_thrift( metadata: parquet_format_safe::FileMetaData, ) -> Result { @@ -70,14 +70,14 @@ impl FileMetaData { let row_groups = metadata .row_groups .into_iter() - .map(|rg| RowGroupMetaData::try_from_thrift(&schema_descr, rg)) + .map(|rg| RowGroupMetadata::try_from_thrift(&schema_descr, rg)) .collect::>()?; let column_orders = metadata .column_orders .map(|orders| parse_column_orders(&orders, &schema_descr)); - Ok(FileMetaData { + Ok(FileMetadata { version: metadata.version, num_rows: metadata.num_rows.try_into()?, created_by: metadata.created_by, diff --git a/crates/polars-parquet/src/parquet/metadata/mod.rs b/crates/polars-parquet/src/parquet/metadata/mod.rs index c153cd7cf592..b7a80739e719 100644 --- a/crates/polars-parquet/src/parquet/metadata/mod.rs +++ b/crates/polars-parquet/src/parquet/metadata/mod.rs @@ -9,9 +9,9 @@ mod sort; pub use column_chunk_metadata::ColumnChunkMetadata; pub use column_descriptor::{ColumnDescriptor, Descriptor}; pub use column_order::ColumnOrder; -pub use file_metadata::{FileMetaData, KeyValue}; -pub use row_metadata::RowGroupMetaData; +pub use file_metadata::{FileMetadata, KeyValue}; +pub use row_metadata::RowGroupMetadata; pub use schema_descriptor::SchemaDescriptor; pub use sort::*; -pub use crate::parquet::thrift_format::FileMetaData as ThriftFileMetaData; +pub use crate::parquet::thrift_format::FileMetaData as ThriftFileMetadata; diff --git a/crates/polars-parquet/src/parquet/metadata/row_metadata.rs b/crates/polars-parquet/src/parquet/metadata/row_metadata.rs index 717bc7e243d8..013308ad7f12 100644 --- a/crates/polars-parquet/src/parquet/metadata/row_metadata.rs +++ b/crates/polars-parquet/src/parquet/metadata/row_metadata.rs @@ -35,7 +35,7 @@ impl InitColumnLookup for ColumnLookup { /// Metadata for a row group. #[derive(Debug, Clone, Default)] -pub struct RowGroupMetaData { +pub struct RowGroupMetadata { columns: Arc<[ColumnChunkMetadata]>, column_lookup: PlHashMap>, num_rows: usize, @@ -43,7 +43,7 @@ pub struct RowGroupMetaData { full_byte_range: core::ops::Range, } -impl RowGroupMetaData { +impl RowGroupMetadata { #[inline(always)] pub fn n_columns(&self) -> usize { self.columns.len() @@ -91,7 +91,7 @@ impl RowGroupMetaData { pub(crate) fn try_from_thrift( schema_descr: &SchemaDescriptor, rg: RowGroup, - ) -> ParquetResult { + ) -> ParquetResult { if schema_descr.columns().len() != rg.columns.len() { return Err(ParquetError::oos(format!("The number of columns in the row group ({}) must be equal to the number of columns in the schema ({})", rg.columns.len(), schema_descr.columns().len()))); } @@ -127,7 +127,7 @@ impl RowGroupMetaData { }) .collect::>>()?; - Ok(RowGroupMetaData { + Ok(RowGroupMetadata { columns, column_lookup, num_rows, diff --git a/crates/polars-parquet/src/parquet/read/column/mod.rs b/crates/polars-parquet/src/parquet/read/column/mod.rs index 54065389328e..56f914ba568e 100644 --- a/crates/polars-parquet/src/parquet/read/column/mod.rs +++ b/crates/polars-parquet/src/parquet/read/column/mod.rs @@ -4,7 +4,7 @@ use polars_utils::idx_vec::UnitVec; use super::{get_page_iterator, MemReader, PageReader}; use crate::parquet::error::{ParquetError, ParquetResult}; -use crate::parquet::metadata::{ColumnChunkMetadata, RowGroupMetaData}; +use crate::parquet::metadata::{ColumnChunkMetadata, RowGroupMetadata}; use crate::parquet::page::CompressedPage; use crate::parquet::schema::types::ParquetType; @@ -17,7 +17,7 @@ use crate::parquet::schema::types::ParquetType; /// `max_page_size` is the maximum number of bytes allowed. pub fn get_column_iterator<'a>( reader: MemReader, - row_group: &'a RowGroupMetaData, + row_group: &'a RowGroupMetadata, field_name: &str, max_page_size: usize, ) -> ColumnIterator<'a> { diff --git a/crates/polars-parquet/src/parquet/read/metadata.rs b/crates/polars-parquet/src/parquet/read/metadata.rs index f92794fc2839..e14a2a60e997 100644 --- a/crates/polars-parquet/src/parquet/read/metadata.rs +++ b/crates/polars-parquet/src/parquet/read/metadata.rs @@ -2,9 +2,9 @@ use std::cmp::min; use std::io::{Read, Seek, SeekFrom}; use parquet_format_safe::thrift::protocol::TCompactInputProtocol; -use parquet_format_safe::FileMetaData as TFileMetaData; +use parquet_format_safe::FileMetaData as TFileMetadata; -use super::super::metadata::FileMetaData; +use super::super::metadata::FileMetadata; use super::super::{DEFAULT_FOOTER_READ_SIZE, FOOTER_SIZE, HEADER_SIZE, PARQUET_MAGIC}; use crate::parquet::error::{ParquetError, ParquetResult}; @@ -26,18 +26,18 @@ fn stream_len(seek: &mut impl Seek) -> std::result::Result Ok(len) } -/// Reads a [`FileMetaData`] from the reader, located at the end of the file. -pub fn read_metadata(reader: &mut R) -> ParquetResult { +/// Reads a [`FileMetadata`] from the reader, located at the end of the file. +pub fn read_metadata(reader: &mut R) -> ParquetResult { // check file is large enough to hold footer let file_size = stream_len(reader)?; read_metadata_with_size(reader, file_size) } -/// Reads a [`FileMetaData`] from the reader, located at the end of the file, with known file size. +/// Reads a [`FileMetadata`] from the reader, located at the end of the file, with known file size. pub fn read_metadata_with_size( reader: &mut R, file_size: u64, -) -> ParquetResult { +) -> ParquetResult { if file_size < HEADER_SIZE + FOOTER_SIZE { return Err(ParquetError::oos( "A parquet file must contain a header and footer with at least 12 bytes", @@ -92,9 +92,9 @@ pub fn read_metadata_with_size( } /// Parse loaded metadata bytes -pub fn deserialize_metadata(reader: R, max_size: usize) -> ParquetResult { +pub fn deserialize_metadata(reader: R, max_size: usize) -> ParquetResult { let mut prot = TCompactInputProtocol::new(reader, max_size); - let metadata = TFileMetaData::read_from_in_protocol(&mut prot)?; + let metadata = TFileMetadata::read_from_in_protocol(&mut prot)?; - FileMetaData::try_from_thrift(metadata) + FileMetadata::try_from_thrift(metadata) } diff --git a/crates/polars-parquet/src/parquet/read/stream.rs b/crates/polars-parquet/src/parquet/read/stream.rs index ec8b26c3d31d..c3755106742b 100644 --- a/crates/polars-parquet/src/parquet/read/stream.rs +++ b/crates/polars-parquet/src/parquet/read/stream.rs @@ -2,7 +2,7 @@ use std::io::SeekFrom; use futures::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; -use super::super::metadata::FileMetaData; +use super::super::metadata::FileMetadata; use super::super::{DEFAULT_FOOTER_READ_SIZE, FOOTER_SIZE, PARQUET_MAGIC}; use super::metadata::{deserialize_metadata, metadata_len}; use crate::parquet::error::{ParquetError, ParquetResult}; @@ -26,7 +26,7 @@ async fn stream_len( /// Asynchronously reads the files' metadata pub async fn read_metadata( reader: &mut R, -) -> ParquetResult { +) -> ParquetResult { let file_size = stream_len(reader).await?; if file_size < HEADER_SIZE + FOOTER_SIZE { diff --git a/crates/polars-parquet/src/parquet/write/file.rs b/crates/polars-parquet/src/parquet/write/file.rs index e9a95be68e73..8dd3212bb76a 100644 --- a/crates/polars-parquet/src/parquet/write/file.rs +++ b/crates/polars-parquet/src/parquet/write/file.rs @@ -9,7 +9,7 @@ use super::row_group::write_row_group; use super::{RowGroupIterColumns, WriteOptions}; use crate::parquet::error::{ParquetError, ParquetResult}; pub use crate::parquet::metadata::KeyValue; -use crate::parquet::metadata::{SchemaDescriptor, ThriftFileMetaData}; +use crate::parquet::metadata::{SchemaDescriptor, ThriftFileMetadata}; use crate::parquet::write::State; use crate::parquet::{FOOTER_SIZE, PARQUET_MAGIC}; @@ -20,7 +20,7 @@ pub(super) fn start_file(writer: &mut W) -> ParquetResult { pub(super) fn end_file( mut writer: &mut W, - metadata: &ThriftFileMetaData, + metadata: &ThriftFileMetadata, ) -> ParquetResult { // Write metadata let mut protocol = TCompactOutputProtocol::new(&mut writer); @@ -67,7 +67,7 @@ pub struct FileWriter { /// Used to store the current state for writing the file state: State, // when the file is written, metadata becomes available - metadata: Option, + metadata: Option, } /// Writes a parquet file containing only the header and footer @@ -75,11 +75,11 @@ pub struct FileWriter { /// This is used to write the metadata as a separate Parquet file, usually when data /// is partitioned across multiple files. /// -/// Note: Recall that when combining row groups from [`ThriftFileMetaData`], the `file_path` on each +/// Note: Recall that when combining row groups from [`ThriftFileMetadata`], the `file_path` on each /// of their column chunks must be updated with their path relative to where they are written to. pub fn write_metadata_sidecar( writer: &mut W, - metadata: &ThriftFileMetaData, + metadata: &ThriftFileMetadata, ) -> ParquetResult { let mut len = start_file(writer)?; len += end_file(writer, metadata)?; @@ -98,11 +98,11 @@ impl FileWriter { &self.schema } - /// Returns the [`ThriftFileMetaData`]. This is Some iff the [`Self::end`] has been called. + /// Returns the [`ThriftFileMetadata`]. This is Some iff the [`Self::end`] has been called. /// /// This is used to write the metadata as a separate Parquet file, usually when data /// is partitioned across multiple files - pub fn metadata(&self) -> Option<&ThriftFileMetaData> { + pub fn metadata(&self) -> Option<&ThriftFileMetadata> { self.metadata.as_ref() } } @@ -225,7 +225,7 @@ impl FileWriter { ParquetResult::Ok(()) })?; - let metadata = ThriftFileMetaData::new( + let metadata = ThriftFileMetadata::new( self.options.version.into(), self.schema.clone().into_thrift(), num_rows, @@ -248,10 +248,10 @@ impl FileWriter { self.writer } - /// Returns the underlying writer and [`ThriftFileMetaData`] + /// Returns the underlying writer and [`ThriftFileMetadata`] /// # Panics /// This function panics if [`Self::end`] has not yet been called - pub fn into_inner_and_metadata(self) -> (W, ThriftFileMetaData) { + pub fn into_inner_and_metadata(self) -> (W, ThriftFileMetadata) { (self.writer, self.metadata.expect("File to have ended")) } } diff --git a/crates/polars-parquet/src/parquet/write/stream.rs b/crates/polars-parquet/src/parquet/write/stream.rs index eadc4640e856..eca712db65dc 100644 --- a/crates/polars-parquet/src/parquet/write/stream.rs +++ b/crates/polars-parquet/src/parquet/write/stream.rs @@ -2,7 +2,7 @@ use std::io::Write; use futures::{AsyncWrite, AsyncWriteExt}; use parquet_format_safe::thrift::protocol::TCompactOutputStreamProtocol; -use parquet_format_safe::{FileMetaData, RowGroup}; +use parquet_format_safe::RowGroup; use super::row_group::write_row_group_async; use super::{RowGroupIterColumns, WriteOptions}; @@ -20,7 +20,7 @@ async fn start_file(writer: &mut W) -> ParquetResult async fn end_file( mut writer: &mut W, - metadata: FileMetaData, + metadata: parquet_format_safe::FileMetaData, ) -> ParquetResult { // Write file metadata let mut protocol = TCompactOutputStreamProtocol::new(&mut writer); @@ -169,7 +169,7 @@ impl FileStreamer { } } - let metadata = FileMetaData::new( + let metadata = parquet_format_safe::FileMetaData::new( self.options.version.into(), self.schema.clone().into_thrift(), num_rows, diff --git a/crates/polars-pipe/src/executors/sources/parquet.rs b/crates/polars-pipe/src/executors/sources/parquet.rs index 8592021b2ff3..7a0dabeb10df 100644 --- a/crates/polars-pipe/src/executors/sources/parquet.rs +++ b/crates/polars-pipe/src/executors/sources/parquet.rs @@ -10,7 +10,7 @@ use polars_core::error::*; use polars_core::prelude::Series; use polars_core::POOL; use polars_io::cloud::CloudOptions; -use polars_io::parquet::metadata::FileMetaDataRef; +use polars_io::parquet::metadata::FileMetadataRef; use polars_io::parquet::read::{BatchedParquetReader, ParquetOptions, ParquetReader}; use polars_io::path_utils::is_cloud_url; use polars_io::pl_async::get_runtime; @@ -41,7 +41,7 @@ pub struct ParquetSource { file_options: FileScanOptions, #[allow(dead_code)] cloud_options: Option, - metadata: Option, + metadata: Option, file_info: FileInfo, hive_parts: Option>>, verbose: bool, @@ -252,7 +252,7 @@ impl ParquetSource { sources: ScanSources, options: ParquetOptions, cloud_options: Option, - metadata: Option, + metadata: Option, file_options: FileScanOptions, file_info: FileInfo, hive_parts: Option>>, diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs index 25dd61aa1eb9..9fd419f90f63 100644 --- a/crates/polars-plan/src/plans/conversion/scans.rs +++ b/crates/polars-plan/src/plans/conversion/scans.rs @@ -32,7 +32,7 @@ pub(super) fn parquet_file_info( sources: &ScanSources, file_options: &FileScanOptions, #[allow(unused)] cloud_options: Option<&polars_io::cloud::CloudOptions>, -) -> PolarsResult<(FileInfo, Option)> { +) -> PolarsResult<(FileInfo, Option)> { use polars_core::error::feature_gated; let (reader_schema, num_rows, metadata) = { diff --git a/crates/polars-plan/src/plans/file_scan.rs b/crates/polars-plan/src/plans/file_scan.rs index 73ae85d93646..e868b98d2799 100644 --- a/crates/polars-plan/src/plans/file_scan.rs +++ b/crates/polars-plan/src/plans/file_scan.rs @@ -5,7 +5,7 @@ use polars_io::csv::read::CsvReadOptions; #[cfg(feature = "ipc")] use polars_io::ipc::IpcScanOptions; #[cfg(feature = "parquet")] -use polars_io::parquet::metadata::FileMetaDataRef; +use polars_io::parquet::metadata::FileMetadataRef; #[cfg(feature = "parquet")] use polars_io::parquet::read::ParquetOptions; @@ -24,7 +24,7 @@ pub enum FileScan { options: ParquetOptions, cloud_options: Option, #[cfg_attr(feature = "serde", serde(skip))] - metadata: Option, + metadata: Option, }, #[cfg(feature = "ipc")] Ipc { diff --git a/crates/polars-stream/src/nodes/parquet_source.rs b/crates/polars-stream/src/nodes/parquet_source.rs index bf5d4262fed6..cff5b9582d3a 100644 --- a/crates/polars-stream/src/nodes/parquet_source.rs +++ b/crates/polars-stream/src/nodes/parquet_source.rs @@ -17,13 +17,13 @@ use polars_expr::prelude::PhysicalExpr; use polars_io::cloud::CloudOptions; use polars_io::predicates::PhysicalIoExpr; use polars_io::prelude::_internal::read_this_row_group; -use polars_io::prelude::{FileMetaData, ParquetOptions}; +use polars_io::prelude::{FileMetadata, ParquetOptions}; use polars_io::utils::byte_source::{ ByteSource, DynByteSource, DynByteSourceBuilder, MemSliceByteSource, }; use polars_io::utils::slice::SplitSlicePosition; use polars_io::{is_cloud_url, RowIndex}; -use polars_parquet::read::RowGroupMetaData; +use polars_parquet::read::RowGroupMetadata; use polars_plan::plans::hive::HivePartitions; use polars_plan::plans::FileInfo; use polars_plan::prelude::FileScanOptions; @@ -540,7 +540,7 @@ impl ParquetSourceNode { usize, usize, Arc, - FileMetaData, + FileMetadata, usize, )>, task_handles_ext::AbortOnDropHandle>, @@ -1007,7 +1007,7 @@ struct RowGroupData { row_offset: usize, slice: Option<(usize, usize)>, file_max_row_group_height: usize, - row_group_metadata: RowGroupMetaData, + row_group_metadata: RowGroupMetadata, shared_file_state: Arc>, } @@ -1016,7 +1016,7 @@ struct RowGroupDataFetcher { usize, usize, Arc, - FileMetaData, + FileMetadata, usize, )>, use_statistics: bool, @@ -1028,7 +1028,7 @@ struct RowGroupDataFetcher { memory_prefetch_func: fn(&[u8]) -> (), current_path_index: usize, current_byte_source: Arc, - current_row_groups: std::vec::IntoIter, + current_row_groups: std::vec::IntoIter, current_row_group_idx: usize, current_max_row_group_height: usize, current_row_offset: usize, @@ -1731,7 +1731,7 @@ async fn read_parquet_metadata_bytes( } fn get_row_group_byte_ranges( - row_group_metadata: &RowGroupMetaData, + row_group_metadata: &RowGroupMetadata, ) -> impl ExactSizeIterator> + '_ { row_group_metadata .byte_ranges_iter() @@ -1739,7 +1739,7 @@ fn get_row_group_byte_ranges( } fn get_row_group_byte_ranges_for_projection<'a>( - row_group_metadata: &'a RowGroupMetaData, + row_group_metadata: &'a RowGroupMetadata, columns: &'a [PlSmallStr], ) -> impl Iterator> + 'a { columns.iter().flat_map(|col_name| { @@ -1756,7 +1756,7 @@ fn get_row_group_byte_ranges_for_projection<'a>( /// dtype. There are no ordering requirements and extra columns are permitted. fn ensure_metadata_has_projected_fields( projected_fields: &[polars_core::prelude::ArrowField], - metadata: &FileMetaData, + metadata: &FileMetadata, ) -> PolarsResult<()> { let schema = polars_parquet::arrow::read::infer_schema(metadata)?; diff --git a/crates/polars/tests/it/io/parquet/read/file.rs b/crates/polars/tests/it/io/parquet/read/file.rs index 5007dcdf0755..d2be2c5402d9 100644 --- a/crates/polars/tests/it/io/parquet/read/file.rs +++ b/crates/polars/tests/it/io/parquet/read/file.rs @@ -4,7 +4,7 @@ use arrow::array::Array; use arrow::datatypes::ArrowSchema; use arrow::record_batch::RecordBatchT; use polars_error::PolarsResult; -use polars_parquet::read::{Filter, RowGroupMetaData}; +use polars_parquet::read::{Filter, RowGroupMetadata}; use super::row_group::{read_columns_many, RowGroupDeserializer}; @@ -25,7 +25,7 @@ impl FileReader { /// Returns a new [`FileReader`]. pub fn new( reader: R, - row_groups: Vec, + row_groups: Vec, schema: ArrowSchema, limit: Option, ) -> Self { @@ -104,7 +104,7 @@ impl Iterator for FileReader { pub struct RowGroupReader { reader: R, schema: ArrowSchema, - row_groups: std::vec::IntoIter, + row_groups: std::vec::IntoIter, remaining_rows: usize, } @@ -113,7 +113,7 @@ impl RowGroupReader { pub fn new( reader: R, schema: ArrowSchema, - row_groups: Vec, + row_groups: Vec, limit: Option, ) -> Self { Self { diff --git a/crates/polars/tests/it/io/parquet/read/row_group.rs b/crates/polars/tests/it/io/parquet/read/row_group.rs index f23ee779b120..6d567a120c92 100644 --- a/crates/polars/tests/it/io/parquet/read/row_group.rs +++ b/crates/polars/tests/it/io/parquet/read/row_group.rs @@ -8,7 +8,7 @@ use polars_error::PolarsResult; use polars_parquet::arrow::read::{column_iter_to_arrays, Filter}; use polars_parquet::parquet::metadata::ColumnChunkMetadata; use polars_parquet::parquet::read::{BasicDecompressor, PageReader}; -use polars_parquet::read::RowGroupMetaData; +use polars_parquet::read::RowGroupMetadata; use polars_utils::mmap::MemReader; /// An [`Iterator`] of [`RecordBatchT`] that (dynamically) adapts a vector of iterators of [`Array`] into @@ -70,7 +70,7 @@ impl Iterator for RowGroupDeserializer { /// the field (one for non-nested types) pub fn read_columns<'a, R: Read + Seek>( reader: &mut R, - row_group_metadata: &'a RowGroupMetaData, + row_group_metadata: &'a RowGroupMetadata, field_name: &'a str, ) -> PolarsResult)>> { row_group_metadata @@ -135,7 +135,7 @@ pub fn to_deserializer( /// and convert them to [`ArrayIter`] via [`to_deserializer`]. pub fn read_columns_many( reader: &mut R, - row_group: &RowGroupMetaData, + row_group: &RowGroupMetadata, fields: &ArrowSchema, filter: Option, ) -> PolarsResult>> { From 38b376cb118926c1b9ee41513ab9739e032c10b9 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Tue, 10 Sep 2024 04:21:51 -0300 Subject: [PATCH 06/28] fix: Enable "polars-json/timezones" feature from "polars-io" (#18635) --- crates/polars-io/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index 64259f78ad09..ca3d313e08ae 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -95,6 +95,7 @@ timezones = [ "dtype-datetime", "arrow/timezones", "polars-json?/chrono-tz", + "polars-json?/timezones", ] dtype-time = ["polars-core/dtype-time", "polars-core/temporal", "polars-time/dtype-time"] dtype-struct = ["polars-core/dtype-struct"] From 5ccb238d62e2e3e471718462714a1c61d97c72c3 Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Tue, 10 Sep 2024 17:23:08 +1000 Subject: [PATCH 07/28] fix: Scanning hive partitioned files where hive columns are partially included in the file (#18626) Co-authored-by: jinliu --- crates/polars-io/src/hive.rs | 71 ++++++++++++++----- .../src/plans/conversion/dsl_to_ir.rs | 41 ++++++----- crates/polars-plan/src/plans/hive.rs | 5 +- py-polars/tests/unit/io/test_hive.py | 36 ++++++++++ 4 files changed, 112 insertions(+), 41 deletions(-) diff --git a/crates/polars-io/src/hive.rs b/crates/polars-io/src/hive.rs index b027e6d1d054..17ace26d6be7 100644 --- a/crates/polars-io/src/hive.rs +++ b/crates/polars-io/src/hive.rs @@ -5,6 +5,8 @@ use polars_core::series::Series; /// We have a special num_rows arg, as df can be empty when a projection contains /// only hive partition columns. /// +/// The `hive_partition_columns` must be ordered by their position in the `reader_schema` +/// /// # Safety /// /// num_rows equals the height of the df when the df height is non-zero. @@ -15,27 +17,58 @@ pub(crate) fn materialize_hive_partitions( num_rows: usize, ) { if let Some(hive_columns) = hive_partition_columns { - let Some(first) = hive_columns.first() else { + // Insert these hive columns in the order they are stored in the file. + if hive_columns.is_empty() { return; - }; - - if reader_schema.index_of(first.name()).is_some() { - // Insert these hive columns in the order they are stored in the file. - for s in hive_columns { - let i = match df.get_columns().binary_search_by_key( - &reader_schema.index_of(s.name()).unwrap_or(usize::MAX), - |s| reader_schema.index_of(s.name()).unwrap_or(usize::MIN), - ) { - Ok(i) => i, - Err(i) => i, - }; - - df.insert_column(i, s.new_from_index(0, num_rows)).unwrap(); - } - } else { - for s in hive_columns { - unsafe { df.with_column_unchecked(s.new_from_index(0, num_rows)) }; + } + + let hive_columns_iter = hive_columns.iter().map(|s| s.new_from_index(0, num_rows)); + + if reader_schema.index_of(hive_columns[0].name()).is_none() || df.width() == 0 { + // Fast-path - all hive columns are at the end + unsafe { df.get_columns_mut() }.extend(hive_columns_iter); + return; + } + + let out_width: usize = df.width() + hive_columns.len(); + let df_columns = df.get_columns(); + let mut out_columns = Vec::with_capacity(out_width); + + // We have a slightly involved algorithm here because `reader_schema` may contain extra + // columns that were excluded from a projection pushdown. + + let hive_columns = hive_columns_iter.collect::>(); + // Safety: These are both non-empty at the start + let mut series_arr = [df_columns, hive_columns.as_slice()]; + let mut schema_idx_arr = [ + reader_schema.index_of(series_arr[0][0].name()).unwrap(), + reader_schema.index_of(series_arr[1][0].name()).unwrap(), + ]; + + loop { + let arg_min = if schema_idx_arr[0] < schema_idx_arr[1] { + 0 + } else { + 1 + }; + + out_columns.push(series_arr[arg_min][0].clone()); + series_arr[arg_min] = &series_arr[arg_min][1..]; + + if series_arr[arg_min].is_empty() { + break; } + + let Some(i) = reader_schema.index_of(series_arr[arg_min][0].name()) else { + break; + }; + + schema_idx_arr[arg_min] = i; } + + out_columns.extend_from_slice(series_arr[0]); + out_columns.extend_from_slice(series_arr[1]); + + *unsafe { df.get_columns_mut() } = out_columns; } } diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs index 658dc9989eb5..a908378e6f5c 100644 --- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs @@ -1068,27 +1068,26 @@ pub(crate) fn maybe_init_projection_excluding_hive( // Update `with_columns` with a projection so that hive columns aren't loaded from the // file let hive_parts = hive_parts?; - let hive_schema = hive_parts.schema(); - let (first_hive_name, _) = hive_schema.get_at_index(0)?; - - // TODO: Optimize this - let names = match reader_schema { - Either::Left(ref v) => v - .contains(first_hive_name.as_str()) - .then(|| v.iter_names_cloned().collect::>()), - Either::Right(ref v) => v - .contains(first_hive_name.as_str()) - .then(|| v.iter_names_cloned().collect()), - }; - - let names = names?; - - Some( - names - .into_iter() - .filter(|x| !hive_schema.contains(x)) - .collect::>(), - ) + match &reader_schema { + Either::Left(reader_schema) => hive_schema + .iter_names() + .any(|x| reader_schema.contains(x)) + .then(|| { + reader_schema + .iter_names_cloned() + .filter(|x| !hive_schema.contains(x)) + .collect::>() + }), + Either::Right(reader_schema) => hive_schema + .iter_names() + .any(|x| reader_schema.contains(x)) + .then(|| { + reader_schema + .iter_names_cloned() + .filter(|x| !hive_schema.contains(x)) + .collect::>() + }), + } } diff --git a/crates/polars-plan/src/plans/hive.rs b/crates/polars-plan/src/plans/hive.rs index 3fc7531ea2b3..a711aeb11848 100644 --- a/crates/polars-plan/src/plans/hive.rs +++ b/crates/polars-plan/src/plans/hive.rs @@ -57,6 +57,8 @@ impl HivePartitions { } } +/// Note: Returned hive partitions are ordered by their position in the `reader_schema` +/// /// # Safety /// `hive_start_idx <= [min path length]` pub fn hive_partitions_from_paths( @@ -198,10 +200,11 @@ pub fn hive_partitions_from_paths( } let mut hive_partitions = Vec::with_capacity(paths.len()); - let buffers = buffers + let mut buffers = buffers .into_iter() .map(|x| x.into_series()) .collect::>>()?; + buffers.sort_by_key(|s| reader_schema.index_of(s.name()).unwrap_or(usize::MAX)); #[allow(clippy::needless_range_loop)] for i in 0..paths.len() { diff --git a/py-polars/tests/unit/io/test_hive.py b/py-polars/tests/unit/io/test_hive.py index ad285b82f3b3..a01a2ef6e59d 100644 --- a/py-polars/tests/unit/io/test_hive.py +++ b/py-polars/tests/unit/io/test_hive.py @@ -554,6 +554,42 @@ def assert_with_projections(lf: pl.LazyFrame, df: pl.DataFrame) -> None: ) assert_with_projections(lf, rhs) + # partial cols in file + partial_path = tmp_path / "a=1/b=2/partial_data.bin" + df = pl.DataFrame( + {"x": 1, "b": 2, "y": 1}, + schema={"x": pl.Int32, "b": pl.Int16, "y": pl.Int32}, + ) + write_func(df, partial_path) + + rhs = rhs.select( + pl.col("x").cast(pl.Int32), + pl.col("b").cast(pl.Int16), + pl.col("y").cast(pl.Int32), + pl.col("a").cast(pl.Int64), + ) + + lf = scan_func(partial_path, hive_partitioning=True) # type: ignore[call-arg] + assert_frame_equal(lf.collect(projection_pushdown=projection_pushdown), rhs) + assert_with_projections(lf, rhs) + + lf = scan_func( # type: ignore[call-arg] + partial_path, + hive_schema={"a": pl.String, "b": pl.String}, + hive_partitioning=True, + ) + rhs = rhs.select( + pl.col("x").cast(pl.Int32), + pl.col("b").cast(pl.String), + pl.col("y").cast(pl.Int32), + pl.col("a").cast(pl.String), + ) + assert_frame_equal( + lf.collect(projection_pushdown=projection_pushdown), + rhs, + ) + assert_with_projections(lf, rhs) + @pytest.mark.write_disk def test_hive_partition_dates(tmp_path: Path) -> None: From 832aa534c5ffe9731028d5206eb5a9b83ac879ab Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Tue, 10 Sep 2024 18:17:07 +1000 Subject: [PATCH 08/28] refactor(rust): Scan from BytesIO in new-streaming parquet source (#18643) --- crates/polars-io/src/utils/byte_source.rs | 6 ++ .../polars-plan/src/plans/ir/scan_sources.rs | 20 ++++++ .../polars-stream/src/nodes/parquet_source.rs | 65 ++++++++++--------- crates/polars-stream/src/physical_plan/fmt.rs | 8 +-- .../src/physical_plan/lower_ir.rs | 8 +-- crates/polars-stream/src/physical_plan/mod.rs | 5 +- .../src/physical_plan/to_graph.rs | 4 +- 7 files changed, 71 insertions(+), 45 deletions(-) diff --git a/crates/polars-io/src/utils/byte_source.rs b/crates/polars-io/src/utils/byte_source.rs index 72cbabb3dd5c..e2dd3e876c2a 100644 --- a/crates/polars-io/src/utils/byte_source.rs +++ b/crates/polars-io/src/utils/byte_source.rs @@ -150,6 +150,12 @@ impl From for DynByteSource { } } +impl From for DynByteSource { + fn from(value: MemSlice) -> Self { + Self::MemSlice(MemSliceByteSource(value)) + } +} + #[derive(Clone, Debug)] pub enum DynByteSourceBuilder { Mmap, diff --git a/crates/polars-plan/src/plans/ir/scan_sources.rs b/crates/polars-plan/src/plans/ir/scan_sources.rs index 1bdb92fda904..08d8cad0bf49 100644 --- a/crates/polars-plan/src/plans/ir/scan_sources.rs +++ b/crates/polars-plan/src/plans/ir/scan_sources.rs @@ -3,6 +3,9 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use polars_core::error::{feature_gated, PolarsResult}; +use polars_io::cloud::CloudOptions; +#[cfg(feature = "cloud")] +use polars_io::utils::byte_source::{DynByteSource, DynByteSourceBuilder}; use polars_utils::mmap::MemSlice; use polars_utils::pl_str::PlSmallStr; @@ -237,6 +240,23 @@ impl<'a> ScanSourceRef<'a> { Self::Buffer(buff) => Ok(MemSlice::from_bytes((*buff).clone())), } } + + #[cfg(feature = "cloud")] + pub async fn to_dyn_byte_source( + &self, + builder: &DynByteSourceBuilder, + cloud_options: Option<&CloudOptions>, + ) -> PolarsResult { + match self { + Self::Path(path) => { + builder + .try_build_from_path(path.to_str().unwrap(), cloud_options) + .await + }, + Self::File(file) => Ok(DynByteSource::from(MemSlice::from_file(file)?)), + Self::Buffer(buff) => Ok(DynByteSource::from(MemSlice::from_bytes((*buff).clone()))), + } + } } impl<'a> Iterator for ScanSourceIter<'a> { diff --git a/crates/polars-stream/src/nodes/parquet_source.rs b/crates/polars-stream/src/nodes/parquet_source.rs index cff5b9582d3a..199e8d2665bd 100644 --- a/crates/polars-stream/src/nodes/parquet_source.rs +++ b/crates/polars-stream/src/nodes/parquet_source.rs @@ -1,5 +1,4 @@ use std::future::Future; -use std::path::PathBuf; use std::sync::atomic::AtomicBool; use std::sync::Arc; @@ -22,10 +21,10 @@ use polars_io::utils::byte_source::{ ByteSource, DynByteSource, DynByteSourceBuilder, MemSliceByteSource, }; use polars_io::utils::slice::SplitSlicePosition; -use polars_io::{is_cloud_url, RowIndex}; +use polars_io::RowIndex; use polars_parquet::read::RowGroupMetadata; use polars_plan::plans::hive::HivePartitions; -use polars_plan::plans::FileInfo; +use polars_plan::plans::{FileInfo, ScanSources}; use polars_plan::prelude::FileScanOptions; use polars_utils::mmap::MemSlice; use polars_utils::pl_str::PlSmallStr; @@ -46,7 +45,7 @@ type AsyncTaskData = Option<( #[allow(clippy::type_complexity)] pub struct ParquetSourceNode { - paths: Arc<[PathBuf]>, + scan_sources: ScanSources, file_info: FileInfo, hive_parts: Option>>, predicate: Option>, @@ -71,7 +70,7 @@ pub struct ParquetSourceNode { #[allow(clippy::too_many_arguments)] impl ParquetSourceNode { pub fn new( - paths: Arc<[PathBuf]>, + scan_sources: ScanSources, file_info: FileInfo, hive_parts: Option>>, predicate: Option>, @@ -81,16 +80,15 @@ impl ParquetSourceNode { ) -> Self { let verbose = config::verbose(); - let byte_source_builder = - if is_cloud_url(paths[0].to_str().unwrap()) || config::force_async() { - DynByteSourceBuilder::ObjectStore - } else { - DynByteSourceBuilder::Mmap - }; + let byte_source_builder = if scan_sources.is_cloud_url() || config::force_async() { + DynByteSourceBuilder::ObjectStore + } else { + DynByteSourceBuilder::Mmap + }; let memory_prefetch_func = get_memory_prefetch_func(verbose); Self { - paths, + scan_sources, file_info, hive_parts, predicate, @@ -570,23 +568,25 @@ impl ParquetSourceNode { } let fetch_metadata_bytes_for_path_index = { - let paths = &self.paths; + let scan_sources = &self.scan_sources; let cloud_options = Arc::new(self.cloud_options.clone()); - let paths = paths.clone(); + let scan_sources = scan_sources.clone(); let cloud_options = cloud_options.clone(); let byte_source_builder = byte_source_builder.clone(); move |path_idx: usize| { - let paths = paths.clone(); + let scan_sources = scan_sources.clone(); let cloud_options = cloud_options.clone(); let byte_source_builder = byte_source_builder.clone(); let handle = io_runtime.spawn(async move { let mut byte_source = Arc::new( - byte_source_builder - .try_build_from_path( - paths[path_idx].to_str().unwrap(), + scan_sources + .get(path_idx) + .unwrap() + .to_dyn_byte_source( + &byte_source_builder, cloud_options.as_ref().as_ref(), ) .await?, @@ -681,13 +681,13 @@ impl ParquetSourceNode { .slice .map(|(offset, len)| offset as usize..offset as usize + len); - let mut metadata_stream = futures::stream::iter(0..self.paths.len()) + let mut metadata_stream = futures::stream::iter(0..self.scan_sources.len()) .map(fetch_metadata_bytes_for_path_index) .buffered(metadata_prefetch_size) .map(process_metadata_bytes) .buffered(metadata_decode_ahead_size); - let paths = self.paths.clone(); + let scan_sources = self.scan_sources.clone(); // We need to be able to both stop early as well as skip values, which is easier to do // using a custom task instead of futures::stream @@ -715,9 +715,11 @@ impl ParquetSourceNode { .map_err(|err| { err.wrap_msg(|msg| { format!( - "error at path (index: {}, path: {}): {}", + "error at path (index: {}, path: {:?}): {}", current_path_index, - paths[current_path_index].to_str().unwrap(), + scan_sources + .get(current_path_index) + .map(|x| PlSmallStr::from_str(x.to_include_path_name())), msg ) }) @@ -771,7 +773,7 @@ impl ParquetSourceNode { Stopped reading at file at index {} \ (remaining {} files will not be read)", current_path_index, - paths.len() - current_path_index - 1, + scan_sources.len() - current_path_index - 1, ); } break; @@ -786,7 +788,7 @@ impl ParquetSourceNode { let slice = self.file_options.slice.unwrap(); let slice_start_as_n_from_end = -slice.0 as usize; - let mut metadata_stream = futures::stream::iter((0..self.paths.len()).rev()) + let mut metadata_stream = futures::stream::iter((0..self.scan_sources.len()).rev()) .map(fetch_metadata_bytes_for_path_index) .buffered(metadata_prefetch_size) .map(process_metadata_bytes) @@ -831,7 +833,7 @@ impl ParquetSourceNode { PolarsResult::Ok((slice_range, processed_metadata_rev, cum_rows)) }; - let path_count = self.paths.len(); + let path_count = self.scan_sources.len(); io_runtime.spawn(async move { if start_rx.await.is_err() { @@ -935,7 +937,7 @@ impl ParquetSourceNode { ); assert_eq!(self.predicate.is_some(), self.physical_predicate.is_some()); - let paths = self.paths.clone(); + let scan_sources = self.scan_sources.clone(); let hive_partitions = self.hive_parts.clone(); let hive_partitions_width = hive_partitions .as_deref() @@ -948,7 +950,7 @@ impl ParquetSourceNode { let ideal_morsel_size = get_ideal_morsel_size(); RowGroupDecoder { - paths, + scan_sources, hive_partitions, hive_partitions_width, include_file_paths, @@ -983,7 +985,7 @@ impl ParquetSourceNode { eprintln!( "[ParquetSource]: {} columns to be projected from {} files", self.projected_arrow_fields.len(), - self.paths.len(), + self.scan_sources.len(), ); } } @@ -1355,7 +1357,7 @@ struct SharedFileState { /// Turns row group data into DataFrames. struct RowGroupDecoder { - paths: Arc<[PathBuf]>, + scan_sources: ScanSources, hive_partitions: Option>>, hive_partitions_width: usize, include_file_paths: Option, @@ -1520,7 +1522,10 @@ impl RowGroupDecoder { let file_path_series = self.include_file_paths.clone().map(|file_path_col| { StringChunked::full( file_path_col, - self.paths[path_index].to_str().unwrap(), + self.scan_sources + .get(path_index) + .unwrap() + .to_include_path_name(), row_group_data.file_max_row_group_height, ) .into_series() diff --git a/crates/polars-stream/src/physical_plan/fmt.rs b/crates/polars-stream/src/physical_plan/fmt.rs index 8a3e7a1b8ac4..7d15337389a8 100644 --- a/crates/polars-stream/src/physical_plan/fmt.rs +++ b/crates/polars-stream/src/physical_plan/fmt.rs @@ -1,7 +1,7 @@ use std::fmt::Write; use polars_plan::plans::expr_ir::ExprIR; -use polars_plan::plans::{AExpr, EscapeLabel, FileScan, PathsDisplay}; +use polars_plan::plans::{AExpr, EscapeLabel, FileScan, ScanSourcesDisplay}; use polars_utils::arena::Arena; use polars_utils::itertools::Itertools; use slotmap::{Key, SecondaryMap, SlotMap}; @@ -107,7 +107,7 @@ fn visualize_plan_rec( }, PhysNodeKind::Multiplexer { input } => ("multiplexer".to_string(), from_ref(input)), PhysNodeKind::FileScan { - paths, + scan_sources, file_info, hive_parts, output_schema: _, @@ -127,9 +127,9 @@ fn visualize_plan_rec( let mut f = EscapeLabel(&mut out); { - let paths_display = PathsDisplay(paths.as_ref()); + let disp = ScanSourcesDisplay(scan_sources); - write!(f, "\npaths: {}", paths_display).unwrap(); + write!(f, "\npaths: {}", disp).unwrap(); } { diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs index 5a1e44694a99..beec7a57e358 100644 --- a/crates/polars-stream/src/physical_plan/lower_ir.rs +++ b/crates/polars-stream/src/physical_plan/lower_ir.rs @@ -331,7 +331,7 @@ pub fn lower_ir( v @ IR::Scan { .. } => { let IR::Scan { - sources, + sources: scan_sources, file_info, hive_parts, output_schema, @@ -343,12 +343,8 @@ pub fn lower_ir( unreachable!(); }; - let paths = sources - .into_paths() - .unwrap_or_else(|| todo!("streaming scanning of in-memory buffers")); - PhysNodeKind::FileScan { - paths, + scan_sources, file_info, hive_parts, output_schema, diff --git a/crates/polars-stream/src/physical_plan/mod.rs b/crates/polars-stream/src/physical_plan/mod.rs index d22a5f968900..e4ba35ce767e 100644 --- a/crates/polars-stream/src/physical_plan/mod.rs +++ b/crates/polars-stream/src/physical_plan/mod.rs @@ -1,4 +1,3 @@ -use std::path::PathBuf; use std::sync::Arc; use polars_core::frame::DataFrame; @@ -6,7 +5,7 @@ use polars_core::prelude::{InitHashMaps, PlHashMap, SortMultipleOptions}; use polars_core::schema::{Schema, SchemaRef}; use polars_error::PolarsResult; use polars_plan::plans::hive::HivePartitions; -use polars_plan::plans::{AExpr, DataFrameUdf, FileInfo, FileScan, IR}; +use polars_plan::plans::{AExpr, DataFrameUdf, FileInfo, FileScan, ScanSources, IR}; use polars_plan::prelude::expr_ir::ExprIR; mod fmt; @@ -119,7 +118,7 @@ pub enum PhysNodeKind { }, FileScan { - paths: Arc<[PathBuf]>, + scan_sources: ScanSources, file_info: FileInfo, hive_parts: Option>>, predicate: Option, diff --git a/crates/polars-stream/src/physical_plan/to_graph.rs b/crates/polars-stream/src/physical_plan/to_graph.rs index d0bd342b0f65..9166acefa2e3 100644 --- a/crates/polars-stream/src/physical_plan/to_graph.rs +++ b/crates/polars-stream/src/physical_plan/to_graph.rs @@ -256,7 +256,7 @@ fn to_graph_rec<'a>( v @ FileScan { .. } => { let FileScan { - paths, + scan_sources, file_info, hive_parts, output_schema, @@ -298,7 +298,7 @@ fn to_graph_rec<'a>( if std::env::var("POLARS_DISABLE_PARQUET_SOURCE").as_deref() != Ok("1") { ctx.graph.add_node( nodes::parquet_source::ParquetSourceNode::new( - paths, + scan_sources, file_info, hive_parts, predicate, From 1ee6a8211ffa63cef68bac08ec4cc0a6c47e8ac7 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 10 Sep 2024 10:49:10 +0200 Subject: [PATCH 09/28] chore(rust): Feature gate iejoin (#18646) --- crates/polars-lazy/Cargo.toml | 1 + crates/polars-ops/Cargo.toml | 1 + crates/polars-ops/src/frame/join/args.rs | 3 +++ crates/polars-ops/src/frame/join/mod.rs | 5 +++++ crates/polars-plan/Cargo.toml | 1 + .../polars-plan/src/plans/conversion/join.rs | 22 ++++++++++++------- crates/polars-python/Cargo.toml | 1 + .../src/lazyframe/visitor/nodes.rs | 1 + crates/polars/Cargo.toml | 1 + py-polars/Cargo.toml | 2 +- 10 files changed, 29 insertions(+), 9 deletions(-) diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml index 03fcc0d8b2c8..a3efd659a77f 100644 --- a/crates/polars-lazy/Cargo.toml +++ b/crates/polars-lazy/Cargo.toml @@ -169,6 +169,7 @@ is_between = ["polars-plan/is_between", "polars-expr/is_between"] is_unique = ["polars-plan/is_unique"] cross_join = ["polars-plan/cross_join", "polars-pipe?/cross_join", "polars-ops/cross_join"] asof_join = ["polars-plan/asof_join", "polars-time", "polars-ops/asof_join", "polars-mem-engine/asof_join"] +iejoin = ["polars-plan/iejoin"] business = ["polars-plan/business"] concat_str = ["polars-plan/concat_str"] range = ["polars-plan/range"] diff --git a/crates/polars-ops/Cargo.toml b/crates/polars-ops/Cargo.toml index 0782f188b1df..2f37857c9cd2 100644 --- a/crates/polars-ops/Cargo.toml +++ b/crates/polars-ops/Cargo.toml @@ -117,6 +117,7 @@ pivot = ["polars-core/reinterpret", "polars-core/dtype-struct"] cross_join = [] chunked_ids = [] asof_join = [] +iejoin = [] semi_anti_join = [] array_any_all = ["dtype-array"] array_count = ["dtype-array"] diff --git a/crates/polars-ops/src/frame/join/args.rs b/crates/polars-ops/src/frame/join/args.rs index 2f5d6504eba7..10eee5d765df 100644 --- a/crates/polars-ops/src/frame/join/args.rs +++ b/crates/polars-ops/src/frame/join/args.rs @@ -58,6 +58,7 @@ impl JoinCoalesce { }, #[cfg(feature = "asof_join")] AsOf(_) => matches!(self, JoinSpecific | CoalesceColumns), + #[cfg(feature = "iejoin")] IEJoin(_) => false, Cross => false, #[cfg(feature = "semi_anti_join")] @@ -121,6 +122,7 @@ pub enum JoinType { Semi, #[cfg(feature = "semi_anti_join")] Anti, + #[cfg(feature = "iejoin")] IEJoin(IEJoinOptions), } @@ -140,6 +142,7 @@ impl Display for JoinType { Full { .. } => "FULL", #[cfg(feature = "asof_join")] AsOf(_) => "ASOF", + #[cfg(feature = "iejoin")] IEJoin(_) => "IEJOIN", Cross => "CROSS", #[cfg(feature = "semi_anti_join")] diff --git a/crates/polars-ops/src/frame/join/mod.rs b/crates/polars-ops/src/frame/join/mod.rs index 433bffd232dd..89507ac216c5 100644 --- a/crates/polars-ops/src/frame/join/mod.rs +++ b/crates/polars-ops/src/frame/join/mod.rs @@ -7,6 +7,7 @@ mod cross_join; mod dispatch_left_right; mod general; mod hash_join; +#[cfg(feature = "iejoin")] mod iejoin; #[cfg(feature = "merge_sorted")] mod merge_sorted; @@ -29,6 +30,7 @@ use general::create_chunked_index_mapping; pub use general::{_coalesce_full_join, _finish_join, _join_suffix_name}; pub use hash_join::*; use hashbrown::hash_map::{Entry, RawEntryMut}; +#[cfg(feature = "iejoin")] pub use iejoin::{IEJoinOptions, InequalityOperator}; #[cfg(feature = "merge_sorted")] pub use merge_sorted::_merge_sorted_dfs; @@ -199,6 +201,7 @@ pub trait DataFrameJoinOps: IntoDf { } } + #[cfg(feature = "iejoin")] if let JoinType::IEJoin(options) = args.how { let func = if POOL.current_num_threads() > 1 && !left_df.is_empty() && !other.is_empty() { @@ -289,6 +292,7 @@ pub trait DataFrameJoinOps: IntoDf { panic!("expected by arguments on both sides") }, }, + #[cfg(feature = "iejoin")] JoinType::IEJoin(_) => { unreachable!() }, @@ -316,6 +320,7 @@ pub trait DataFrameJoinOps: IntoDf { JoinType::AsOf(_) => polars_bail!( ComputeError: "asof join not supported for join on multiple keys" ), + #[cfg(feature = "iejoin")] JoinType::IEJoin(_) => { unreachable!() }, diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml index dd33428c8398..7edc15ea8616 100644 --- a/crates/polars-plan/Cargo.toml +++ b/crates/polars-plan/Cargo.toml @@ -116,6 +116,7 @@ is_unique = ["polars-ops/is_unique"] is_between = ["polars-ops/is_between"] cross_join = ["polars-ops/cross_join"] asof_join = ["polars-time", "polars-ops/asof_join"] +iejoin = ["polars-ops/iejoin"] concat_str = [] business = ["polars-ops/business"] range = [] diff --git a/crates/polars-plan/src/plans/conversion/join.rs b/crates/polars-plan/src/plans/conversion/join.rs index 36701f9ab5a7..53a7bbd2274a 100644 --- a/crates/polars-plan/src/plans/conversion/join.rs +++ b/crates/polars-plan/src/plans/conversion/join.rs @@ -1,8 +1,10 @@ use arrow::legacy::error::PolarsResult; use either::Either; +use polars_core::error::feature_gated; use super::*; use crate::dsl::Expr; +#[cfg(feature = "iejoin")] use crate::plans::AExpr; fn check_join_keys(keys: &[Expr]) -> PolarsResult<()> { @@ -26,14 +28,16 @@ pub fn resolve_join( ctxt: &mut DslConversionContext, ) -> PolarsResult { if !predicates.is_empty() { - debug_assert!(left_on.is_empty() && right_on.is_empty()); - return resolve_join_where( - input_left.unwrap_left(), - input_right.unwrap_left(), - predicates, - options, - ctxt, - ); + feature_gated!("iejoin", { + debug_assert!(left_on.is_empty() && right_on.is_empty()); + return resolve_join_where( + input_left.unwrap_left(), + input_right.unwrap_left(), + predicates, + options, + ctxt, + ); + }) } let owned = Arc::unwrap_or_clone; @@ -119,6 +123,7 @@ pub fn resolve_join( run_conversion(lp, ctxt, "join") } +#[cfg(feature = "iejoin")] impl From for Operator { fn from(value: InequalityOperator) -> Self { match value { @@ -130,6 +135,7 @@ impl From for Operator { } } +#[cfg(feature = "iejoin")] fn resolve_join_where( input_left: Arc, input_right: Arc, diff --git a/crates/polars-python/Cargo.toml b/crates/polars-python/Cargo.toml index b93d34a678e5..9ed35648c89f 100644 --- a/crates/polars-python/Cargo.toml +++ b/crates/polars-python/Cargo.toml @@ -122,6 +122,7 @@ json = ["polars/serde", "serde_json", "polars/json", "polars-utils/serde"] trigonometry = ["polars/trigonometry"] sign = ["polars/sign"] asof_join = ["polars/asof_join"] +iejoin = ["polars/iejoin"] cross_join = ["polars/cross_join"] pct_change = ["polars/pct_change"] repeat_by = ["polars/repeat_by"] diff --git a/crates/polars-python/src/lazyframe/visitor/nodes.rs b/crates/polars-python/src/lazyframe/visitor/nodes.rs index 4e9344a61d15..d8dbb71281bc 100644 --- a/crates/polars-python/src/lazyframe/visitor/nodes.rs +++ b/crates/polars-python/src/lazyframe/visitor/nodes.rs @@ -481,6 +481,7 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult { JoinType::Cross => "cross", JoinType::Semi => "leftsemi", JoinType::Anti => "leftanti", + #[cfg(feature = "iejoin")] JoinType::IEJoin(_) => return Err(PyNotImplementedError::new_err("IEJoin")), }, options.args.join_nulls, diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index a27907484369..b858dbc36678 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -130,6 +130,7 @@ approx_unique = ["polars-lazy?/approx_unique", "polars-ops/approx_unique"] arg_where = ["polars-lazy?/arg_where"] array_any_all = ["polars-lazy?/array_any_all", "dtype-array"] asof_join = ["polars-lazy?/asof_join", "polars-ops/asof_join"] +iejoin = ["polars-lazy?/iejoin"] binary_encoding = ["polars-ops/binary_encoding", "polars-lazy?/binary_encoding", "polars-sql?/binary_encoding"] business = ["polars-lazy?/business", "polars-ops/business"] checked_arithmetic = ["polars-core/checked_arithmetic"] diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index a42c643516ea..1147cbdde89a 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -9,7 +9,7 @@ crate-type = ["cdylib"] [dependencies] libc = { workspace = true } -polars-python = { workspace = true, features = ["pymethods"] } +polars-python = { workspace = true, features = ["pymethods", "iejoin"] } pyo3 = { workspace = true, features = ["abi3-py38", "chrono", "extension-module", "multiple-pymethods"] } [build-dependencies] From 5658e650ce7be79be678aa4e60cb969e954f7416 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 10 Sep 2024 13:25:55 +0200 Subject: [PATCH 10/28] chore: Check predicates in join_where (#18648) --- crates/polars-plan/src/plans/conversion/join.rs | 1 + py-polars/tests/unit/operations/test_inequality_join.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/crates/polars-plan/src/plans/conversion/join.rs b/crates/polars-plan/src/plans/conversion/join.rs index 53a7bbd2274a..0a073934b5d3 100644 --- a/crates/polars-plan/src/plans/conversion/join.rs +++ b/crates/polars-plan/src/plans/conversion/join.rs @@ -44,6 +44,7 @@ pub fn resolve_join( if matches!(options.args.how, JoinType::Cross) { polars_ensure!(left_on.len() + right_on.len() == 0, InvalidOperation: "a 'cross' join doesn't expect any join keys"); } else { + polars_ensure!(left_on.len() + right_on.len() > 0, InvalidOperation: "expected join keys/predicates"); check_join_keys(&left_on)?; check_join_keys(&right_on)?; diff --git a/py-polars/tests/unit/operations/test_inequality_join.py b/py-polars/tests/unit/operations/test_inequality_join.py index 242a0e1f5e8e..94839c39fb4f 100644 --- a/py-polars/tests/unit/operations/test_inequality_join.py +++ b/py-polars/tests/unit/operations/test_inequality_join.py @@ -460,3 +460,9 @@ def test_raise_on_multiple_binary_comparisons() -> None: df.join_where( df, (pl.col("id") < pl.col("id")) & (pl.col("id") >= pl.col("id")) ) + + +def test_raise_invalid_input_join_where() -> None: + df = pl.DataFrame({"id": [1, 2]}) + with pytest.raises(pl.exceptions.InvalidOperationError): + df.join_where(df) From fe043908556a8013b55488fd292109a34ce6a1ec Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Tue, 10 Sep 2024 21:26:11 +1000 Subject: [PATCH 11/28] refactor(rust): Split `parquet_source.rs` in new-streaming (#18649) --- .../polars-stream/src/nodes/parquet_source.rs | 1867 ----------------- .../src/nodes/parquet_source/init.rs | 737 +++++++ .../parquet_source/mem_prefetch_funcs.rs | 71 + .../nodes/parquet_source/metadata_utils.rs | 156 ++ .../src/nodes/parquet_source/mod.rs | 262 +++ .../parquet_source/row_group_data_fetch.rs | 390 ++++ .../nodes/parquet_source/row_group_decode.rs | 287 +++ 7 files changed, 1903 insertions(+), 1867 deletions(-) delete mode 100644 crates/polars-stream/src/nodes/parquet_source.rs create mode 100644 crates/polars-stream/src/nodes/parquet_source/init.rs create mode 100644 crates/polars-stream/src/nodes/parquet_source/mem_prefetch_funcs.rs create mode 100644 crates/polars-stream/src/nodes/parquet_source/metadata_utils.rs create mode 100644 crates/polars-stream/src/nodes/parquet_source/mod.rs create mode 100644 crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs create mode 100644 crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs diff --git a/crates/polars-stream/src/nodes/parquet_source.rs b/crates/polars-stream/src/nodes/parquet_source.rs deleted file mode 100644 index 199e8d2665bd..000000000000 --- a/crates/polars-stream/src/nodes/parquet_source.rs +++ /dev/null @@ -1,1867 +0,0 @@ -use std::future::Future; -use std::sync::atomic::AtomicBool; -use std::sync::Arc; - -use futures::stream::FuturesUnordered; -use futures::StreamExt; -use polars_core::config; -use polars_core::frame::DataFrame; -use polars_core::prelude::{ - ArrowSchema, ChunkFull, DataType, IdxCa, InitHashMaps, PlHashMap, StringChunked, -}; -use polars_core::series::{IntoSeries, IsSorted, Series}; -use polars_core::utils::operation_exceeded_idxsize_msg; -use polars_error::{polars_bail, polars_err, PolarsResult}; -use polars_expr::prelude::PhysicalExpr; -use polars_io::cloud::CloudOptions; -use polars_io::predicates::PhysicalIoExpr; -use polars_io::prelude::_internal::read_this_row_group; -use polars_io::prelude::{FileMetadata, ParquetOptions}; -use polars_io::utils::byte_source::{ - ByteSource, DynByteSource, DynByteSourceBuilder, MemSliceByteSource, -}; -use polars_io::utils::slice::SplitSlicePosition; -use polars_io::RowIndex; -use polars_parquet::read::RowGroupMetadata; -use polars_plan::plans::hive::HivePartitions; -use polars_plan::plans::{FileInfo, ScanSources}; -use polars_plan::prelude::FileScanOptions; -use polars_utils::mmap::MemSlice; -use polars_utils::pl_str::PlSmallStr; -use polars_utils::slice::GetSaferUnchecked; -use polars_utils::IdxSize; - -use super::{MorselSeq, TaskPriority}; -use crate::async_executor::{self}; -use crate::async_primitives::connector::connector; -use crate::async_primitives::wait_group::{WaitGroup, WaitToken}; -use crate::morsel::get_ideal_morsel_size; -use crate::utils::task_handles_ext; - -type AsyncTaskData = Option<( - Vec>, - async_executor::AbortOnDropHandle>, -)>; - -#[allow(clippy::type_complexity)] -pub struct ParquetSourceNode { - scan_sources: ScanSources, - file_info: FileInfo, - hive_parts: Option>>, - predicate: Option>, - options: ParquetOptions, - cloud_options: Option, - file_options: FileScanOptions, - // Run-time vars - config: Config, - verbose: bool, - physical_predicate: Option>, - projected_arrow_fields: Arc<[polars_core::prelude::ArrowField]>, - byte_source_builder: DynByteSourceBuilder, - memory_prefetch_func: fn(&[u8]) -> (), - // This permit blocks execution until the first morsel is requested. - morsel_stream_starter: Option>, - // This is behind a Mutex so that we can call `shutdown()` asynchronously. - async_task_data: Arc>, - row_group_decoder: Option>, - is_finished: Arc, -} - -#[allow(clippy::too_many_arguments)] -impl ParquetSourceNode { - pub fn new( - scan_sources: ScanSources, - file_info: FileInfo, - hive_parts: Option>>, - predicate: Option>, - options: ParquetOptions, - cloud_options: Option, - file_options: FileScanOptions, - ) -> Self { - let verbose = config::verbose(); - - let byte_source_builder = if scan_sources.is_cloud_url() || config::force_async() { - DynByteSourceBuilder::ObjectStore - } else { - DynByteSourceBuilder::Mmap - }; - let memory_prefetch_func = get_memory_prefetch_func(verbose); - - Self { - scan_sources, - file_info, - hive_parts, - predicate, - options, - cloud_options, - file_options, - - config: Config { - // Initialized later - num_pipelines: 0, - metadata_prefetch_size: 0, - metadata_decode_ahead_size: 0, - row_group_prefetch_size: 0, - }, - verbose, - physical_predicate: None, - projected_arrow_fields: Arc::new([]), - byte_source_builder, - memory_prefetch_func, - - morsel_stream_starter: None, - async_task_data: Arc::new(tokio::sync::Mutex::new(None)), - row_group_decoder: None, - is_finished: Arc::new(AtomicBool::new(false)), - } - } -} - -mod compute_node_impl { - - use std::sync::Arc; - - use polars_expr::prelude::phys_expr_to_io_expr; - - use super::super::compute_node_prelude::*; - use super::{Config, ParquetSourceNode}; - use crate::morsel::SourceToken; - - impl ComputeNode for ParquetSourceNode { - fn name(&self) -> &str { - "parquet_source" - } - - fn initialize(&mut self, num_pipelines: usize) { - self.config = { - let metadata_prefetch_size = polars_core::config::get_file_prefetch_size(); - // Limit metadata decode to the number of threads. - let metadata_decode_ahead_size = - (metadata_prefetch_size / 2).min(1 + num_pipelines).max(1); - let row_group_prefetch_size = polars_core::config::get_rg_prefetch_size(); - - Config { - num_pipelines, - metadata_prefetch_size, - metadata_decode_ahead_size, - row_group_prefetch_size, - } - }; - - if self.verbose { - eprintln!("[ParquetSource]: {:?}", &self.config); - } - - self.init_projected_arrow_fields(); - self.physical_predicate = self.predicate.clone().map(phys_expr_to_io_expr); - - let (raw_morsel_receivers, morsel_stream_task_handle) = self.init_raw_morsel_stream(); - - self.async_task_data - .try_lock() - .unwrap() - .replace((raw_morsel_receivers, morsel_stream_task_handle)); - - let row_group_decoder = self.init_row_group_decoder(); - self.row_group_decoder = Some(Arc::new(row_group_decoder)); - } - - fn update_state( - &mut self, - recv: &mut [PortState], - send: &mut [PortState], - ) -> PolarsResult<()> { - use std::sync::atomic::Ordering; - - assert!(recv.is_empty()); - assert_eq!(send.len(), 1); - - if self.is_finished.load(Ordering::Relaxed) { - send[0] = PortState::Done; - assert!( - self.async_task_data.try_lock().unwrap().is_none(), - "should have already been shut down" - ); - } else if send[0] == PortState::Done { - { - // Early shutdown - our port state was set to `Done` by the downstream nodes. - self.shutdown_in_background(); - }; - self.is_finished.store(true, Ordering::Relaxed); - } else { - send[0] = PortState::Ready - } - - Ok(()) - } - - fn spawn<'env, 's>( - &'env mut self, - scope: &'s TaskScope<'s, 'env>, - recv: &mut [Option>], - send: &mut [Option>], - _state: &'s ExecutionState, - join_handles: &mut Vec>>, - ) { - use std::sync::atomic::Ordering; - - assert!(recv.is_empty()); - assert_eq!(send.len(), 1); - assert!(!self.is_finished.load(Ordering::Relaxed)); - - let morsel_senders = send[0].take().unwrap().parallel(); - - let mut async_task_data_guard = self.async_task_data.try_lock().unwrap(); - let (raw_morsel_receivers, _) = async_task_data_guard.as_mut().unwrap(); - - assert_eq!(raw_morsel_receivers.len(), morsel_senders.len()); - - if let Some(v) = self.morsel_stream_starter.take() { - v.send(()).unwrap(); - } - let is_finished = self.is_finished.clone(); - - let task_handles = raw_morsel_receivers - .drain(..) - .zip(morsel_senders) - .map(|(mut raw_morsel_rx, mut morsel_tx)| { - let is_finished = is_finished.clone(); - - scope.spawn_task(TaskPriority::Low, async move { - let source_token = SourceToken::new(); - loop { - let Ok((df, morsel_seq, wait_token)) = raw_morsel_rx.recv().await - else { - is_finished.store(true, Ordering::Relaxed); - break; - }; - - let mut morsel = Morsel::new(df, morsel_seq, source_token.clone()); - morsel.set_consume_token(wait_token); - - if morsel_tx.send(morsel).await.is_err() { - break; - } - - if source_token.stop_requested() { - break; - } - } - - raw_morsel_rx - }) - }) - .collect::>(); - - drop(async_task_data_guard); - - let async_task_data = self.async_task_data.clone(); - - join_handles.push(scope.spawn_task(TaskPriority::Low, async move { - { - let mut async_task_data_guard = async_task_data.try_lock().unwrap(); - let (raw_morsel_receivers, _) = async_task_data_guard.as_mut().unwrap(); - - for handle in task_handles { - raw_morsel_receivers.push(handle.await); - } - } - - if self.is_finished.load(Ordering::Relaxed) { - self.shutdown().await?; - } - - Ok(()) - })) - } - } -} - -impl ParquetSourceNode { - /// # Panics - /// Panics if called more than once. - async fn shutdown_impl( - async_task_data: Arc>, - verbose: bool, - ) -> PolarsResult<()> { - if verbose { - eprintln!("[ParquetSource]: Shutting down"); - } - - let (mut raw_morsel_receivers, morsel_stream_task_handle) = - async_task_data.try_lock().unwrap().take().unwrap(); - - raw_morsel_receivers.clear(); - // Join on the producer handle to catch errors/panics. - // Safety - // * We dropped the receivers on the line above - // * This function is only called once. - morsel_stream_task_handle.await - } - - fn shutdown(&self) -> impl Future> { - if self.verbose { - eprintln!("[ParquetSource]: Shutdown via `shutdown()`"); - } - Self::shutdown_impl(self.async_task_data.clone(), self.verbose) - } - - /// Spawns a task to shut down the source node to avoid blocking the current thread. This is - /// usually called when data is no longer needed from the source node, as such it does not - /// propagate any (non-critical) errors. If on the other hand the source node does not provide - /// more data when requested, then it is more suitable to call [`Self::shutdown`], as it returns - /// a result that can be used to distinguish between whether the data stream stopped due to an - /// error or EOF. - fn shutdown_in_background(&self) { - if self.verbose { - eprintln!("[ParquetSource]: Shutdown via `shutdown_in_background()`"); - } - let async_task_data = self.async_task_data.clone(); - polars_io::pl_async::get_runtime() - .spawn(Self::shutdown_impl(async_task_data, self.verbose)); - } - - /// Constructs the task that provides a morsel stream. - #[allow(clippy::type_complexity)] - fn init_raw_morsel_stream( - &mut self, - ) -> ( - Vec>, - async_executor::AbortOnDropHandle>, - ) { - let verbose = self.verbose; - - let use_statistics = self.options.use_statistics; - - let (mut raw_morsel_senders, raw_morsel_receivers): (Vec<_>, Vec<_>) = - (0..self.config.num_pipelines).map(|_| connector()).unzip(); - - if let Some((_, 0)) = self.file_options.slice { - return ( - raw_morsel_receivers, - async_executor::AbortOnDropHandle::new(async_executor::spawn( - TaskPriority::Low, - std::future::ready(Ok(())), - )), - ); - } - - let reader_schema = self - .file_info - .reader_schema - .as_ref() - .unwrap() - .as_ref() - .unwrap_left() - .clone(); - - let (normalized_slice_oneshot_rx, metadata_rx, metadata_task_handle) = - self.init_metadata_fetcher(); - - let num_pipelines = self.config.num_pipelines; - let row_group_prefetch_size = self.config.row_group_prefetch_size; - let projection = self.file_options.with_columns.clone(); - assert_eq!(self.physical_predicate.is_some(), self.predicate.is_some()); - let predicate = self.physical_predicate.clone(); - let memory_prefetch_func = self.memory_prefetch_func; - - let mut row_group_data_fetcher = RowGroupDataFetcher { - metadata_rx, - use_statistics, - verbose, - reader_schema, - projection, - predicate, - slice_range: None, // Initialized later - memory_prefetch_func, - current_path_index: 0, - current_byte_source: Default::default(), - current_row_groups: Default::default(), - current_row_group_idx: 0, - current_max_row_group_height: 0, - current_row_offset: 0, - current_shared_file_state: Default::default(), - }; - - let row_group_decoder = self.init_row_group_decoder(); - let row_group_decoder = Arc::new(row_group_decoder); - - // Processes row group metadata and spawns I/O tasks to fetch row group data. This is - // currently spawned onto the CPU runtime as it does not directly make any async I/O calls, - // but instead it potentially performs predicate/slice evaluation on metadata. If we observe - // that under heavy CPU load scenarios the I/O throughput drops due to this task not being - // scheduled we can change it to be a high priority task. - let morsel_stream_task_handle = async_executor::spawn(TaskPriority::Low, async move { - let slice_range = { - let Ok(slice) = normalized_slice_oneshot_rx.await else { - // If we are here then the producer probably errored. - drop(row_group_data_fetcher); - return metadata_task_handle.await.unwrap(); - }; - - slice.map(|(offset, len)| offset..offset + len) - }; - - row_group_data_fetcher.slice_range = slice_range; - - // Pins a wait group to a channel index. - struct IndexedWaitGroup { - index: usize, - wait_group: WaitGroup, - } - - impl IndexedWaitGroup { - async fn wait(self) -> Self { - self.wait_group.wait().await; - self - } - } - - // Ensure proper backpressure by only polling the buffered iterator when a wait group - // is free. - let mut wait_groups = (0..num_pipelines) - .map(|index| { - let wait_group = WaitGroup::default(); - { - let _prime_this_wait_group = wait_group.token(); - } - IndexedWaitGroup { - index, - wait_group: WaitGroup::default(), - } - .wait() - }) - .collect::>(); - - let mut df_stream = row_group_data_fetcher - .into_stream() - .map(|x| async { - match x { - Ok(handle) => handle.await, - Err(e) => Err(e), - } - }) - .buffered(row_group_prefetch_size) - .map(|x| async { - let row_group_decoder = row_group_decoder.clone(); - - match x { - Ok(row_group_data) => { - async_executor::spawn(TaskPriority::Low, async move { - row_group_decoder.row_group_data_to_df(row_group_data).await - }) - .await - }, - Err(e) => Err(e), - } - }) - .buffered( - // Because we are using an ordered buffer, we may suffer from head-of-line blocking, - // so we add a small amount of buffer. - num_pipelines + 4, - ); - - let morsel_seq_ref = &mut MorselSeq::default(); - let mut dfs = vec![].into_iter(); - - 'main: loop { - let Some(mut indexed_wait_group) = wait_groups.next().await else { - break; - }; - - if dfs.len() == 0 { - let Some(v) = df_stream.next().await else { - break; - }; - - let v = v?; - assert!(!v.is_empty()); - - dfs = v.into_iter(); - } - - let mut df = dfs.next().unwrap(); - let morsel_seq = *morsel_seq_ref; - *morsel_seq_ref = morsel_seq.successor(); - - loop { - use crate::async_primitives::connector::SendError; - - let channel_index = indexed_wait_group.index; - let wait_token = indexed_wait_group.wait_group.token(); - - match raw_morsel_senders[channel_index].try_send((df, morsel_seq, wait_token)) { - Ok(_) => { - wait_groups.push(indexed_wait_group.wait()); - break; - }, - Err(SendError::Closed(v)) => { - // The channel assigned to this wait group has been closed, so we will not - // add it back to the list of wait groups, and we will try to send this - // across another channel. - df = v.0 - }, - Err(SendError::Full(_)) => unreachable!(), - } - - let Some(v) = wait_groups.next().await else { - // All channels have closed - break 'main; - }; - - indexed_wait_group = v; - } - } - - // Join on the producer handle to catch errors/panics. - drop(df_stream); - metadata_task_handle.await.unwrap() - }); - - let morsel_stream_task_handle = - async_executor::AbortOnDropHandle::new(morsel_stream_task_handle); - - (raw_morsel_receivers, morsel_stream_task_handle) - } - - /// Constructs the task that fetches file metadata. - /// Note: This must be called AFTER `self.projected_arrow_fields` has been initialized. - /// - /// TODO: During IR conversion the metadata of the first file is already downloaded - see if - /// we can find a way to re-use it. - #[allow(clippy::type_complexity)] - fn init_metadata_fetcher( - &mut self, - ) -> ( - tokio::sync::oneshot::Receiver>, - crate::async_primitives::connector::Receiver<( - usize, - usize, - Arc, - FileMetadata, - usize, - )>, - task_handles_ext::AbortOnDropHandle>, - ) { - let verbose = self.verbose; - let io_runtime = polars_io::pl_async::get_runtime(); - - assert!( - !self.projected_arrow_fields.is_empty() - || self.file_options.with_columns.as_deref() == Some(&[]) - ); - let projected_arrow_fields = self.projected_arrow_fields.clone(); - let needs_max_row_group_height_calc = - self.file_options.include_file_paths.is_some() || self.hive_parts.is_some(); - - let (normalized_slice_oneshot_tx, normalized_slice_oneshot_rx) = - tokio::sync::oneshot::channel(); - let (mut metadata_tx, metadata_rx) = connector(); - - let byte_source_builder = self.byte_source_builder.clone(); - - if self.verbose { - eprintln!( - "[ParquetSource]: Byte source builder: {:?}", - &byte_source_builder - ); - } - - let fetch_metadata_bytes_for_path_index = { - let scan_sources = &self.scan_sources; - let cloud_options = Arc::new(self.cloud_options.clone()); - - let scan_sources = scan_sources.clone(); - let cloud_options = cloud_options.clone(); - let byte_source_builder = byte_source_builder.clone(); - - move |path_idx: usize| { - let scan_sources = scan_sources.clone(); - let cloud_options = cloud_options.clone(); - let byte_source_builder = byte_source_builder.clone(); - - let handle = io_runtime.spawn(async move { - let mut byte_source = Arc::new( - scan_sources - .get(path_idx) - .unwrap() - .to_dyn_byte_source( - &byte_source_builder, - cloud_options.as_ref().as_ref(), - ) - .await?, - ); - let (metadata_bytes, maybe_full_bytes) = - read_parquet_metadata_bytes(byte_source.as_ref(), verbose).await?; - - if let Some(v) = maybe_full_bytes { - if !matches!(byte_source.as_ref(), DynByteSource::MemSlice(_)) { - if verbose { - eprintln!( - "[ParquetSource]: Parquet file was fully fetched during \ - metadata read ({} bytes).", - v.len(), - ); - } - - byte_source = Arc::new(DynByteSource::from(MemSliceByteSource(v))) - } - } - - PolarsResult::Ok((path_idx, byte_source, metadata_bytes)) - }); - - let handle = task_handles_ext::AbortOnDropHandle(handle); - - std::future::ready(handle) - } - }; - - let process_metadata_bytes = { - move |handle: task_handles_ext::AbortOnDropHandle< - PolarsResult<(usize, Arc, MemSlice)>, - >| { - let projected_arrow_fields = projected_arrow_fields.clone(); - // Run on CPU runtime - metadata deserialization is expensive, especially - // for very wide tables. - let handle = async_executor::spawn(TaskPriority::Low, async move { - let (path_index, byte_source, metadata_bytes) = handle.await.unwrap()?; - - let metadata = polars_parquet::parquet::read::deserialize_metadata( - metadata_bytes.as_ref(), - metadata_bytes.len() * 2 + 1024, - )?; - - ensure_metadata_has_projected_fields( - projected_arrow_fields.as_ref(), - &metadata, - )?; - - let file_max_row_group_height = if needs_max_row_group_height_calc { - metadata - .row_groups - .iter() - .map(|x| x.num_rows()) - .max() - .unwrap_or(0) - } else { - 0 - }; - - PolarsResult::Ok((path_index, byte_source, metadata, file_max_row_group_height)) - }); - - async_executor::AbortOnDropHandle::new(handle) - } - }; - - let metadata_prefetch_size = self.config.metadata_prefetch_size; - let metadata_decode_ahead_size = self.config.metadata_decode_ahead_size; - - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - self.morsel_stream_starter = Some(start_tx); - - let metadata_task_handle = if self - .file_options - .slice - .map(|(offset, _)| offset >= 0) - .unwrap_or(true) - { - normalized_slice_oneshot_tx - .send( - self.file_options - .slice - .map(|(offset, len)| (offset as usize, len)), - ) - .unwrap(); - - // Safety: `offset + len` does not overflow. - let slice_range = self - .file_options - .slice - .map(|(offset, len)| offset as usize..offset as usize + len); - - let mut metadata_stream = futures::stream::iter(0..self.scan_sources.len()) - .map(fetch_metadata_bytes_for_path_index) - .buffered(metadata_prefetch_size) - .map(process_metadata_bytes) - .buffered(metadata_decode_ahead_size); - - let scan_sources = self.scan_sources.clone(); - - // We need to be able to both stop early as well as skip values, which is easier to do - // using a custom task instead of futures::stream - io_runtime.spawn(async move { - let current_row_offset_ref = &mut 0usize; - let current_path_index_ref = &mut 0usize; - - if start_rx.await.is_err() { - return Ok(()); - } - - if verbose { - eprintln!("[ParquetSource]: Starting data fetch") - } - - loop { - let current_path_index = *current_path_index_ref; - *current_path_index_ref += 1; - - let Some(v) = metadata_stream.next().await else { - break; - }; - - let (path_index, byte_source, metadata, file_max_row_group_height) = v - .map_err(|err| { - err.wrap_msg(|msg| { - format!( - "error at path (index: {}, path: {:?}): {}", - current_path_index, - scan_sources - .get(current_path_index) - .map(|x| PlSmallStr::from_str(x.to_include_path_name())), - msg - ) - }) - })?; - - assert_eq!(path_index, current_path_index); - - let current_row_offset = *current_row_offset_ref; - *current_row_offset_ref = current_row_offset.saturating_add(metadata.num_rows); - - if let Some(slice_range) = slice_range.clone() { - match SplitSlicePosition::split_slice_at_file( - current_row_offset, - metadata.num_rows, - slice_range, - ) { - SplitSlicePosition::Before => { - if verbose { - eprintln!( - "[ParquetSource]: Slice pushdown: \ - Skipped file at index {} ({} rows)", - current_path_index, metadata.num_rows - ); - } - continue; - }, - SplitSlicePosition::After => unreachable!(), - SplitSlicePosition::Overlapping(..) => {}, - }; - }; - - if metadata_tx - .send(( - path_index, - current_row_offset, - byte_source, - metadata, - file_max_row_group_height, - )) - .await - .is_err() - { - break; - } - - if let Some(slice_range) = slice_range.as_ref() { - if *current_row_offset_ref >= slice_range.end { - if verbose { - eprintln!( - "[ParquetSource]: Slice pushdown: \ - Stopped reading at file at index {} \ - (remaining {} files will not be read)", - current_path_index, - scan_sources.len() - current_path_index - 1, - ); - } - break; - } - }; - } - - Ok(()) - }) - } else { - // Walk the files in reverse to translate the slice into a positive offset. - let slice = self.file_options.slice.unwrap(); - let slice_start_as_n_from_end = -slice.0 as usize; - - let mut metadata_stream = futures::stream::iter((0..self.scan_sources.len()).rev()) - .map(fetch_metadata_bytes_for_path_index) - .buffered(metadata_prefetch_size) - .map(process_metadata_bytes) - .buffered(metadata_decode_ahead_size); - - // Note: - // * We want to wait until the first morsel is requested before starting this - let init_negative_slice_and_metadata = async move { - let mut processed_metadata_rev = vec![]; - let mut cum_rows = 0; - - while let Some(v) = metadata_stream.next().await { - let v = v?; - let (_, _, metadata, _) = &v; - cum_rows += metadata.num_rows; - processed_metadata_rev.push(v); - - if cum_rows >= slice_start_as_n_from_end { - break; - } - } - - let (start, len) = if slice_start_as_n_from_end > cum_rows { - // We need to trim the slice, e.g. SLICE[offset: -100, len: 75] on a file of 50 - // rows should only give the first 25 rows. - let first_file_position = slice_start_as_n_from_end - cum_rows; - (0, slice.1.saturating_sub(first_file_position)) - } else { - (cum_rows - slice_start_as_n_from_end, slice.1) - }; - - if len == 0 { - processed_metadata_rev.clear(); - } - - normalized_slice_oneshot_tx - .send(Some((start, len))) - .unwrap(); - - let slice_range = start..(start + len); - - PolarsResult::Ok((slice_range, processed_metadata_rev, cum_rows)) - }; - - let path_count = self.scan_sources.len(); - - io_runtime.spawn(async move { - if start_rx.await.is_err() { - return Ok(()); - } - - if verbose { - eprintln!("[ParquetSource]: Starting data fetch (negative slice)") - } - - let (slice_range, processed_metadata_rev, cum_rows) = - async_executor::AbortOnDropHandle::new(async_executor::spawn( - TaskPriority::Low, - init_negative_slice_and_metadata, - )) - .await?; - - if verbose { - if let Some((path_index, ..)) = processed_metadata_rev.last() { - eprintln!( - "[ParquetSource]: Slice pushdown: Negatively-offsetted slice {:?} \ - begins at file index {}, translated to {:?}", - slice, path_index, slice_range - ); - } else { - eprintln!( - "[ParquetSource]: Slice pushdown: Negatively-offsetted slice {:?} \ - skipped all files ({} files containing {} rows)", - slice, path_count, cum_rows - ) - } - } - - let metadata_iter = processed_metadata_rev.into_iter().rev(); - let current_row_offset_ref = &mut 0usize; - - for (current_path_index, byte_source, metadata, file_max_row_group_height) in - metadata_iter - { - let current_row_offset = *current_row_offset_ref; - *current_row_offset_ref = current_row_offset.saturating_add(metadata.num_rows); - - assert!(matches!( - SplitSlicePosition::split_slice_at_file( - current_row_offset, - metadata.num_rows, - slice_range.clone(), - ), - SplitSlicePosition::Overlapping(..) - )); - - if metadata_tx - .send(( - current_path_index, - current_row_offset, - byte_source, - metadata, - file_max_row_group_height, - )) - .await - .is_err() - { - break; - } - - if *current_row_offset_ref >= slice_range.end { - if verbose { - eprintln!( - "[ParquetSource]: Slice pushdown: \ - Stopped reading at file at index {} \ - (remaining {} files will not be read)", - current_path_index, - path_count - current_path_index - 1, - ); - } - break; - } - } - - Ok(()) - }) - }; - - let metadata_task_handle = task_handles_ext::AbortOnDropHandle(metadata_task_handle); - - ( - normalized_slice_oneshot_rx, - metadata_rx, - metadata_task_handle, - ) - } - - /// Creates a `RowGroupDecoder` that turns `RowGroupData` into DataFrames. - /// This must be called AFTER the following have been initialized: - /// * `self.projected_arrow_fields` - /// * `self.physical_predicate` - fn init_row_group_decoder(&self) -> RowGroupDecoder { - assert!( - !self.projected_arrow_fields.is_empty() - || self.file_options.with_columns.as_deref() == Some(&[]) - ); - assert_eq!(self.predicate.is_some(), self.physical_predicate.is_some()); - - let scan_sources = self.scan_sources.clone(); - let hive_partitions = self.hive_parts.clone(); - let hive_partitions_width = hive_partitions - .as_deref() - .map(|x| x[0].get_statistics().column_stats().len()) - .unwrap_or(0); - let include_file_paths = self.file_options.include_file_paths.clone(); - let projected_arrow_fields = self.projected_arrow_fields.clone(); - let row_index = self.file_options.row_index.clone(); - let physical_predicate = self.physical_predicate.clone(); - let ideal_morsel_size = get_ideal_morsel_size(); - - RowGroupDecoder { - scan_sources, - hive_partitions, - hive_partitions_width, - include_file_paths, - projected_arrow_fields, - row_index, - physical_predicate, - ideal_morsel_size, - } - } - - fn init_projected_arrow_fields(&mut self) { - let reader_schema = self - .file_info - .reader_schema - .as_ref() - .unwrap() - .as_ref() - .unwrap_left() - .clone(); - - self.projected_arrow_fields = - if let Some(columns) = self.file_options.with_columns.as_deref() { - columns - .iter() - .map(|x| reader_schema.get(x).unwrap().clone()) - .collect() - } else { - reader_schema.iter_values().cloned().collect() - }; - - if self.verbose { - eprintln!( - "[ParquetSource]: {} columns to be projected from {} files", - self.projected_arrow_fields.len(), - self.scan_sources.len(), - ); - } - } -} - -#[derive(Debug)] -struct Config { - num_pipelines: usize, - /// Number of files to pre-fetch metadata for concurrently - metadata_prefetch_size: usize, - /// Number of files to decode metadata for in parallel in advance - metadata_decode_ahead_size: usize, - /// Number of row groups to pre-fetch concurrently, this can be across files - row_group_prefetch_size: usize, -} - -/// Represents byte-data that can be transformed into a DataFrame after some computation. -struct RowGroupData { - byte_source: FetchedBytes, - path_index: usize, - row_offset: usize, - slice: Option<(usize, usize)>, - file_max_row_group_height: usize, - row_group_metadata: RowGroupMetadata, - shared_file_state: Arc>, -} - -struct RowGroupDataFetcher { - metadata_rx: crate::async_primitives::connector::Receiver<( - usize, - usize, - Arc, - FileMetadata, - usize, - )>, - use_statistics: bool, - verbose: bool, - reader_schema: Arc, - projection: Option>, - predicate: Option>, - slice_range: Option>, - memory_prefetch_func: fn(&[u8]) -> (), - current_path_index: usize, - current_byte_source: Arc, - current_row_groups: std::vec::IntoIter, - current_row_group_idx: usize, - current_max_row_group_height: usize, - current_row_offset: usize, - current_shared_file_state: Arc>, -} - -impl RowGroupDataFetcher { - fn into_stream(self) -> RowGroupDataStream { - RowGroupDataStream::new(self) - } - - async fn init_next_file_state(&mut self) -> bool { - let Ok((path_index, row_offset, byte_source, metadata, file_max_row_group_height)) = - self.metadata_rx.recv().await - else { - return false; - }; - - self.current_path_index = path_index; - self.current_byte_source = byte_source; - self.current_max_row_group_height = file_max_row_group_height; - // The metadata task also sends a row offset to start counting from as it may skip files - // during slice pushdown. - self.current_row_offset = row_offset; - self.current_row_group_idx = 0; - self.current_row_groups = metadata.row_groups.into_iter(); - self.current_shared_file_state = Default::default(); - - true - } - - async fn next( - &mut self, - ) -> Option>>> { - 'main: loop { - for row_group_metadata in self.current_row_groups.by_ref() { - let current_row_offset = self.current_row_offset; - let current_row_group_idx = self.current_row_group_idx; - - let num_rows = row_group_metadata.num_rows(); - - self.current_row_offset = current_row_offset.saturating_add(num_rows); - self.current_row_group_idx += 1; - - if self.use_statistics - && !match read_this_row_group( - self.predicate.as_deref(), - &row_group_metadata, - self.reader_schema.as_ref(), - ) { - Ok(v) => v, - Err(e) => return Some(Err(e)), - } - { - if self.verbose { - eprintln!( - "[ParquetSource]: Predicate pushdown: \ - Skipped row group {} in file {} ({} rows)", - current_row_group_idx, self.current_path_index, num_rows - ); - } - continue; - } - - if num_rows > IdxSize::MAX as usize { - let msg = operation_exceeded_idxsize_msg( - format!("number of rows in row group ({})", num_rows).as_str(), - ); - return Some(Err(polars_err!(ComputeError: msg))); - } - - let slice = if let Some(slice_range) = self.slice_range.clone() { - let (offset, len) = match SplitSlicePosition::split_slice_at_file( - current_row_offset, - num_rows, - slice_range, - ) { - SplitSlicePosition::Before => { - if self.verbose { - eprintln!( - "[ParquetSource]: Slice pushdown: \ - Skipped row group {} in file {} ({} rows)", - current_row_group_idx, self.current_path_index, num_rows - ); - } - continue; - }, - SplitSlicePosition::After => { - if self.verbose { - eprintln!( - "[ParquetSource]: Slice pushdown: \ - Stop at row group {} in file {} \ - (remaining {} row groups will not be read)", - current_row_group_idx, - self.current_path_index, - self.current_row_groups.len(), - ); - }; - break 'main; - }, - SplitSlicePosition::Overlapping(offset, len) => (offset, len), - }; - - Some((offset, len)) - } else { - None - }; - - let current_byte_source = self.current_byte_source.clone(); - let projection = self.projection.clone(); - let current_shared_file_state = self.current_shared_file_state.clone(); - let memory_prefetch_func = self.memory_prefetch_func; - let io_runtime = polars_io::pl_async::get_runtime(); - let current_path_index = self.current_path_index; - let current_max_row_group_height = self.current_max_row_group_height; - - // Push calculation of byte ranges to a task to run in parallel, as it can be - // expensive for very wide tables and projections. - let handle = async_executor::spawn(TaskPriority::Low, async move { - let byte_source = if let DynByteSource::MemSlice(mem_slice) = - current_byte_source.as_ref() - { - // Skip byte range calculation for `no_prefetch`. - if memory_prefetch_func as usize != mem_prefetch_funcs::no_prefetch as usize - { - let slice = mem_slice.0.as_ref(); - - if let Some(columns) = projection.as_ref() { - for range in get_row_group_byte_ranges_for_projection( - &row_group_metadata, - columns.as_ref(), - ) { - memory_prefetch_func(unsafe { - slice.get_unchecked_release(range) - }) - } - } else { - let mut iter = get_row_group_byte_ranges(&row_group_metadata); - let first = iter.next().unwrap(); - let range = - iter.fold(first, |l, r| l.start.min(r.start)..l.end.max(r.end)); - - memory_prefetch_func(unsafe { slice.get_unchecked_release(range) }) - }; - } - - // We have a mmapped or in-memory slice representing the entire - // file that can be sliced directly, so we can skip the byte-range - // calculations and HashMap allocation. - let mem_slice = mem_slice.0.clone(); - FetchedBytes::MemSlice { - offset: 0, - mem_slice, - } - } else if let Some(columns) = projection.as_ref() { - let ranges = get_row_group_byte_ranges_for_projection( - &row_group_metadata, - columns.as_ref(), - ) - .collect::>(); - - let bytes = { - let ranges_2 = ranges.clone(); - task_handles_ext::AbortOnDropHandle(io_runtime.spawn(async move { - current_byte_source.get_ranges(ranges_2.as_ref()).await - })) - .await - .unwrap()? - }; - - assert_eq!(bytes.len(), ranges.len()); - - let mut bytes_map = PlHashMap::with_capacity(ranges.len()); - - for (range, bytes) in ranges.iter().zip(bytes) { - memory_prefetch_func(bytes.as_ref()); - let v = bytes_map.insert(range.start, bytes); - debug_assert!(v.is_none(), "duplicate range start {}", range.start); - } - - FetchedBytes::BytesMap(bytes_map) - } else { - // We have a dedicated code-path for a full projection that performs a - // single range request for the entire row group. During testing this - // provided much higher throughput from cloud than making multiple range - // request with `get_ranges()`. - let mut iter = get_row_group_byte_ranges(&row_group_metadata); - let mut ranges = Vec::with_capacity(iter.len()); - let first = iter.next().unwrap(); - ranges.push(first.clone()); - let full_range = iter.fold(first, |l, r| { - ranges.push(r.clone()); - l.start.min(r.start)..l.end.max(r.end) - }); - - let mem_slice = { - let full_range_2 = full_range.clone(); - task_handles_ext::AbortOnDropHandle(io_runtime.spawn(async move { - current_byte_source.get_range(full_range_2).await - })) - .await - .unwrap()? - }; - - FetchedBytes::MemSlice { - offset: full_range.start, - mem_slice, - } - }; - - PolarsResult::Ok(RowGroupData { - byte_source, - path_index: current_path_index, - row_offset: current_row_offset, - slice, - file_max_row_group_height: current_max_row_group_height, - row_group_metadata, - shared_file_state: current_shared_file_state.clone(), - }) - }); - - let handle = async_executor::AbortOnDropHandle::new(handle); - return Some(Ok(handle)); - } - - // Initialize state to the next file. - if !self.init_next_file_state().await { - break; - } - } - - None - } -} - -enum FetchedBytes { - MemSlice { mem_slice: MemSlice, offset: usize }, - BytesMap(PlHashMap), -} - -impl FetchedBytes { - fn get_range(&self, range: std::ops::Range) -> MemSlice { - match self { - Self::MemSlice { mem_slice, offset } => { - let offset = *offset; - debug_assert!(range.start >= offset); - mem_slice.slice(range.start - offset..range.end - offset) - }, - Self::BytesMap(v) => { - let v = v.get(&range.start).unwrap(); - debug_assert_eq!(v.len(), range.len()); - v.clone() - }, - } - } -} - -#[rustfmt::skip] -type RowGroupDataStreamFut = std::pin::Pin , - Option < - PolarsResult < - async_executor::AbortOnDropHandle < - PolarsResult < - RowGroupData > > > > - ) - > + Send ->>; - -struct RowGroupDataStream { - current_future: RowGroupDataStreamFut, -} - -impl RowGroupDataStream { - fn new(row_group_data_fetcher: RowGroupDataFetcher) -> Self { - // [`RowGroupDataFetcher`] is a big struct, so we Box it once here to avoid boxing it on - // every `next()` call. - let current_future = Self::call_next_owned(Box::new(row_group_data_fetcher)); - Self { current_future } - } - - fn call_next_owned( - mut row_group_data_fetcher: Box, - ) -> RowGroupDataStreamFut { - Box::pin(async move { - let out = row_group_data_fetcher.next().await; - (row_group_data_fetcher, out) - }) - } -} - -impl futures::stream::Stream for RowGroupDataStream { - type Item = PolarsResult>>; - - fn poll_next( - mut self: std::pin::Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { - use std::pin::Pin; - use std::task::Poll; - - match Pin::new(&mut self.current_future.as_mut()).poll(cx) { - Poll::Ready((row_group_data_fetcher, out)) => { - if out.is_some() { - self.current_future = Self::call_next_owned(row_group_data_fetcher); - } - - Poll::Ready(out) - }, - Poll::Pending => Poll::Pending, - } - } -} - -/// State shared across row groups for a single file. -struct SharedFileState { - path_index: usize, - hive_series: Vec, - file_path_series: Option, -} - -/// Turns row group data into DataFrames. -struct RowGroupDecoder { - scan_sources: ScanSources, - hive_partitions: Option>>, - hive_partitions_width: usize, - include_file_paths: Option, - projected_arrow_fields: Arc<[polars_core::prelude::ArrowField]>, - row_index: Option, - physical_predicate: Option>, - ideal_morsel_size: usize, -} - -impl RowGroupDecoder { - async fn row_group_data_to_df( - &self, - row_group_data: RowGroupData, - ) -> PolarsResult> { - let row_group_data = Arc::new(row_group_data); - - let out_width = self.row_index.is_some() as usize - + self.projected_arrow_fields.len() - + self.hive_partitions_width - + self.include_file_paths.is_some() as usize; - - let mut out_columns = Vec::with_capacity(out_width); - - if self.row_index.is_some() { - // Add a placeholder so that we don't have to shift the entire vec - // later. - out_columns.push(Series::default()); - } - - let slice_range = row_group_data - .slice - .map(|(offset, len)| offset..offset + len) - .unwrap_or(0..row_group_data.row_group_metadata.num_rows()); - - let projected_arrow_fields = &self.projected_arrow_fields; - let projected_arrow_fields = projected_arrow_fields.clone(); - - let row_group_data_2 = row_group_data.clone(); - let slice_range_2 = slice_range.clone(); - - // Minimum number of values to amortize the overhead of spawning tasks. - // This value is arbitrarily chosen. - const VALUES_PER_THREAD: usize = 16_777_216; - let n_rows = row_group_data.row_group_metadata.num_rows(); - let cols_per_task = 1 + VALUES_PER_THREAD / n_rows; - - let decode_fut_iter = (0..self.projected_arrow_fields.len()) - .step_by(cols_per_task) - .map(move |offset| { - let row_group_data = row_group_data_2.clone(); - let slice_range = slice_range_2.clone(); - let projected_arrow_fields = projected_arrow_fields.clone(); - - async move { - (offset - ..offset - .saturating_add(cols_per_task) - .min(projected_arrow_fields.len())) - .map(|i| { - let arrow_field = projected_arrow_fields[i].clone(); - - let columns_to_deserialize = row_group_data - .row_group_metadata - .columns_under_root_iter(&arrow_field.name) - .map(|col_md| { - let byte_range = col_md.byte_range(); - - ( - col_md, - row_group_data.byte_source.get_range( - byte_range.start as usize..byte_range.end as usize, - ), - ) - }) - .collect::>(); - - assert!( - slice_range.end <= row_group_data.row_group_metadata.num_rows() - ); - - let array = polars_io::prelude::_internal::to_deserializer( - columns_to_deserialize, - arrow_field.clone(), - Some(polars_parquet::read::Filter::Range(slice_range.clone())), - )?; - - let series = Series::try_from((&arrow_field, array))?; - - // TODO: Also load in the metadata. - - PolarsResult::Ok(series) - }) - .collect::>>() - } - }); - - if decode_fut_iter.len() > 1 { - for handle in decode_fut_iter.map(|fut| { - async_executor::AbortOnDropHandle::new(async_executor::spawn( - TaskPriority::Low, - fut, - )) - }) { - out_columns.extend(handle.await?); - } - } else { - for fut in decode_fut_iter { - out_columns.extend(fut.await?); - } - } - - let projection_height = if self.projected_arrow_fields.is_empty() { - slice_range.len() - } else { - debug_assert!(out_columns.len() > self.row_index.is_some() as usize); - out_columns.last().unwrap().len() - }; - - if let Some(RowIndex { name, offset }) = self.row_index.as_ref() { - let Some(offset) = (|| { - let offset = offset - .checked_add((row_group_data.row_offset + slice_range.start) as IdxSize)?; - offset.checked_add(projection_height as IdxSize)?; - - Some(offset) - })() else { - let msg = format!( - "adding a row index column with offset {} overflows at {} rows", - offset, - row_group_data.row_offset + slice_range.end - ); - polars_bail!(ComputeError: msg) - }; - - // The DataFrame can be empty at this point if no columns were projected from the file, - // so we create the row index column manually instead of using `df.with_row_index` to - // ensure it has the correct number of rows. - let mut ca = IdxCa::from_vec( - name.clone(), - (offset..offset + projection_height as IdxSize).collect(), - ); - ca.set_sorted_flag(IsSorted::Ascending); - - out_columns[0] = ca.into_series(); - } - - let shared_file_state = row_group_data - .shared_file_state - .get_or_init(|| async { - let path_index = row_group_data.path_index; - - let hive_series = if let Some(hp) = self.hive_partitions.as_deref() { - let mut v = hp[path_index].materialize_partition_columns(); - for s in v.iter_mut() { - *s = s.new_from_index(0, row_group_data.file_max_row_group_height); - } - v - } else { - vec![] - }; - - let file_path_series = self.include_file_paths.clone().map(|file_path_col| { - StringChunked::full( - file_path_col, - self.scan_sources - .get(path_index) - .unwrap() - .to_include_path_name(), - row_group_data.file_max_row_group_height, - ) - .into_series() - }); - - SharedFileState { - path_index, - hive_series, - file_path_series, - } - }) - .await; - - assert_eq!(shared_file_state.path_index, row_group_data.path_index); - - for s in &shared_file_state.hive_series { - debug_assert!(s.len() >= projection_height); - out_columns.push(s.slice(0, projection_height)); - } - - if let Some(file_path_series) = &shared_file_state.file_path_series { - debug_assert!(file_path_series.len() >= projection_height); - out_columns.push(file_path_series.slice(0, projection_height)); - } - - let df = unsafe { DataFrame::new_no_checks(out_columns) }; - - // Re-calculate: A slice may have been applied. - let cols_per_task = 1 + VALUES_PER_THREAD / df.height(); - - let df = if let Some(predicate) = self.physical_predicate.as_deref() { - let mask = predicate.evaluate_io(&df)?; - let mask = mask.bool().unwrap(); - - if cols_per_task <= df.width() { - df._filter_seq(mask)? - } else { - let mask = mask.clone(); - let cols = Arc::new(df.take_columns()); - let mut out_cols = Vec::with_capacity(cols.len()); - - for handle in (0..cols.len()) - .step_by(cols_per_task) - .map(move |offset| { - let cols = cols.clone(); - let mask = mask.clone(); - async move { - cols[offset..offset.saturating_add(cols_per_task).min(cols.len())] - .iter() - .map(|s| s.filter(&mask)) - .collect::>>() - } - }) - .map(|fut| { - async_executor::AbortOnDropHandle::new(async_executor::spawn( - TaskPriority::Low, - fut, - )) - }) - { - out_cols.extend(handle.await?); - } - - unsafe { DataFrame::new_no_checks(out_cols) } - } - } else { - df - }; - - assert_eq!(df.width(), out_width); - - let n_morsels = if df.height() > 3 * self.ideal_morsel_size / 2 { - // num_rows > (1.5 * ideal_morsel_size) - (df.height() / self.ideal_morsel_size).max(2) - } else { - 1 - } as u64; - - if n_morsels == 1 { - return Ok(vec![df]); - } - - let rows_per_morsel = 1 + df.height() / n_morsels as usize; - - let out = (0..i64::try_from(df.height()).unwrap()) - .step_by(rows_per_morsel) - .map(|offset| df.slice(offset, rows_per_morsel)) - .collect::>(); - - Ok(out) - } -} - -/// Read the metadata bytes of a parquet file, does not decode the bytes. If during metadata fetch -/// the bytes of the entire file are loaded, it is returned in the second return value. -async fn read_parquet_metadata_bytes( - byte_source: &DynByteSource, - verbose: bool, -) -> PolarsResult<(MemSlice, Option)> { - use polars_parquet::parquet::error::ParquetError; - use polars_parquet::parquet::PARQUET_MAGIC; - - const FOOTER_HEADER_SIZE: usize = polars_parquet::parquet::FOOTER_SIZE as usize; - - let file_size = byte_source.get_size().await?; - - if file_size < FOOTER_HEADER_SIZE { - return Err(ParquetError::OutOfSpec(format!( - "file size ({}) is less than minimum size required to store parquet footer ({})", - file_size, FOOTER_HEADER_SIZE - )) - .into()); - } - - let estimated_metadata_size = if let DynByteSource::MemSlice(_) = byte_source { - // Mmapped or in-memory, reads are free. - file_size - } else { - (file_size / 2048).clamp(16_384, 131_072).min(file_size) - }; - - let bytes = byte_source - .get_range((file_size - estimated_metadata_size)..file_size) - .await?; - - let footer_header_bytes = bytes.slice((bytes.len() - FOOTER_HEADER_SIZE)..bytes.len()); - - let (v, remaining) = footer_header_bytes.split_at(4); - let footer_size = i32::from_le_bytes(v.try_into().unwrap()); - - if remaining != PARQUET_MAGIC { - return Err(ParquetError::OutOfSpec(format!( - r#"expected parquet magic bytes "{}" in footer, got "{}" instead"#, - std::str::from_utf8(&PARQUET_MAGIC).unwrap(), - String::from_utf8_lossy(remaining) - )) - .into()); - } - - if footer_size < 0 { - return Err(ParquetError::OutOfSpec(format!( - "expected positive footer size, got {} instead", - footer_size - )) - .into()); - } - - let footer_size = footer_size as usize + FOOTER_HEADER_SIZE; - - if file_size < footer_size { - return Err(ParquetError::OutOfSpec(format!( - "file size ({}) is less than the indicated footer size ({})", - file_size, footer_size - )) - .into()); - } - - if bytes.len() < footer_size { - debug_assert!(!matches!(byte_source, DynByteSource::MemSlice(_))); - if verbose { - eprintln!( - "[ParquetSource]: Extra {} bytes need to be fetched for metadata \ - (initial estimate = {}, actual size = {})", - footer_size - estimated_metadata_size, - bytes.len(), - footer_size, - ); - } - - let mut out = Vec::with_capacity(footer_size); - let offset = file_size - footer_size; - let len = footer_size - bytes.len(); - let delta_bytes = byte_source.get_range(offset..(offset + len)).await?; - - debug_assert!(out.capacity() >= delta_bytes.len() + bytes.len()); - - out.extend_from_slice(&delta_bytes); - out.extend_from_slice(&bytes); - - Ok((MemSlice::from_vec(out), None)) - } else { - if verbose && !matches!(byte_source, DynByteSource::MemSlice(_)) { - eprintln!( - "[ParquetSource]: Fetched all bytes for metadata on first try \ - (initial estimate = {}, actual size = {}, excess = {})", - bytes.len(), - footer_size, - estimated_metadata_size - footer_size, - ); - } - - let metadata_bytes = bytes.slice((bytes.len() - footer_size)..bytes.len()); - - if bytes.len() == file_size { - Ok((metadata_bytes, Some(bytes))) - } else { - debug_assert!(!matches!(byte_source, DynByteSource::MemSlice(_))); - let metadata_bytes = if bytes.len() - footer_size >= bytes.len() { - // Re-allocate to drop the excess bytes - MemSlice::from_vec(metadata_bytes.to_vec()) - } else { - metadata_bytes - }; - - Ok((metadata_bytes, None)) - } - } -} - -fn get_row_group_byte_ranges( - row_group_metadata: &RowGroupMetadata, -) -> impl ExactSizeIterator> + '_ { - row_group_metadata - .byte_ranges_iter() - .map(|byte_range| byte_range.start as usize..byte_range.end as usize) -} - -fn get_row_group_byte_ranges_for_projection<'a>( - row_group_metadata: &'a RowGroupMetadata, - columns: &'a [PlSmallStr], -) -> impl Iterator> + 'a { - columns.iter().flat_map(|col_name| { - row_group_metadata - .columns_under_root_iter(col_name) - .map(|col| { - let byte_range = col.byte_range(); - byte_range.start as usize..byte_range.end as usize - }) - }) -} - -/// Ensures that a parquet file has all the necessary columns for a projection with the correct -/// dtype. There are no ordering requirements and extra columns are permitted. -fn ensure_metadata_has_projected_fields( - projected_fields: &[polars_core::prelude::ArrowField], - metadata: &FileMetadata, -) -> PolarsResult<()> { - let schema = polars_parquet::arrow::read::infer_schema(metadata)?; - - // Note: We convert to Polars-native dtypes for timezone normalization. - let mut schema = schema - .into_iter_values() - .map(|x| { - let dtype = DataType::from_arrow(&x.dtype, true); - (x.name, dtype) - }) - .collect::>(); - - for field in projected_fields { - let Some(dtype) = schema.remove(&field.name) else { - polars_bail!(SchemaMismatch: "did not find column: {}", field.name) - }; - - let expected_dtype = DataType::from_arrow(&field.dtype, true); - - if dtype != expected_dtype { - polars_bail!(SchemaMismatch: "data type mismatch for column {}: found: {}, expected: {}", - &field.name, dtype, expected_dtype - ) - } - } - - Ok(()) -} - -fn get_memory_prefetch_func(verbose: bool) -> fn(&[u8]) -> () { - let memory_prefetch_func = match std::env::var("POLARS_MEMORY_PREFETCH").ok().as_deref() { - None => { - // Sequential advice was observed to provide speedups on Linux. - // ref https://github.com/pola-rs/polars/pull/18152#discussion_r1721701965 - #[cfg(target_os = "linux")] - { - mem_prefetch_funcs::madvise_sequential - } - #[cfg(not(target_os = "linux"))] - { - mem_prefetch_funcs::no_prefetch - } - }, - Some("no_prefetch") => mem_prefetch_funcs::no_prefetch, - Some("prefetch_l2") => mem_prefetch_funcs::prefetch_l2, - Some("madvise_sequential") => { - #[cfg(target_family = "unix")] - { - mem_prefetch_funcs::madvise_sequential - } - #[cfg(not(target_family = "unix"))] - { - panic!("POLARS_MEMORY_PREFETCH=madvise_sequential is not supported by this system"); - } - }, - Some("madvise_willneed") => { - #[cfg(target_family = "unix")] - { - mem_prefetch_funcs::madvise_willneed - } - #[cfg(not(target_family = "unix"))] - { - panic!("POLARS_MEMORY_PREFETCH=madvise_willneed is not supported by this system"); - } - }, - Some("madvise_populate_read") => { - #[cfg(target_os = "linux")] - { - mem_prefetch_funcs::madvise_populate_read - } - #[cfg(not(target_os = "linux"))] - { - panic!( - "POLARS_MEMORY_PREFETCH=madvise_populate_read is not supported by this system" - ); - } - }, - Some(v) => panic!("invalid value for POLARS_MEMORY_PREFETCH: {}", v), - }; - - if verbose { - let func_name = match memory_prefetch_func as usize { - v if v == mem_prefetch_funcs::no_prefetch as usize => "no_prefetch", - v if v == mem_prefetch_funcs::prefetch_l2 as usize => "prefetch_l2", - v if v == mem_prefetch_funcs::madvise_sequential as usize => "madvise_sequential", - v if v == mem_prefetch_funcs::madvise_willneed as usize => "madvise_willneed", - v if v == mem_prefetch_funcs::madvise_populate_read as usize => "madvise_populate_read", - _ => unreachable!(), - }; - - eprintln!("[ParquetSource] Memory prefetch function: {}", func_name); - } - - memory_prefetch_func -} - -mod mem_prefetch_funcs { - pub use polars_utils::mem::{ - madvise_populate_read, madvise_sequential, madvise_willneed, prefetch_l2, - }; - - pub fn no_prefetch(_: &[u8]) {} -} diff --git a/crates/polars-stream/src/nodes/parquet_source/init.rs b/crates/polars-stream/src/nodes/parquet_source/init.rs new file mode 100644 index 000000000000..2aba9642fb04 --- /dev/null +++ b/crates/polars-stream/src/nodes/parquet_source/init.rs @@ -0,0 +1,737 @@ +use std::future::Future; +use std::sync::Arc; + +use futures::stream::FuturesUnordered; +use futures::StreamExt; +use polars_core::frame::DataFrame; +use polars_error::PolarsResult; +use polars_io::prelude::FileMetadata; +use polars_io::utils::byte_source::{DynByteSource, MemSliceByteSource}; +use polars_io::utils::slice::SplitSlicePosition; +use polars_utils::mmap::MemSlice; +use polars_utils::pl_str::PlSmallStr; + +use super::metadata_utils::{ensure_metadata_has_projected_fields, read_parquet_metadata_bytes}; +use super::row_group_data_fetch::RowGroupDataFetcher; +use super::row_group_decode::RowGroupDecoder; +use super::{AsyncTaskData, ParquetSourceNode}; +use crate::async_executor; +use crate::async_primitives::connector::connector; +use crate::async_primitives::wait_group::{WaitGroup, WaitToken}; +use crate::morsel::get_ideal_morsel_size; +use crate::nodes::{MorselSeq, TaskPriority}; +use crate::utils::task_handles_ext; + +impl ParquetSourceNode { + /// # Panics + /// Panics if called more than once. + async fn shutdown_impl( + async_task_data: Arc>, + verbose: bool, + ) -> PolarsResult<()> { + if verbose { + eprintln!("[ParquetSource]: Shutting down"); + } + + let (mut raw_morsel_receivers, morsel_stream_task_handle) = + async_task_data.try_lock().unwrap().take().unwrap(); + + raw_morsel_receivers.clear(); + // Join on the producer handle to catch errors/panics. + // Safety + // * We dropped the receivers on the line above + // * This function is only called once. + morsel_stream_task_handle.await + } + + pub(super) fn shutdown(&self) -> impl Future> { + if self.verbose { + eprintln!("[ParquetSource]: Shutdown via `shutdown()`"); + } + Self::shutdown_impl(self.async_task_data.clone(), self.verbose) + } + + /// Spawns a task to shut down the source node to avoid blocking the current thread. This is + /// usually called when data is no longer needed from the source node, as such it does not + /// propagate any (non-critical) errors. If on the other hand the source node does not provide + /// more data when requested, then it is more suitable to call [`Self::shutdown`], as it returns + /// a result that can be used to distinguish between whether the data stream stopped due to an + /// error or EOF. + pub(super) fn shutdown_in_background(&self) { + if self.verbose { + eprintln!("[ParquetSource]: Shutdown via `shutdown_in_background()`"); + } + let async_task_data = self.async_task_data.clone(); + polars_io::pl_async::get_runtime() + .spawn(Self::shutdown_impl(async_task_data, self.verbose)); + } + + /// Constructs the task that provides a morsel stream. + #[allow(clippy::type_complexity)] + pub(super) fn init_raw_morsel_stream( + &mut self, + ) -> ( + Vec>, + async_executor::AbortOnDropHandle>, + ) { + let verbose = self.verbose; + + let use_statistics = self.options.use_statistics; + + let (mut raw_morsel_senders, raw_morsel_receivers): (Vec<_>, Vec<_>) = + (0..self.config.num_pipelines).map(|_| connector()).unzip(); + + if let Some((_, 0)) = self.file_options.slice { + return ( + raw_morsel_receivers, + async_executor::AbortOnDropHandle::new(async_executor::spawn( + TaskPriority::Low, + std::future::ready(Ok(())), + )), + ); + } + + let reader_schema = self + .file_info + .reader_schema + .as_ref() + .unwrap() + .as_ref() + .unwrap_left() + .clone(); + + let (normalized_slice_oneshot_rx, metadata_rx, metadata_task_handle) = + self.init_metadata_fetcher(); + + let num_pipelines = self.config.num_pipelines; + let row_group_prefetch_size = self.config.row_group_prefetch_size; + let projection = self.file_options.with_columns.clone(); + assert_eq!(self.physical_predicate.is_some(), self.predicate.is_some()); + let predicate = self.physical_predicate.clone(); + let memory_prefetch_func = self.memory_prefetch_func; + + let mut row_group_data_fetcher = RowGroupDataFetcher { + metadata_rx, + use_statistics, + verbose, + reader_schema, + projection, + predicate, + slice_range: None, // Initialized later + memory_prefetch_func, + current_path_index: 0, + current_byte_source: Default::default(), + current_row_groups: Default::default(), + current_row_group_idx: 0, + current_max_row_group_height: 0, + current_row_offset: 0, + current_shared_file_state: Default::default(), + }; + + let row_group_decoder = self.init_row_group_decoder(); + let row_group_decoder = Arc::new(row_group_decoder); + + // Processes row group metadata and spawns I/O tasks to fetch row group data. This is + // currently spawned onto the CPU runtime as it does not directly make any async I/O calls, + // but instead it potentially performs predicate/slice evaluation on metadata. If we observe + // that under heavy CPU load scenarios the I/O throughput drops due to this task not being + // scheduled we can change it to be a high priority task. + let morsel_stream_task_handle = async_executor::spawn(TaskPriority::Low, async move { + let slice_range = { + let Ok(slice) = normalized_slice_oneshot_rx.await else { + // If we are here then the producer probably errored. + drop(row_group_data_fetcher); + return metadata_task_handle.await.unwrap(); + }; + + slice.map(|(offset, len)| offset..offset + len) + }; + + row_group_data_fetcher.slice_range = slice_range; + + // Pins a wait group to a channel index. + struct IndexedWaitGroup { + index: usize, + wait_group: WaitGroup, + } + + impl IndexedWaitGroup { + async fn wait(self) -> Self { + self.wait_group.wait().await; + self + } + } + + // Ensure proper backpressure by only polling the buffered iterator when a wait group + // is free. + let mut wait_groups = (0..num_pipelines) + .map(|index| { + let wait_group = WaitGroup::default(); + { + let _prime_this_wait_group = wait_group.token(); + } + IndexedWaitGroup { + index, + wait_group: WaitGroup::default(), + } + .wait() + }) + .collect::>(); + + let mut df_stream = row_group_data_fetcher + .into_stream() + .map(|x| async { + match x { + Ok(handle) => handle.await, + Err(e) => Err(e), + } + }) + .buffered(row_group_prefetch_size) + .map(|x| async { + let row_group_decoder = row_group_decoder.clone(); + + match x { + Ok(row_group_data) => { + async_executor::spawn(TaskPriority::Low, async move { + row_group_decoder.row_group_data_to_df(row_group_data).await + }) + .await + }, + Err(e) => Err(e), + } + }) + .buffered( + // Because we are using an ordered buffer, we may suffer from head-of-line blocking, + // so we add a small amount of buffer. + num_pipelines + 4, + ); + + let morsel_seq_ref = &mut MorselSeq::default(); + let mut dfs = vec![].into_iter(); + + 'main: loop { + let Some(mut indexed_wait_group) = wait_groups.next().await else { + break; + }; + + if dfs.len() == 0 { + let Some(v) = df_stream.next().await else { + break; + }; + + let v = v?; + assert!(!v.is_empty()); + + dfs = v.into_iter(); + } + + let mut df = dfs.next().unwrap(); + let morsel_seq = *morsel_seq_ref; + *morsel_seq_ref = morsel_seq.successor(); + + loop { + use crate::async_primitives::connector::SendError; + + let channel_index = indexed_wait_group.index; + let wait_token = indexed_wait_group.wait_group.token(); + + match raw_morsel_senders[channel_index].try_send((df, morsel_seq, wait_token)) { + Ok(_) => { + wait_groups.push(indexed_wait_group.wait()); + break; + }, + Err(SendError::Closed(v)) => { + // The channel assigned to this wait group has been closed, so we will not + // add it back to the list of wait groups, and we will try to send this + // across another channel. + df = v.0 + }, + Err(SendError::Full(_)) => unreachable!(), + } + + let Some(v) = wait_groups.next().await else { + // All channels have closed + break 'main; + }; + + indexed_wait_group = v; + } + } + + // Join on the producer handle to catch errors/panics. + drop(df_stream); + metadata_task_handle.await.unwrap() + }); + + let morsel_stream_task_handle = + async_executor::AbortOnDropHandle::new(morsel_stream_task_handle); + + (raw_morsel_receivers, morsel_stream_task_handle) + } + + /// Constructs the task that fetches file metadata. + /// Note: This must be called AFTER `self.projected_arrow_fields` has been initialized. + /// + /// TODO: During IR conversion the metadata of the first file is already downloaded - see if + /// we can find a way to re-use it. + #[allow(clippy::type_complexity)] + fn init_metadata_fetcher( + &mut self, + ) -> ( + tokio::sync::oneshot::Receiver>, + crate::async_primitives::connector::Receiver<( + usize, + usize, + Arc, + FileMetadata, + usize, + )>, + task_handles_ext::AbortOnDropHandle>, + ) { + let verbose = self.verbose; + let io_runtime = polars_io::pl_async::get_runtime(); + + assert!( + !self.projected_arrow_fields.is_empty() + || self.file_options.with_columns.as_deref() == Some(&[]) + ); + let projected_arrow_fields = self.projected_arrow_fields.clone(); + let needs_max_row_group_height_calc = + self.file_options.include_file_paths.is_some() || self.hive_parts.is_some(); + + let (normalized_slice_oneshot_tx, normalized_slice_oneshot_rx) = + tokio::sync::oneshot::channel(); + let (mut metadata_tx, metadata_rx) = connector(); + + let byte_source_builder = self.byte_source_builder.clone(); + + if self.verbose { + eprintln!( + "[ParquetSource]: Byte source builder: {:?}", + &byte_source_builder + ); + } + + let fetch_metadata_bytes_for_path_index = { + let scan_sources = &self.scan_sources; + let cloud_options = Arc::new(self.cloud_options.clone()); + + let scan_sources = scan_sources.clone(); + let cloud_options = cloud_options.clone(); + let byte_source_builder = byte_source_builder.clone(); + + move |path_idx: usize| { + let scan_sources = scan_sources.clone(); + let cloud_options = cloud_options.clone(); + let byte_source_builder = byte_source_builder.clone(); + + let handle = io_runtime.spawn(async move { + let mut byte_source = Arc::new( + scan_sources + .get(path_idx) + .unwrap() + .to_dyn_byte_source( + &byte_source_builder, + cloud_options.as_ref().as_ref(), + ) + .await?, + ); + let (metadata_bytes, maybe_full_bytes) = + read_parquet_metadata_bytes(byte_source.as_ref(), verbose).await?; + + if let Some(v) = maybe_full_bytes { + if !matches!(byte_source.as_ref(), DynByteSource::MemSlice(_)) { + if verbose { + eprintln!( + "[ParquetSource]: Parquet file was fully fetched during \ + metadata read ({} bytes).", + v.len(), + ); + } + + byte_source = Arc::new(DynByteSource::from(MemSliceByteSource(v))) + } + } + + PolarsResult::Ok((path_idx, byte_source, metadata_bytes)) + }); + + let handle = task_handles_ext::AbortOnDropHandle(handle); + + std::future::ready(handle) + } + }; + + let process_metadata_bytes = { + move |handle: task_handles_ext::AbortOnDropHandle< + PolarsResult<(usize, Arc, MemSlice)>, + >| { + let projected_arrow_fields = projected_arrow_fields.clone(); + // Run on CPU runtime - metadata deserialization is expensive, especially + // for very wide tables. + let handle = async_executor::spawn(TaskPriority::Low, async move { + let (path_index, byte_source, metadata_bytes) = handle.await.unwrap()?; + + let metadata = polars_parquet::parquet::read::deserialize_metadata( + metadata_bytes.as_ref(), + metadata_bytes.len() * 2 + 1024, + )?; + + ensure_metadata_has_projected_fields( + projected_arrow_fields.as_ref(), + &metadata, + )?; + + let file_max_row_group_height = if needs_max_row_group_height_calc { + metadata + .row_groups + .iter() + .map(|x| x.num_rows()) + .max() + .unwrap_or(0) + } else { + 0 + }; + + PolarsResult::Ok((path_index, byte_source, metadata, file_max_row_group_height)) + }); + + async_executor::AbortOnDropHandle::new(handle) + } + }; + + let metadata_prefetch_size = self.config.metadata_prefetch_size; + let metadata_decode_ahead_size = self.config.metadata_decode_ahead_size; + + let (start_tx, start_rx) = tokio::sync::oneshot::channel(); + self.morsel_stream_starter = Some(start_tx); + + let metadata_task_handle = if self + .file_options + .slice + .map(|(offset, _)| offset >= 0) + .unwrap_or(true) + { + normalized_slice_oneshot_tx + .send( + self.file_options + .slice + .map(|(offset, len)| (offset as usize, len)), + ) + .unwrap(); + + // Safety: `offset + len` does not overflow. + let slice_range = self + .file_options + .slice + .map(|(offset, len)| offset as usize..offset as usize + len); + + let mut metadata_stream = futures::stream::iter(0..self.scan_sources.len()) + .map(fetch_metadata_bytes_for_path_index) + .buffered(metadata_prefetch_size) + .map(process_metadata_bytes) + .buffered(metadata_decode_ahead_size); + + let scan_sources = self.scan_sources.clone(); + + // We need to be able to both stop early as well as skip values, which is easier to do + // using a custom task instead of futures::stream + io_runtime.spawn(async move { + let current_row_offset_ref = &mut 0usize; + let current_path_index_ref = &mut 0usize; + + if start_rx.await.is_err() { + return Ok(()); + } + + if verbose { + eprintln!("[ParquetSource]: Starting data fetch") + } + + loop { + let current_path_index = *current_path_index_ref; + *current_path_index_ref += 1; + + let Some(v) = metadata_stream.next().await else { + break; + }; + + let (path_index, byte_source, metadata, file_max_row_group_height) = v + .map_err(|err| { + err.wrap_msg(|msg| { + format!( + "error at path (index: {}, path: {:?}): {}", + current_path_index, + scan_sources + .get(current_path_index) + .map(|x| PlSmallStr::from_str(x.to_include_path_name())), + msg + ) + }) + })?; + + assert_eq!(path_index, current_path_index); + + let current_row_offset = *current_row_offset_ref; + *current_row_offset_ref = current_row_offset.saturating_add(metadata.num_rows); + + if let Some(slice_range) = slice_range.clone() { + match SplitSlicePosition::split_slice_at_file( + current_row_offset, + metadata.num_rows, + slice_range, + ) { + SplitSlicePosition::Before => { + if verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Skipped file at index {} ({} rows)", + current_path_index, metadata.num_rows + ); + } + continue; + }, + SplitSlicePosition::After => unreachable!(), + SplitSlicePosition::Overlapping(..) => {}, + }; + }; + + if metadata_tx + .send(( + path_index, + current_row_offset, + byte_source, + metadata, + file_max_row_group_height, + )) + .await + .is_err() + { + break; + } + + if let Some(slice_range) = slice_range.as_ref() { + if *current_row_offset_ref >= slice_range.end { + if verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Stopped reading at file at index {} \ + (remaining {} files will not be read)", + current_path_index, + scan_sources.len() - current_path_index - 1, + ); + } + break; + } + }; + } + + Ok(()) + }) + } else { + // Walk the files in reverse to translate the slice into a positive offset. + let slice = self.file_options.slice.unwrap(); + let slice_start_as_n_from_end = -slice.0 as usize; + + let mut metadata_stream = futures::stream::iter((0..self.scan_sources.len()).rev()) + .map(fetch_metadata_bytes_for_path_index) + .buffered(metadata_prefetch_size) + .map(process_metadata_bytes) + .buffered(metadata_decode_ahead_size); + + // Note: + // * We want to wait until the first morsel is requested before starting this + let init_negative_slice_and_metadata = async move { + let mut processed_metadata_rev = vec![]; + let mut cum_rows = 0; + + while let Some(v) = metadata_stream.next().await { + let v = v?; + let (_, _, metadata, _) = &v; + cum_rows += metadata.num_rows; + processed_metadata_rev.push(v); + + if cum_rows >= slice_start_as_n_from_end { + break; + } + } + + let (start, len) = if slice_start_as_n_from_end > cum_rows { + // We need to trim the slice, e.g. SLICE[offset: -100, len: 75] on a file of 50 + // rows should only give the first 25 rows. + let first_file_position = slice_start_as_n_from_end - cum_rows; + (0, slice.1.saturating_sub(first_file_position)) + } else { + (cum_rows - slice_start_as_n_from_end, slice.1) + }; + + if len == 0 { + processed_metadata_rev.clear(); + } + + normalized_slice_oneshot_tx + .send(Some((start, len))) + .unwrap(); + + let slice_range = start..(start + len); + + PolarsResult::Ok((slice_range, processed_metadata_rev, cum_rows)) + }; + + let path_count = self.scan_sources.len(); + + io_runtime.spawn(async move { + if start_rx.await.is_err() { + return Ok(()); + } + + if verbose { + eprintln!("[ParquetSource]: Starting data fetch (negative slice)") + } + + let (slice_range, processed_metadata_rev, cum_rows) = + async_executor::AbortOnDropHandle::new(async_executor::spawn( + TaskPriority::Low, + init_negative_slice_and_metadata, + )) + .await?; + + if verbose { + if let Some((path_index, ..)) = processed_metadata_rev.last() { + eprintln!( + "[ParquetSource]: Slice pushdown: Negatively-offsetted slice {:?} \ + begins at file index {}, translated to {:?}", + slice, path_index, slice_range + ); + } else { + eprintln!( + "[ParquetSource]: Slice pushdown: Negatively-offsetted slice {:?} \ + skipped all files ({} files containing {} rows)", + slice, path_count, cum_rows + ) + } + } + + let metadata_iter = processed_metadata_rev.into_iter().rev(); + let current_row_offset_ref = &mut 0usize; + + for (current_path_index, byte_source, metadata, file_max_row_group_height) in + metadata_iter + { + let current_row_offset = *current_row_offset_ref; + *current_row_offset_ref = current_row_offset.saturating_add(metadata.num_rows); + + assert!(matches!( + SplitSlicePosition::split_slice_at_file( + current_row_offset, + metadata.num_rows, + slice_range.clone(), + ), + SplitSlicePosition::Overlapping(..) + )); + + if metadata_tx + .send(( + current_path_index, + current_row_offset, + byte_source, + metadata, + file_max_row_group_height, + )) + .await + .is_err() + { + break; + } + + if *current_row_offset_ref >= slice_range.end { + if verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Stopped reading at file at index {} \ + (remaining {} files will not be read)", + current_path_index, + path_count - current_path_index - 1, + ); + } + break; + } + } + + Ok(()) + }) + }; + + let metadata_task_handle = task_handles_ext::AbortOnDropHandle(metadata_task_handle); + + ( + normalized_slice_oneshot_rx, + metadata_rx, + metadata_task_handle, + ) + } + + /// Creates a `RowGroupDecoder` that turns `RowGroupData` into DataFrames. + /// This must be called AFTER the following have been initialized: + /// * `self.projected_arrow_fields` + /// * `self.physical_predicate` + pub(super) fn init_row_group_decoder(&self) -> RowGroupDecoder { + assert!( + !self.projected_arrow_fields.is_empty() + || self.file_options.with_columns.as_deref() == Some(&[]) + ); + assert_eq!(self.predicate.is_some(), self.physical_predicate.is_some()); + + let scan_sources = self.scan_sources.clone(); + let hive_partitions = self.hive_parts.clone(); + let hive_partitions_width = hive_partitions + .as_deref() + .map(|x| x[0].get_statistics().column_stats().len()) + .unwrap_or(0); + let include_file_paths = self.file_options.include_file_paths.clone(); + let projected_arrow_fields = self.projected_arrow_fields.clone(); + let row_index = self.file_options.row_index.clone(); + let physical_predicate = self.physical_predicate.clone(); + let ideal_morsel_size = get_ideal_morsel_size(); + + RowGroupDecoder { + scan_sources, + hive_partitions, + hive_partitions_width, + include_file_paths, + projected_arrow_fields, + row_index, + physical_predicate, + ideal_morsel_size, + } + } + + pub(super) fn init_projected_arrow_fields(&mut self) { + let reader_schema = self + .file_info + .reader_schema + .as_ref() + .unwrap() + .as_ref() + .unwrap_left() + .clone(); + + self.projected_arrow_fields = + if let Some(columns) = self.file_options.with_columns.as_deref() { + columns + .iter() + .map(|x| reader_schema.get(x).unwrap().clone()) + .collect() + } else { + reader_schema.iter_values().cloned().collect() + }; + + if self.verbose { + eprintln!( + "[ParquetSource]: {} columns to be projected from {} files", + self.projected_arrow_fields.len(), + self.scan_sources.len(), + ); + } + } +} diff --git a/crates/polars-stream/src/nodes/parquet_source/mem_prefetch_funcs.rs b/crates/polars-stream/src/nodes/parquet_source/mem_prefetch_funcs.rs new file mode 100644 index 000000000000..a8a356551ff6 --- /dev/null +++ b/crates/polars-stream/src/nodes/parquet_source/mem_prefetch_funcs.rs @@ -0,0 +1,71 @@ +pub(super) use polars_utils::mem::{ + madvise_populate_read, madvise_sequential, madvise_willneed, prefetch_l2, +}; +pub(super) fn no_prefetch(_: &[u8]) {} + +pub(super) fn get_memory_prefetch_func(verbose: bool) -> fn(&[u8]) -> () { + let memory_prefetch_func = match std::env::var("POLARS_MEMORY_PREFETCH").ok().as_deref() { + None => { + // Sequential advice was observed to provide speedups on Linux. + // ref https://github.com/pola-rs/polars/pull/18152#discussion_r1721701965 + #[cfg(target_os = "linux")] + { + madvise_sequential + } + #[cfg(not(target_os = "linux"))] + { + no_prefetch + } + }, + Some("no_prefetch") => no_prefetch, + Some("prefetch_l2") => prefetch_l2, + Some("madvise_sequential") => { + #[cfg(target_family = "unix")] + { + madvise_sequential + } + #[cfg(not(target_family = "unix"))] + { + panic!("POLARS_MEMORY_PREFETCH=madvise_sequential is not supported by this system"); + } + }, + Some("madvise_willneed") => { + #[cfg(target_family = "unix")] + { + madvise_willneed + } + #[cfg(not(target_family = "unix"))] + { + panic!("POLARS_MEMORY_PREFETCH=madvise_willneed is not supported by this system"); + } + }, + Some("madvise_populate_read") => { + #[cfg(target_os = "linux")] + { + madvise_populate_read + } + #[cfg(not(target_os = "linux"))] + { + panic!( + "POLARS_MEMORY_PREFETCH=madvise_populate_read is not supported by this system" + ); + } + }, + Some(v) => panic!("invalid value for POLARS_MEMORY_PREFETCH: {}", v), + }; + + if verbose { + let func_name = match memory_prefetch_func as usize { + v if v == no_prefetch as usize => "no_prefetch", + v if v == prefetch_l2 as usize => "prefetch_l2", + v if v == madvise_sequential as usize => "madvise_sequential", + v if v == madvise_willneed as usize => "madvise_willneed", + v if v == madvise_populate_read as usize => "madvise_populate_read", + _ => unreachable!(), + }; + + eprintln!("[ParquetSource] Memory prefetch function: {}", func_name); + } + + memory_prefetch_func +} diff --git a/crates/polars-stream/src/nodes/parquet_source/metadata_utils.rs b/crates/polars-stream/src/nodes/parquet_source/metadata_utils.rs new file mode 100644 index 000000000000..7c848b07b750 --- /dev/null +++ b/crates/polars-stream/src/nodes/parquet_source/metadata_utils.rs @@ -0,0 +1,156 @@ +use polars_core::prelude::{DataType, PlHashMap}; +use polars_error::{polars_bail, PolarsResult}; +use polars_io::prelude::FileMetadata; +use polars_io::utils::byte_source::{ByteSource, DynByteSource}; +use polars_utils::mmap::MemSlice; +use polars_utils::pl_str::PlSmallStr; + +/// Read the metadata bytes of a parquet file, does not decode the bytes. If during metadata fetch +/// the bytes of the entire file are loaded, it is returned in the second return value. +pub(super) async fn read_parquet_metadata_bytes( + byte_source: &DynByteSource, + verbose: bool, +) -> PolarsResult<(MemSlice, Option)> { + use polars_parquet::parquet::error::ParquetError; + use polars_parquet::parquet::PARQUET_MAGIC; + + const FOOTER_HEADER_SIZE: usize = polars_parquet::parquet::FOOTER_SIZE as usize; + + let file_size = byte_source.get_size().await?; + + if file_size < FOOTER_HEADER_SIZE { + return Err(ParquetError::OutOfSpec(format!( + "file size ({}) is less than minimum size required to store parquet footer ({})", + file_size, FOOTER_HEADER_SIZE + )) + .into()); + } + + let estimated_metadata_size = if let DynByteSource::MemSlice(_) = byte_source { + // Mmapped or in-memory, reads are free. + file_size + } else { + (file_size / 2048).clamp(16_384, 131_072).min(file_size) + }; + + let bytes = byte_source + .get_range((file_size - estimated_metadata_size)..file_size) + .await?; + + let footer_header_bytes = bytes.slice((bytes.len() - FOOTER_HEADER_SIZE)..bytes.len()); + + let (v, remaining) = footer_header_bytes.split_at(4); + let footer_size = i32::from_le_bytes(v.try_into().unwrap()); + + if remaining != PARQUET_MAGIC { + return Err(ParquetError::OutOfSpec(format!( + r#"expected parquet magic bytes "{}" in footer, got "{}" instead"#, + std::str::from_utf8(&PARQUET_MAGIC).unwrap(), + String::from_utf8_lossy(remaining) + )) + .into()); + } + + if footer_size < 0 { + return Err(ParquetError::OutOfSpec(format!( + "expected positive footer size, got {} instead", + footer_size + )) + .into()); + } + + let footer_size = footer_size as usize + FOOTER_HEADER_SIZE; + + if file_size < footer_size { + return Err(ParquetError::OutOfSpec(format!( + "file size ({}) is less than the indicated footer size ({})", + file_size, footer_size + )) + .into()); + } + + if bytes.len() < footer_size { + debug_assert!(!matches!(byte_source, DynByteSource::MemSlice(_))); + if verbose { + eprintln!( + "[ParquetSource]: Extra {} bytes need to be fetched for metadata \ + (initial estimate = {}, actual size = {})", + footer_size - estimated_metadata_size, + bytes.len(), + footer_size, + ); + } + + let mut out = Vec::with_capacity(footer_size); + let offset = file_size - footer_size; + let len = footer_size - bytes.len(); + let delta_bytes = byte_source.get_range(offset..(offset + len)).await?; + + debug_assert!(out.capacity() >= delta_bytes.len() + bytes.len()); + + out.extend_from_slice(&delta_bytes); + out.extend_from_slice(&bytes); + + Ok((MemSlice::from_vec(out), None)) + } else { + if verbose && !matches!(byte_source, DynByteSource::MemSlice(_)) { + eprintln!( + "[ParquetSource]: Fetched all bytes for metadata on first try \ + (initial estimate = {}, actual size = {}, excess = {})", + bytes.len(), + footer_size, + estimated_metadata_size - footer_size, + ); + } + + let metadata_bytes = bytes.slice((bytes.len() - footer_size)..bytes.len()); + + if bytes.len() == file_size { + Ok((metadata_bytes, Some(bytes))) + } else { + debug_assert!(!matches!(byte_source, DynByteSource::MemSlice(_))); + let metadata_bytes = if bytes.len() - footer_size >= bytes.len() { + // Re-allocate to drop the excess bytes + MemSlice::from_vec(metadata_bytes.to_vec()) + } else { + metadata_bytes + }; + + Ok((metadata_bytes, None)) + } + } +} + +/// Ensures that a parquet file has all the necessary columns for a projection with the correct +/// dtype. There are no ordering requirements and extra columns are permitted. +pub(super) fn ensure_metadata_has_projected_fields( + projected_fields: &[polars_core::prelude::ArrowField], + metadata: &FileMetadata, +) -> PolarsResult<()> { + let schema = polars_parquet::arrow::read::infer_schema(metadata)?; + + // Note: We convert to Polars-native dtypes for timezone normalization. + let mut schema = schema + .into_iter_values() + .map(|x| { + let dtype = DataType::from_arrow(&x.dtype, true); + (x.name, dtype) + }) + .collect::>(); + + for field in projected_fields { + let Some(dtype) = schema.remove(&field.name) else { + polars_bail!(SchemaMismatch: "did not find column: {}", field.name) + }; + + let expected_dtype = DataType::from_arrow(&field.dtype, true); + + if dtype != expected_dtype { + polars_bail!(SchemaMismatch: "data type mismatch for column {}: found: {}, expected: {}", + &field.name, dtype, expected_dtype + ) + } + } + + Ok(()) +} diff --git a/crates/polars-stream/src/nodes/parquet_source/mod.rs b/crates/polars-stream/src/nodes/parquet_source/mod.rs new file mode 100644 index 000000000000..a9344aa35c21 --- /dev/null +++ b/crates/polars-stream/src/nodes/parquet_source/mod.rs @@ -0,0 +1,262 @@ +use std::sync::atomic::AtomicBool; +use std::sync::Arc; + +use mem_prefetch_funcs::get_memory_prefetch_func; +use polars_core::config; +use polars_core::frame::DataFrame; +use polars_error::PolarsResult; +use polars_expr::prelude::{phys_expr_to_io_expr, PhysicalExpr}; +use polars_io::cloud::CloudOptions; +use polars_io::predicates::PhysicalIoExpr; +use polars_io::prelude::ParquetOptions; +use polars_io::utils::byte_source::DynByteSourceBuilder; +use polars_plan::plans::hive::HivePartitions; +use polars_plan::plans::{FileInfo, ScanSources}; +use polars_plan::prelude::FileScanOptions; +use row_group_decode::RowGroupDecoder; + +use super::compute_node_prelude::*; +use super::{MorselSeq, TaskPriority}; +use crate::async_executor::{self}; +use crate::async_primitives::wait_group::WaitToken; +use crate::morsel::SourceToken; + +mod init; +mod mem_prefetch_funcs; +mod metadata_utils; +mod row_group_data_fetch; +mod row_group_decode; + +type AsyncTaskData = Option<( + Vec>, + async_executor::AbortOnDropHandle>, +)>; + +#[allow(clippy::type_complexity)] +pub struct ParquetSourceNode { + scan_sources: ScanSources, + file_info: FileInfo, + hive_parts: Option>>, + predicate: Option>, + options: ParquetOptions, + cloud_options: Option, + file_options: FileScanOptions, + // Run-time vars + config: Config, + verbose: bool, + physical_predicate: Option>, + projected_arrow_fields: Arc<[polars_core::prelude::ArrowField]>, + byte_source_builder: DynByteSourceBuilder, + memory_prefetch_func: fn(&[u8]) -> (), + // This permit blocks execution until the first morsel is requested. + morsel_stream_starter: Option>, + // This is behind a Mutex so that we can call `shutdown()` asynchronously. + async_task_data: Arc>, + row_group_decoder: Option>, + is_finished: Arc, +} + +#[derive(Debug)] +struct Config { + num_pipelines: usize, + /// Number of files to pre-fetch metadata for concurrently + metadata_prefetch_size: usize, + /// Number of files to decode metadata for in parallel in advance + metadata_decode_ahead_size: usize, + /// Number of row groups to pre-fetch concurrently, this can be across files + row_group_prefetch_size: usize, +} + +#[allow(clippy::too_many_arguments)] +impl ParquetSourceNode { + pub fn new( + scan_sources: ScanSources, + file_info: FileInfo, + hive_parts: Option>>, + predicate: Option>, + options: ParquetOptions, + cloud_options: Option, + file_options: FileScanOptions, + ) -> Self { + let verbose = config::verbose(); + + let byte_source_builder = if scan_sources.is_cloud_url() || config::force_async() { + DynByteSourceBuilder::ObjectStore + } else { + DynByteSourceBuilder::Mmap + }; + let memory_prefetch_func = get_memory_prefetch_func(verbose); + + Self { + scan_sources, + file_info, + hive_parts, + predicate, + options, + cloud_options, + file_options, + + config: Config { + // Initialized later + num_pipelines: 0, + metadata_prefetch_size: 0, + metadata_decode_ahead_size: 0, + row_group_prefetch_size: 0, + }, + verbose, + physical_predicate: None, + projected_arrow_fields: Arc::new([]), + byte_source_builder, + memory_prefetch_func, + + morsel_stream_starter: None, + async_task_data: Arc::new(tokio::sync::Mutex::new(None)), + row_group_decoder: None, + is_finished: Arc::new(AtomicBool::new(false)), + } + } +} + +impl ComputeNode for ParquetSourceNode { + fn name(&self) -> &str { + "parquet_source" + } + + fn initialize(&mut self, num_pipelines: usize) { + self.config = { + let metadata_prefetch_size = polars_core::config::get_file_prefetch_size(); + // Limit metadata decode to the number of threads. + let metadata_decode_ahead_size = + (metadata_prefetch_size / 2).min(1 + num_pipelines).max(1); + let row_group_prefetch_size = polars_core::config::get_rg_prefetch_size(); + + Config { + num_pipelines, + metadata_prefetch_size, + metadata_decode_ahead_size, + row_group_prefetch_size, + } + }; + + if self.verbose { + eprintln!("[ParquetSource]: {:?}", &self.config); + } + + self.init_projected_arrow_fields(); + self.physical_predicate = self.predicate.clone().map(phys_expr_to_io_expr); + + let (raw_morsel_receivers, morsel_stream_task_handle) = self.init_raw_morsel_stream(); + + self.async_task_data + .try_lock() + .unwrap() + .replace((raw_morsel_receivers, morsel_stream_task_handle)); + + let row_group_decoder = self.init_row_group_decoder(); + self.row_group_decoder = Some(Arc::new(row_group_decoder)); + } + + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { + use std::sync::atomic::Ordering; + + assert!(recv.is_empty()); + assert_eq!(send.len(), 1); + + if self.is_finished.load(Ordering::Relaxed) { + send[0] = PortState::Done; + assert!( + self.async_task_data.try_lock().unwrap().is_none(), + "should have already been shut down" + ); + } else if send[0] == PortState::Done { + { + // Early shutdown - our port state was set to `Done` by the downstream nodes. + self.shutdown_in_background(); + }; + self.is_finished.store(true, Ordering::Relaxed); + } else { + send[0] = PortState::Ready + } + + Ok(()) + } + + fn spawn<'env, 's>( + &'env mut self, + scope: &'s TaskScope<'s, 'env>, + recv: &mut [Option>], + send: &mut [Option>], + _state: &'s ExecutionState, + join_handles: &mut Vec>>, + ) { + use std::sync::atomic::Ordering; + + assert!(recv.is_empty()); + assert_eq!(send.len(), 1); + assert!(!self.is_finished.load(Ordering::Relaxed)); + + let morsel_senders = send[0].take().unwrap().parallel(); + + let mut async_task_data_guard = self.async_task_data.try_lock().unwrap(); + let (raw_morsel_receivers, _) = async_task_data_guard.as_mut().unwrap(); + + assert_eq!(raw_morsel_receivers.len(), morsel_senders.len()); + + if let Some(v) = self.morsel_stream_starter.take() { + v.send(()).unwrap(); + } + let is_finished = self.is_finished.clone(); + + let task_handles = raw_morsel_receivers + .drain(..) + .zip(morsel_senders) + .map(|(mut raw_morsel_rx, mut morsel_tx)| { + let is_finished = is_finished.clone(); + + scope.spawn_task(TaskPriority::Low, async move { + let source_token = SourceToken::new(); + loop { + let Ok((df, morsel_seq, wait_token)) = raw_morsel_rx.recv().await else { + is_finished.store(true, Ordering::Relaxed); + break; + }; + + let mut morsel = Morsel::new(df, morsel_seq, source_token.clone()); + morsel.set_consume_token(wait_token); + + if morsel_tx.send(morsel).await.is_err() { + break; + } + + if source_token.stop_requested() { + break; + } + } + + raw_morsel_rx + }) + }) + .collect::>(); + + drop(async_task_data_guard); + + let async_task_data = self.async_task_data.clone(); + + join_handles.push(scope.spawn_task(TaskPriority::Low, async move { + { + let mut async_task_data_guard = async_task_data.try_lock().unwrap(); + let (raw_morsel_receivers, _) = async_task_data_guard.as_mut().unwrap(); + + for handle in task_handles { + raw_morsel_receivers.push(handle.await); + } + } + + if self.is_finished.load(Ordering::Relaxed) { + self.shutdown().await?; + } + + Ok(()) + })) + } +} diff --git a/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs b/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs new file mode 100644 index 000000000000..761131ecf1d5 --- /dev/null +++ b/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs @@ -0,0 +1,390 @@ +use std::future::Future; +use std::sync::Arc; + +use polars_core::prelude::{ArrowSchema, InitHashMaps, PlHashMap}; +use polars_core::utils::operation_exceeded_idxsize_msg; +use polars_error::{polars_err, PolarsResult}; +use polars_io::predicates::PhysicalIoExpr; +use polars_io::prelude::FileMetadata; +use polars_io::prelude::_internal::read_this_row_group; +use polars_io::utils::byte_source::{ByteSource, DynByteSource}; +use polars_io::utils::slice::SplitSlicePosition; +use polars_parquet::read::RowGroupMetadata; +use polars_utils::mmap::MemSlice; +use polars_utils::pl_str::PlSmallStr; +use polars_utils::slice::GetSaferUnchecked; +use polars_utils::IdxSize; + +use super::mem_prefetch_funcs; +use super::row_group_decode::SharedFileState; +use crate::async_executor; +use crate::nodes::TaskPriority; +use crate::utils::task_handles_ext; + +/// Represents byte-data that can be transformed into a DataFrame after some computation. +pub(super) struct RowGroupData { + pub(super) byte_source: FetchedBytes, + pub(super) path_index: usize, + pub(super) row_offset: usize, + pub(super) slice: Option<(usize, usize)>, + pub(super) file_max_row_group_height: usize, + pub(super) row_group_metadata: RowGroupMetadata, + pub(super) shared_file_state: Arc>, +} + +pub(super) struct RowGroupDataFetcher { + pub(super) metadata_rx: crate::async_primitives::connector::Receiver<( + usize, + usize, + Arc, + FileMetadata, + usize, + )>, + pub(super) use_statistics: bool, + pub(super) verbose: bool, + pub(super) reader_schema: Arc, + pub(super) projection: Option>, + pub(super) predicate: Option>, + pub(super) slice_range: Option>, + pub(super) memory_prefetch_func: fn(&[u8]) -> (), + pub(super) current_path_index: usize, + pub(super) current_byte_source: Arc, + pub(super) current_row_groups: std::vec::IntoIter, + pub(super) current_row_group_idx: usize, + pub(super) current_max_row_group_height: usize, + pub(super) current_row_offset: usize, + pub(super) current_shared_file_state: Arc>, +} + +impl RowGroupDataFetcher { + pub(super) fn into_stream(self) -> RowGroupDataStream { + RowGroupDataStream::new(self) + } + + pub(super) async fn init_next_file_state(&mut self) -> bool { + let Ok((path_index, row_offset, byte_source, metadata, file_max_row_group_height)) = + self.metadata_rx.recv().await + else { + return false; + }; + + self.current_path_index = path_index; + self.current_byte_source = byte_source; + self.current_max_row_group_height = file_max_row_group_height; + // The metadata task also sends a row offset to start counting from as it may skip files + // during slice pushdown. + self.current_row_offset = row_offset; + self.current_row_group_idx = 0; + self.current_row_groups = metadata.row_groups.into_iter(); + self.current_shared_file_state = Default::default(); + + true + } + + pub(super) async fn next( + &mut self, + ) -> Option>>> { + 'main: loop { + for row_group_metadata in self.current_row_groups.by_ref() { + let current_row_offset = self.current_row_offset; + let current_row_group_idx = self.current_row_group_idx; + + let num_rows = row_group_metadata.num_rows(); + + self.current_row_offset = current_row_offset.saturating_add(num_rows); + self.current_row_group_idx += 1; + + if self.use_statistics + && !match read_this_row_group( + self.predicate.as_deref(), + &row_group_metadata, + self.reader_schema.as_ref(), + ) { + Ok(v) => v, + Err(e) => return Some(Err(e)), + } + { + if self.verbose { + eprintln!( + "[ParquetSource]: Predicate pushdown: \ + Skipped row group {} in file {} ({} rows)", + current_row_group_idx, self.current_path_index, num_rows + ); + } + continue; + } + + if num_rows > IdxSize::MAX as usize { + let msg = operation_exceeded_idxsize_msg( + format!("number of rows in row group ({})", num_rows).as_str(), + ); + return Some(Err(polars_err!(ComputeError: msg))); + } + + let slice = if let Some(slice_range) = self.slice_range.clone() { + let (offset, len) = match SplitSlicePosition::split_slice_at_file( + current_row_offset, + num_rows, + slice_range, + ) { + SplitSlicePosition::Before => { + if self.verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Skipped row group {} in file {} ({} rows)", + current_row_group_idx, self.current_path_index, num_rows + ); + } + continue; + }, + SplitSlicePosition::After => { + if self.verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Stop at row group {} in file {} \ + (remaining {} row groups will not be read)", + current_row_group_idx, + self.current_path_index, + self.current_row_groups.len(), + ); + }; + break 'main; + }, + SplitSlicePosition::Overlapping(offset, len) => (offset, len), + }; + + Some((offset, len)) + } else { + None + }; + + let current_byte_source = self.current_byte_source.clone(); + let projection = self.projection.clone(); + let current_shared_file_state = self.current_shared_file_state.clone(); + let memory_prefetch_func = self.memory_prefetch_func; + let io_runtime = polars_io::pl_async::get_runtime(); + let current_path_index = self.current_path_index; + let current_max_row_group_height = self.current_max_row_group_height; + + // Push calculation of byte ranges to a task to run in parallel, as it can be + // expensive for very wide tables and projections. + let handle = async_executor::spawn(TaskPriority::Low, async move { + let byte_source = if let DynByteSource::MemSlice(mem_slice) = + current_byte_source.as_ref() + { + // Skip byte range calculation for `no_prefetch`. + if memory_prefetch_func as usize != mem_prefetch_funcs::no_prefetch as usize + { + let slice = mem_slice.0.as_ref(); + + if let Some(columns) = projection.as_ref() { + for range in get_row_group_byte_ranges_for_projection( + &row_group_metadata, + columns.as_ref(), + ) { + memory_prefetch_func(unsafe { + slice.get_unchecked_release(range) + }) + } + } else { + let mut iter = get_row_group_byte_ranges(&row_group_metadata); + let first = iter.next().unwrap(); + let range = + iter.fold(first, |l, r| l.start.min(r.start)..l.end.max(r.end)); + + memory_prefetch_func(unsafe { slice.get_unchecked_release(range) }) + }; + } + + // We have a mmapped or in-memory slice representing the entire + // file that can be sliced directly, so we can skip the byte-range + // calculations and HashMap allocation. + let mem_slice = mem_slice.0.clone(); + FetchedBytes::MemSlice { + offset: 0, + mem_slice, + } + } else if let Some(columns) = projection.as_ref() { + let ranges = get_row_group_byte_ranges_for_projection( + &row_group_metadata, + columns.as_ref(), + ) + .collect::>(); + + let bytes = { + let ranges_2 = ranges.clone(); + task_handles_ext::AbortOnDropHandle(io_runtime.spawn(async move { + current_byte_source.get_ranges(ranges_2.as_ref()).await + })) + .await + .unwrap()? + }; + + assert_eq!(bytes.len(), ranges.len()); + + let mut bytes_map = PlHashMap::with_capacity(ranges.len()); + + for (range, bytes) in ranges.iter().zip(bytes) { + memory_prefetch_func(bytes.as_ref()); + let v = bytes_map.insert(range.start, bytes); + debug_assert!(v.is_none(), "duplicate range start {}", range.start); + } + + FetchedBytes::BytesMap(bytes_map) + } else { + // We have a dedicated code-path for a full projection that performs a + // single range request for the entire row group. During testing this + // provided much higher throughput from cloud than making multiple range + // request with `get_ranges()`. + let mut iter = get_row_group_byte_ranges(&row_group_metadata); + let mut ranges = Vec::with_capacity(iter.len()); + let first = iter.next().unwrap(); + ranges.push(first.clone()); + let full_range = iter.fold(first, |l, r| { + ranges.push(r.clone()); + l.start.min(r.start)..l.end.max(r.end) + }); + + let mem_slice = { + let full_range_2 = full_range.clone(); + task_handles_ext::AbortOnDropHandle(io_runtime.spawn(async move { + current_byte_source.get_range(full_range_2).await + })) + .await + .unwrap()? + }; + + FetchedBytes::MemSlice { + offset: full_range.start, + mem_slice, + } + }; + + PolarsResult::Ok(RowGroupData { + byte_source, + path_index: current_path_index, + row_offset: current_row_offset, + slice, + file_max_row_group_height: current_max_row_group_height, + row_group_metadata, + shared_file_state: current_shared_file_state.clone(), + }) + }); + + let handle = async_executor::AbortOnDropHandle::new(handle); + return Some(Ok(handle)); + } + + // Initialize state to the next file. + if !self.init_next_file_state().await { + break; + } + } + + None + } +} + +pub(super) enum FetchedBytes { + MemSlice { mem_slice: MemSlice, offset: usize }, + BytesMap(PlHashMap), +} + +impl FetchedBytes { + pub(super) fn get_range(&self, range: std::ops::Range) -> MemSlice { + match self { + Self::MemSlice { mem_slice, offset } => { + let offset = *offset; + debug_assert!(range.start >= offset); + mem_slice.slice(range.start - offset..range.end - offset) + }, + Self::BytesMap(v) => { + let v = v.get(&range.start).unwrap(); + debug_assert_eq!(v.len(), range.len()); + v.clone() + }, + } + } +} + +#[rustfmt::skip] +type RowGroupDataStreamFut = std::pin::Pin , + Option < + PolarsResult < + async_executor::AbortOnDropHandle < + PolarsResult < + RowGroupData > > > > + ) + > + Send +>>; + +pub(super) struct RowGroupDataStream { + current_future: RowGroupDataStreamFut, +} + +impl RowGroupDataStream { + fn new(row_group_data_fetcher: RowGroupDataFetcher) -> Self { + // [`RowGroupDataFetcher`] is a big struct, so we Box it once here to avoid boxing it on + // every `next()` call. + let current_future = Self::call_next_owned(Box::new(row_group_data_fetcher)); + Self { current_future } + } + + fn call_next_owned( + mut row_group_data_fetcher: Box, + ) -> RowGroupDataStreamFut { + Box::pin(async move { + let out = row_group_data_fetcher.next().await; + (row_group_data_fetcher, out) + }) + } +} + +impl futures::stream::Stream for RowGroupDataStream { + type Item = PolarsResult>>; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + use std::pin::Pin; + use std::task::Poll; + + match Pin::new(&mut self.current_future.as_mut()).poll(cx) { + Poll::Ready((row_group_data_fetcher, out)) => { + if out.is_some() { + self.current_future = Self::call_next_owned(row_group_data_fetcher); + } + + Poll::Ready(out) + }, + Poll::Pending => Poll::Pending, + } + } +} + +fn get_row_group_byte_ranges( + row_group_metadata: &RowGroupMetadata, +) -> impl ExactSizeIterator> + '_ { + row_group_metadata + .byte_ranges_iter() + .map(|byte_range| byte_range.start as usize..byte_range.end as usize) +} + +fn get_row_group_byte_ranges_for_projection<'a>( + row_group_metadata: &'a RowGroupMetadata, + columns: &'a [PlSmallStr], +) -> impl Iterator> + 'a { + columns.iter().flat_map(|col_name| { + row_group_metadata + .columns_under_root_iter(col_name) + .map(|col| { + let byte_range = col.byte_range(); + byte_range.start as usize..byte_range.end as usize + }) + }) +} diff --git a/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs b/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs new file mode 100644 index 000000000000..b3249e60057c --- /dev/null +++ b/crates/polars-stream/src/nodes/parquet_source/row_group_decode.rs @@ -0,0 +1,287 @@ +use std::sync::Arc; + +use polars_core::frame::DataFrame; +use polars_core::prelude::{ChunkFull, IdxCa, StringChunked}; +use polars_core::series::{IntoSeries, IsSorted, Series}; +use polars_error::{polars_bail, PolarsResult}; +use polars_io::predicates::PhysicalIoExpr; +use polars_io::RowIndex; +use polars_plan::plans::hive::HivePartitions; +use polars_plan::plans::ScanSources; +use polars_utils::pl_str::PlSmallStr; +use polars_utils::IdxSize; + +use super::row_group_data_fetch::RowGroupData; +use crate::async_executor; +use crate::nodes::TaskPriority; + +/// Turns row group data into DataFrames. +pub(super) struct RowGroupDecoder { + pub(super) scan_sources: ScanSources, + pub(super) hive_partitions: Option>>, + pub(super) hive_partitions_width: usize, + pub(super) include_file_paths: Option, + pub(super) projected_arrow_fields: Arc<[polars_core::prelude::ArrowField]>, + pub(super) row_index: Option, + pub(super) physical_predicate: Option>, + pub(super) ideal_morsel_size: usize, +} + +impl RowGroupDecoder { + pub(super) async fn row_group_data_to_df( + &self, + row_group_data: RowGroupData, + ) -> PolarsResult> { + let row_group_data = Arc::new(row_group_data); + + let out_width = self.row_index.is_some() as usize + + self.projected_arrow_fields.len() + + self.hive_partitions_width + + self.include_file_paths.is_some() as usize; + + let mut out_columns = Vec::with_capacity(out_width); + + if self.row_index.is_some() { + // Add a placeholder so that we don't have to shift the entire vec + // later. + out_columns.push(Series::default()); + } + + let slice_range = row_group_data + .slice + .map(|(offset, len)| offset..offset + len) + .unwrap_or(0..row_group_data.row_group_metadata.num_rows()); + + let projected_arrow_fields = &self.projected_arrow_fields; + let projected_arrow_fields = projected_arrow_fields.clone(); + + let row_group_data_2 = row_group_data.clone(); + let slice_range_2 = slice_range.clone(); + + // Minimum number of values to amortize the overhead of spawning tasks. + // This value is arbitrarily chosen. + const VALUES_PER_THREAD: usize = 16_777_216; + let n_rows = row_group_data.row_group_metadata.num_rows(); + let cols_per_task = 1 + VALUES_PER_THREAD / n_rows; + + let decode_fut_iter = (0..self.projected_arrow_fields.len()) + .step_by(cols_per_task) + .map(move |offset| { + let row_group_data = row_group_data_2.clone(); + let slice_range = slice_range_2.clone(); + let projected_arrow_fields = projected_arrow_fields.clone(); + + async move { + (offset + ..offset + .saturating_add(cols_per_task) + .min(projected_arrow_fields.len())) + .map(|i| { + let arrow_field = projected_arrow_fields[i].clone(); + + let columns_to_deserialize = row_group_data + .row_group_metadata + .columns_under_root_iter(&arrow_field.name) + .map(|col_md| { + let byte_range = col_md.byte_range(); + + ( + col_md, + row_group_data.byte_source.get_range( + byte_range.start as usize..byte_range.end as usize, + ), + ) + }) + .collect::>(); + + assert!( + slice_range.end <= row_group_data.row_group_metadata.num_rows() + ); + + let array = polars_io::prelude::_internal::to_deserializer( + columns_to_deserialize, + arrow_field.clone(), + Some(polars_parquet::read::Filter::Range(slice_range.clone())), + )?; + + let series = Series::try_from((&arrow_field, array))?; + + // TODO: Also load in the metadata. + + PolarsResult::Ok(series) + }) + .collect::>>() + } + }); + + if decode_fut_iter.len() > 1 { + for handle in decode_fut_iter.map(|fut| { + async_executor::AbortOnDropHandle::new(async_executor::spawn( + TaskPriority::Low, + fut, + )) + }) { + out_columns.extend(handle.await?); + } + } else { + for fut in decode_fut_iter { + out_columns.extend(fut.await?); + } + } + + let projection_height = if self.projected_arrow_fields.is_empty() { + slice_range.len() + } else { + debug_assert!(out_columns.len() > self.row_index.is_some() as usize); + out_columns.last().unwrap().len() + }; + + if let Some(RowIndex { name, offset }) = self.row_index.as_ref() { + let Some(offset) = (|| { + let offset = offset + .checked_add((row_group_data.row_offset + slice_range.start) as IdxSize)?; + offset.checked_add(projection_height as IdxSize)?; + + Some(offset) + })() else { + let msg = format!( + "adding a row index column with offset {} overflows at {} rows", + offset, + row_group_data.row_offset + slice_range.end + ); + polars_bail!(ComputeError: msg) + }; + + // The DataFrame can be empty at this point if no columns were projected from the file, + // so we create the row index column manually instead of using `df.with_row_index` to + // ensure it has the correct number of rows. + let mut ca = IdxCa::from_vec( + name.clone(), + (offset..offset + projection_height as IdxSize).collect(), + ); + ca.set_sorted_flag(IsSorted::Ascending); + + out_columns[0] = ca.into_series(); + } + + let shared_file_state = row_group_data + .shared_file_state + .get_or_init(|| async { + let path_index = row_group_data.path_index; + + let hive_series = if let Some(hp) = self.hive_partitions.as_deref() { + let mut v = hp[path_index].materialize_partition_columns(); + for s in v.iter_mut() { + *s = s.new_from_index(0, row_group_data.file_max_row_group_height); + } + v + } else { + vec![] + }; + + let file_path_series = self.include_file_paths.clone().map(|file_path_col| { + StringChunked::full( + file_path_col, + self.scan_sources + .get(path_index) + .unwrap() + .to_include_path_name(), + row_group_data.file_max_row_group_height, + ) + .into_series() + }); + + SharedFileState { + path_index, + hive_series, + file_path_series, + } + }) + .await; + + assert_eq!(shared_file_state.path_index, row_group_data.path_index); + + for s in &shared_file_state.hive_series { + debug_assert!(s.len() >= projection_height); + out_columns.push(s.slice(0, projection_height)); + } + + if let Some(file_path_series) = &shared_file_state.file_path_series { + debug_assert!(file_path_series.len() >= projection_height); + out_columns.push(file_path_series.slice(0, projection_height)); + } + + let df = unsafe { DataFrame::new_no_checks(out_columns) }; + + // Re-calculate: A slice may have been applied. + let cols_per_task = 1 + VALUES_PER_THREAD / df.height(); + + let df = if let Some(predicate) = self.physical_predicate.as_deref() { + let mask = predicate.evaluate_io(&df)?; + let mask = mask.bool().unwrap(); + + if cols_per_task <= df.width() { + df._filter_seq(mask)? + } else { + let mask = mask.clone(); + let cols = Arc::new(df.take_columns()); + let mut out_cols = Vec::with_capacity(cols.len()); + + for handle in (0..cols.len()) + .step_by(cols_per_task) + .map(move |offset| { + let cols = cols.clone(); + let mask = mask.clone(); + async move { + cols[offset..offset.saturating_add(cols_per_task).min(cols.len())] + .iter() + .map(|s| s.filter(&mask)) + .collect::>>() + } + }) + .map(|fut| { + async_executor::AbortOnDropHandle::new(async_executor::spawn( + TaskPriority::Low, + fut, + )) + }) + { + out_cols.extend(handle.await?); + } + + unsafe { DataFrame::new_no_checks(out_cols) } + } + } else { + df + }; + + assert_eq!(df.width(), out_width); + + let n_morsels = if df.height() > 3 * self.ideal_morsel_size / 2 { + // num_rows > (1.5 * ideal_morsel_size) + (df.height() / self.ideal_morsel_size).max(2) + } else { + 1 + } as u64; + + if n_morsels == 1 { + return Ok(vec![df]); + } + + let rows_per_morsel = 1 + df.height() / n_morsels as usize; + + let out = (0..i64::try_from(df.height()).unwrap()) + .step_by(rows_per_morsel) + .map(|offset| df.slice(offset, rows_per_morsel)) + .collect::>(); + + Ok(out) + } +} + +/// State shared across row groups for a single file. +pub(super) struct SharedFileState { + path_index: usize, + hive_series: Vec, + file_path_series: Option, +} From 760ab20367a39529e9bcb37e24f595d0f71d2828 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Tue, 10 Sep 2024 14:15:07 +0200 Subject: [PATCH 12/28] chore: Don't raise on multiple same names in ie_join (#18658) --- .../polars-plan/src/plans/conversion/join.rs | 21 ++++++++++++------- .../unit/operations/test_inequality_join.py | 16 ++++++++++++++ 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/crates/polars-plan/src/plans/conversion/join.rs b/crates/polars-plan/src/plans/conversion/join.rs index 0a073934b5d3..e7199b2c13d2 100644 --- a/crates/polars-plan/src/plans/conversion/join.rs +++ b/crates/polars-plan/src/plans/conversion/join.rs @@ -90,13 +90,20 @@ pub fn resolve_join( let left_on = to_expr_irs_ignore_alias(left_on, ctxt.expr_arena)?; let right_on = to_expr_irs_ignore_alias(right_on, ctxt.expr_arena)?; let mut joined_on = PlHashSet::new(); - for (l, r) in left_on.iter().zip(right_on.iter()) { - polars_ensure!( - joined_on.insert((l.output_name(), r.output_name())), - InvalidOperation: "joining with repeated key names; already joined on {} and {}", - l.output_name(), - r.output_name() - ) + + #[cfg(feature = "iejoin")] + let check = !matches!(options.args.how, JoinType::IEJoin(_)); + #[cfg(not(feature = "iejoin"))] + let check = true; + if check { + for (l, r) in left_on.iter().zip(right_on.iter()) { + polars_ensure!( + joined_on.insert((l.output_name(), r.output_name())), + InvalidOperation: "joining with repeated key names; already joined on {} and {}", + l.output_name(), + r.output_name() + ) + } } drop(joined_on); diff --git a/py-polars/tests/unit/operations/test_inequality_join.py b/py-polars/tests/unit/operations/test_inequality_join.py index 94839c39fb4f..cc4ea5c6bb02 100644 --- a/py-polars/tests/unit/operations/test_inequality_join.py +++ b/py-polars/tests/unit/operations/test_inequality_join.py @@ -466,3 +466,19 @@ def test_raise_invalid_input_join_where() -> None: df = pl.DataFrame({"id": [1, 2]}) with pytest.raises(pl.exceptions.InvalidOperationError): df.join_where(df) + + +def test_ie_join_use_keys_multiple() -> None: + a = pl.LazyFrame({"a": [1, 2, 3], "x": [7, 2, 1]}) + b = pl.LazyFrame({"b": [2, 2, 2], "x": [7, 1, 3]}) + + assert a.join_where( + b, + pl.col.a >= pl.col.b, + pl.col.a <= pl.col.b, + ).collect().sort("x_right").to_dict(as_series=False) == { + "a": [2, 2, 2], + "x": [2, 2, 2], + "b": [2, 2, 2], + "x_right": [1, 3, 7], + } From 655c78110d535f76059c89b72af880842ec7a80a Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Tue, 10 Sep 2024 22:25:40 +1000 Subject: [PATCH 13/28] refactor(rust): Rename `MemSlice::from_slice` -> `MemSlice::from_static` (#18657) --- crates/polars-io/src/mmap.rs | 2 +- crates/polars-utils/src/mmap.rs | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/crates/polars-io/src/mmap.rs b/crates/polars-io/src/mmap.rs index 498c73da1a9d..df91f32942f9 100644 --- a/crates/polars-io/src/mmap.rs +++ b/crates/polars-io/src/mmap.rs @@ -87,7 +87,7 @@ impl std::ops::Deref for ReaderBytes<'_> { impl ReaderBytes<'static> { pub fn into_mem_slice(self) -> MemSlice { match self { - ReaderBytes::Borrowed(v) => MemSlice::from_slice(v), + ReaderBytes::Borrowed(v) => MemSlice::from_static(v), ReaderBytes::Owned(v) => MemSlice::from_vec(v), ReaderBytes::Mapped(v, _) => MemSlice::from_mmap(Arc::new(v)), } diff --git a/crates/polars-utils/src/mmap.rs b/crates/polars-utils/src/mmap.rs index 9e946b3dac52..1a9e20191b1e 100644 --- a/crates/polars-utils/src/mmap.rs +++ b/crates/polars-utils/src/mmap.rs @@ -62,6 +62,10 @@ mod private { } impl MemSlice { + pub const fn empty() -> Self { + Self::from_static(&[]) + } + /// Copy the contents into a new owned `Vec` #[inline(always)] pub fn to_vec(self) -> Vec { @@ -101,8 +105,9 @@ mod private { /// Construct a `MemSlice` that simply wraps around a `&[u8]`. #[inline] - pub fn from_slice(slice: &'static [u8]) -> Self { - Self::from_bytes(bytes::Bytes::from_static(slice)) + pub const fn from_static(slice: &'static [u8]) -> Self { + let inner = MemSliceInner::Bytes(bytes::Bytes::from_static(slice)); + Self { slice, inner } } /// Attempt to prefetch the memory belonging to to this [`MemSlice`] @@ -170,7 +175,7 @@ impl MemReader { /// slice outlives the returned `MemSlice`. #[inline] pub fn from_slice(slice: &'static [u8]) -> Self { - Self::new(MemSlice::from_slice(slice)) + Self::new(MemSlice::from_static(slice)) } #[inline(always)] @@ -377,8 +382,9 @@ mod tests { let slice = vec.as_slice(); let ptr = slice.as_ptr(); - let mem_slice = - MemSlice::from_slice(unsafe { std::mem::transmute::<&[u8], &'static [u8]>(slice) }); + let mem_slice = MemSlice::from_static(unsafe { + std::mem::transmute::<&[u8], &'static [u8]>(slice) + }); let ptr_out = mem_slice.as_ptr(); assert_eq!(ptr_out, ptr); @@ -393,8 +399,9 @@ mod tests { let vec = vec![1u8, 2, 3, 4, 5]; let slice = vec.as_slice(); - let mem_slice = - MemSlice::from_slice(unsafe { std::mem::transmute::<&[u8], &'static [u8]>(slice) }); + let mem_slice = MemSlice::from_static(unsafe { + std::mem::transmute::<&[u8], &'static [u8]>(slice) + }); let out = &*mem_slice.slice(3..5); assert_eq!(out, &slice[3..5]); From 2e92f0ab3630e9e0ed5d6898d271a0bdcfb09ee0 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Tue, 10 Sep 2024 14:59:57 +0200 Subject: [PATCH 14/28] refactor: Fix a bunch of tests for new-streaming (#18659) --- crates/polars-stream/src/nodes/reduce.rs | 3 +- py-polars/tests/unit/datatypes/test_list.py | 18 ++++----- py-polars/tests/unit/datatypes/test_struct.py | 1 + .../tests/unit/datatypes/test_temporal.py | 4 +- .../namespaces/temporal/test_datetime.py | 40 +++++++++++-------- 5 files changed, 34 insertions(+), 32 deletions(-) diff --git a/crates/polars-stream/src/nodes/reduce.rs b/crates/polars-stream/src/nodes/reduce.rs index 2ce9ee2c9464..f6de3bd1124a 100644 --- a/crates/polars-stream/src/nodes/reduce.rs +++ b/crates/polars-stream/src/nodes/reduce.rs @@ -59,9 +59,8 @@ impl ReduceNode { scope.spawn_task(TaskPriority::High, async move { while let Ok(morsel) = recv.recv().await { for (reducer, selector) in local_reducers.iter_mut().zip(selectors) { - // TODO: don't convert to physical representation here. let input = selector.evaluate(morsel.df(), state).await?; - reducer.update(&input.to_physical_repr())?; + reducer.update(&input)?; } } diff --git a/py-polars/tests/unit/datatypes/test_list.py b/py-polars/tests/unit/datatypes/test_list.py index 4607cfa89426..8c5502d698fd 100644 --- a/py-polars/tests/unit/datatypes/test_list.py +++ b/py-polars/tests/unit/datatypes/test_list.py @@ -114,20 +114,16 @@ def test_cast_inner() -> None: def test_list_empty_group_by_result_3521() -> None: - # Create a left relation where the join column contains a null value - left = pl.DataFrame().with_columns( - pl.lit(1).alias("group_by_column"), - pl.lit(None).cast(pl.Int32).alias("join_column"), + # Create a left relation where the join column contains a null value. + left = pl.DataFrame( + {"group_by_column": [1], "join_column": [None]}, + schema_overrides={"join_column": pl.Int64}, ) - # Create a right relation where there is a column to count distinct on - right = pl.DataFrame().with_columns( - pl.lit(1).alias("join_column"), - pl.lit(1).alias("n_unique_column"), - ) + # Create a right relation where there is a column to count distinct on. + right = pl.DataFrame({"join_column": [1], "n_unique_column": [1]}) - # Calculate n_unique after dropping nulls - # This will panic on polars version 0.13.38 and 0.13.39 + # Calculate n_unique after dropping nulls. result = ( left.join(right, on="join_column", how="left") .group_by("group_by_column") diff --git a/py-polars/tests/unit/datatypes/test_struct.py b/py-polars/tests/unit/datatypes/test_struct.py index 49a223f76fd4..6489a83e5a6b 100644 --- a/py-polars/tests/unit/datatypes/test_struct.py +++ b/py-polars/tests/unit/datatypes/test_struct.py @@ -265,6 +265,7 @@ def test_from_dicts_struct() -> None: ] +@pytest.mark.may_fail_auto_streaming def test_list_to_struct() -> None: df = pl.DataFrame({"a": [[1, 2, 3], [1, 2]]}) assert df.select([pl.col("a").list.to_struct()]).to_series().to_list() == [ diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py index ea1798fe7114..e0c9f6498c65 100644 --- a/py-polars/tests/unit/datatypes/test_temporal.py +++ b/py-polars/tests/unit/datatypes/test_temporal.py @@ -1399,12 +1399,12 @@ def test_replace_time_zone_sortedness_expressions( from_tz: str | None, expected_sortedness: bool, ambiguous: str ) -> None: df = ( - pl.Series("ts", [1603584000000000, 1603587600000000]) + pl.Series("ts", [1603584000000000, 1603584060000000, 1603587600000000]) .cast(pl.Datetime("us", from_tz)) .sort() .to_frame() ) - df = df.with_columns(ambiguous=pl.Series([ambiguous] * 2)) + df = df.with_columns(ambiguous=pl.Series([ambiguous] * 3)) assert df["ts"].flags["SORTED_ASC"] result = df.select( pl.col("ts").dt.replace_time_zone("UTC", ambiguous=pl.col("ambiguous")) diff --git a/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py b/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py index fb4ddee68146..a4fcfde344cc 100644 --- a/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py +++ b/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py @@ -138,15 +138,13 @@ def test_local_date_sortedness(time_zone: str | None, expected: bool) -> None: ser = (pl.Series([datetime(2022, 1, 1, 23)]).dt.replace_time_zone(time_zone)).sort() result = ser.dt.date() assert result.flags["SORTED_ASC"] - assert result.flags["SORTED_DESC"] is False # 2 elements - depends on time zone ser = ( pl.Series([datetime(2022, 1, 1, 23)] * 2).dt.replace_time_zone(time_zone) ).sort() result = ser.dt.date() - assert result.flags["SORTED_ASC"] == expected - assert result.flags["SORTED_DESC"] is False + assert result.flags["SORTED_ASC"] >= expected @pytest.mark.parametrize("time_zone", [None, "Asia/Kathmandu", "UTC"]) @@ -155,11 +153,16 @@ def test_local_time_sortedness(time_zone: str | None) -> None: ser = (pl.Series([datetime(2022, 1, 1, 23)]).dt.replace_time_zone(time_zone)).sort() result = ser.dt.time() assert result.flags["SORTED_ASC"] - assert not result.flags["SORTED_DESC"] - # two elements - not sorted + # three elements - not sorted ser = ( - pl.Series([datetime(2022, 1, 1, 23)] * 2).dt.replace_time_zone(time_zone) + pl.Series( + [ + datetime(2022, 1, 1, 23), + datetime(2022, 1, 2, 21), + datetime(2022, 1, 3, 22), + ] + ).dt.replace_time_zone(time_zone) ).sort() result = ser.dt.time() assert not result.flags["SORTED_ASC"] @@ -180,31 +183,34 @@ def test_local_time_before_epoch(time_unit: TimeUnit) -> None: ("time_zone", "offset", "expected"), [ (None, "1d", True), - ("Asia/Kathmandu", "1d", False), + ("Europe/London", "1d", False), ("UTC", "1d", True), (None, "1mo", True), - ("Asia/Kathmandu", "1mo", False), + ("Europe/London", "1mo", False), ("UTC", "1mo", True), (None, "1w", True), - ("Asia/Kathmandu", "1w", False), + ("Europe/London", "1w", False), ("UTC", "1w", True), (None, "1h", True), - ("Asia/Kathmandu", "1h", True), + ("Europe/London", "1h", True), ("UTC", "1h", True), ], ) def test_offset_by_sortedness( time_zone: str | None, offset: str, expected: bool ) -> None: - # create 2 values, as a single value is always sorted - ser = ( - pl.Series( - [datetime(2022, 1, 1, 22), datetime(2022, 1, 1, 22)] - ).dt.replace_time_zone(time_zone) + s = pl.datetime_range( + datetime(2020, 10, 25), + datetime(2020, 10, 25, 3), + "30m", + time_zone=time_zone, + eager=True, ).sort() - result = ser.dt.offset_by(offset) + assert s.flags["SORTED_ASC"] + assert not s.flags["SORTED_DESC"] + result = s.dt.offset_by(offset) assert result.flags["SORTED_ASC"] == expected - assert result.flags["SORTED_DESC"] is False + assert not result.flags["SORTED_DESC"] def test_dt_datetime_date_time_invalid() -> None: From ad9a1d8c7f96b6e69ecbff16b22d8087263eeacf Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Tue, 10 Sep 2024 23:00:22 +1000 Subject: [PATCH 15/28] refactor(rust): Remove duplicate byte range calc from new parquet source (#18655) --- .../parquet_source/row_group_data_fetch.rs | 24 ++++--------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs b/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs index 761131ecf1d5..773a5a9e3625 100644 --- a/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs +++ b/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs @@ -187,10 +187,8 @@ impl RowGroupDataFetcher { }) } } else { - let mut iter = get_row_group_byte_ranges(&row_group_metadata); - let first = iter.next().unwrap(); - let range = - iter.fold(first, |l, r| l.start.min(r.start)..l.end.max(r.end)); + let range = row_group_metadata.full_byte_range(); + let range = range.start as usize..range.end as usize; memory_prefetch_func(unsafe { slice.get_unchecked_release(range) }) }; @@ -236,14 +234,8 @@ impl RowGroupDataFetcher { // single range request for the entire row group. During testing this // provided much higher throughput from cloud than making multiple range // request with `get_ranges()`. - let mut iter = get_row_group_byte_ranges(&row_group_metadata); - let mut ranges = Vec::with_capacity(iter.len()); - let first = iter.next().unwrap(); - ranges.push(first.clone()); - let full_range = iter.fold(first, |l, r| { - ranges.push(r.clone()); - l.start.min(r.start)..l.end.max(r.end) - }); + let full_range = row_group_metadata.full_byte_range(); + let full_range = full_range.start as usize..full_range.end as usize; let mem_slice = { let full_range_2 = full_range.clone(); @@ -367,14 +359,6 @@ impl futures::stream::Stream for RowGroupDataStream { } } -fn get_row_group_byte_ranges( - row_group_metadata: &RowGroupMetadata, -) -> impl ExactSizeIterator> + '_ { - row_group_metadata - .byte_ranges_iter() - .map(|byte_range| byte_range.start as usize..byte_range.end as usize) -} - fn get_row_group_byte_ranges_for_projection<'a>( row_group_metadata: &'a RowGroupMetadata, columns: &'a [PlSmallStr], From 5c4e7e97a66007d3052042543486c42d23cfac74 Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Tue, 10 Sep 2024 23:54:39 +1000 Subject: [PATCH 16/28] refactor(rust): Re-use already decoded metadata for first path (new-parquet-source) (#18656) --- .../src/parquet/metadata/file_metadata.rs | 2 +- .../src/nodes/parquet_source/init.rs | 409 ----------------- .../nodes/parquet_source/metadata_fetch.rs | 430 ++++++++++++++++++ .../src/nodes/parquet_source/mod.rs | 6 +- .../src/physical_plan/to_graph.rs | 3 +- crates/polars-utils/src/mmap.rs | 4 +- 6 files changed, 439 insertions(+), 415 deletions(-) create mode 100644 crates/polars-stream/src/nodes/parquet_source/metadata_fetch.rs diff --git a/crates/polars-parquet/src/parquet/metadata/file_metadata.rs b/crates/polars-parquet/src/parquet/metadata/file_metadata.rs index 492d283f64ed..47c9f160781d 100644 --- a/crates/polars-parquet/src/parquet/metadata/file_metadata.rs +++ b/crates/polars-parquet/src/parquet/metadata/file_metadata.rs @@ -10,7 +10,7 @@ pub use crate::parquet::thrift_format::KeyValue; /// Metadata for a Parquet file. // This is almost equal to [`parquet_format_safe::FileMetaData`] but contains the descriptors, // which are crucial to deserialize pages. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct FileMetadata { /// version of this file. pub version: i32, diff --git a/crates/polars-stream/src/nodes/parquet_source/init.rs b/crates/polars-stream/src/nodes/parquet_source/init.rs index 2aba9642fb04..661ea4b84825 100644 --- a/crates/polars-stream/src/nodes/parquet_source/init.rs +++ b/crates/polars-stream/src/nodes/parquet_source/init.rs @@ -5,13 +5,7 @@ use futures::stream::FuturesUnordered; use futures::StreamExt; use polars_core::frame::DataFrame; use polars_error::PolarsResult; -use polars_io::prelude::FileMetadata; -use polars_io::utils::byte_source::{DynByteSource, MemSliceByteSource}; -use polars_io::utils::slice::SplitSlicePosition; -use polars_utils::mmap::MemSlice; -use polars_utils::pl_str::PlSmallStr; -use super::metadata_utils::{ensure_metadata_has_projected_fields, read_parquet_metadata_bytes}; use super::row_group_data_fetch::RowGroupDataFetcher; use super::row_group_decode::RowGroupDecoder; use super::{AsyncTaskData, ParquetSourceNode}; @@ -20,7 +14,6 @@ use crate::async_primitives::connector::connector; use crate::async_primitives::wait_group::{WaitGroup, WaitToken}; use crate::morsel::get_ideal_morsel_size; use crate::nodes::{MorselSeq, TaskPriority}; -use crate::utils::task_handles_ext; impl ParquetSourceNode { /// # Panics @@ -269,408 +262,6 @@ impl ParquetSourceNode { (raw_morsel_receivers, morsel_stream_task_handle) } - /// Constructs the task that fetches file metadata. - /// Note: This must be called AFTER `self.projected_arrow_fields` has been initialized. - /// - /// TODO: During IR conversion the metadata of the first file is already downloaded - see if - /// we can find a way to re-use it. - #[allow(clippy::type_complexity)] - fn init_metadata_fetcher( - &mut self, - ) -> ( - tokio::sync::oneshot::Receiver>, - crate::async_primitives::connector::Receiver<( - usize, - usize, - Arc, - FileMetadata, - usize, - )>, - task_handles_ext::AbortOnDropHandle>, - ) { - let verbose = self.verbose; - let io_runtime = polars_io::pl_async::get_runtime(); - - assert!( - !self.projected_arrow_fields.is_empty() - || self.file_options.with_columns.as_deref() == Some(&[]) - ); - let projected_arrow_fields = self.projected_arrow_fields.clone(); - let needs_max_row_group_height_calc = - self.file_options.include_file_paths.is_some() || self.hive_parts.is_some(); - - let (normalized_slice_oneshot_tx, normalized_slice_oneshot_rx) = - tokio::sync::oneshot::channel(); - let (mut metadata_tx, metadata_rx) = connector(); - - let byte_source_builder = self.byte_source_builder.clone(); - - if self.verbose { - eprintln!( - "[ParquetSource]: Byte source builder: {:?}", - &byte_source_builder - ); - } - - let fetch_metadata_bytes_for_path_index = { - let scan_sources = &self.scan_sources; - let cloud_options = Arc::new(self.cloud_options.clone()); - - let scan_sources = scan_sources.clone(); - let cloud_options = cloud_options.clone(); - let byte_source_builder = byte_source_builder.clone(); - - move |path_idx: usize| { - let scan_sources = scan_sources.clone(); - let cloud_options = cloud_options.clone(); - let byte_source_builder = byte_source_builder.clone(); - - let handle = io_runtime.spawn(async move { - let mut byte_source = Arc::new( - scan_sources - .get(path_idx) - .unwrap() - .to_dyn_byte_source( - &byte_source_builder, - cloud_options.as_ref().as_ref(), - ) - .await?, - ); - let (metadata_bytes, maybe_full_bytes) = - read_parquet_metadata_bytes(byte_source.as_ref(), verbose).await?; - - if let Some(v) = maybe_full_bytes { - if !matches!(byte_source.as_ref(), DynByteSource::MemSlice(_)) { - if verbose { - eprintln!( - "[ParquetSource]: Parquet file was fully fetched during \ - metadata read ({} bytes).", - v.len(), - ); - } - - byte_source = Arc::new(DynByteSource::from(MemSliceByteSource(v))) - } - } - - PolarsResult::Ok((path_idx, byte_source, metadata_bytes)) - }); - - let handle = task_handles_ext::AbortOnDropHandle(handle); - - std::future::ready(handle) - } - }; - - let process_metadata_bytes = { - move |handle: task_handles_ext::AbortOnDropHandle< - PolarsResult<(usize, Arc, MemSlice)>, - >| { - let projected_arrow_fields = projected_arrow_fields.clone(); - // Run on CPU runtime - metadata deserialization is expensive, especially - // for very wide tables. - let handle = async_executor::spawn(TaskPriority::Low, async move { - let (path_index, byte_source, metadata_bytes) = handle.await.unwrap()?; - - let metadata = polars_parquet::parquet::read::deserialize_metadata( - metadata_bytes.as_ref(), - metadata_bytes.len() * 2 + 1024, - )?; - - ensure_metadata_has_projected_fields( - projected_arrow_fields.as_ref(), - &metadata, - )?; - - let file_max_row_group_height = if needs_max_row_group_height_calc { - metadata - .row_groups - .iter() - .map(|x| x.num_rows()) - .max() - .unwrap_or(0) - } else { - 0 - }; - - PolarsResult::Ok((path_index, byte_source, metadata, file_max_row_group_height)) - }); - - async_executor::AbortOnDropHandle::new(handle) - } - }; - - let metadata_prefetch_size = self.config.metadata_prefetch_size; - let metadata_decode_ahead_size = self.config.metadata_decode_ahead_size; - - let (start_tx, start_rx) = tokio::sync::oneshot::channel(); - self.morsel_stream_starter = Some(start_tx); - - let metadata_task_handle = if self - .file_options - .slice - .map(|(offset, _)| offset >= 0) - .unwrap_or(true) - { - normalized_slice_oneshot_tx - .send( - self.file_options - .slice - .map(|(offset, len)| (offset as usize, len)), - ) - .unwrap(); - - // Safety: `offset + len` does not overflow. - let slice_range = self - .file_options - .slice - .map(|(offset, len)| offset as usize..offset as usize + len); - - let mut metadata_stream = futures::stream::iter(0..self.scan_sources.len()) - .map(fetch_metadata_bytes_for_path_index) - .buffered(metadata_prefetch_size) - .map(process_metadata_bytes) - .buffered(metadata_decode_ahead_size); - - let scan_sources = self.scan_sources.clone(); - - // We need to be able to both stop early as well as skip values, which is easier to do - // using a custom task instead of futures::stream - io_runtime.spawn(async move { - let current_row_offset_ref = &mut 0usize; - let current_path_index_ref = &mut 0usize; - - if start_rx.await.is_err() { - return Ok(()); - } - - if verbose { - eprintln!("[ParquetSource]: Starting data fetch") - } - - loop { - let current_path_index = *current_path_index_ref; - *current_path_index_ref += 1; - - let Some(v) = metadata_stream.next().await else { - break; - }; - - let (path_index, byte_source, metadata, file_max_row_group_height) = v - .map_err(|err| { - err.wrap_msg(|msg| { - format!( - "error at path (index: {}, path: {:?}): {}", - current_path_index, - scan_sources - .get(current_path_index) - .map(|x| PlSmallStr::from_str(x.to_include_path_name())), - msg - ) - }) - })?; - - assert_eq!(path_index, current_path_index); - - let current_row_offset = *current_row_offset_ref; - *current_row_offset_ref = current_row_offset.saturating_add(metadata.num_rows); - - if let Some(slice_range) = slice_range.clone() { - match SplitSlicePosition::split_slice_at_file( - current_row_offset, - metadata.num_rows, - slice_range, - ) { - SplitSlicePosition::Before => { - if verbose { - eprintln!( - "[ParquetSource]: Slice pushdown: \ - Skipped file at index {} ({} rows)", - current_path_index, metadata.num_rows - ); - } - continue; - }, - SplitSlicePosition::After => unreachable!(), - SplitSlicePosition::Overlapping(..) => {}, - }; - }; - - if metadata_tx - .send(( - path_index, - current_row_offset, - byte_source, - metadata, - file_max_row_group_height, - )) - .await - .is_err() - { - break; - } - - if let Some(slice_range) = slice_range.as_ref() { - if *current_row_offset_ref >= slice_range.end { - if verbose { - eprintln!( - "[ParquetSource]: Slice pushdown: \ - Stopped reading at file at index {} \ - (remaining {} files will not be read)", - current_path_index, - scan_sources.len() - current_path_index - 1, - ); - } - break; - } - }; - } - - Ok(()) - }) - } else { - // Walk the files in reverse to translate the slice into a positive offset. - let slice = self.file_options.slice.unwrap(); - let slice_start_as_n_from_end = -slice.0 as usize; - - let mut metadata_stream = futures::stream::iter((0..self.scan_sources.len()).rev()) - .map(fetch_metadata_bytes_for_path_index) - .buffered(metadata_prefetch_size) - .map(process_metadata_bytes) - .buffered(metadata_decode_ahead_size); - - // Note: - // * We want to wait until the first morsel is requested before starting this - let init_negative_slice_and_metadata = async move { - let mut processed_metadata_rev = vec![]; - let mut cum_rows = 0; - - while let Some(v) = metadata_stream.next().await { - let v = v?; - let (_, _, metadata, _) = &v; - cum_rows += metadata.num_rows; - processed_metadata_rev.push(v); - - if cum_rows >= slice_start_as_n_from_end { - break; - } - } - - let (start, len) = if slice_start_as_n_from_end > cum_rows { - // We need to trim the slice, e.g. SLICE[offset: -100, len: 75] on a file of 50 - // rows should only give the first 25 rows. - let first_file_position = slice_start_as_n_from_end - cum_rows; - (0, slice.1.saturating_sub(first_file_position)) - } else { - (cum_rows - slice_start_as_n_from_end, slice.1) - }; - - if len == 0 { - processed_metadata_rev.clear(); - } - - normalized_slice_oneshot_tx - .send(Some((start, len))) - .unwrap(); - - let slice_range = start..(start + len); - - PolarsResult::Ok((slice_range, processed_metadata_rev, cum_rows)) - }; - - let path_count = self.scan_sources.len(); - - io_runtime.spawn(async move { - if start_rx.await.is_err() { - return Ok(()); - } - - if verbose { - eprintln!("[ParquetSource]: Starting data fetch (negative slice)") - } - - let (slice_range, processed_metadata_rev, cum_rows) = - async_executor::AbortOnDropHandle::new(async_executor::spawn( - TaskPriority::Low, - init_negative_slice_and_metadata, - )) - .await?; - - if verbose { - if let Some((path_index, ..)) = processed_metadata_rev.last() { - eprintln!( - "[ParquetSource]: Slice pushdown: Negatively-offsetted slice {:?} \ - begins at file index {}, translated to {:?}", - slice, path_index, slice_range - ); - } else { - eprintln!( - "[ParquetSource]: Slice pushdown: Negatively-offsetted slice {:?} \ - skipped all files ({} files containing {} rows)", - slice, path_count, cum_rows - ) - } - } - - let metadata_iter = processed_metadata_rev.into_iter().rev(); - let current_row_offset_ref = &mut 0usize; - - for (current_path_index, byte_source, metadata, file_max_row_group_height) in - metadata_iter - { - let current_row_offset = *current_row_offset_ref; - *current_row_offset_ref = current_row_offset.saturating_add(metadata.num_rows); - - assert!(matches!( - SplitSlicePosition::split_slice_at_file( - current_row_offset, - metadata.num_rows, - slice_range.clone(), - ), - SplitSlicePosition::Overlapping(..) - )); - - if metadata_tx - .send(( - current_path_index, - current_row_offset, - byte_source, - metadata, - file_max_row_group_height, - )) - .await - .is_err() - { - break; - } - - if *current_row_offset_ref >= slice_range.end { - if verbose { - eprintln!( - "[ParquetSource]: Slice pushdown: \ - Stopped reading at file at index {} \ - (remaining {} files will not be read)", - current_path_index, - path_count - current_path_index - 1, - ); - } - break; - } - } - - Ok(()) - }) - }; - - let metadata_task_handle = task_handles_ext::AbortOnDropHandle(metadata_task_handle); - - ( - normalized_slice_oneshot_rx, - metadata_rx, - metadata_task_handle, - ) - } - /// Creates a `RowGroupDecoder` that turns `RowGroupData` into DataFrames. /// This must be called AFTER the following have been initialized: /// * `self.projected_arrow_fields` diff --git a/crates/polars-stream/src/nodes/parquet_source/metadata_fetch.rs b/crates/polars-stream/src/nodes/parquet_source/metadata_fetch.rs new file mode 100644 index 000000000000..5f3281145083 --- /dev/null +++ b/crates/polars-stream/src/nodes/parquet_source/metadata_fetch.rs @@ -0,0 +1,430 @@ +use std::sync::Arc; + +use futures::StreamExt; +use polars_error::PolarsResult; +use polars_io::prelude::FileMetadata; +use polars_io::utils::byte_source::{DynByteSource, MemSliceByteSource}; +use polars_io::utils::slice::SplitSlicePosition; +use polars_utils::mmap::MemSlice; +use polars_utils::pl_str::PlSmallStr; + +use super::metadata_utils::{ensure_metadata_has_projected_fields, read_parquet_metadata_bytes}; +use super::ParquetSourceNode; +use crate::async_executor; +use crate::async_primitives::connector::connector; +use crate::nodes::TaskPriority; +use crate::utils::task_handles_ext; + +impl ParquetSourceNode { + /// Constructs the task that fetches file metadata. + /// Note: This must be called AFTER `self.projected_arrow_fields` has been initialized. + #[allow(clippy::type_complexity)] + pub(super) fn init_metadata_fetcher( + &mut self, + ) -> ( + tokio::sync::oneshot::Receiver>, + crate::async_primitives::connector::Receiver<( + usize, + usize, + Arc, + FileMetadata, + usize, + )>, + task_handles_ext::AbortOnDropHandle>, + ) { + let verbose = self.verbose; + let io_runtime = polars_io::pl_async::get_runtime(); + + assert!( + !self.projected_arrow_fields.is_empty() + || self.file_options.with_columns.as_deref() == Some(&[]) + ); + let projected_arrow_fields = self.projected_arrow_fields.clone(); + let needs_max_row_group_height_calc = + self.file_options.include_file_paths.is_some() || self.hive_parts.is_some(); + + let (normalized_slice_oneshot_tx, normalized_slice_oneshot_rx) = + tokio::sync::oneshot::channel(); + let (mut metadata_tx, metadata_rx) = connector(); + + let byte_source_builder = self.byte_source_builder.clone(); + + if self.verbose { + eprintln!( + "[ParquetSource]: Byte source builder: {:?}", + &byte_source_builder + ); + } + + let fetch_metadata_bytes_for_path_index = { + let scan_sources = &self.scan_sources; + let cloud_options = Arc::new(self.cloud_options.clone()); + + let scan_sources = scan_sources.clone(); + let cloud_options = cloud_options.clone(); + let byte_source_builder = byte_source_builder.clone(); + + move |path_idx: usize| { + let scan_sources = scan_sources.clone(); + let cloud_options = cloud_options.clone(); + let byte_source_builder = byte_source_builder.clone(); + + let handle = io_runtime.spawn(async move { + let mut byte_source = Arc::new( + scan_sources + .get(path_idx) + .unwrap() + .to_dyn_byte_source( + &byte_source_builder, + cloud_options.as_ref().as_ref(), + ) + .await?, + ); + + if path_idx == 0 { + let metadata_bytes = MemSlice::EMPTY; + return Ok((0, byte_source, metadata_bytes)); + } + + let (metadata_bytes, maybe_full_bytes) = + read_parquet_metadata_bytes(byte_source.as_ref(), verbose).await?; + + if let Some(v) = maybe_full_bytes { + if !matches!(byte_source.as_ref(), DynByteSource::MemSlice(_)) { + if verbose { + eprintln!( + "[ParquetSource]: Parquet file was fully fetched during \ + metadata read ({} bytes).", + v.len(), + ); + } + + byte_source = Arc::new(DynByteSource::from(MemSliceByteSource(v))) + } + } + + PolarsResult::Ok((path_idx, byte_source, metadata_bytes)) + }); + + let handle = task_handles_ext::AbortOnDropHandle(handle); + + std::future::ready(handle) + } + }; + + let first_metadata = self.first_metadata.clone(); + + let process_metadata_bytes = { + move |handle: task_handles_ext::AbortOnDropHandle< + PolarsResult<(usize, Arc, MemSlice)>, + >| { + let projected_arrow_fields = projected_arrow_fields.clone(); + let first_metadata = first_metadata.clone(); + // Run on CPU runtime - metadata deserialization is expensive, especially + // for very wide tables. + let handle = async_executor::spawn(TaskPriority::Low, async move { + let (path_index, byte_source, metadata_bytes) = handle.await.unwrap()?; + + let metadata = if path_index == 0 { + Arc::unwrap_or_clone(first_metadata) + } else { + polars_parquet::parquet::read::deserialize_metadata( + metadata_bytes.as_ref(), + metadata_bytes.len() * 2 + 1024, + )? + }; + + ensure_metadata_has_projected_fields( + projected_arrow_fields.as_ref(), + &metadata, + )?; + + let file_max_row_group_height = if needs_max_row_group_height_calc { + metadata + .row_groups + .iter() + .map(|x| x.num_rows()) + .max() + .unwrap_or(0) + } else { + 0 + }; + + PolarsResult::Ok((path_index, byte_source, metadata, file_max_row_group_height)) + }); + + async_executor::AbortOnDropHandle::new(handle) + } + }; + + let metadata_prefetch_size = self.config.metadata_prefetch_size; + let metadata_decode_ahead_size = self.config.metadata_decode_ahead_size; + + let (start_tx, start_rx) = tokio::sync::oneshot::channel(); + self.morsel_stream_starter = Some(start_tx); + + let metadata_task_handle = if self + .file_options + .slice + .map(|(offset, _)| offset >= 0) + .unwrap_or(true) + { + normalized_slice_oneshot_tx + .send( + self.file_options + .slice + .map(|(offset, len)| (offset as usize, len)), + ) + .unwrap(); + + // Safety: `offset + len` does not overflow. + let slice_range = self + .file_options + .slice + .map(|(offset, len)| offset as usize..offset as usize + len); + + let mut metadata_stream = futures::stream::iter(0..self.scan_sources.len()) + .map(fetch_metadata_bytes_for_path_index) + .buffered(metadata_prefetch_size) + .map(process_metadata_bytes) + .buffered(metadata_decode_ahead_size); + + let scan_sources = self.scan_sources.clone(); + + // We need to be able to both stop early as well as skip values, which is easier to do + // using a custom task instead of futures::stream + io_runtime.spawn(async move { + let current_row_offset_ref = &mut 0usize; + let current_path_index_ref = &mut 0usize; + + if start_rx.await.is_err() { + return Ok(()); + } + + if verbose { + eprintln!("[ParquetSource]: Starting data fetch") + } + + loop { + let current_path_index = *current_path_index_ref; + *current_path_index_ref += 1; + + let Some(v) = metadata_stream.next().await else { + break; + }; + + let (path_index, byte_source, metadata, file_max_row_group_height) = v + .map_err(|err| { + err.wrap_msg(|msg| { + format!( + "error at path (index: {}, path: {:?}): {}", + current_path_index, + scan_sources + .get(current_path_index) + .map(|x| PlSmallStr::from_str(x.to_include_path_name())), + msg + ) + }) + })?; + + assert_eq!(path_index, current_path_index); + + let current_row_offset = *current_row_offset_ref; + *current_row_offset_ref = current_row_offset.saturating_add(metadata.num_rows); + + if let Some(slice_range) = slice_range.clone() { + match SplitSlicePosition::split_slice_at_file( + current_row_offset, + metadata.num_rows, + slice_range, + ) { + SplitSlicePosition::Before => { + if verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Skipped file at index {} ({} rows)", + current_path_index, metadata.num_rows + ); + } + continue; + }, + SplitSlicePosition::After => unreachable!(), + SplitSlicePosition::Overlapping(..) => {}, + }; + }; + + if metadata_tx + .send(( + path_index, + current_row_offset, + byte_source, + metadata, + file_max_row_group_height, + )) + .await + .is_err() + { + break; + } + + if let Some(slice_range) = slice_range.as_ref() { + if *current_row_offset_ref >= slice_range.end { + if verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Stopped reading at file at index {} \ + (remaining {} files will not be read)", + current_path_index, + scan_sources.len() - current_path_index - 1, + ); + } + break; + } + }; + } + + Ok(()) + }) + } else { + // Walk the files in reverse to translate the slice into a positive offset. + let slice = self.file_options.slice.unwrap(); + let slice_start_as_n_from_end = -slice.0 as usize; + + let mut metadata_stream = futures::stream::iter((0..self.scan_sources.len()).rev()) + .map(fetch_metadata_bytes_for_path_index) + .buffered(metadata_prefetch_size) + .map(process_metadata_bytes) + .buffered(metadata_decode_ahead_size); + + // Note: + // * We want to wait until the first morsel is requested before starting this + let init_negative_slice_and_metadata = async move { + let mut processed_metadata_rev = vec![]; + let mut cum_rows = 0; + + while let Some(v) = metadata_stream.next().await { + let v = v?; + let (_, _, metadata, _) = &v; + cum_rows += metadata.num_rows; + processed_metadata_rev.push(v); + + if cum_rows >= slice_start_as_n_from_end { + break; + } + } + + let (start, len) = if slice_start_as_n_from_end > cum_rows { + // We need to trim the slice, e.g. SLICE[offset: -100, len: 75] on a file of 50 + // rows should only give the first 25 rows. + let first_file_position = slice_start_as_n_from_end - cum_rows; + (0, slice.1.saturating_sub(first_file_position)) + } else { + (cum_rows - slice_start_as_n_from_end, slice.1) + }; + + if len == 0 { + processed_metadata_rev.clear(); + } + + normalized_slice_oneshot_tx + .send(Some((start, len))) + .unwrap(); + + let slice_range = start..(start + len); + + PolarsResult::Ok((slice_range, processed_metadata_rev, cum_rows)) + }; + + let path_count = self.scan_sources.len(); + + io_runtime.spawn(async move { + if start_rx.await.is_err() { + return Ok(()); + } + + if verbose { + eprintln!("[ParquetSource]: Starting data fetch (negative slice)") + } + + let (slice_range, processed_metadata_rev, cum_rows) = + async_executor::AbortOnDropHandle::new(async_executor::spawn( + TaskPriority::Low, + init_negative_slice_and_metadata, + )) + .await?; + + if verbose { + if let Some((path_index, ..)) = processed_metadata_rev.last() { + eprintln!( + "[ParquetSource]: Slice pushdown: Negatively-offsetted slice {:?} \ + begins at file index {}, translated to {:?}", + slice, path_index, slice_range + ); + } else { + eprintln!( + "[ParquetSource]: Slice pushdown: Negatively-offsetted slice {:?} \ + skipped all files ({} files containing {} rows)", + slice, path_count, cum_rows + ) + } + } + + let metadata_iter = processed_metadata_rev.into_iter().rev(); + let current_row_offset_ref = &mut 0usize; + + for (current_path_index, byte_source, metadata, file_max_row_group_height) in + metadata_iter + { + let current_row_offset = *current_row_offset_ref; + *current_row_offset_ref = current_row_offset.saturating_add(metadata.num_rows); + + assert!(matches!( + SplitSlicePosition::split_slice_at_file( + current_row_offset, + metadata.num_rows, + slice_range.clone(), + ), + SplitSlicePosition::Overlapping(..) + )); + + if metadata_tx + .send(( + current_path_index, + current_row_offset, + byte_source, + metadata, + file_max_row_group_height, + )) + .await + .is_err() + { + break; + } + + if *current_row_offset_ref >= slice_range.end { + if verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Stopped reading at file at index {} \ + (remaining {} files will not be read)", + current_path_index, + path_count - current_path_index - 1, + ); + } + break; + } + } + + Ok(()) + }) + }; + + let metadata_task_handle = task_handles_ext::AbortOnDropHandle(metadata_task_handle); + + ( + normalized_slice_oneshot_rx, + metadata_rx, + metadata_task_handle, + ) + } +} diff --git a/crates/polars-stream/src/nodes/parquet_source/mod.rs b/crates/polars-stream/src/nodes/parquet_source/mod.rs index a9344aa35c21..10df7ef0e3bf 100644 --- a/crates/polars-stream/src/nodes/parquet_source/mod.rs +++ b/crates/polars-stream/src/nodes/parquet_source/mod.rs @@ -8,7 +8,7 @@ use polars_error::PolarsResult; use polars_expr::prelude::{phys_expr_to_io_expr, PhysicalExpr}; use polars_io::cloud::CloudOptions; use polars_io::predicates::PhysicalIoExpr; -use polars_io::prelude::ParquetOptions; +use polars_io::prelude::{FileMetadata, ParquetOptions}; use polars_io::utils::byte_source::DynByteSourceBuilder; use polars_plan::plans::hive::HivePartitions; use polars_plan::plans::{FileInfo, ScanSources}; @@ -23,6 +23,7 @@ use crate::morsel::SourceToken; mod init; mod mem_prefetch_funcs; +mod metadata_fetch; mod metadata_utils; mod row_group_data_fetch; mod row_group_decode; @@ -41,6 +42,7 @@ pub struct ParquetSourceNode { options: ParquetOptions, cloud_options: Option, file_options: FileScanOptions, + first_metadata: Arc, // Run-time vars config: Config, verbose: bool, @@ -77,6 +79,7 @@ impl ParquetSourceNode { options: ParquetOptions, cloud_options: Option, file_options: FileScanOptions, + first_metadata: Arc, ) -> Self { let verbose = config::verbose(); @@ -95,6 +98,7 @@ impl ParquetSourceNode { options, cloud_options, file_options, + first_metadata, config: Config { // Initialized later diff --git a/crates/polars-stream/src/physical_plan/to_graph.rs b/crates/polars-stream/src/physical_plan/to_graph.rs index 9166acefa2e3..e5cbf86b0351 100644 --- a/crates/polars-stream/src/physical_plan/to_graph.rs +++ b/crates/polars-stream/src/physical_plan/to_graph.rs @@ -293,7 +293,7 @@ fn to_graph_rec<'a>( FileScan::Parquet { options, cloud_options, - metadata: _, + metadata: first_metadata, } => { if std::env::var("POLARS_DISABLE_PARQUET_SOURCE").as_deref() != Ok("1") { ctx.graph.add_node( @@ -305,6 +305,7 @@ fn to_graph_rec<'a>( options, cloud_options, file_options, + first_metadata.unwrap(), ), [], ) diff --git a/crates/polars-utils/src/mmap.rs b/crates/polars-utils/src/mmap.rs index 1a9e20191b1e..cd33ab85438a 100644 --- a/crates/polars-utils/src/mmap.rs +++ b/crates/polars-utils/src/mmap.rs @@ -62,9 +62,7 @@ mod private { } impl MemSlice { - pub const fn empty() -> Self { - Self::from_static(&[]) - } + pub const EMPTY: Self = Self::from_static(&[]); /// Copy the contents into a new owned `Vec` #[inline(always)] From 9d01442d3efa2c3bd6ddec4a55612d07e76e1600 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Wed, 11 Sep 2024 18:12:22 +1200 Subject: [PATCH 17/28] docs(python): Update join_where docs to clarify behaviour (#18670) --- py-polars/polars/dataframe/frame.py | 14 +++++++++----- py-polars/polars/lazyframe/frame.py | 10 +++++++--- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 2d8710b9f431..83e55cfe64b3 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -7103,21 +7103,25 @@ def join_where( suffix: str = "_right", ) -> DataFrame: """ - Perform a join based on one or multiple equality predicates. + Perform a join based on one or multiple (in)equality predicates. + + This performs an inner join, so only rows where all predicates are true + are included in the result, and a row from either DataFrame may be included + multiple times in the result. + + .. note:: + The row order of the input DataFrames is not preserved. .. warning:: This functionality is experimental. It may be changed at any point without it being considered a breaking change. - A row from this table may be included in zero or multiple rows in the result, - and the relative order of rows may differ between the input and output tables. - Parameters ---------- other DataFrame to join with. *predicates - (In)Equality condition to join the two table on. + (In)Equality condition to join the two tables on. When a column name occurs in both tables, the proper suffix must be applied in the predicate. suffix diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index ec329898441a..d50c2867dd24 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -4571,8 +4571,12 @@ def join_where( """ Perform a join based on one or multiple (in)equality predicates. - A row from this table may be included in zero or multiple rows in the result, - and the relative order of rows may differ between the input and output tables. + This performs an inner join, so only rows where all predicates are true + are included in the result, and a row from either DataFrame may be included + multiple times in the result. + + .. note:: + The row order of the input DataFrames is not preserved. .. warning:: This functionality is experimental. It may be @@ -4583,7 +4587,7 @@ def join_where( other DataFrame to join with. *predicates - (In)Equality condition to join the two table on. + (In)Equality condition to join the two tables on. When a column name occurs in both tables, the proper suffix must be applied in the predicate. suffix From 43cc962da2c5bc46802bdb634fe5ec01d95f48b7 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Wed, 11 Sep 2024 08:44:16 +0200 Subject: [PATCH 18/28] perf: Back arrow arrays with SharedStorage which can have non-refcounted static slices (#18666) --- crates/polars-arrow/src/array/binview/mod.rs | 6 +- .../src/array/static_array_collect.rs | 10 +- crates/polars-arrow/src/bitmap/immutable.rs | 150 +++++------- crates/polars-arrow/src/bitmap/mutable.rs | 4 +- crates/polars-arrow/src/buffer/immutable.rs | 66 +++--- crates/polars-arrow/src/buffer/mod.rs | 96 -------- crates/polars-arrow/src/ffi/array.rs | 18 +- crates/polars-arrow/src/lib.rs | 1 + crates/polars-arrow/src/storage.rs | 215 ++++++++++++++++++ .../chunked_array/object/extension/drop.rs | 2 +- crates/polars-utils/src/foreign_vec.rs | 100 -------- crates/polars-utils/src/lib.rs | 1 - .../polars/tests/it/arrow/buffer/immutable.rs | 2 +- 13 files changed, 322 insertions(+), 349 deletions(-) create mode 100644 crates/polars-arrow/src/storage.rs delete mode 100644 crates/polars-utils/src/foreign_vec.rs diff --git a/crates/polars-arrow/src/array/binview/mod.rs b/crates/polars-arrow/src/array/binview/mod.rs index d3fcc3c263d3..2e0eee85d2ef 100644 --- a/crates/polars-arrow/src/array/binview/mod.rs +++ b/crates/polars-arrow/src/array/binview/mod.rs @@ -403,10 +403,10 @@ impl BinaryViewArrayGeneric { self.buffers .iter() .map(|buf| { - if buf.shared_count_strong() == 1 { - buf.len() - } else { + if buf.storage_refcount() > 1 { 0 + } else { + buf.len() } }) .sum() diff --git a/crates/polars-arrow/src/array/static_array_collect.rs b/crates/polars-arrow/src/array/static_array_collect.rs index 0b30ee25b365..296d93502abe 100644 --- a/crates/polars-arrow/src/array/static_array_collect.rs +++ b/crates/polars-arrow/src/array/static_array_collect.rs @@ -1,5 +1,4 @@ use std::borrow::Cow; -use std::sync::Arc; use polars_utils::no_call_const; @@ -15,6 +14,7 @@ use crate::datatypes::ArrowDataType; use crate::legacy::prelude::fixed_size_list::AnonymousBuilder as AnonymousFixedSizeListArrayBuilder; use crate::legacy::prelude::list::AnonymousBuilder as AnonymousListArrayBuilder; use crate::legacy::trusted_len::TrustedLenPush; +use crate::storage::SharedStorage; use crate::trusted_len::TrustedLen; use crate::types::NativeType; @@ -256,7 +256,7 @@ macro_rules! impl_collect_vec_validity { unsafe { // SAFETY: we made sure the null_count is correct. Some(Bitmap::from_inner_unchecked( - Arc::new(bitmap.into()), + SharedStorage::from_vec(bitmap), 0, buf.len(), Some(null_count), @@ -317,7 +317,7 @@ macro_rules! impl_trusted_collect_vec_validity { unsafe { // SAFETY: we made sure the null_count is correct. Some(Bitmap::from_inner_unchecked( - Arc::new(bitmap.into()), + SharedStorage::from_vec(bitmap), 0, buf.len(), Some(null_count), @@ -766,7 +766,7 @@ macro_rules! impl_collect_bool_validity { let false_count = len - true_count; let values = unsafe { - Bitmap::from_inner_unchecked(Arc::new(buf.into()), 0, len, Some(false_count)) + Bitmap::from_inner_unchecked(SharedStorage::from_vec(buf), 0, len, Some(false_count)) }; let null_count = len - nonnull_count; @@ -774,7 +774,7 @@ macro_rules! impl_collect_bool_validity { unsafe { // SAFETY: we made sure the null_count is correct. Some(Bitmap::from_inner_unchecked( - Arc::new(validity.into()), + SharedStorage::from_vec(validity), 0, len, Some(null_count), diff --git a/crates/polars-arrow/src/bitmap/immutable.rs b/crates/polars-arrow/src/bitmap/immutable.rs index 6ad76a07b639..c9aa0b681b4a 100644 --- a/crates/polars-arrow/src/bitmap/immutable.rs +++ b/crates/polars-arrow/src/bitmap/immutable.rs @@ -1,9 +1,8 @@ use std::ops::Deref; use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::{Arc, OnceLock}; +use std::sync::LazyLock; use either::Either; -use parking_lot::RwLockUpgradableReadGuard; use polars_error::{polars_bail, PolarsResult}; use super::utils::{count_zeros, fmt, get_bit, get_bit_unchecked, BitChunk, BitChunks, BitmapIter}; @@ -13,8 +12,8 @@ use crate::bitmap::aligned::AlignedBitmapSlice; use crate::bitmap::iterator::{ FastU32BitmapIter, FastU56BitmapIter, FastU64BitmapIter, TrueIdxIter, }; -use crate::buffer::Bytes; use crate::legacy::utils::FromTrustedLenIterator; +use crate::storage::SharedStorage; use crate::trusted_len::TrustedLen; const UNKNOWN_BIT_COUNT: u64 = u64::MAX; @@ -52,7 +51,7 @@ const UNKNOWN_BIT_COUNT: u64 = u64::MAX; /// let same: Bitmap = sliced.into_mut().left().unwrap(); /// ``` pub struct Bitmap { - bytes: Arc>, + storage: SharedStorage, // Both offset and length are measured in bits. They are used to bound the // bitmap to a region of Bytes. offset: usize, @@ -73,7 +72,7 @@ fn has_cached_unset_bit_count(ubcc: u64) -> bool { impl Clone for Bitmap { fn clone(&self) -> Self { Self { - bytes: Arc::clone(&self.bytes), + storage: self.storage.clone(), offset: self.offset, length: self.length, unset_bit_count_cache: AtomicU64::new( @@ -121,9 +120,9 @@ impl Bitmap { pub fn try_new(bytes: Vec, length: usize) -> PolarsResult { check(&bytes, 0, length)?; Ok(Self { + storage: SharedStorage::from_vec(bytes), length, offset: 0, - bytes: Arc::new(bytes.into()), unset_bit_count_cache: AtomicU64::new(if length == 0 { 0 } else { UNKNOWN_BIT_COUNT }), }) } @@ -142,32 +141,32 @@ impl Bitmap { /// Returns a new iterator of `bool` over this bitmap pub fn iter(&self) -> BitmapIter { - BitmapIter::new(&self.bytes, self.offset, self.length) + BitmapIter::new(&self.storage, self.offset, self.length) } /// Returns an iterator over bits in bit chunks [`BitChunk`]. /// /// This iterator is useful to operate over multiple bits via e.g. bitwise. pub fn chunks(&self) -> BitChunks { - BitChunks::new(&self.bytes, self.offset, self.length) + BitChunks::new(&self.storage, self.offset, self.length) } /// Returns a fast iterator that gives 32 bits at a time. /// Has a remainder that must be handled separately. pub fn fast_iter_u32(&self) -> FastU32BitmapIter<'_> { - FastU32BitmapIter::new(&self.bytes, self.offset, self.length) + FastU32BitmapIter::new(&self.storage, self.offset, self.length) } /// Returns a fast iterator that gives 56 bits at a time. /// Has a remainder that must be handled separately. pub fn fast_iter_u56(&self) -> FastU56BitmapIter<'_> { - FastU56BitmapIter::new(&self.bytes, self.offset, self.length) + FastU56BitmapIter::new(&self.storage, self.offset, self.length) } /// Returns a fast iterator that gives 64 bits at a time. /// Has a remainder that must be handled separately. pub fn fast_iter_u64(&self) -> FastU64BitmapIter<'_> { - FastU64BitmapIter::new(&self.bytes, self.offset, self.length) + FastU64BitmapIter::new(&self.storage, self.offset, self.length) } /// Returns an iterator that only iterates over the set bits. @@ -177,7 +176,7 @@ impl Bitmap { /// Returns the bits of this [`Bitmap`] as a [`AlignedBitmapSlice`]. pub fn aligned(&self) -> AlignedBitmapSlice<'_, T> { - AlignedBitmapSlice::new(&self.bytes, self.offset, self.length) + AlignedBitmapSlice::new(&self.storage, self.offset, self.length) } /// Returns the byte slice of this [`Bitmap`]. @@ -192,7 +191,7 @@ impl Bitmap { let start = self.offset / 8; let len = (self.offset % 8 + self.length).saturating_add(7) / 8; ( - &self.bytes[start..start + len], + &self.storage[start..start + len], self.offset % 8, self.length, ) @@ -224,7 +223,7 @@ impl Bitmap { /// computed. Repeated calls use the cached bitcount. pub fn unset_bits(&self) -> usize { self.lazy_unset_bits().unwrap_or_else(|| { - let zeros = count_zeros(&self.bytes, self.offset, self.length); + let zeros = count_zeros(&self.storage, self.offset, self.length); self.unset_bit_count_cache .store(zeros as u64, Ordering::Relaxed); zeros @@ -294,8 +293,9 @@ impl Bitmap { if length + small_portion >= self.length { // Subtract the null count of the chunks we slice off. let slice_end = self.offset + offset + length; - let head_count = count_zeros(&self.bytes, self.offset, offset); - let tail_count = count_zeros(&self.bytes, slice_end, self.length - length - offset); + let head_count = count_zeros(&self.storage, self.offset, offset); + let tail_count = + count_zeros(&self.storage, slice_end, self.length - length - offset); let new_count = *unset_bit_count_cache - head_count as u64 - tail_count as u64; *unset_bit_count_cache = new_count; } else { @@ -334,7 +334,7 @@ impl Bitmap { /// Panics iff `i >= self.len()`. #[inline] pub fn get_bit(&self, i: usize) -> bool { - get_bit(&self.bytes, self.offset + i) + get_bit(&self.storage, self.offset + i) } /// Unsafely returns whether the bit at position `i` is set. @@ -343,13 +343,13 @@ impl Bitmap { /// Unsound iff `i >= self.len()`. #[inline] pub unsafe fn get_bit_unchecked(&self, i: usize) -> bool { - get_bit_unchecked(&self.bytes, self.offset + i) + get_bit_unchecked(&self.storage, self.offset + i) } /// Returns a pointer to the start of this [`Bitmap`] (ignores `offsets`) /// This pointer is allocated iff `self.len() > 0`. pub(crate) fn as_ptr(&self) -> *const u8 { - self.bytes.deref().as_ptr() + self.storage.deref().as_ptr() } /// Returns a pointer to the start of this [`Bitmap`] (ignores `offsets`) @@ -366,15 +366,12 @@ impl Bitmap { /// * this [`Bitmap`] has not been cloned (i.e. [`Arc`]`::get_mut` yields [`Some`]) /// * this [`Bitmap`] was not imported from the c data interface (FFI) pub fn into_mut(mut self) -> Either { - match ( - self.offset, - Arc::get_mut(&mut self.bytes).and_then(|b| b.get_vec()), - ) { - (0, Some(v)) => { - let data = std::mem::take(v); - Either::Right(MutableBitmap::from_vec(data, self.length)) + match self.storage.try_into_vec() { + Ok(v) => Either::Right(MutableBitmap::from_vec(v, self.length)), + Err(storage) => { + self.storage = storage; + Either::Left(self) }, - _ => Either::Left(self), } } @@ -390,7 +387,7 @@ impl Bitmap { let vec = chunk_iter_to_vec(chunks.chain(std::iter::once(remainder))); MutableBitmap::from_vec(vec, data.length) } else { - MutableBitmap::from_vec(data.bytes.as_ref().to_vec(), data.length) + MutableBitmap::from_vec(data.storage.as_ref().to_vec(), data.length) } }, Either::Right(data) => data, @@ -400,57 +397,23 @@ impl Bitmap { /// Initializes an new [`Bitmap`] filled with unset values. #[inline] pub fn new_zeroed(length: usize) -> Self { - // There are quite some situations where we just want a zeroed out Bitmap, since that would - // constantly need to reallocate we make a static that contains the largest allocation. - // Then, we can just take an Arc::clone of that slice everytime or grow it if needed. - static GLOBAL_ZERO_BYTES: OnceLock>>> = OnceLock::new(); - - let rwlock_zero_bytes = GLOBAL_ZERO_BYTES.get_or_init(|| { - let byte_length = length.div_ceil(8).next_power_of_two(); - parking_lot::RwLock::new(Arc::new(Bytes::from(vec![0; byte_length]))) - }); - - let unset_bit_count_cache = AtomicU64::new(length as u64); - - let zero_bytes = rwlock_zero_bytes.upgradable_read(); - if zero_bytes.len() * 8 >= length { - let bytes = zero_bytes.clone(); - return Bitmap { - bytes, - offset: 0, - length, - unset_bit_count_cache, - }; - } - - let mut zero_bytes = RwLockUpgradableReadGuard::upgrade(zero_bytes); - - // Race Condition: - // By the time we got here, another Guard could have been upgraded, and the buffer - // could have been expanded already. So we want to check again whether we cannot just take - // that buffer. - if zero_bytes.len() * 8 >= length { - let bytes = zero_bytes.clone(); - return Bitmap { - bytes, - offset: 0, - length, - unset_bit_count_cache, - }; - } - - // Let do exponential increases so that we are not constantly allocating new - // buffers. - let byte_length = length.div_ceil(8).next_power_of_two(); - - let bytes = Arc::new(Bytes::from(vec![0; byte_length])); - *zero_bytes = bytes.clone(); - - Bitmap { - bytes, + // We intentionally leak 1MiB of zeroed memory once so we don't have to + // refcount it. + const GLOBAL_ZERO_SIZE: usize = 1024 * 1024; + static GLOBAL_ZEROES: LazyLock> = + LazyLock::new(|| SharedStorage::from_static(vec![0; GLOBAL_ZERO_SIZE].leak())); + + let bytes_needed = length.div_ceil(8); + let storage = if bytes_needed <= GLOBAL_ZERO_SIZE { + GLOBAL_ZEROES.clone() + } else { + SharedStorage::from_vec(vec![0; bytes_needed]) + }; + Self { + storage, offset: 0, length, - unset_bit_count_cache, + unset_bit_count_cache: AtomicU64::new(length as u64), } } @@ -464,13 +427,20 @@ impl Bitmap { vec![0; length.saturating_add(7) / 8] }; let unset_bits = if value { 0 } else { length }; - unsafe { Bitmap::from_inner_unchecked(Arc::new(bytes.into()), 0, length, Some(unset_bits)) } + unsafe { + Bitmap::from_inner_unchecked( + SharedStorage::from_vec(bytes), + 0, + length, + Some(unset_bits), + ) + } } /// Counts the nulls (unset bits) starting from `offset` bits and for `length` bits. #[inline] pub fn null_count_range(&self, offset: usize, length: usize) -> usize { - count_zeros(&self.bytes, self.offset + offset, length) + count_zeros(&self.storage, self.offset + offset, length) } /// Creates a new [`Bitmap`] from a slice and length. @@ -506,12 +476,12 @@ impl Bitmap { /// # Safety /// Callers must ensure all invariants of this struct are upheld. pub unsafe fn from_inner_unchecked( - bytes: Arc>, + storage: SharedStorage, offset: usize, length: usize, unset_bits: Option, ) -> Self { - debug_assert!(check(&bytes[..], offset, length).is_ok()); + debug_assert!(check(&storage[..], offset, length).is_ok()); let unset_bit_count_cache = if let Some(n) = unset_bits { AtomicU64::new(n as u64) @@ -519,7 +489,7 @@ impl Bitmap { AtomicU64::new(UNKNOWN_BIT_COUNT) }; Self { - bytes, + storage, offset, length, unset_bit_count_cache, @@ -633,10 +603,10 @@ impl Bitmap { let length = value.len(); let unset_bits = value.null_count(); Self { + storage: SharedStorage::from_arrow_buffer(value.buffer().clone()), offset, length, unset_bit_count_cache: AtomicU64::new(unset_bits as u64), - bytes: Arc::new(crate::buffer::to_bytes(value.buffer().clone())), } } } @@ -646,7 +616,7 @@ impl<'a> IntoIterator for &'a Bitmap { type IntoIter = BitmapIter<'a>; fn into_iter(self) -> Self::IntoIter { - BitmapIter::<'a>::new(&self.bytes, self.offset, self.length) + BitmapIter::<'a>::new(&self.storage, self.offset, self.length) } } @@ -663,7 +633,7 @@ impl IntoIterator for Bitmap { impl From for arrow_buffer::buffer::NullBuffer { fn from(value: Bitmap) -> Self { let null_count = value.unset_bits(); - let buffer = crate::buffer::to_buffer(value.bytes); + let buffer = value.storage.into_arrow_buffer(); let buffer = arrow_buffer::buffer::BooleanBuffer::new(buffer, value.offset, value.length); // SAFETY: null count is accurate unsafe { arrow_buffer::buffer::NullBuffer::new_unchecked(buffer, null_count) } @@ -677,8 +647,6 @@ impl Splitable for Bitmap { } unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) { - let bytes = &self.bytes; - if offset == 0 { return (Self::new(), self.clone()); } @@ -709,12 +677,12 @@ impl Splitable for Bitmap { if lhs_length <= rhs_length { if rhs_length + small_portion >= self.length { - let count = count_zeros(&self.bytes, self.offset, lhs_length) as u64; + let count = count_zeros(&self.storage, self.offset, lhs_length) as u64; lhs_ubcc = count; rhs_ubcc = ubcc - count; } } else if lhs_length + small_portion >= self.length { - let count = count_zeros(&self.bytes, self.offset + offset, rhs_length) as u64; + let count = count_zeros(&self.storage, self.offset + offset, rhs_length) as u64; lhs_ubcc = ubcc - count; rhs_ubcc = count; } @@ -726,13 +694,13 @@ impl Splitable for Bitmap { ( Self { - bytes: bytes.clone(), + storage: self.storage.clone(), offset: self.offset, length: lhs_length, unset_bit_count_cache: AtomicU64::new(lhs_ubcc), }, Self { - bytes: bytes.clone(), + storage: self.storage.clone(), offset: self.offset + offset, length: rhs_length, unset_bit_count_cache: AtomicU64::new(rhs_ubcc), diff --git a/crates/polars-arrow/src/bitmap/mutable.rs b/crates/polars-arrow/src/bitmap/mutable.rs index c81df93f1cdc..d030682a63a7 100644 --- a/crates/polars-arrow/src/bitmap/mutable.rs +++ b/crates/polars-arrow/src/bitmap/mutable.rs @@ -1,5 +1,4 @@ use std::hint::unreachable_unchecked; -use std::sync::Arc; use polars_error::{polars_bail, PolarsResult}; @@ -8,6 +7,7 @@ use super::utils::{ }; use super::{intersects_with_mut, Bitmap}; use crate::bitmap::utils::{get_bit_unchecked, merge_reversed, set_bit_unchecked}; +use crate::storage::SharedStorage; use crate::trusted_len::TrustedLen; /// A container of booleans. [`MutableBitmap`] is semantically equivalent @@ -374,7 +374,7 @@ impl From for Option { // SAFETY: invariants of the `MutableBitmap` equal that of `Bitmap`. let bitmap = unsafe { Bitmap::from_inner_unchecked( - Arc::new(buffer.buffer.into()), + SharedStorage::from_vec(buffer.buffer), 0, buffer.length, Some(unset_bits), diff --git a/crates/polars-arrow/src/buffer/immutable.rs b/crates/polars-arrow/src/buffer/immutable.rs index 21f765d46408..1dfe805ffc57 100644 --- a/crates/polars-arrow/src/buffer/immutable.rs +++ b/crates/polars-arrow/src/buffer/immutable.rs @@ -1,11 +1,11 @@ use std::ops::Deref; -use std::sync::Arc; use either::Either; use num_traits::Zero; -use super::{Bytes, IntoIter}; +use super::IntoIter; use crate::array::{ArrayAccessor, Splitable}; +use crate::storage::SharedStorage; /// [`Buffer`] is a contiguous memory region that can be shared across /// thread boundaries. @@ -39,7 +39,7 @@ use crate::array::{ArrayAccessor, Splitable}; #[derive(Clone)] pub struct Buffer { /// The internal byte buffer. - storage: Arc>, + storage: SharedStorage, /// A pointer into the buffer where our data starts. ptr: *const T, @@ -48,8 +48,8 @@ pub struct Buffer { length: usize, } -unsafe impl Sync for Buffer {} -unsafe impl Send for Buffer {} +unsafe impl Sync for Buffer {} +unsafe impl Send for Buffer {} impl PartialEq for Buffer { #[inline] @@ -79,11 +79,11 @@ impl Buffer { } /// Auxiliary method to create a new Buffer - pub(crate) fn from_bytes(bytes: Bytes) -> Self { - let ptr = bytes.as_ptr(); - let length = bytes.len(); + pub(crate) fn from_storage(storage: SharedStorage) -> Self { + let ptr = storage.as_ptr(); + let length = storage.len(); Buffer { - storage: Arc::new(bytes), + storage, ptr, length, } @@ -204,7 +204,7 @@ impl Buffer { /// Returns a mutable reference to its underlying [`Vec`], if possible. /// /// This operation returns [`Either::Right`] iff this [`Buffer`]: - /// * has not been cloned (i.e. [`Arc`]`::get_mut` yields [`Some`]) + /// * has no alive clones /// * has not been imported from the C data interface (FFI) #[inline] pub fn into_mut(mut self) -> Either> { @@ -212,36 +212,31 @@ impl Buffer { if self.is_sliced() { return Either::Left(self); } - match Arc::get_mut(&mut self.storage) - .and_then(|b| b.get_vec()) - .map(std::mem::take) - { - Some(inner) => Either::Right(inner), - None => Either::Left(self), + match self.storage.try_into_vec() { + Ok(v) => Either::Right(v), + Err(slf) => { + self.storage = slf; + Either::Left(self) + }, } } /// Returns a mutable reference to its slice, if possible. /// /// This operation returns [`Some`] iff this [`Buffer`]: - /// * has not been cloned (i.e. [`Arc`]`::get_mut` yields [`Some`]) + /// * has no alive clones /// * has not been imported from the C data interface (FFI) #[inline] pub fn get_mut_slice(&mut self) -> Option<&mut [T]> { let offset = self.offset(); - let unique = Arc::get_mut(&mut self.storage)?; - let vec = unique.get_vec()?; - Some(unsafe { vec.get_unchecked_mut(offset..offset + self.length) }) - } - - /// Get the strong count of underlying `Arc` data buffer. - pub fn shared_count_strong(&self) -> usize { - Arc::strong_count(&self.storage) + let slice = self.storage.try_as_mut_slice()?; + Some(unsafe { slice.get_unchecked_mut(offset..offset + self.length) }) } - /// Get the weak count of underlying `Arc` data buffer. - pub fn shared_count_weak(&self) -> usize { - Arc::weak_count(&self.storage) + /// Since this takes a shared reference to self, beware that others might + /// increment this after you've checked it's equal to 1. + pub fn storage_refcount(&self) -> u64 { + self.storage.refcount() } } @@ -262,15 +257,8 @@ impl Buffer { impl From> for Buffer { #[inline] - fn from(p: Vec) -> Self { - let bytes: Bytes = p.into(); - let ptr = bytes.as_ptr(); - let length = bytes.len(); - Self { - storage: Arc::new(bytes), - ptr, - length, - } + fn from(v: Vec) -> Self { + Self::from_storage(SharedStorage::from_vec(v)) } } @@ -303,7 +291,7 @@ impl IntoIterator for Buffer { #[cfg(feature = "arrow_rs")] impl From for Buffer { fn from(value: arrow_buffer::Buffer) -> Self { - Self::from_bytes(crate::buffer::to_bytes(value)) + Self::from_storage(SharedStorage::from_arrow_buffer(value)) } } @@ -311,7 +299,7 @@ impl From for Buffer { impl From> for arrow_buffer::Buffer { fn from(value: Buffer) -> Self { let offset = value.offset(); - crate::buffer::to_buffer(value.storage).slice_with_length( + value.storage.into_arrow_buffer().slice_with_length( offset * std::mem::size_of::(), value.length * std::mem::size_of::(), ) diff --git a/crates/polars-arrow/src/buffer/mod.rs b/crates/polars-arrow/src/buffer/mod.rs index a5c3aaf90763..386545482d09 100644 --- a/crates/polars-arrow/src/buffer/mod.rs +++ b/crates/polars-arrow/src/buffer/mod.rs @@ -3,101 +3,5 @@ mod immutable; mod iterator; -use std::ops::Deref; - -use crate::ffi::InternalArrowArray; - -pub(crate) enum BytesAllocator { - // Dead code lint is a false positive. - // remove once fixed in rustc - #[allow(dead_code)] - InternalArrowArray(InternalArrowArray), - - #[cfg(feature = "arrow_rs")] - // Dead code lint is a false positive. - // remove once fixed in rustc - #[allow(dead_code)] - Arrow(arrow_buffer::Buffer), -} -pub(crate) type BytesInner = polars_utils::foreign_vec::ForeignVec; - -/// Bytes representation. -#[repr(transparent)] -pub struct Bytes(BytesInner); - -impl Bytes { - /// Takes ownership of an allocated memory region. - /// # Panics - /// This function panics if and only if pointer is not null - /// - /// # Safety - /// This function is safe if and only if `ptr` is valid for `length` - /// # Implementation - /// This function leaks if and only if `owner` does not deallocate - /// the region `[ptr, ptr+length[` when dropped. - #[inline] - pub(crate) unsafe fn from_foreign(ptr: *const T, length: usize, owner: BytesAllocator) -> Self { - Self(BytesInner::from_foreign(ptr, length, owner)) - } - - /// Returns a `Some` mutable reference of [`Vec`] iff this was initialized - /// from a [`Vec`] and `None` otherwise. - #[inline] - pub(crate) fn get_vec(&mut self) -> Option<&mut Vec> { - self.0.get_vec() - } -} - -impl Deref for Bytes { - type Target = [T]; - - #[inline] - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl From> for Bytes { - #[inline] - fn from(data: Vec) -> Self { - let inner: BytesInner = data.into(); - Bytes(inner) - } -} - -impl From> for Bytes { - #[inline] - fn from(value: BytesInner) -> Self { - Self(value) - } -} - -#[cfg(feature = "arrow_rs")] -pub(crate) fn to_buffer( - value: std::sync::Arc>, -) -> arrow_buffer::Buffer { - // This should never panic as ForeignVec pointer must be non-null - let ptr = std::ptr::NonNull::new(value.as_ptr() as _).unwrap(); - let len = value.len() * std::mem::size_of::(); - // SAFETY: allocation is guaranteed to be valid for `len` bytes - unsafe { arrow_buffer::Buffer::from_custom_allocation(ptr, len, value) } -} - -#[cfg(feature = "arrow_rs")] -pub(crate) fn to_bytes(value: arrow_buffer::Buffer) -> Bytes { - let ptr = value.as_ptr(); - let align = ptr.align_offset(std::mem::align_of::()); - assert_eq!(align, 0, "not aligned"); - let len = value.len() / std::mem::size_of::(); - - // Valid as `NativeType: Pod` and checked alignment above - let ptr = value.as_ptr() as *const T; - - let owner = crate::buffer::BytesAllocator::Arrow(value); - - // SAFETY: slice is valid for len elements of T - unsafe { Bytes::from_foreign(ptr, len, owner) } -} - pub use immutable::Buffer; pub(super) use iterator::IntoIter; diff --git a/crates/polars-arrow/src/ffi/array.rs b/crates/polars-arrow/src/ffi/array.rs index 55090f1c760a..60a102f56e94 100644 --- a/crates/polars-arrow/src/ffi/array.rs +++ b/crates/polars-arrow/src/ffi/array.rs @@ -7,9 +7,10 @@ use super::ArrowArray; use crate::array::*; use crate::bitmap::utils::bytes_for; use crate::bitmap::Bitmap; -use crate::buffer::{Buffer, Bytes, BytesAllocator}; +use crate::buffer::Buffer; use crate::datatypes::{ArrowDataType, PhysicalType}; use crate::ffi::schema::get_child; +use crate::storage::SharedStorage; use crate::types::NativeType; use crate::{match_integer_type, with_match_primitive_type_full}; @@ -259,8 +260,8 @@ unsafe fn create_buffer_known_len( return Ok(Buffer::new()); } let ptr: *mut T = get_buffer_ptr(array, dtype, index)?; - let bytes = Bytes::from_foreign(ptr, len, BytesAllocator::InternalArrowArray(owner)); - Ok(Buffer::from_bytes(bytes)) + let storage = SharedStorage::from_internal_arrow_array(ptr, len, owner); + Ok(Buffer::from_storage(storage)) } /// returns the buffer `i` of `array` interpreted as a [`Buffer`]. @@ -286,8 +287,8 @@ unsafe fn create_buffer( // We have to check alignment. // This is the zero-copy path. if ptr.align_offset(std::mem::align_of::()) == 0 { - let bytes = Bytes::from_foreign(ptr, len, BytesAllocator::InternalArrowArray(owner)); - Ok(Buffer::from_bytes(bytes).sliced(offset, len - offset)) + let storage = SharedStorage::from_internal_arrow_array(ptr, len, owner); + Ok(Buffer::from_storage(storage).sliced(offset, len - offset)) } // This is the path where alignment isn't correct. // We copy the data to a new vec @@ -321,7 +322,7 @@ unsafe fn create_bitmap( let offset: usize = array.offset.try_into().expect("offset to fit in `usize`"); let bytes_len = bytes_for(offset + len); - let bytes = Bytes::from_foreign(ptr, bytes_len, BytesAllocator::InternalArrowArray(owner)); + let storage = SharedStorage::from_internal_arrow_array(ptr, bytes_len, owner); let null_count = if is_validity { Some(array.null_count()) @@ -329,10 +330,7 @@ unsafe fn create_bitmap( None }; Ok(Bitmap::from_inner_unchecked( - Arc::new(bytes), - offset, - len, - null_count, + storage, offset, len, null_count, )) } diff --git a/crates/polars-arrow/src/lib.rs b/crates/polars-arrow/src/lib.rs index 15af97483a41..8c9b5c0d1af5 100644 --- a/crates/polars-arrow/src/lib.rs +++ b/crates/polars-arrow/src/lib.rs @@ -26,6 +26,7 @@ pub mod record_batch; pub mod offset; pub mod scalar; +pub mod storage; pub mod trusted_len; pub mod types; diff --git a/crates/polars-arrow/src/storage.rs b/crates/polars-arrow/src/storage.rs new file mode 100644 index 000000000000..864e4dc29d38 --- /dev/null +++ b/crates/polars-arrow/src/storage.rs @@ -0,0 +1,215 @@ +use std::mem::ManuallyDrop; +use std::ops::Deref; +use std::ptr::NonNull; +use std::sync::atomic::{AtomicU64, Ordering}; + +use crate::ffi::InternalArrowArray; + +enum BackingStorage { + Vec { + capacity: usize, + }, + InternalArrowArray(InternalArrowArray), + #[cfg(feature = "arrow_rs")] + ArrowBuffer(arrow_buffer::Buffer), +} + +struct SharedStorageInner { + ref_count: AtomicU64, + ptr: *mut T, + length: usize, + backing: Option, +} + +impl Drop for SharedStorageInner { + fn drop(&mut self) { + match self.backing.take() { + Some(BackingStorage::InternalArrowArray(a)) => drop(a), + #[cfg(feature = "arrow_rs")] + Some(BackingStorage::ArrowBuffer(b)) => drop(b), + Some(BackingStorage::Vec { capacity }) => unsafe { + drop(Vec::from_raw_parts(self.ptr, self.length, capacity)) + }, + None => {}, + } + } +} + +pub struct SharedStorage { + inner: NonNull>, +} + +unsafe impl Send for SharedStorage {} +unsafe impl Sync for SharedStorage {} + +impl SharedStorage { + pub fn from_static(slice: &'static [T]) -> Self { + let length = slice.len(); + let ptr = slice.as_ptr().cast_mut(); + let inner = SharedStorageInner { + ref_count: AtomicU64::new(2), // Never used, but 2 so it won't pass exclusivity tests. + ptr, + length, + backing: None, + }; + Self { + inner: NonNull::new(Box::into_raw(Box::new(inner))).unwrap(), + } + } + + pub fn from_vec(mut v: Vec) -> Self { + let length = v.len(); + let capacity = v.capacity(); + let ptr = v.as_mut_ptr(); + core::mem::forget(v); + let inner = SharedStorageInner { + ref_count: AtomicU64::new(1), + ptr, + length, + backing: Some(BackingStorage::Vec { capacity }), + }; + Self { + inner: NonNull::new(Box::into_raw(Box::new(inner))).unwrap(), + } + } + + pub fn from_internal_arrow_array(ptr: *const T, len: usize, arr: InternalArrowArray) -> Self { + let inner = SharedStorageInner { + ref_count: AtomicU64::new(1), + ptr: ptr.cast_mut(), + length: len, + backing: Some(BackingStorage::InternalArrowArray(arr)), + }; + Self { + inner: NonNull::new(Box::into_raw(Box::new(inner))).unwrap(), + } + } +} + +#[cfg(feature = "arrow_rs")] +impl SharedStorage { + pub fn from_arrow_buffer(buffer: arrow_buffer::Buffer) -> Self { + let ptr = buffer.as_ptr(); + let align_offset = ptr.align_offset(std::mem::align_of::()); + assert_eq!(align_offset, 0, "arrow_buffer::Buffer misaligned"); + let length = buffer.len() / std::mem::size_of::(); + + let inner = SharedStorageInner { + ref_count: AtomicU64::new(1), + ptr: ptr as *mut T, + length, + backing: Some(BackingStorage::ArrowBuffer(buffer)), + }; + Self { + inner: NonNull::new(Box::into_raw(Box::new(inner))).unwrap(), + } + } + + pub fn into_arrow_buffer(self) -> arrow_buffer::Buffer { + let ptr = NonNull::new(self.as_ptr() as *mut u8).unwrap(); + let len = self.len() * std::mem::size_of::(); + let arc = std::sync::Arc::new(self); + unsafe { arrow_buffer::Buffer::from_custom_allocation(ptr, len, arc) } + } +} + +impl SharedStorage { + #[inline(always)] + pub fn len(&self) -> usize { + self.inner().length + } + + #[inline(always)] + pub fn as_ptr(&self) -> *const T { + self.inner().ptr + } + + #[inline(always)] + pub fn is_exclusive(&mut self) -> bool { + // Ordering semantics copied from Arc. + self.inner().ref_count.load(Ordering::Acquire) == 1 + } + + /// Gets the reference count of this storage. + /// + /// Because this function takes a shared reference this should not be used + /// in cases where we are checking if the refcount is one for safety, + /// someone else could increment it in the meantime. + #[inline(always)] + pub fn refcount(&self) -> u64 { + // Ordering semantics copied from Arc. + self.inner().ref_count.load(Ordering::Acquire) + } + + pub fn try_as_mut_slice(&mut self) -> Option<&mut [T]> { + self.is_exclusive().then(|| { + let inner = self.inner(); + unsafe { core::slice::from_raw_parts_mut(inner.ptr, inner.length) } + }) + } + + pub fn try_into_vec(mut self) -> Result, Self> { + let Some(BackingStorage::Vec { capacity }) = self.inner().backing else { + return Err(self); + }; + if self.is_exclusive() { + let slf = ManuallyDrop::new(self); + let inner = slf.inner(); + Ok(unsafe { Vec::from_raw_parts(inner.ptr, inner.length, capacity) }) + } else { + Err(self) + } + } + + #[inline(always)] + fn inner(&self) -> &SharedStorageInner { + unsafe { &*self.inner.as_ptr() } + } + + /// # Safety + /// May only be called once. + #[cold] + unsafe fn drop_slow(&mut self) { + unsafe { drop(Box::from_raw(self.inner.as_ptr())) } + } +} + +impl Deref for SharedStorage { + type Target = [T]; + + #[inline] + fn deref(&self) -> &Self::Target { + unsafe { + let inner = self.inner(); + core::slice::from_raw_parts(inner.ptr, inner.length) + } + } +} + +impl Clone for SharedStorage { + fn clone(&self) -> Self { + let inner = self.inner(); + if inner.backing.is_some() { + // Ordering semantics copied from Arc. + inner.ref_count.fetch_add(1, Ordering::Relaxed); + } + Self { inner: self.inner } + } +} + +impl Drop for SharedStorage { + fn drop(&mut self) { + let inner = self.inner(); + if inner.backing.is_none() { + return; + } + + // Ordering semantics copied from Arc. + if inner.ref_count.fetch_sub(1, Ordering::Release) == 1 { + std::sync::atomic::fence(Ordering::Acquire); + unsafe { + self.drop_slow(); + } + } + } +} diff --git a/crates/polars-core/src/chunked_array/object/extension/drop.rs b/crates/polars-core/src/chunked_array/object/extension/drop.rs index 3b3e16deff2e..075e9e99dc61 100644 --- a/crates/polars-core/src/chunked_array/object/extension/drop.rs +++ b/crates/polars-core/src/chunked_array/object/extension/drop.rs @@ -41,7 +41,7 @@ pub(crate) unsafe fn drop_object_array(values: &dyn Array) { // If the buf is not shared with anyone but us we can deallocate. let buf = arr.values(); - if buf.shared_count_strong() == 1 && !buf.is_empty() { + if buf.storage_refcount() == 1 && !buf.is_empty() { PolarsExtension::new(arr.clone()); }; } diff --git a/crates/polars-utils/src/foreign_vec.rs b/crates/polars-utils/src/foreign_vec.rs deleted file mode 100644 index f763246ec029..000000000000 --- a/crates/polars-utils/src/foreign_vec.rs +++ /dev/null @@ -1,100 +0,0 @@ -/// This is pulled out of https://github.com/DataEngineeringLabs/foreign_vec -use std::mem::ManuallyDrop; -use std::ops::DerefMut; -use std::vec::Vec; - -/// Mode of deallocating memory regions -enum Allocation { - /// Native allocation - Native, - // A foreign allocator and its ref count - Foreign(D), -} - -/// A continuous memory region that may be allocated externally. -/// -/// In the most common case, this is created from [`Vec`]. -/// However, this region may also be allocated by a foreign allocator `D` -/// and behave as `&[T]`. -pub struct ForeignVec { - /// An implementation using an `enum` of a `Vec` or a foreign pointer is not used - /// because `deref` is at least 50% more expensive than the deref of a `Vec`. - data: ManuallyDrop>, - /// the region was allocated - allocation: Allocation, -} - -impl ForeignVec { - /// Takes ownership of an allocated memory region. - /// # Panics - /// This function panics if and only if pointer is not null - /// # Safety - /// This function is safe if and only if `ptr` is valid for `length` - /// # Implementation - /// This function leaks if and only if `owner` does not deallocate - /// the region `[ptr, ptr+length[` when dropped. - #[inline] - pub unsafe fn from_foreign(ptr: *const T, length: usize, owner: D) -> Self { - assert!(!ptr.is_null()); - // This line is technically outside the assumptions of `Vec::from_raw_parts`, since - // `ptr` was not allocated by `Vec`. However, one of the invariants of this struct - // is that we do never expose this region as a `Vec`; we only use `Vec` on it to provide - // immutable access to the region (via `Vec::deref` to `&[T]`). - let data = Vec::from_raw_parts(ptr as *mut T, length, length); - let data = ManuallyDrop::new(data); - - Self { - data, - allocation: Allocation::Foreign(owner), - } - } - - /// Returns a `Some` mutable reference of [`Vec`] iff this was initialized - /// from a [`Vec`] and `None` otherwise. - pub fn get_vec(&mut self) -> Option<&mut Vec> { - match &self.allocation { - Allocation::Foreign(_) => None, - Allocation::Native => Some(self.data.deref_mut()), - } - } -} - -impl Drop for ForeignVec { - #[inline] - fn drop(&mut self) { - match self.allocation { - Allocation::Foreign(_) => { - // the foreign is dropped via its `Drop` - }, - Allocation::Native => { - let data = core::mem::take(&mut self.data); - let _ = ManuallyDrop::into_inner(data); - }, - } - } -} - -impl core::ops::Deref for ForeignVec { - type Target = [T]; - - #[inline] - fn deref(&self) -> &[T] { - &self.data - } -} - -impl core::fmt::Debug for ForeignVec { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - core::fmt::Debug::fmt(&**self, f) - } -} - -impl From> for ForeignVec { - #[inline] - fn from(data: Vec) -> Self { - Self { - data: ManuallyDrop::new(data), - allocation: Allocation::Native, - } - } -} diff --git a/crates/polars-utils/src/lib.rs b/crates/polars-utils/src/lib.rs index 68e331973800..eacd517d1254 100644 --- a/crates/polars-utils/src/lib.rs +++ b/crates/polars-utils/src/lib.rs @@ -14,7 +14,6 @@ pub mod contention_pool; pub mod cpuid; mod error; pub mod floor_divmod; -pub mod foreign_vec; pub mod functions; pub mod hashing; pub mod idx_vec; diff --git a/crates/polars/tests/it/arrow/buffer/immutable.rs b/crates/polars/tests/it/arrow/buffer/immutable.rs index a4835422c56e..9065b52fba35 100644 --- a/crates/polars/tests/it/arrow/buffer/immutable.rs +++ b/crates/polars/tests/it/arrow/buffer/immutable.rs @@ -99,7 +99,7 @@ fn from_arrow_vec() { } #[test] -#[should_panic(expected = "not aligned")] +#[should_panic(expected = "arrow_buffer::Buffer misaligned")] fn from_arrow_misaligned() { let buffer = arrow_buffer::Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]).slice(1); let _ = Buffer::::from(buffer); From 1ee85a38d7c9f0d854224e388783c0047212a06a Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Wed, 11 Sep 2024 16:45:53 +1000 Subject: [PATCH 19/28] refactor(rust): Push down max row group height calc to file metadata (#18674) --- .../src/parquet/metadata/file_metadata.rs | 11 +++- .../nodes/parquet_source/metadata_fetch.rs | 56 ++++++------------- .../parquet_source/row_group_data_fetch.rs | 6 +- 3 files changed, 28 insertions(+), 45 deletions(-) diff --git a/crates/polars-parquet/src/parquet/metadata/file_metadata.rs b/crates/polars-parquet/src/parquet/metadata/file_metadata.rs index 47c9f160781d..2705c2a7b70d 100644 --- a/crates/polars-parquet/src/parquet/metadata/file_metadata.rs +++ b/crates/polars-parquet/src/parquet/metadata/file_metadata.rs @@ -16,6 +16,8 @@ pub struct FileMetadata { pub version: i32, /// number of rows in the file. pub num_rows: usize, + /// Max row group height, useful for sharing column materializations. + pub max_row_group_height: usize, /// String message for application that wrote this file. /// /// This should have the following format: @@ -67,10 +69,16 @@ impl FileMetadata { ) -> Result { let schema_descr = SchemaDescriptor::try_from_thrift(&metadata.schema)?; + let mut max_row_group_height = 0; + let row_groups = metadata .row_groups .into_iter() - .map(|rg| RowGroupMetadata::try_from_thrift(&schema_descr, rg)) + .map(|rg| { + let md = RowGroupMetadata::try_from_thrift(&schema_descr, rg)?; + max_row_group_height = max_row_group_height.max(md.num_rows()); + Ok(md) + }) .collect::>()?; let column_orders = metadata @@ -80,6 +88,7 @@ impl FileMetadata { Ok(FileMetadata { version: metadata.version, num_rows: metadata.num_rows.try_into()?, + max_row_group_height, created_by: metadata.created_by, row_groups, key_value_metadata: metadata.key_value_metadata, diff --git a/crates/polars-stream/src/nodes/parquet_source/metadata_fetch.rs b/crates/polars-stream/src/nodes/parquet_source/metadata_fetch.rs index 5f3281145083..b5a2453cfe1c 100644 --- a/crates/polars-stream/src/nodes/parquet_source/metadata_fetch.rs +++ b/crates/polars-stream/src/nodes/parquet_source/metadata_fetch.rs @@ -28,7 +28,6 @@ impl ParquetSourceNode { usize, Arc, FileMetadata, - usize, )>, task_handles_ext::AbortOnDropHandle>, ) { @@ -40,8 +39,6 @@ impl ParquetSourceNode { || self.file_options.with_columns.as_deref() == Some(&[]) ); let projected_arrow_fields = self.projected_arrow_fields.clone(); - let needs_max_row_group_height_calc = - self.file_options.include_file_paths.is_some() || self.hive_parts.is_some(); let (normalized_slice_oneshot_tx, normalized_slice_oneshot_rx) = tokio::sync::oneshot::channel(); @@ -139,18 +136,7 @@ impl ParquetSourceNode { &metadata, )?; - let file_max_row_group_height = if needs_max_row_group_height_calc { - metadata - .row_groups - .iter() - .map(|x| x.num_rows()) - .max() - .unwrap_or(0) - } else { - 0 - }; - - PolarsResult::Ok((path_index, byte_source, metadata, file_max_row_group_height)) + PolarsResult::Ok((path_index, byte_source, metadata)) }); async_executor::AbortOnDropHandle::new(handle) @@ -213,19 +199,18 @@ impl ParquetSourceNode { break; }; - let (path_index, byte_source, metadata, file_max_row_group_height) = v - .map_err(|err| { - err.wrap_msg(|msg| { - format!( - "error at path (index: {}, path: {:?}): {}", - current_path_index, - scan_sources - .get(current_path_index) - .map(|x| PlSmallStr::from_str(x.to_include_path_name())), - msg - ) - }) - })?; + let (path_index, byte_source, metadata) = v.map_err(|err| { + err.wrap_msg(|msg| { + format!( + "error at path (index: {}, path: {:?}): {}", + current_path_index, + scan_sources + .get(current_path_index) + .map(|x| PlSmallStr::from_str(x.to_include_path_name())), + msg + ) + }) + })?; assert_eq!(path_index, current_path_index); @@ -254,13 +239,7 @@ impl ParquetSourceNode { }; if metadata_tx - .send(( - path_index, - current_row_offset, - byte_source, - metadata, - file_max_row_group_height, - )) + .send((path_index, current_row_offset, byte_source, metadata)) .await .is_err() { @@ -304,7 +283,7 @@ impl ParquetSourceNode { while let Some(v) = metadata_stream.next().await { let v = v?; - let (_, _, metadata, _) = &v; + let (_, _, metadata) = &v; cum_rows += metadata.num_rows; processed_metadata_rev.push(v); @@ -372,9 +351,7 @@ impl ParquetSourceNode { let metadata_iter = processed_metadata_rev.into_iter().rev(); let current_row_offset_ref = &mut 0usize; - for (current_path_index, byte_source, metadata, file_max_row_group_height) in - metadata_iter - { + for (current_path_index, byte_source, metadata) in metadata_iter { let current_row_offset = *current_row_offset_ref; *current_row_offset_ref = current_row_offset.saturating_add(metadata.num_rows); @@ -393,7 +370,6 @@ impl ParquetSourceNode { current_row_offset, byte_source, metadata, - file_max_row_group_height, )) .await .is_err() diff --git a/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs b/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs index 773a5a9e3625..376562c92fb2 100644 --- a/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs +++ b/crates/polars-stream/src/nodes/parquet_source/row_group_data_fetch.rs @@ -38,7 +38,6 @@ pub(super) struct RowGroupDataFetcher { usize, Arc, FileMetadata, - usize, )>, pub(super) use_statistics: bool, pub(super) verbose: bool, @@ -62,15 +61,14 @@ impl RowGroupDataFetcher { } pub(super) async fn init_next_file_state(&mut self) -> bool { - let Ok((path_index, row_offset, byte_source, metadata, file_max_row_group_height)) = - self.metadata_rx.recv().await + let Ok((path_index, row_offset, byte_source, metadata)) = self.metadata_rx.recv().await else { return false; }; self.current_path_index = path_index; self.current_byte_source = byte_source; - self.current_max_row_group_height = file_max_row_group_height; + self.current_max_row_group_height = metadata.max_row_group_height; // The metadata task also sends a row offset to start counting from as it may skip files // during slice pushdown. self.current_row_offset = row_offset; From 12cce97026fde047df3b9a3f92f229f7dacdb42c Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Wed, 11 Sep 2024 08:47:50 +0200 Subject: [PATCH 20/28] fix: Scalar checks (#18627) --- .../src/expressions/aggregation.rs | 8 + crates/polars-expr/src/expressions/alias.rs | 4 + crates/polars-expr/src/expressions/apply.rs | 18 +- crates/polars-expr/src/expressions/binary.rs | 7 + crates/polars-expr/src/expressions/cast.rs | 4 + crates/polars-expr/src/expressions/column.rs | 3 + crates/polars-expr/src/expressions/count.rs | 4 + crates/polars-expr/src/expressions/filter.rs | 4 + crates/polars-expr/src/expressions/gather.rs | 4 + crates/polars-expr/src/expressions/literal.rs | 4 + crates/polars-expr/src/expressions/mod.rs | 1 + crates/polars-expr/src/expressions/rolling.rs | 4 + crates/polars-expr/src/expressions/slice.rs | 4 + crates/polars-expr/src/expressions/sort.rs | 4 + crates/polars-expr/src/expressions/sortby.rs | 4 + crates/polars-expr/src/expressions/ternary.rs | 7 + crates/polars-expr/src/expressions/window.rs | 8 + crates/polars-expr/src/planner.rs | 8 + .../src/executors/projection.rs | 4 +- .../src/executors/projection_utils.rs | 34 ++- .../polars-mem-engine/src/executors/stack.rs | 97 ++++--- crates/polars-plan/src/plans/aexpr/mod.rs | 239 +----------------- crates/polars-plan/src/plans/aexpr/scalar.rs | 27 ++ .../polars-plan/src/plans/aexpr/traverse.rs | 238 +++++++++++++++++ crates/polars-plan/src/plans/expr_ir.rs | 4 + crates/polars-plan/src/plans/lit.rs | 4 + .../polars/_utils/construction/dataframe.py | 13 +- py-polars/polars/expr/expr.py | 3 +- py-polars/tests/unit/dataframe/test_df.py | 2 +- py-polars/tests/unit/functions/test_lit.py | 4 +- .../tests/unit/operations/test_window.py | 7 + py-polars/tests/unit/test_scalar.py | 14 + 32 files changed, 496 insertions(+), 294 deletions(-) create mode 100644 crates/polars-plan/src/plans/aexpr/scalar.rs create mode 100644 crates/polars-plan/src/plans/aexpr/traverse.rs create mode 100644 py-polars/tests/unit/test_scalar.py diff --git a/crates/polars-expr/src/expressions/aggregation.rs b/crates/polars-expr/src/expressions/aggregation.rs index 5c64ef144cc4..297c77b19e00 100644 --- a/crates/polars-expr/src/expressions/aggregation.rs +++ b/crates/polars-expr/src/expressions/aggregation.rs @@ -451,6 +451,10 @@ impl PhysicalExpr for AggregationExpr { } } + fn is_scalar(&self) -> bool { + true + } + fn as_partitioned_aggregator(&self) -> Option<&dyn PartitionedAggregation> { Some(self) } @@ -742,6 +746,10 @@ impl PhysicalExpr for AggQuantileExpr { fn to_field(&self, input_schema: &Schema) -> PolarsResult { self.input.to_field(input_schema) } + + fn is_scalar(&self) -> bool { + true + } } /// Simple wrapper to parallelize functions that can be divided over threads aggregated and diff --git a/crates/polars-expr/src/expressions/alias.rs b/crates/polars-expr/src/expressions/alias.rs index a6ea8953288c..6b38d8dc8270 100644 --- a/crates/polars-expr/src/expressions/alias.rs +++ b/crates/polars-expr/src/expressions/alias.rs @@ -59,6 +59,10 @@ impl PhysicalExpr for AliasExpr { )) } + fn is_scalar(&self) -> bool { + self.physical_expr.is_scalar() + } + fn as_partitioned_aggregator(&self) -> Option<&dyn PartitionedAggregation> { Some(self) } diff --git a/crates/polars-expr/src/expressions/apply.rs b/crates/polars-expr/src/expressions/apply.rs index 2b2bbc1e57d2..a5ea16d0f22f 100644 --- a/crates/polars-expr/src/expressions/apply.rs +++ b/crates/polars-expr/src/expressions/apply.rs @@ -18,7 +18,8 @@ pub struct ApplyExpr { function: SpecialEq>, expr: Expr, collect_groups: ApplyOptions, - returns_scalar: bool, + function_returns_scalar: bool, + function_operates_on_scalar: bool, allow_rename: bool, pass_name_to_apply: bool, input_schema: Option, @@ -29,6 +30,7 @@ pub struct ApplyExpr { } impl ApplyExpr { + #[allow(clippy::too_many_arguments)] pub(crate) fn new( inputs: Vec>, function: SpecialEq>, @@ -37,6 +39,7 @@ impl ApplyExpr { allow_threading: bool, input_schema: Option, output_dtype: Option, + returns_scalar: bool, ) -> Self { #[cfg(debug_assertions)] if matches!(options.collect_groups, ApplyOptions::ElementWise) @@ -50,7 +53,8 @@ impl ApplyExpr { function, expr, collect_groups: options.collect_groups, - returns_scalar: options.flags.contains(FunctionFlags::RETURNS_SCALAR), + function_returns_scalar: options.flags.contains(FunctionFlags::RETURNS_SCALAR), + function_operates_on_scalar: returns_scalar, allow_rename: options.flags.contains(FunctionFlags::ALLOW_RENAME), pass_name_to_apply: options.flags.contains(FunctionFlags::PASS_NAME_TO_APPLY), input_schema, @@ -72,7 +76,8 @@ impl ApplyExpr { function, expr, collect_groups, - returns_scalar: false, + function_returns_scalar: false, + function_operates_on_scalar: false, allow_rename: false, pass_name_to_apply: false, input_schema: None, @@ -104,7 +109,7 @@ impl ApplyExpr { ca: ListChunked, ) -> PolarsResult> { let all_unit_len = all_unit_length(&ca); - if all_unit_len && self.returns_scalar { + if all_unit_len && self.function_returns_scalar { ac.with_agg_state(AggState::AggregatedScalar( ca.explode().unwrap().into_series(), )); @@ -253,7 +258,7 @@ impl ApplyExpr { let mut ac = acs.swap_remove(0); ac.with_update_groups(UpdateGroups::No); - let agg_state = if self.returns_scalar { + let agg_state = if self.function_returns_scalar { AggState::AggregatedScalar(Series::new_empty(field.name().clone(), &field.dtype)) } else { match self.collect_groups { @@ -426,6 +431,9 @@ impl PhysicalExpr for ApplyExpr { None } } + fn is_scalar(&self) -> bool { + self.function_returns_scalar || self.function_operates_on_scalar + } } fn apply_multiple_elementwise<'a>( diff --git a/crates/polars-expr/src/expressions/binary.rs b/crates/polars-expr/src/expressions/binary.rs index dd7697e2f054..c7e89132bc2e 100644 --- a/crates/polars-expr/src/expressions/binary.rs +++ b/crates/polars-expr/src/expressions/binary.rs @@ -15,6 +15,7 @@ pub struct BinaryExpr { expr: Expr, has_literal: bool, allow_threading: bool, + is_scalar: bool, } impl BinaryExpr { @@ -25,6 +26,7 @@ impl BinaryExpr { expr: Expr, has_literal: bool, allow_threading: bool, + is_scalar: bool, ) -> Self { Self { left, @@ -33,6 +35,7 @@ impl BinaryExpr { expr, has_literal, allow_threading, + is_scalar, } } } @@ -254,6 +257,10 @@ impl PhysicalExpr for BinaryExpr { self.expr.to_field(input_schema, Context::Default) } + fn is_scalar(&self) -> bool { + self.is_scalar + } + fn as_partitioned_aggregator(&self) -> Option<&dyn PartitionedAggregation> { Some(self) } diff --git a/crates/polars-expr/src/expressions/cast.rs b/crates/polars-expr/src/expressions/cast.rs index dcc3d4dddbca..ebfd50311918 100644 --- a/crates/polars-expr/src/expressions/cast.rs +++ b/crates/polars-expr/src/expressions/cast.rs @@ -76,6 +76,10 @@ impl PhysicalExpr for CastExpr { }) } + fn is_scalar(&self) -> bool { + self.input.is_scalar() + } + fn as_partitioned_aggregator(&self) -> Option<&dyn PartitionedAggregation> { Some(self) } diff --git a/crates/polars-expr/src/expressions/column.rs b/crates/polars-expr/src/expressions/column.rs index 603353e4815b..74a20dcdb0ba 100644 --- a/crates/polars-expr/src/expressions/column.rs +++ b/crates/polars-expr/src/expressions/column.rs @@ -190,6 +190,9 @@ impl PhysicalExpr for ColumnExpr { ) }) } + fn is_scalar(&self) -> bool { + false + } } impl PartitionedAggregation for ColumnExpr { diff --git a/crates/polars-expr/src/expressions/count.rs b/crates/polars-expr/src/expressions/count.rs index 2d8fbeb6a2d2..5e8b4c75e376 100644 --- a/crates/polars-expr/src/expressions/count.rs +++ b/crates/polars-expr/src/expressions/count.rs @@ -46,6 +46,10 @@ impl PhysicalExpr for CountExpr { fn as_partitioned_aggregator(&self) -> Option<&dyn PartitionedAggregation> { Some(self) } + + fn is_scalar(&self) -> bool { + true + } } impl PartitionedAggregation for CountExpr { diff --git a/crates/polars-expr/src/expressions/filter.rs b/crates/polars-expr/src/expressions/filter.rs index 4e02b38ae4b7..b11d0dda6129 100644 --- a/crates/polars-expr/src/expressions/filter.rs +++ b/crates/polars-expr/src/expressions/filter.rs @@ -148,4 +148,8 @@ impl PhysicalExpr for FilterExpr { fn to_field(&self, input_schema: &Schema) -> PolarsResult { self.input.to_field(input_schema) } + + fn is_scalar(&self) -> bool { + false + } } diff --git a/crates/polars-expr/src/expressions/gather.rs b/crates/polars-expr/src/expressions/gather.rs index c82bedee986b..5c9fc86a9c27 100644 --- a/crates/polars-expr/src/expressions/gather.rs +++ b/crates/polars-expr/src/expressions/gather.rs @@ -92,6 +92,10 @@ impl PhysicalExpr for GatherExpr { fn to_field(&self, input_schema: &Schema) -> PolarsResult { self.phys_expr.to_field(input_schema) } + + fn is_scalar(&self) -> bool { + self.returns_scalar + } } impl GatherExpr { diff --git a/crates/polars-expr/src/expressions/literal.rs b/crates/polars-expr/src/expressions/literal.rs index 6d406c5b297a..ad2e73cd8f70 100644 --- a/crates/polars-expr/src/expressions/literal.rs +++ b/crates/polars-expr/src/expressions/literal.rs @@ -127,6 +127,10 @@ impl PhysicalExpr for LiteralExpr { fn is_literal(&self) -> bool { true } + + fn is_scalar(&self) -> bool { + self.0.is_scalar() + } } impl PartitionedAggregation for LiteralExpr { diff --git a/crates/polars-expr/src/expressions/mod.rs b/crates/polars-expr/src/expressions/mod.rs index b66920de7ab9..ec17842d719c 100644 --- a/crates/polars-expr/src/expressions/mod.rs +++ b/crates/polars-expr/src/expressions/mod.rs @@ -587,6 +587,7 @@ pub trait PhysicalExpr: Send + Sync { fn is_literal(&self) -> bool { false } + fn is_scalar(&self) -> bool; } impl Display for &dyn PhysicalExpr { diff --git a/crates/polars-expr/src/expressions/rolling.rs b/crates/polars-expr/src/expressions/rolling.rs index 601901460c3f..806e3d5b0398 100644 --- a/crates/polars-expr/src/expressions/rolling.rs +++ b/crates/polars-expr/src/expressions/rolling.rs @@ -66,4 +66,8 @@ impl PhysicalExpr for RollingExpr { fn as_expression(&self) -> Option<&Expr> { Some(&self.expr) } + + fn is_scalar(&self) -> bool { + false + } } diff --git a/crates/polars-expr/src/expressions/slice.rs b/crates/polars-expr/src/expressions/slice.rs index 579c8d66635e..d0e187120939 100644 --- a/crates/polars-expr/src/expressions/slice.rs +++ b/crates/polars-expr/src/expressions/slice.rs @@ -266,4 +266,8 @@ impl PhysicalExpr for SliceExpr { fn to_field(&self, input_schema: &Schema) -> PolarsResult { self.input.to_field(input_schema) } + + fn is_scalar(&self) -> bool { + false + } } diff --git a/crates/polars-expr/src/expressions/sort.rs b/crates/polars-expr/src/expressions/sort.rs index a7f2a94ae344..64a6d458125e 100644 --- a/crates/polars-expr/src/expressions/sort.rs +++ b/crates/polars-expr/src/expressions/sort.rs @@ -108,4 +108,8 @@ impl PhysicalExpr for SortExpr { fn to_field(&self, input_schema: &Schema) -> PolarsResult { self.physical_expr.to_field(input_schema) } + + fn is_scalar(&self) -> bool { + false + } } diff --git a/crates/polars-expr/src/expressions/sortby.rs b/crates/polars-expr/src/expressions/sortby.rs index 0c2a775657d4..71825c971329 100644 --- a/crates/polars-expr/src/expressions/sortby.rs +++ b/crates/polars-expr/src/expressions/sortby.rs @@ -357,4 +357,8 @@ impl PhysicalExpr for SortByExpr { fn to_field(&self, input_schema: &Schema) -> PolarsResult { self.input.to_field(input_schema) } + + fn is_scalar(&self) -> bool { + false + } } diff --git a/crates/polars-expr/src/expressions/ternary.rs b/crates/polars-expr/src/expressions/ternary.rs index ef12dcea0204..c776e4b951dd 100644 --- a/crates/polars-expr/src/expressions/ternary.rs +++ b/crates/polars-expr/src/expressions/ternary.rs @@ -12,6 +12,7 @@ pub struct TernaryExpr { expr: Expr, // Can be expensive on small data to run literals in parallel. run_par: bool, + returns_scalar: bool, } impl TernaryExpr { @@ -21,6 +22,7 @@ impl TernaryExpr { falsy: Arc, expr: Expr, run_par: bool, + returns_scalar: bool, ) -> Self { Self { predicate, @@ -28,6 +30,7 @@ impl TernaryExpr { falsy, expr, run_par, + returns_scalar, } } } @@ -322,6 +325,10 @@ impl PhysicalExpr for TernaryExpr { fn as_partitioned_aggregator(&self) -> Option<&dyn PartitionedAggregation> { Some(self) } + + fn is_scalar(&self) -> bool { + self.returns_scalar + } } impl PartitionedAggregation for TernaryExpr { diff --git a/crates/polars-expr/src/expressions/window.rs b/crates/polars-expr/src/expressions/window.rs index 2ea353cc7c52..47ea0847507c 100644 --- a/crates/polars-expr/src/expressions/window.rs +++ b/crates/polars-expr/src/expressions/window.rs @@ -519,6 +519,10 @@ impl PhysicalExpr for WindowExpr { match self.determine_map_strategy(ac.agg_state(), sorted_keys, &gb)? { Nothing => { let mut out = ac.flat_naive().into_owned(); + + if ac.is_literal() { + out = out.new_from_index(0, df.height()) + } cache_gb(gb, state, &cache_key); if let Some(name) = &self.out_name { out.rename(name.clone()); @@ -630,6 +634,10 @@ impl PhysicalExpr for WindowExpr { self.function.to_field(input_schema, Context::Default) } + fn is_scalar(&self) -> bool { + false + } + #[allow(clippy::ptr_arg)] fn evaluate_on_groups<'a>( &self, diff --git a/crates/polars-expr/src/planner.rs b/crates/polars-expr/src/planner.rs index 315fc123d158..e578b8da9679 100644 --- a/crates/polars-expr/src/planner.rs +++ b/crates/polars-expr/src/planner.rs @@ -293,6 +293,7 @@ fn create_physical_expr_inner( ))) }, BinaryExpr { left, op, right } => { + let is_scalar = is_scalar_ae(expression, expr_arena); let lhs = create_physical_expr_inner(*left, ctxt, expr_arena, schema, state)?; let rhs = create_physical_expr_inner(*right, ctxt, expr_arena, schema, state)?; Ok(Arc::new(phys_expr::BinaryExpr::new( @@ -302,6 +303,7 @@ fn create_physical_expr_inner( node_to_expr(expression, expr_arena), state.local.has_lit, state.allow_threading, + is_scalar, ))) }, Column(column) => Ok(Arc::new(ColumnExpr::new( @@ -444,6 +446,7 @@ fn create_physical_expr_inner( truthy, falsy, } => { + let is_scalar = is_scalar_ae(expression, expr_arena); let mut lit_count = 0u8; state.reset(); let predicate = @@ -461,6 +464,7 @@ fn create_physical_expr_inner( falsy, node_to_expr(expression, expr_arena), lit_count < 2, + is_scalar, ))) }, AnonymousFunction { @@ -469,6 +473,7 @@ fn create_physical_expr_inner( output_type: _, options, } => { + let is_scalar = is_scalar_ae(expression, expr_arena); let output_dtype = schema.and_then(|schema| { expr_arena .get(expression) @@ -500,6 +505,7 @@ fn create_physical_expr_inner( state.allow_threading, schema.cloned(), output_dtype, + is_scalar, ))) }, Function { @@ -508,6 +514,7 @@ fn create_physical_expr_inner( options, .. } => { + let is_scalar = is_scalar_ae(expression, expr_arena); let output_dtype = schema.and_then(|schema| { expr_arena .get(expression) @@ -538,6 +545,7 @@ fn create_physical_expr_inner( state.allow_threading, schema.cloned(), output_dtype, + is_scalar, ))) }, Slice { diff --git a/crates/polars-mem-engine/src/executors/projection.rs b/crates/polars-mem-engine/src/executors/projection.rs index 43ffdd98c753..fec746e37a35 100644 --- a/crates/polars-mem-engine/src/executors/projection.rs +++ b/crates/polars-mem-engine/src/executors/projection.rs @@ -37,7 +37,7 @@ impl ProjectionExec { self.has_windows, self.options.run_parallel, )?; - check_expand_literals(selected_cols, df.is_empty(), self.options) + check_expand_literals(&df, &self.expr, selected_cols, df.is_empty(), self.options) }); let df = POOL.install(|| iter.collect::>>())?; @@ -53,7 +53,7 @@ impl ProjectionExec { self.has_windows, self.options.run_parallel, )?; - check_expand_literals(selected_cols, df.is_empty(), self.options)? + check_expand_literals(&df, &self.expr, selected_cols, df.is_empty(), self.options)? }; // this only runs during testing and check if the runtime type matches the predicted schema diff --git a/crates/polars-mem-engine/src/executors/projection_utils.rs b/crates/polars-mem-engine/src/executors/projection_utils.rs index 979c29321cb9..bd0e189f0b14 100644 --- a/crates/polars-mem-engine/src/executors/projection_utils.rs +++ b/crates/polars-mem-engine/src/executors/projection_utils.rs @@ -1,3 +1,4 @@ +use polars_plan::constants::CSE_REPLACED; use polars_utils::itertools::Itertools; use super::*; @@ -243,6 +244,8 @@ pub(super) fn evaluate_physical_expressions( } pub(super) fn check_expand_literals( + df: &DataFrame, + phys_expr: &[Arc], mut selected_columns: Vec, zero_length: bool, options: ProjectionOptions, @@ -252,6 +255,16 @@ pub(super) fn check_expand_literals( }; let duplicate_check = options.duplicate_check; let should_broadcast = options.should_broadcast; + + // When we have CSE we cannot verify scalars yet. + let verify_scalar = if !df.get_columns().is_empty() { + !df.get_columns()[df.width() - 1] + .name() + .starts_with(CSE_REPLACED) + } else { + true + }; + let mut df_height = 0; let mut has_empty = false; let mut all_equal_len = true; @@ -282,21 +295,28 @@ pub(super) fn check_expand_literals( if !all_equal_len && should_broadcast { selected_columns = selected_columns .into_iter() - .map(|series| { + .zip(phys_expr) + .map(|(series, phys)| { Ok(match series.len() { 0 if df_height == 1 => series, 1 => { if has_empty { - - polars_ensure!(df_height == 1, - ComputeError: "Series length {} doesn't match the DataFrame height of {}", - series.len(), df_height - ); - + polars_ensure!(df_height == 1, + ComputeError: "Series length {} doesn't match the DataFrame height of {}", + series.len(), df_height + ); series.slice(0, 0) } else if df_height == 1 { series } else { + if verify_scalar { + polars_ensure!(phys.is_scalar(), + InvalidOperation: "Series: {}, length {} doesn't match the DataFrame height of {}\n\n\ + If you want this Series to be broadcasted, ensure it is a scalar (for instance by adding '.first()').", + series.name(), series.len(), df_height + ); + + } series.new_from_index(0, df_height) } }, diff --git a/crates/polars-mem-engine/src/executors/stack.rs b/crates/polars-mem-engine/src/executors/stack.rs index 3425c129fef2..440fbdd619ca 100644 --- a/crates/polars-mem-engine/src/executors/stack.rs +++ b/crates/polars-mem-engine/src/executors/stack.rs @@ -1,4 +1,5 @@ use polars_core::utils::accumulate_dataframes_vertical_unchecked; +use polars_plan::constants::CSE_REPLACED; use super::*; @@ -21,28 +22,13 @@ impl StackExec { let schema = &*self.input_schema; // Vertical and horizontal parallelism. - let df = - if self.streamable && df.n_chunks() > 1 && df.height() > 0 && self.options.run_parallel - { - let chunks = df.split_chunks().collect::>(); - let iter = chunks.into_par_iter().map(|mut df| { - let res = evaluate_physical_expressions( - &mut df, - &self.exprs, - state, - self.has_windows, - self.options.run_parallel, - )?; - // We don't have to do a broadcast check as cse is not allowed to hit this. - df._add_columns(res, schema)?; - Ok(df) - }); - - let df = POOL.install(|| iter.collect::>>())?; - accumulate_dataframes_vertical_unchecked(df) - } - // Only horizontal parallelism - else { + let df = if self.streamable + && df.n_chunks() > 1 + && df.height() > 0 + && self.options.run_parallel + { + let chunks = df.split_chunks().collect::>(); + let iter = chunks.into_par_iter().map(|mut df| { let res = evaluate_physical_expressions( &mut df, &self.exprs, @@ -50,23 +36,60 @@ impl StackExec { self.has_windows, self.options.run_parallel, )?; - if !self.options.should_broadcast { - debug_assert!( - res.iter() - .all(|column| column.name().starts_with("__POLARS_CSER_0x")), - "non-broadcasting hstack should only be used for CSE columns" - ); - // Safety: this case only appears as a result of - // CSE optimization, and the usage there produces - // new, unique column names. It is immediately - // followed by a projection which pulls out the - // possibly mismatching column lengths. - unsafe { df.get_columns_mut().extend(res) }; + // We don't have to do a broadcast check as cse is not allowed to hit this. + df._add_columns(res, schema)?; + Ok(df) + }); + + let df = POOL.install(|| iter.collect::>>())?; + accumulate_dataframes_vertical_unchecked(df) + } + // Only horizontal parallelism + else { + let res = evaluate_physical_expressions( + &mut df, + &self.exprs, + state, + self.has_windows, + self.options.run_parallel, + )?; + if !self.options.should_broadcast { + debug_assert!( + res.iter() + .all(|column| column.name().starts_with("__POLARS_CSER_0x")), + "non-broadcasting hstack should only be used for CSE columns" + ); + // Safety: this case only appears as a result of + // CSE optimization, and the usage there produces + // new, unique column names. It is immediately + // followed by a projection which pulls out the + // possibly mismatching column lengths. + unsafe { df.get_columns_mut().extend(res) }; + } else { + let height = df.height(); + + // When we have CSE we cannot verify scalars yet. + let verify_scalar = if !df.get_columns().is_empty() { + !df.get_columns()[df.width() - 1] + .name() + .starts_with(CSE_REPLACED) } else { - df._add_columns(res, schema)?; + true + }; + for (i, c) in res.iter().enumerate() { + let len = c.len(); + if verify_scalar && len != height && len == 1 { + polars_ensure!(self.exprs[i].is_scalar(), + InvalidOperation: "Series {}, length {} doesn't match the DataFrame height of {}\n\n\ + If you want this Series to be broadcasted, ensure it is a scalar (for instance by adding '.first()').", + c.name(), len, height + ); + } } - df - }; + df._add_columns(res, schema)?; + } + df + }; state.clear_window_expr_cache(); diff --git a/crates/polars-plan/src/plans/aexpr/mod.rs b/crates/polars-plan/src/plans/aexpr/mod.rs index 70c6335bcd1e..4be23e79df14 100644 --- a/crates/polars-plan/src/plans/aexpr/mod.rs +++ b/crates/polars-plan/src/plans/aexpr/mod.rs @@ -1,6 +1,8 @@ #[cfg(feature = "cse")] mod hash; +mod scalar; mod schema; +mod traverse; mod utils; use std::hash::{Hash, Hasher}; @@ -11,9 +13,11 @@ use polars_core::chunked_array::cast::CastOptions; use polars_core::prelude::*; use polars_core::utils::{get_time_units, try_get_supertype}; use polars_utils::arena::{Arena, Node}; +pub use scalar::is_scalar_ae; #[cfg(feature = "ir_serde")] use serde::{Deserialize, Serialize}; use strum_macros::IntoStaticStr; +pub use traverse::*; pub use utils::*; use crate::constants::LEN; @@ -247,242 +251,7 @@ impl AExpr { .map(|f| f.dtype().clone()) } - /// Push nodes at this level to a pre-allocated stack - pub(crate) fn nodes(&self, container: &mut C) { - use AExpr::*; - - match self { - Column(_) | Literal(_) | Len => {}, - Alias(e, _) => container.push_node(*e), - BinaryExpr { left, op: _, right } => { - // reverse order so that left is popped first - container.push_node(*right); - container.push_node(*left); - }, - Cast { expr, .. } => container.push_node(*expr), - Sort { expr, .. } => container.push_node(*expr), - Gather { expr, idx, .. } => { - container.push_node(*idx); - // latest, so that it is popped first - container.push_node(*expr); - }, - SortBy { expr, by, .. } => { - for node in by { - container.push_node(*node) - } - // latest, so that it is popped first - container.push_node(*expr); - }, - Filter { input, by } => { - container.push_node(*by); - // latest, so that it is popped first - container.push_node(*input); - }, - Agg(agg_e) => match agg_e.get_input() { - NodeInputs::Single(node) => container.push_node(node), - NodeInputs::Many(nodes) => container.extend_from_slice(&nodes), - NodeInputs::Leaf => {}, - }, - Ternary { - truthy, - falsy, - predicate, - } => { - container.push_node(*predicate); - container.push_node(*falsy); - // latest, so that it is popped first - container.push_node(*truthy); - }, - AnonymousFunction { input, .. } | Function { input, .. } => - // we iterate in reverse order, so that the lhs is popped first and will be found - // as the root columns/ input columns by `_suffix` and `_keep_name` etc. - { - input - .iter() - .rev() - .for_each(|e| container.push_node(e.node())) - }, - Explode(e) => container.push_node(*e), - Window { - function, - partition_by, - order_by, - options: _, - } => { - if let Some((n, _)) = order_by { - container.push_node(*n); - } - for e in partition_by.iter().rev() { - container.push_node(*e); - } - // latest so that it is popped first - container.push_node(*function); - }, - Slice { - input, - offset, - length, - } => { - container.push_node(*length); - container.push_node(*offset); - // latest so that it is popped first - container.push_node(*input); - }, - } - } - - pub(crate) fn replace_inputs(mut self, inputs: &[Node]) -> Self { - use AExpr::*; - let input = match &mut self { - Column(_) | Literal(_) | Len => return self, - Alias(input, _) => input, - Cast { expr, .. } => expr, - Explode(input) => input, - BinaryExpr { left, right, .. } => { - *right = inputs[0]; - *left = inputs[1]; - return self; - }, - Gather { expr, idx, .. } => { - *idx = inputs[0]; - *expr = inputs[1]; - return self; - }, - Sort { expr, .. } => expr, - SortBy { expr, by, .. } => { - *expr = *inputs.last().unwrap(); - by.clear(); - by.extend_from_slice(&inputs[..inputs.len() - 1]); - return self; - }, - Filter { input, by, .. } => { - *by = inputs[0]; - *input = inputs[1]; - return self; - }, - Agg(a) => { - match a { - IRAggExpr::Quantile { expr, quantile, .. } => { - *expr = inputs[0]; - *quantile = inputs[1]; - }, - _ => { - a.set_input(inputs[0]); - }, - } - return self; - }, - Ternary { - truthy, - falsy, - predicate, - } => { - *predicate = inputs[0]; - *falsy = inputs[1]; - *truthy = inputs[2]; - return self; - }, - AnonymousFunction { input, .. } | Function { input, .. } => { - debug_assert_eq!(input.len(), inputs.len()); - - // Assign in reverse order as that was the order in which nodes were extracted. - for (e, node) in input.iter_mut().zip(inputs.iter().rev()) { - e.set_node(*node); - } - return self; - }, - Slice { - input, - offset, - length, - } => { - *length = inputs[0]; - *offset = inputs[1]; - *input = inputs[2]; - return self; - }, - Window { - function, - partition_by, - order_by, - .. - } => { - let offset = order_by.is_some() as usize; - *function = *inputs.last().unwrap(); - partition_by.clear(); - partition_by.extend_from_slice(&inputs[offset..inputs.len() - 1]); - - if let Some((_, options)) = order_by { - *order_by = Some((inputs[0], *options)); - } - - return self; - }, - }; - *input = inputs[0]; - self - } - pub(crate) fn is_leaf(&self) -> bool { matches!(self, AExpr::Column(_) | AExpr::Literal(_) | AExpr::Len) } } - -impl IRAggExpr { - pub fn get_input(&self) -> NodeInputs { - use IRAggExpr::*; - use NodeInputs::*; - match self { - Min { input, .. } => Single(*input), - Max { input, .. } => Single(*input), - Median(input) => Single(*input), - NUnique(input) => Single(*input), - First(input) => Single(*input), - Last(input) => Single(*input), - Mean(input) => Single(*input), - Implode(input) => Single(*input), - Quantile { expr, quantile, .. } => Many(vec![*expr, *quantile]), - Sum(input) => Single(*input), - Count(input, _) => Single(*input), - Std(input, _) => Single(*input), - Var(input, _) => Single(*input), - AggGroups(input) => Single(*input), - } - } - pub fn set_input(&mut self, input: Node) { - use IRAggExpr::*; - let node = match self { - Min { input, .. } => input, - Max { input, .. } => input, - Median(input) => input, - NUnique(input) => input, - First(input) => input, - Last(input) => input, - Mean(input) => input, - Implode(input) => input, - Quantile { expr, .. } => expr, - Sum(input) => input, - Count(input, _) => input, - Std(input, _) => input, - Var(input, _) => input, - AggGroups(input) => input, - }; - *node = input; - } -} - -pub enum NodeInputs { - Leaf, - Single(Node), - Many(Vec), -} - -impl NodeInputs { - pub fn first(&self) -> Node { - match self { - NodeInputs::Single(node) => *node, - NodeInputs::Many(nodes) => nodes[0], - NodeInputs::Leaf => panic!(), - } - } -} diff --git a/crates/polars-plan/src/plans/aexpr/scalar.rs b/crates/polars-plan/src/plans/aexpr/scalar.rs new file mode 100644 index 000000000000..f7d681b407d4 --- /dev/null +++ b/crates/polars-plan/src/plans/aexpr/scalar.rs @@ -0,0 +1,27 @@ +use recursive::recursive; + +use super::*; + +#[recursive] +pub fn is_scalar_ae(node: Node, expr_arena: &Arena) -> bool { + match expr_arena.get(node) { + AExpr::Literal(lv) => lv.is_scalar(), + AExpr::Function { options, input, .. } + | AExpr::AnonymousFunction { options, input, .. } => { + if options.is_elementwise() { + input.iter().all(|e| e.is_scalar(expr_arena)) + } else { + options.flags.contains(FunctionFlags::RETURNS_SCALAR) + } + }, + AExpr::BinaryExpr { left, right, .. } => { + is_scalar_ae(*left, expr_arena) && is_scalar_ae(*right, expr_arena) + }, + AExpr::Ternary { truthy, falsy, .. } => { + is_scalar_ae(*truthy, expr_arena) && is_scalar_ae(*falsy, expr_arena) + }, + AExpr::Agg(_) | AExpr::Len => true, + AExpr::Cast { expr, .. } | AExpr::Alias(expr, _) => is_scalar_ae(*expr, expr_arena), + _ => false, + } +} diff --git a/crates/polars-plan/src/plans/aexpr/traverse.rs b/crates/polars-plan/src/plans/aexpr/traverse.rs new file mode 100644 index 000000000000..29999ef6995f --- /dev/null +++ b/crates/polars-plan/src/plans/aexpr/traverse.rs @@ -0,0 +1,238 @@ +use super::*; + +impl AExpr { + /// Push nodes at this level to a pre-allocated stack. + pub(crate) fn nodes(&self, container: &mut C) { + use AExpr::*; + + match self { + Column(_) | Literal(_) | Len => {}, + Alias(e, _) => container.push_node(*e), + BinaryExpr { left, op: _, right } => { + // reverse order so that left is popped first + container.push_node(*right); + container.push_node(*left); + }, + Cast { expr, .. } => container.push_node(*expr), + Sort { expr, .. } => container.push_node(*expr), + Gather { expr, idx, .. } => { + container.push_node(*idx); + // latest, so that it is popped first + container.push_node(*expr); + }, + SortBy { expr, by, .. } => { + for node in by { + container.push_node(*node) + } + // latest, so that it is popped first + container.push_node(*expr); + }, + Filter { input, by } => { + container.push_node(*by); + // latest, so that it is popped first + container.push_node(*input); + }, + Agg(agg_e) => match agg_e.get_input() { + NodeInputs::Single(node) => container.push_node(node), + NodeInputs::Many(nodes) => container.extend_from_slice(&nodes), + NodeInputs::Leaf => {}, + }, + Ternary { + truthy, + falsy, + predicate, + } => { + container.push_node(*predicate); + container.push_node(*falsy); + // latest, so that it is popped first + container.push_node(*truthy); + }, + AnonymousFunction { input, .. } | Function { input, .. } => + // we iterate in reverse order, so that the lhs is popped first and will be found + // as the root columns/ input columns by `_suffix` and `_keep_name` etc. + { + input + .iter() + .rev() + .for_each(|e| container.push_node(e.node())) + }, + Explode(e) => container.push_node(*e), + Window { + function, + partition_by, + order_by, + options: _, + } => { + if let Some((n, _)) = order_by { + container.push_node(*n); + } + for e in partition_by.iter().rev() { + container.push_node(*e); + } + // latest so that it is popped first + container.push_node(*function); + }, + Slice { + input, + offset, + length, + } => { + container.push_node(*length); + container.push_node(*offset); + // latest so that it is popped first + container.push_node(*input); + }, + } + } + + pub(crate) fn replace_inputs(mut self, inputs: &[Node]) -> Self { + use AExpr::*; + let input = match &mut self { + Column(_) | Literal(_) | Len => return self, + Alias(input, _) => input, + Cast { expr, .. } => expr, + Explode(input) => input, + BinaryExpr { left, right, .. } => { + *right = inputs[0]; + *left = inputs[1]; + return self; + }, + Gather { expr, idx, .. } => { + *idx = inputs[0]; + *expr = inputs[1]; + return self; + }, + Sort { expr, .. } => expr, + SortBy { expr, by, .. } => { + *expr = *inputs.last().unwrap(); + by.clear(); + by.extend_from_slice(&inputs[..inputs.len() - 1]); + return self; + }, + Filter { input, by, .. } => { + *by = inputs[0]; + *input = inputs[1]; + return self; + }, + Agg(a) => { + match a { + IRAggExpr::Quantile { expr, quantile, .. } => { + *expr = inputs[0]; + *quantile = inputs[1]; + }, + _ => { + a.set_input(inputs[0]); + }, + } + return self; + }, + Ternary { + truthy, + falsy, + predicate, + } => { + *predicate = inputs[0]; + *falsy = inputs[1]; + *truthy = inputs[2]; + return self; + }, + AnonymousFunction { input, .. } | Function { input, .. } => { + debug_assert_eq!(input.len(), inputs.len()); + + // Assign in reverse order as that was the order in which nodes were extracted. + for (e, node) in input.iter_mut().zip(inputs.iter().rev()) { + e.set_node(*node); + } + return self; + }, + Slice { + input, + offset, + length, + } => { + *length = inputs[0]; + *offset = inputs[1]; + *input = inputs[2]; + return self; + }, + Window { + function, + partition_by, + order_by, + .. + } => { + let offset = order_by.is_some() as usize; + *function = *inputs.last().unwrap(); + partition_by.clear(); + partition_by.extend_from_slice(&inputs[offset..inputs.len() - 1]); + + if let Some((_, options)) = order_by { + *order_by = Some((inputs[0], *options)); + } + + return self; + }, + }; + *input = inputs[0]; + self + } +} + +impl IRAggExpr { + pub fn get_input(&self) -> NodeInputs { + use IRAggExpr::*; + use NodeInputs::*; + match self { + Min { input, .. } => Single(*input), + Max { input, .. } => Single(*input), + Median(input) => Single(*input), + NUnique(input) => Single(*input), + First(input) => Single(*input), + Last(input) => Single(*input), + Mean(input) => Single(*input), + Implode(input) => Single(*input), + Quantile { expr, quantile, .. } => Many(vec![*expr, *quantile]), + Sum(input) => Single(*input), + Count(input, _) => Single(*input), + Std(input, _) => Single(*input), + Var(input, _) => Single(*input), + AggGroups(input) => Single(*input), + } + } + pub fn set_input(&mut self, input: Node) { + use IRAggExpr::*; + let node = match self { + Min { input, .. } => input, + Max { input, .. } => input, + Median(input) => input, + NUnique(input) => input, + First(input) => input, + Last(input) => input, + Mean(input) => input, + Implode(input) => input, + Quantile { expr, .. } => expr, + Sum(input) => input, + Count(input, _) => input, + Std(input, _) => input, + Var(input, _) => input, + AggGroups(input) => input, + }; + *node = input; + } +} + +pub enum NodeInputs { + Leaf, + Single(Node), + Many(Vec), +} + +impl NodeInputs { + pub fn first(&self) -> Node { + match self { + NodeInputs::Single(node) => *node, + NodeInputs::Many(nodes) => nodes[0], + NodeInputs::Leaf => panic!(), + } + } +} diff --git a/crates/polars-plan/src/plans/expr_ir.rs b/crates/polars-plan/src/plans/expr_ir.rs index 8512fdc8d8ea..b70bca266ee1 100644 --- a/crates/polars-plan/src/plans/expr_ir.rs +++ b/crates/polars-plan/src/plans/expr_ir.rs @@ -205,6 +205,10 @@ impl ExprIR { alias.hash(state) } } + + pub fn is_scalar(&self, expr_arena: &Arena) -> bool { + is_scalar_ae(self.node, expr_arena) + } } impl AsRef for ExprIR { diff --git a/crates/polars-plan/src/plans/lit.rs b/crates/polars-plan/src/plans/lit.rs index 48f2e8aa7e45..c44fc3fe8147 100644 --- a/crates/polars-plan/src/plans/lit.rs +++ b/crates/polars-plan/src/plans/lit.rs @@ -227,6 +227,10 @@ impl LiteralValue { LiteralValue::UInt32(value) } } + + pub fn is_scalar(&self) -> bool { + !matches!(self, LiteralValue::Series(_) | LiteralValue::Range { .. }) + } } pub trait Literal { diff --git a/py-polars/polars/_utils/construction/dataframe.py b/py-polars/polars/_utils/construction/dataframe.py index 90b7ef485655..1ab05930f5da 100644 --- a/py-polars/polars/_utils/construction/dataframe.py +++ b/py-polars/polars/_utils/construction/dataframe.py @@ -63,7 +63,7 @@ from polars.polars import PyDataFrame if TYPE_CHECKING: - from polars import DataFrame, Series + from polars import DataFrame, Expr, Series from polars._typing import ( Orientation, PolarsDataType, @@ -1212,15 +1212,22 @@ def arrow_to_pydf( if rechunk: pydf = pydf.rechunk() + def broadcastable_s(s: Series, name: str) -> Expr: + if s.len() == 1: + return F.lit(s).first().alias(name) + return F.lit(s).alias(name) + reset_order = False if len(dictionary_cols) > 0: df = wrap_df(pydf) - df = df.with_columns([F.lit(s).alias(s.name) for s in dictionary_cols.values()]) + df = df.with_columns( + [broadcastable_s(s, s.name) for s in dictionary_cols.values()] + ) reset_order = True if len(struct_cols) > 0: df = wrap_df(pydf) - df = df.with_columns([F.lit(s).alias(s.name) for s in struct_cols.values()]) + df = df.with_columns([broadcastable_s(s, s.name) for s in struct_cols.values()]) reset_order = True if reset_order: diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index e24297aa93ee..d94b257b0f1b 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -1723,7 +1723,7 @@ def mode(self) -> Expr: ... "b": [1, 1, 2, 2], ... } ... ) - >>> df.select(pl.all().mode()) # doctest: +IGNORE_RESULT + >>> df.select(pl.all().mode().first()) # doctest: +IGNORE_RESULT shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1731,7 +1731,6 @@ def mode(self) -> Expr: │ i64 ┆ i64 │ ╞═════╪═════╡ │ 1 ┆ 1 │ - │ 1 ┆ 2 │ └─────┴─────┘ """ return self._from_pyexpr(self._pyexpr.mode()) diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index c3472cc49d79..ab27d2647395 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -1999,7 +1999,7 @@ def test_add_string() -> None: def test_df_broadcast() -> None: df = pl.DataFrame({"a": [1, 2, 3]}, schema_overrides={"a": pl.UInt8}) - out = df.with_columns(pl.Series("s", [[1, 2]])) + out = df.with_columns(pl.lit(pl.Series("s", [[1, 2]])).first()) assert out.shape == (3, 2) assert out.schema == {"a": pl.UInt8, "s": pl.List(pl.Int64)} assert out.rows() == [(1, [1, 2]), (2, [1, 2]), (3, [1, 2])] diff --git a/py-polars/tests/unit/functions/test_lit.py b/py-polars/tests/unit/functions/test_lit.py index 1f13ba122825..5677f412d5fb 100644 --- a/py-polars/tests/unit/functions/test_lit.py +++ b/py-polars/tests/unit/functions/test_lit.py @@ -27,7 +27,7 @@ ) def test_lit_list_input(input: list[Any]) -> None: df = pl.DataFrame({"a": [1, 2]}) - result = df.with_columns(pl.lit(input)) + result = df.with_columns(pl.lit(input).first()) expected = pl.DataFrame({"a": [1, 2], "literal": [input, input]}) assert_frame_equal(result, expected) @@ -41,7 +41,7 @@ def test_lit_list_input(input: list[Any]) -> None: ) def test_lit_tuple_input(input: tuple[Any, ...]) -> None: df = pl.DataFrame({"a": [1, 2]}) - result = df.with_columns(pl.lit(input)) + result = df.with_columns(pl.lit(input).first()) expected = pl.DataFrame({"a": [1, 2], "literal": [list(input), list(input)]}) assert_frame_equal(result, expected) diff --git a/py-polars/tests/unit/operations/test_window.py b/py-polars/tests/unit/operations/test_window.py index ddf293d216c9..8171fd5f9b03 100644 --- a/py-polars/tests/unit/operations/test_window.py +++ b/py-polars/tests/unit/operations/test_window.py @@ -511,3 +511,10 @@ def test_window_17308() -> None: assert df.select(pl.col("A").sum(), pl.col("B").sum().over("grp")).to_dict( as_series=False ) == {"A": [3, 3], "B": [3, 4]} + + +def test_lit_window_broadcast() -> None: + # the broadcast should happen in the window function + assert pl.DataFrame({"a": [1, 1, 2]}).select(pl.lit(0).over("a").alias("a"))[ + "a" + ].to_list() == [0, 0, 0] diff --git a/py-polars/tests/unit/test_scalar.py b/py-polars/tests/unit/test_scalar.py new file mode 100644 index 000000000000..68632e7bfdd4 --- /dev/null +++ b/py-polars/tests/unit/test_scalar.py @@ -0,0 +1,14 @@ +import pytest + +import polars as pl + + +def test_invalid_broadcast() -> None: + df = pl.DataFrame( + { + "a": [100, 103], + "group": [0, 1], + } + ) + with pytest.raises(pl.exceptions.InvalidOperationError): + df.select(pl.col("group").filter(pl.col("group") == 0), "a") From 149d6b905874c7966ae8dd6f355c79bff8a2ad72 Mon Sep 17 00:00:00 2001 From: Bela Stoyan Date: Wed, 11 Sep 2024 08:49:53 +0200 Subject: [PATCH 21/28] fix(python): Use IO[bytes] instead of BytesIO in `DataFrame.write_parquet()` (#18652) --- py-polars/polars/dataframe/frame.py | 2 +- py-polars/tests/unit/io/test_parquet.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 83e55cfe64b3..c422f8bffdf8 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -3638,7 +3638,7 @@ def write_ipc_stream( def write_parquet( self, - file: str | Path | BytesIO, + file: str | Path | IO[bytes], *, compression: ParquetCompression = "zstd", compression_level: int | None = None, diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 3da465561bd1..170cc2a015be 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -1891,3 +1891,16 @@ def test_concat_multiple_inmem() -> None: assert_frame_equal(pl.read_parquet([fb, gb]), dfs) assert_frame_equal(pl.read_parquet([fb, gb], use_pyarrow=True), dfs) + + +@pytest.mark.write_disk +def test_write_binary_open_file(tmp_path: Path) -> None: + df = pl.DataFrame({"a": [1, 2, 3]}) + + path = tmp_path / "test.parquet" + + with path.open("wb") as f_write: + df.write_parquet(f_write) + + out = pl.read_parquet(path) + assert_frame_equal(out, df) From 120a5e2d97f0baef2025f0f2c5c0c40a712a2c25 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 11 Sep 2024 10:14:39 +0100 Subject: [PATCH 22/28] feat(python): Add tooltip by default to charts (#18625) --- py-polars/polars/dataframe/plotting.py | 73 +++++++++---------- py-polars/polars/series/plotting.py | 33 ++++++--- .../unit/operations/namespaces/test_plot.py | 34 +++++++++ 3 files changed, 90 insertions(+), 50 deletions(-) diff --git a/py-polars/polars/dataframe/plotting.py b/py-polars/polars/dataframe/plotting.py index ed118e504656..ac787afd3882 100644 --- a/py-polars/polars/dataframe/plotting.py +++ b/py-polars/polars/dataframe/plotting.py @@ -6,15 +6,13 @@ import sys import altair as alt - from altair.typing import ( - ChannelColor, - ChannelOrder, - ChannelSize, - ChannelTooltip, - ChannelX, - ChannelY, - EncodeKwds, - ) + from altair.typing import ChannelColor as Color + from altair.typing import ChannelOrder as Order + from altair.typing import ChannelSize as Size + from altair.typing import ChannelTooltip as Tooltip + from altair.typing import ChannelX as X + from altair.typing import ChannelY as Y + from altair.typing import EncodeKwds from polars import DataFrame @@ -29,12 +27,15 @@ Encodings: TypeAlias = Dict[ str, - Union[ - ChannelX, ChannelY, ChannelColor, ChannelOrder, ChannelSize, ChannelTooltip - ], + Union[X, Y, Color, Order, Size, Tooltip], ] +def _add_tooltip(encodings: Encodings, /, **kwargs: Unpack[EncodeKwds]) -> None: + if "tooltip" not in kwargs: + encodings["tooltip"] = [*encodings.values(), *kwargs.values()] # type: ignore[assignment] + + class DataFramePlot: """DataFrame.plot namespace.""" @@ -45,10 +46,9 @@ def __init__(self, df: DataFrame) -> None: def bar( self, - x: ChannelX | None = None, - y: ChannelY | None = None, - color: ChannelColor | None = None, - tooltip: ChannelTooltip | None = None, + x: X | None = None, + y: Y | None = None, + color: Color | None = None, /, **kwargs: Unpack[EncodeKwds], ) -> alt.Chart: @@ -77,8 +77,6 @@ def bar( Column with y-coordinates of bars. color Column to color bars by. - tooltip - Columns to show values of when hovering over bars with pointer. **kwargs Additional keyword arguments passed to Altair. @@ -102,17 +100,15 @@ def bar( encodings["y"] = y if color is not None: encodings["color"] = color - if tooltip is not None: - encodings["tooltip"] = tooltip + _add_tooltip(encodings, **kwargs) return self._chart.mark_bar().encode(**encodings, **kwargs).interactive() def line( self, - x: ChannelX | None = None, - y: ChannelY | None = None, - color: ChannelColor | None = None, - order: ChannelOrder | None = None, - tooltip: ChannelTooltip | None = None, + x: X | None = None, + y: Y | None = None, + color: Color | None = None, + order: Order | None = None, /, **kwargs: Unpack[EncodeKwds], ) -> alt.Chart: @@ -142,8 +138,6 @@ def line( Column to color lines by. order Column to use for order of data points in lines. - tooltip - Columns to show values of when hovering over lines with pointer. **kwargs Additional keyword arguments passed to Altair. @@ -168,17 +162,15 @@ def line( encodings["color"] = color if order is not None: encodings["order"] = order - if tooltip is not None: - encodings["tooltip"] = tooltip + _add_tooltip(encodings, **kwargs) return self._chart.mark_line().encode(**encodings, **kwargs).interactive() def point( self, - x: ChannelX | None = None, - y: ChannelY | None = None, - color: ChannelColor | None = None, - size: ChannelSize | None = None, - tooltip: ChannelTooltip | None = None, + x: X | None = None, + y: Y | None = None, + color: Color | None = None, + size: Size | None = None, /, **kwargs: Unpack[EncodeKwds], ) -> alt.Chart: @@ -209,8 +201,6 @@ def point( Column to color points by. size Column which determines points' sizes. - tooltip - Columns to show values of when hovering over points with pointer. **kwargs Additional keyword arguments passed to Altair. @@ -234,8 +224,7 @@ def point( encodings["color"] = color if size is not None: encodings["size"] = size - if tooltip is not None: - encodings["tooltip"] = tooltip + _add_tooltip(encodings, **kwargs) return ( self._chart.mark_point() .encode( @@ -253,4 +242,10 @@ def __getattr__(self, attr: str) -> Callable[..., alt.Chart]: if method is None: msg = "Altair has no method 'mark_{attr}'" raise AttributeError(msg) - return lambda **kwargs: method().encode(**kwargs).interactive() + encodings: Encodings = {} + + def func(**kwargs: EncodeKwds) -> alt.Chart: + _add_tooltip(encodings, **kwargs) + return method().encode(**encodings, **kwargs).interactive() + + return func diff --git a/py-polars/polars/series/plotting.py b/py-polars/polars/series/plotting.py index cb5c6c93a1e1..5430d55c6ff3 100644 --- a/py-polars/polars/series/plotting.py +++ b/py-polars/polars/series/plotting.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Callable +from polars.dataframe.plotting import _add_tooltip from polars.dependencies import altair as alt if TYPE_CHECKING: @@ -9,6 +10,8 @@ from altair.typing import EncodeKwds + from polars.dataframe.plotting import Encodings + if sys.version_info >= (3, 11): from typing import Unpack else: @@ -62,11 +65,13 @@ def hist( if self._series_name == "count()": msg = "Cannot use `plot.hist` when Series name is `'count()'`" raise ValueError(msg) + encodings: Encodings = { + "x": alt.X(f"{self._series_name}:Q", bin=True), + "y": "count()", + } + _add_tooltip(encodings, **kwargs) return ( - alt.Chart(self._df) - .mark_bar() - .encode(x=alt.X(f"{self._series_name}:Q", bin=True), y="count()", **kwargs) # type: ignore[misc] - .interactive() + alt.Chart(self._df).mark_bar().encode(**encodings, **kwargs).interactive() ) def kde( @@ -104,11 +109,13 @@ def kde( if self._series_name == "density": msg = "Cannot use `plot.kde` when Series name is `'density'`" raise ValueError(msg) + encodings: Encodings = {"x": self._series_name, "y": "density:Q"} + _add_tooltip(encodings, **kwargs) return ( alt.Chart(self._df) .transform_density(self._series_name, as_=[self._series_name, "density"]) .mark_area() - .encode(x=self._series_name, y="density:Q", **kwargs) # type: ignore[misc] + .encode(**encodings, **kwargs) .interactive() ) @@ -147,10 +154,12 @@ def line( if self._series_name == "index": msg = "Cannot call `plot.line` when Series name is 'index'" raise ValueError(msg) + encodings: Encodings = {"x": "index", "y": self._series_name} + _add_tooltip(encodings, **kwargs) return ( alt.Chart(self._df.with_row_index()) .mark_line() - .encode(x="index", y=self._series_name, **kwargs) # type: ignore[misc] + .encode(**encodings, **kwargs) .interactive() ) @@ -165,8 +174,10 @@ def __getattr__(self, attr: str) -> Callable[..., alt.Chart]: if method is None: msg = "Altair has no method 'mark_{attr}'" raise AttributeError(msg) - return ( - lambda **kwargs: method() - .encode(x="index", y=self._series_name, **kwargs) - .interactive() - ) + encodings: Encodings = {"x": "index", "y": self._series_name} + + def func(**kwargs: EncodeKwds) -> alt.Chart: + _add_tooltip(encodings, **kwargs) + return method().encode(**encodings, **kwargs).interactive() + + return func diff --git a/py-polars/tests/unit/operations/namespaces/test_plot.py b/py-polars/tests/unit/operations/namespaces/test_plot.py index fc2fbc02648a..5a4c1c21a596 100644 --- a/py-polars/tests/unit/operations/namespaces/test_plot.py +++ b/py-polars/tests/unit/operations/namespaces/test_plot.py @@ -17,6 +17,29 @@ def test_dataframe_plot() -> None: df.plot.area(x="length", y="width", color="species").to_json() +def test_dataframe_plot_tooltip() -> None: + df = pl.DataFrame( + { + "length": [1, 4, 6], + "width": [4, 5, 6], + "species": ["setosa", "setosa", "versicolor"], + } + ) + result = df.plot.line(x="length", y="width", color="species").to_dict() + assert result["encoding"]["tooltip"] == [ + {"field": "length", "type": "quantitative"}, + {"field": "width", "type": "quantitative"}, + {"field": "species", "type": "nominal"}, + ] + result = df.plot.line( + x="length", y="width", color="species", tooltip=["length", "width"] + ).to_dict() + assert result["encoding"]["tooltip"] == [ + {"field": "length", "type": "quantitative"}, + {"field": "width", "type": "quantitative"}, + ] + + def test_series_plot() -> None: # dry-run, check nothing errors s = pl.Series("a", [1, 4, 4, 4, 7, 2, 5, 3, 6]) @@ -26,6 +49,17 @@ def test_series_plot() -> None: s.plot.point().to_json() +def test_series_plot_tooltip() -> None: + s = pl.Series("a", [1, 4, 4, 4, 7, 2, 5, 3, 6]) + result = s.plot.line().to_dict() + assert result["encoding"]["tooltip"] == [ + {"field": "index", "type": "quantitative"}, + {"field": "a", "type": "quantitative"}, + ] + result = s.plot.line(tooltip=["a"]).to_dict() + assert result["encoding"]["tooltip"] == [{"field": "a", "type": "quantitative"}] + + def test_empty_dataframe() -> None: pl.DataFrame({"a": [], "b": []}).plot.point(x="a", y="b") From f25ca0c14d984bc820c7910d216003edc57d1a3c Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Wed, 11 Sep 2024 11:54:50 +0200 Subject: [PATCH 23/28] Rust Polars 0.43.0 (#18678) --- Cargo.lock | 46 ++++++++++++++++----------------- Cargo.toml | 46 ++++++++++++++++----------------- crates/Makefile | 1 + crates/polars-stream/Cargo.toml | 7 ++--- 4 files changed, 51 insertions(+), 49 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7aca7dc3acc5..1963e248aba0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2953,7 +2953,7 @@ dependencies = [ [[package]] name = "polars" -version = "0.42.0" +version = "0.43.0" dependencies = [ "ahash", "apache-avro", @@ -2983,7 +2983,7 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.42.0" +version = "0.43.0" dependencies = [ "ahash", "arrow-array", @@ -3052,7 +3052,7 @@ dependencies = [ [[package]] name = "polars-compute" -version = "0.42.0" +version = "0.43.0" dependencies = [ "bytemuck", "either", @@ -3067,7 +3067,7 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.42.0" +version = "0.43.0" dependencies = [ "ahash", "arrow-array", @@ -3102,7 +3102,7 @@ dependencies = [ [[package]] name = "polars-doc-examples" -version = "0.42.0" +version = "0.43.0" dependencies = [ "aws-config", "aws-sdk-s3", @@ -3116,7 +3116,7 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.42.0" +version = "0.43.0" dependencies = [ "avro-schema", "object_store", @@ -3128,7 +3128,7 @@ dependencies = [ [[package]] name = "polars-expr" -version = "0.42.0" +version = "0.43.0" dependencies = [ "ahash", "bitflags", @@ -3147,7 +3147,7 @@ dependencies = [ [[package]] name = "polars-ffi" -version = "0.42.0" +version = "0.43.0" dependencies = [ "polars-arrow", "polars-core", @@ -3155,7 +3155,7 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.42.0" +version = "0.43.0" dependencies = [ "ahash", "async-trait", @@ -3203,7 +3203,7 @@ dependencies = [ [[package]] name = "polars-json" -version = "0.42.0" +version = "0.43.0" dependencies = [ "ahash", "chrono", @@ -3223,7 +3223,7 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.42.0" +version = "0.43.0" dependencies = [ "ahash", "bitflags", @@ -3251,7 +3251,7 @@ dependencies = [ [[package]] name = "polars-mem-engine" -version = "0.42.0" +version = "0.43.0" dependencies = [ "futures", "memmap2", @@ -3272,7 +3272,7 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.42.0" +version = "0.43.0" dependencies = [ "ahash", "aho-corasick", @@ -3307,7 +3307,7 @@ dependencies = [ [[package]] name = "polars-parquet" -version = "0.42.0" +version = "0.43.0" dependencies = [ "ahash", "async-stream", @@ -3338,7 +3338,7 @@ dependencies = [ [[package]] name = "polars-pipe" -version = "0.42.0" +version = "0.43.0" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -3363,7 +3363,7 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.42.0" +version = "0.43.0" dependencies = [ "ahash", "bitflags", @@ -3400,7 +3400,7 @@ dependencies = [ [[package]] name = "polars-python" -version = "0.42.0" +version = "0.43.0" dependencies = [ "ahash", "arboard", @@ -3434,7 +3434,7 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.42.0" +version = "0.43.0" dependencies = [ "bytemuck", "polars-arrow", @@ -3444,7 +3444,7 @@ dependencies = [ [[package]] name = "polars-schema" -version = "0.42.0" +version = "0.43.0" dependencies = [ "indexmap", "polars-error", @@ -3455,7 +3455,7 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.42.0" +version = "0.43.0" dependencies = [ "hex", "once_cell", @@ -3475,7 +3475,7 @@ dependencies = [ [[package]] name = "polars-stream" -version = "0.42.0" +version = "0.43.0" dependencies = [ "atomic-waker", "crossbeam-deque", @@ -3502,7 +3502,7 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.42.0" +version = "0.43.0" dependencies = [ "atoi", "bytemuck", @@ -3521,7 +3521,7 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.42.0" +version = "0.43.0" dependencies = [ "ahash", "bytemuck", diff --git a/Cargo.toml b/Cargo.toml index a83488882fb6..11952df77330 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ default-members = [ # ] [workspace.package] -version = "0.42.0" +version = "0.43.0" authors = ["Ritchie Vink "] edition = "2021" homepage = "https://www.pola.rs/" @@ -96,27 +96,27 @@ version_check = "0.9.4" xxhash-rust = { version = "0.8.6", features = ["xxh3"] } zstd = "0.13" -polars = { version = "0.42.0", path = "crates/polars", default-features = false } -polars-compute = { version = "0.42.0", path = "crates/polars-compute", default-features = false } -polars-core = { version = "0.42.0", path = "crates/polars-core", default-features = false } -polars-error = { version = "0.42.0", path = "crates/polars-error", default-features = false } -polars-expr = { version = "0.42.0", path = "crates/polars-expr", default-features = false } -polars-ffi = { version = "0.42.0", path = "crates/polars-ffi", default-features = false } -polars-io = { version = "0.42.0", path = "crates/polars-io", default-features = false } -polars-json = { version = "0.42.0", path = "crates/polars-json", default-features = false } -polars-lazy = { version = "0.42.0", path = "crates/polars-lazy", default-features = false } -polars-mem-engine = { version = "0.42.0", path = "crates/polars-mem-engine", default-features = false } -polars-ops = { version = "0.42.0", path = "crates/polars-ops", default-features = false } -polars-parquet = { version = "0.42.0", path = "crates/polars-parquet", default-features = false } -polars-pipe = { version = "0.42.0", path = "crates/polars-pipe", default-features = false } -polars-plan = { version = "0.42.0", path = "crates/polars-plan", default-features = false } -polars-python = { version = "0.42.0", path = "crates/polars-python", default-features = false } -polars-row = { version = "0.42.0", path = "crates/polars-row", default-features = false } -polars-schema = { version = "0.42.0", path = "crates/polars-schema", default-features = false } -polars-sql = { version = "0.42.0", path = "crates/polars-sql", default-features = false } -polars-stream = { version = "0.42.0", path = "crates/polars-stream", default-features = false } -polars-time = { version = "0.42.0", path = "crates/polars-time", default-features = false } -polars-utils = { version = "0.42.0", path = "crates/polars-utils", default-features = false } +polars = { version = "0.43.0", path = "crates/polars", default-features = false } +polars-compute = { version = "0.43.0", path = "crates/polars-compute", default-features = false } +polars-core = { version = "0.43.0", path = "crates/polars-core", default-features = false } +polars-error = { version = "0.43.0", path = "crates/polars-error", default-features = false } +polars-expr = { version = "0.43.0", path = "crates/polars-expr", default-features = false } +polars-ffi = { version = "0.43.0", path = "crates/polars-ffi", default-features = false } +polars-io = { version = "0.43.0", path = "crates/polars-io", default-features = false } +polars-json = { version = "0.43.0", path = "crates/polars-json", default-features = false } +polars-lazy = { version = "0.43.0", path = "crates/polars-lazy", default-features = false } +polars-mem-engine = { version = "0.43.0", path = "crates/polars-mem-engine", default-features = false } +polars-ops = { version = "0.43.0", path = "crates/polars-ops", default-features = false } +polars-parquet = { version = "0.43.0", path = "crates/polars-parquet", default-features = false } +polars-pipe = { version = "0.43.0", path = "crates/polars-pipe", default-features = false } +polars-plan = { version = "0.43.0", path = "crates/polars-plan", default-features = false } +polars-python = { version = "0.43.0", path = "crates/polars-python", default-features = false } +polars-row = { version = "0.43.0", path = "crates/polars-row", default-features = false } +polars-schema = { version = "0.43.0", path = "crates/polars-schema", default-features = false } +polars-sql = { version = "0.43.0", path = "crates/polars-sql", default-features = false } +polars-stream = { version = "0.43.0", path = "crates/polars-stream", default-features = false } +polars-time = { version = "0.43.0", path = "crates/polars-time", default-features = false } +polars-utils = { version = "0.43.0", path = "crates/polars-utils", default-features = false } [workspace.dependencies.arrow-format] package = "polars-arrow-format" @@ -124,7 +124,7 @@ version = "0.1.0" [workspace.dependencies.arrow] package = "polars-arrow" -version = "0.42.0" +version = "0.43.0" path = "crates/polars-arrow" default-features = false features = [ diff --git a/crates/Makefile b/crates/Makefile index e98014380065..28622ee061f5 100644 --- a/crates/Makefile +++ b/crates/Makefile @@ -104,6 +104,7 @@ doctest: ## Check that documentation builds publish: ## Publish Polars crates cargo publish --allow-dirty -p polars-error cargo publish --allow-dirty -p polars-utils + cargo publish --allow-dirty -p polars-schema cargo publish --allow-dirty -p polars-arrow cargo publish --allow-dirty -p polars-compute cargo publish --allow-dirty -p polars-row diff --git a/crates/polars-stream/Cargo.toml b/crates/polars-stream/Cargo.toml index e2a7d0c45649..4cec6b590e81 100644 --- a/crates/polars-stream/Cargo.toml +++ b/crates/polars-stream/Cargo.toml @@ -26,10 +26,11 @@ tokio = { workspace = true } polars-core = { workspace = true } polars-error = { workspace = true } -polars-expr = { workspace = true } -polars-mem-engine = { workspace = true, features = ["parquet"] } +polars-expr = { workspace = true, features = ["dtype-full"] } +# TODO: feature gate +polars-mem-engine = { workspace = true, features = ["parquet", "csv", "json", "ipc", "cloud", "python", "temporal", "dtype-categorical", "dtype-i8", "dtype-i16", "dtype-u8", "dtype-u16", "dtype-decimal", "dtype-struct", "object"] } polars-parquet = { workspace = true } -polars-plan = { workspace = true, features = ["parquet"] } +polars-plan = { workspace = true, features = ["parquet", "csv", "json", "ipc", "cloud", "python", "serde", "temporal", "dtype-categorical", "dtype-i8", "dtype-i16", "dtype-u8", "dtype-u16", "dtype-decimal", "dtype-struct", "object"] } [build-dependencies] version_check = { workspace = true } From db1b15fe6f5bad2c3c86fe0ceaacacf2f7eb3d72 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Wed, 11 Sep 2024 14:26:28 +0400 Subject: [PATCH 24/28] feat(python): Update `BytecodeParser` for upcoming Python 3.13 (#18677) --- py-polars/polars/_utils/udfs.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/py-polars/polars/_utils/udfs.py b/py-polars/polars/_utils/udfs.py index 6a94c5bf6b9f..dca868fb9f9f 100644 --- a/py-polars/polars/_utils/udfs.py +++ b/py-polars/polars/_utils/udfs.py @@ -604,10 +604,10 @@ def op(inst: Instruction) -> str: return "replace_strict" else: msg = ( - "unrecognized opname" - "\n\nPlease report a bug to https://github.com/pola-rs/polars/issues" - " with the content of function you were passing to `map` and the" - f" following instruction object:\n{inst!r}" + f"unexpected or unrecognised op name ({inst.opname})\n\n" + "Please report a bug to https://github.com/pola-rs/polars/issues " + "with the content of function you were passing to the `map` " + f"expressions and the following instruction object:\n{inst!r}" ) raise AssertionError(msg) @@ -751,7 +751,7 @@ def __init__( self._original_instructions = list(instructions) self._rewritten_instructions = self._rewrite( self._upgrade_instruction(inst) - for inst in self._original_instructions + for inst in self._unpack_superinstructions(self._original_instructions) if inst.opname not in self._ignored_ops ) @@ -1018,6 +1018,22 @@ def _rewrite_methods( return len(matching_instructions) + @staticmethod + def _unpack_superinstructions( + instructions: list[Instruction], + ) -> Iterator[Instruction]: + """Expand known 'superinstructions' into their component parts.""" + for inst in instructions: + if inst.opname == "LOAD_FAST_LOAD_FAST": + for idx in (0, 1): + yield inst._replace( + opname="LOAD_FAST", + argval=inst.argval[idx], + argrepr=inst.argval[idx], + ) + else: + yield inst + @staticmethod def _upgrade_instruction(inst: Instruction) -> Instruction: """Rewrite any older binary opcodes using py 3.11 'BINARY_OP' instead.""" From eac567f62c80b486fac4f5b8c95910da9592cc52 Mon Sep 17 00:00:00 2001 From: Or Yarimi <17513370+yarimiz@users.noreply.github.com> Date: Wed, 11 Sep 2024 14:02:58 +0300 Subject: [PATCH 25/28] fix(rust): Qcut all nulls panics (#18667) Co-authored-by: ritchie --- crates/polars-ops/src/series/ops/cut.rs | 14 +++++++++----- py-polars/tests/unit/operations/test_qcut.py | 9 +++++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/crates/polars-ops/src/series/ops/cut.rs b/crates/polars-ops/src/series/ops/cut.rs index 2deb6dfeb52f..cba643cf98e9 100644 --- a/crates/polars-ops/src/series/ops/cut.rs +++ b/crates/polars-ops/src/series/ops/cut.rs @@ -131,15 +131,19 @@ pub fn qcut( ) -> PolarsResult { polars_ensure!(!probs.iter().any(|x| x.is_nan()), ComputeError: "quantiles cannot be NaN"); + if s.null_count() == s.len() { + // If we only have nulls we don't have any breakpoints. + return Ok(Series::full_null( + s.name().clone(), + s.len(), + &DataType::Categorical(None, Default::default()), + )); + } + let s = s.cast(&DataType::Float64)?; let s2 = s.sort(SortOptions::default())?; let ca = s2.f64()?; - if ca.null_count() == ca.len() { - // If we only have nulls we don't have any breakpoints. - return cut(&s, vec![], labels, left_closed, include_breaks); - } - let f = |&p| { ca.quantile(p, QuantileInterpolOptions::Linear) .unwrap() diff --git a/py-polars/tests/unit/operations/test_qcut.py b/py-polars/tests/unit/operations/test_qcut.py index e92afba49dda..7b7b1c399a45 100644 --- a/py-polars/tests/unit/operations/test_qcut.py +++ b/py-polars/tests/unit/operations/test_qcut.py @@ -100,6 +100,15 @@ def test_qcut_full_null() -> None: assert_series_equal(result, expected, categorical_as_str=True) +def test_qcut_full_null_with_labels() -> None: + s = pl.Series("a", [None, None, None, None]) + + result = s.qcut([0.25, 0.50], labels=["1", "2", "3"]) + + expected = pl.Series("a", [None, None, None, None], dtype=pl.Categorical) + assert_series_equal(result, expected, categorical_as_str=True) + + def test_qcut_allow_duplicates() -> None: s = pl.Series([1, 2, 2, 3]) From 79b91c39683e0e12bb28132f418ecee18e6bf291 Mon Sep 17 00:00:00 2001 From: Ohan Fillbach <19675187+ohanf@users.noreply.github.com> Date: Wed, 11 Sep 2024 06:23:16 -0500 Subject: [PATCH 26/28] fix(rust): Refactor decompression checks and add support for decompressing JSON (#18536) Co-authored-by: Ohan Fillbach --- crates/polars-io/src/csv/read/parser.rs | 2 +- crates/polars-io/src/csv/read/read_impl.rs | 4 +- crates/polars-io/src/csv/read/utils.rs | 27 ++++--- crates/polars-io/src/json/mod.rs | 11 ++- crates/polars-io/src/utils/compression.rs | 77 +++++++++++++++---- crates/polars-io/src/utils/mod.rs | 1 - crates/polars-io/src/utils/other.rs | 41 ---------- .../src/executors/scan/csv.rs | 1 + .../src/executors/scan/ndjson.rs | 1 + .../polars-plan/src/plans/conversion/scans.rs | 1 + .../polars-plan/src/plans/functions/count.rs | 2 +- py-polars/tests/unit/io/test_json.py | 35 +++++++++ 12 files changed, 130 insertions(+), 73 deletions(-) diff --git a/crates/polars-io/src/csv/read/parser.rs b/crates/polars-io/src/csv/read/parser.rs index 282a304003a3..fa19b7575179 100644 --- a/crates/polars-io/src/csv/read/parser.rs +++ b/crates/polars-io/src/csv/read/parser.rs @@ -14,7 +14,7 @@ use super::options::{CommentPrefix, NullValuesCompiled}; use super::splitfields::SplitFields; use super::utils::get_file_chunks; use crate::path_utils::is_cloud_url; -use crate::utils::maybe_decompress_bytes; +use crate::utils::compression::maybe_decompress_bytes; /// Read the number of rows without parsing columns /// useful for count(*) queries diff --git a/crates/polars-io/src/csv/read/read_impl.rs b/crates/polars-io/src/csv/read/read_impl.rs index eabd1ca6a2f0..048df2dc2fb2 100644 --- a/crates/polars-io/src/csv/read/read_impl.rs +++ b/crates/polars-io/src/csv/read/read_impl.rs @@ -24,7 +24,7 @@ use super::utils::get_file_chunks; use crate::mmap::ReaderBytes; use crate::predicates::PhysicalIoExpr; #[cfg(not(any(feature = "decompress", feature = "decompress-fast")))] -use crate::utils::is_compressed; +use crate::utils::compression::SupportedCompression; use crate::utils::update_row_counts; use crate::RowIndex; @@ -179,7 +179,7 @@ impl<'a> CoreReader<'a> { let mut reader_bytes = reader_bytes; #[cfg(not(any(feature = "decompress", feature = "decompress-fast")))] - if is_compressed(&reader_bytes) { + if SupportedCompression::check(&reader_bytes).is_some() { polars_bail!( ComputeError: "cannot read compressed CSV file; \ compile with feature 'decompress' or 'decompress-fast'" diff --git a/crates/polars-io/src/csv/read/utils.rs b/crates/polars-io/src/csv/read/utils.rs index 33c6a6c8f290..802f60d93d3b 100644 --- a/crates/polars-io/src/csv/read/utils.rs +++ b/crates/polars-io/src/csv/read/utils.rs @@ -129,16 +129,23 @@ pub(crate) fn decompress( quote_char: Option, eol_char: u8, ) -> Option> { - use crate::utils::compression::magic::*; - if bytes.starts_with(&GZIP) { - let mut decoder = flate2::read::MultiGzDecoder::new(bytes); - decompress_impl(&mut decoder, n_rows, separator, quote_char, eol_char) - } else if bytes.starts_with(&ZLIB0) || bytes.starts_with(&ZLIB1) || bytes.starts_with(&ZLIB2) { - let mut decoder = flate2::read::ZlibDecoder::new(bytes); - decompress_impl(&mut decoder, n_rows, separator, quote_char, eol_char) - } else if bytes.starts_with(&ZSTD) { - let mut decoder = zstd::Decoder::new(bytes).ok()?; - decompress_impl(&mut decoder, n_rows, separator, quote_char, eol_char) + use crate::utils::compression::SupportedCompression; + + if let Some(algo) = SupportedCompression::check(bytes) { + match algo { + SupportedCompression::GZIP => { + let mut decoder = flate2::read::MultiGzDecoder::new(bytes); + decompress_impl(&mut decoder, n_rows, separator, quote_char, eol_char) + }, + SupportedCompression::ZLIB => { + let mut decoder = flate2::read::ZlibDecoder::new(bytes); + decompress_impl(&mut decoder, n_rows, separator, quote_char, eol_char) + }, + SupportedCompression::ZSTD => { + let mut decoder = zstd::Decoder::new(bytes).ok()?; + decompress_impl(&mut decoder, n_rows, separator, quote_char, eol_char) + }, + } } else { None } diff --git a/crates/polars-io/src/json/mod.rs b/crates/polars-io/src/json/mod.rs index 1a8f9eb8f5a4..f58a9010ac25 100644 --- a/crates/polars-io/src/json/mod.rs +++ b/crates/polars-io/src/json/mod.rs @@ -272,8 +272,15 @@ where JsonFormat::Json => { polars_ensure!(!self.ignore_errors, InvalidOperation: "'ignore_errors' only supported in ndjson"); let mut bytes = rb.deref().to_vec(); - let json_value = - simd_json::to_borrowed_value(&mut bytes).map_err(to_compute_err)?; + let owned = &mut vec![]; + compression::maybe_decompress_bytes(&bytes, owned)?; + // the easiest way to avoid ownership issues is by implicitly figuring out if + // decompression happened (owned is only populated on decompress), then pick which bytes to parse + let json_value = if owned.is_empty() { + simd_json::to_borrowed_value(&mut bytes).map_err(to_compute_err)? + } else { + simd_json::to_borrowed_value(owned).map_err(to_compute_err)? + }; // struct type let dtype = if let Some(mut schema) = self.schema { diff --git a/crates/polars-io/src/utils/compression.rs b/crates/polars-io/src/utils/compression.rs index d771b4c6ca1e..4c8750f4f9f6 100644 --- a/crates/polars-io/src/utils/compression.rs +++ b/crates/polars-io/src/utils/compression.rs @@ -1,19 +1,66 @@ -// magic numbers -pub mod magic { - pub const GZIP: [u8; 2] = [31, 139]; - pub const ZLIB0: [u8; 2] = [0x78, 0x01]; - pub const ZLIB1: [u8; 2] = [0x78, 0x9C]; - pub const ZLIB2: [u8; 2] = [0x78, 0xDA]; - pub const ZSTD: [u8; 4] = [0x28, 0xB5, 0x2F, 0xFD]; +use std::io::Read; + +use polars_core::prelude::*; +use polars_error::to_compute_err; + +/// Represents the compression algorithms that we have decoders for +pub enum SupportedCompression { + GZIP, + ZLIB, + ZSTD, } -/// check if csv file is compressed -pub fn is_compressed(bytes: &[u8]) -> bool { - use magic::*; +impl SupportedCompression { + /// If the given byte slice starts with the "magic" bytes for a supported compression family, return + /// that family, for unsupported/uncompressed slices, return None + pub fn check(bytes: &[u8]) -> Option { + if bytes.len() < 4 { + // not enough bytes to perform prefix checks + return None; + } + match bytes[..4] { + [31, 139, _, _] => Some(Self::GZIP), + [0x78, 0x01, _, _] | // ZLIB0 + [0x78, 0x9C, _, _] | // ZLIB1 + [0x78, 0xDA, _, _] // ZLIB2 + => Some(Self::ZLIB), + [0x28, 0xB5, 0x2F, 0xFD] => Some(Self::ZSTD), + _ => None, + } + } +} + +/// Decompress `bytes` if compression is detected, otherwise simply return it. +/// An `out` vec must be given for ownership of the decompressed data. +pub fn maybe_decompress_bytes<'a>(bytes: &'a [u8], out: &'a mut Vec) -> PolarsResult<&'a [u8]> { + assert!(out.is_empty()); + + if let Some(algo) = SupportedCompression::check(bytes) { + #[cfg(any(feature = "decompress", feature = "decompress-fast"))] + { + match algo { + SupportedCompression::GZIP => { + flate2::read::MultiGzDecoder::new(bytes) + .read_to_end(out) + .map_err(to_compute_err)?; + }, + SupportedCompression::ZLIB => { + flate2::read::ZlibDecoder::new(bytes) + .read_to_end(out) + .map_err(to_compute_err)?; + }, + SupportedCompression::ZSTD => { + zstd::Decoder::new(bytes)?.read_to_end(out)?; + }, + } - bytes.starts_with(&ZLIB0) - || bytes.starts_with(&ZLIB1) - || bytes.starts_with(&ZLIB2) - || bytes.starts_with(&GZIP) - || bytes.starts_with(&ZSTD) + Ok(out) + } + #[cfg(not(any(feature = "decompress", feature = "decompress-fast")))] + { + panic!("cannot decompress without 'decompress' or 'decompress-fast' feature") + } + } else { + Ok(bytes) + } } diff --git a/crates/polars-io/src/utils/mod.rs b/crates/polars-io/src/utils/mod.rs index 87c80b1b5c5a..8cae1ab1ef5f 100644 --- a/crates/polars-io/src/utils/mod.rs +++ b/crates/polars-io/src/utils/mod.rs @@ -1,7 +1,6 @@ pub mod compression; mod other; -pub use compression::is_compressed; pub use other::*; #[cfg(feature = "cloud")] pub mod byte_source; diff --git a/crates/polars-io/src/utils/other.rs b/crates/polars-io/src/utils/other.rs index 1984e6ad480e..8999ecb657d4 100644 --- a/crates/polars-io/src/utils/other.rs +++ b/crates/polars-io/src/utils/other.rs @@ -6,7 +6,6 @@ use once_cell::sync::Lazy; use polars_core::prelude::*; #[cfg(any(feature = "ipc_streaming", feature = "parquet"))] use polars_core::utils::{accumulate_dataframes_vertical_unchecked, split_df_as_ref}; -use polars_error::to_compute_err; use polars_utils::mmap::MMapSemaphore; use regex::{Regex, RegexBuilder}; @@ -46,46 +45,6 @@ pub fn get_reader_bytes<'a, R: Read + MmapBytesReader + ?Sized>( } } -/// Decompress `bytes` if compression is detected, otherwise simply return it. -/// An `out` vec must be given for ownership of the decompressed data. -pub fn maybe_decompress_bytes<'a>(bytes: &'a [u8], out: &'a mut Vec) -> PolarsResult<&'a [u8]> { - assert!(out.is_empty()); - use crate::prelude::is_compressed; - let is_compressed = bytes.len() >= 4 && is_compressed(bytes); - - if is_compressed { - #[cfg(any(feature = "decompress", feature = "decompress-fast"))] - { - use crate::utils::compression::magic::*; - - if bytes.starts_with(&GZIP) { - flate2::read::MultiGzDecoder::new(bytes) - .read_to_end(out) - .map_err(to_compute_err)?; - } else if bytes.starts_with(&ZLIB0) - || bytes.starts_with(&ZLIB1) - || bytes.starts_with(&ZLIB2) - { - flate2::read::ZlibDecoder::new(bytes) - .read_to_end(out) - .map_err(to_compute_err)?; - } else if bytes.starts_with(&ZSTD) { - zstd::Decoder::new(bytes)?.read_to_end(out)?; - } else { - polars_bail!(ComputeError: "unimplemented compression format") - } - - Ok(out) - } - #[cfg(not(any(feature = "decompress", feature = "decompress-fast")))] - { - panic!("cannot decompress without 'decompress' or 'decompress-fast' feature") - } - } else { - Ok(bytes) - } -} - #[cfg(any( feature = "ipc", feature = "ipc_streaming", diff --git a/crates/polars-mem-engine/src/executors/scan/csv.rs b/crates/polars-mem-engine/src/executors/scan/csv.rs index 0ebcb7632ae7..fde24b4ef6a0 100644 --- a/crates/polars-mem-engine/src/executors/scan/csv.rs +++ b/crates/polars-mem-engine/src/executors/scan/csv.rs @@ -4,6 +4,7 @@ use polars_core::config; use polars_core::utils::{ accumulate_dataframes_vertical, accumulate_dataframes_vertical_unchecked, }; +use polars_io::utils::compression::maybe_decompress_bytes; use super::*; diff --git a/crates/polars-mem-engine/src/executors/scan/ndjson.rs b/crates/polars-mem-engine/src/executors/scan/ndjson.rs index a662760fd54b..90f3e1e9fa0b 100644 --- a/crates/polars-mem-engine/src/executors/scan/ndjson.rs +++ b/crates/polars-mem-engine/src/executors/scan/ndjson.rs @@ -1,5 +1,6 @@ use polars_core::config; use polars_core::utils::accumulate_dataframes_vertical; +use polars_io::utils::compression::maybe_decompress_bytes; use super::*; diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs index 9fd419f90f63..8e755fe38597 100644 --- a/crates/polars-plan/src/plans/conversion/scans.rs +++ b/crates/polars-plan/src/plans/conversion/scans.rs @@ -3,6 +3,7 @@ use polars_io::path_utils::is_cloud_url; #[cfg(feature = "cloud")] use polars_io::pl_async::get_runtime; use polars_io::prelude::*; +use polars_io::utils::compression::maybe_decompress_bytes; use polars_io::RowIndex; use super::*; diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs index 7375ff47ff31..38709c5c7b8e 100644 --- a/crates/polars-plan/src/plans/functions/count.rs +++ b/crates/polars-plan/src/plans/functions/count.rs @@ -219,7 +219,7 @@ pub(super) fn count_rows_ndjson( cloud_options: Option<&CloudOptions>, ) -> PolarsResult { use polars_core::config; - use polars_io::utils::maybe_decompress_bytes; + use polars_io::utils::compression::maybe_decompress_bytes; if sources.is_empty() { return Ok(0); diff --git a/py-polars/tests/unit/io/test_json.py b/py-polars/tests/unit/io/test_json.py index 4bce4ee4e0ce..4e24cf3b08cd 100644 --- a/py-polars/tests/unit/io/test_json.py +++ b/py-polars/tests/unit/io/test_json.py @@ -1,13 +1,17 @@ from __future__ import annotations +import gzip import io import json import typing +import zlib from collections import OrderedDict from decimal import Decimal as D from io import BytesIO from typing import TYPE_CHECKING +import zstandard + if TYPE_CHECKING: from pathlib import Path @@ -385,3 +389,34 @@ def test_empty_json() -> None: df = pl.read_json(b'{"j":{}}') assert df.dtypes == [pl.Struct([])] assert df.shape == (0, 1) + + +def test_compressed_json() -> None: + # shared setup + json_obj = [ + {"id": 1, "name": "Alice", "trusted": True}, + {"id": 2, "name": "Bob", "trusted": True}, + {"id": 3, "name": "Carol", "trusted": False}, + ] + expected = pl.DataFrame(json_obj, orient="row") + json_bytes = json.dumps(json_obj).encode() + + # gzip + compressed_bytes = gzip.compress(json_bytes) + out = pl.read_json(compressed_bytes) + assert_frame_equal(out, expected) + + # zlib + compressed_bytes = zlib.compress(json_bytes) + out = pl.read_json(compressed_bytes) + assert_frame_equal(out, expected) + + # zstd + compressed_bytes = zstandard.compress(json_bytes) + out = pl.read_json(compressed_bytes) + assert_frame_equal(out, expected) + + # no compression + uncompressed = io.BytesIO(json_bytes) + out = pl.read_json(uncompressed) + assert_frame_equal(out, expected) From 3e2119fe582db4e5e80c86ece657c705f04e2ee8 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Wed, 11 Sep 2024 13:23:45 +0200 Subject: [PATCH 27/28] Python Polars 1.7.0 (#18680) --- Cargo.lock | 2 +- py-polars/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1963e248aba0..cd3499f949bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3623,7 +3623,7 @@ dependencies = [ [[package]] name = "py-polars" -version = "1.6.0" +version = "1.7.0" dependencies = [ "built", "jemallocator", diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index 1147cbdde89a..c52111523fd1 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-polars" -version = "1.6.0" +version = "1.7.0" edition = "2021" [lib] From d8acacfadc7059f6acc363a68839ec312910751e Mon Sep 17 00:00:00 2001 From: ritchie Date: Wed, 11 Sep 2024 13:33:43 +0200 Subject: [PATCH 28/28] chore: Fix python release --- Cargo.lock | 1 + py-polars/Cargo.toml | 2 ++ 2 files changed, 3 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index cd3499f949bf..bd7cad4040f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3629,6 +3629,7 @@ dependencies = [ "jemallocator", "libc", "mimalloc", + "polars", "polars-python", "pyo3", ] diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index c52111523fd1..45b95825b825 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -9,6 +9,8 @@ crate-type = ["cdylib"] [dependencies] libc = { workspace = true } +# Explicit dependency is needed to add bigidx in CI during release +polars = { workspace = true } polars-python = { workspace = true, features = ["pymethods", "iejoin"] } pyo3 = { workspace = true, features = ["abi3-py38", "chrono", "extension-module", "multiple-pymethods"] }