From 58a38af21dccaf3326514494a1db118601c8c2ca Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 8 Dec 2024 10:27:37 +0100 Subject: [PATCH] rust Polars 0.45.0 (#20213) --- Cargo.lock | 48 ++++++++--------- Cargo.toml | 48 ++++++++--------- crates/polars-json/Cargo.toml | 2 +- crates/polars-lazy/Cargo.toml | 10 +++- crates/polars-stream/Cargo.toml | 12 +++-- .../polars-stream/src/nodes/io_sources/csv.rs | 7 ++- .../polars-stream/src/nodes/io_sources/mod.rs | 2 + crates/polars-stream/src/physical_plan/fmt.rs | 2 + .../src/physical_plan/lower_ir.rs | 51 +++++++++++-------- .../src/physical_plan/to_graph.rs | 2 + 10 files changed, 106 insertions(+), 78 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1cd9bacb0392..c50080a41b03 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2936,7 +2936,7 @@ dependencies = [ [[package]] name = "polars" -version = "0.44.2" +version = "0.45.0" dependencies = [ "ahash", "apache-avro", @@ -2965,7 +2965,7 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.44.2" +version = "0.45.0" dependencies = [ "ahash", "async-stream", @@ -3031,7 +3031,7 @@ dependencies = [ [[package]] name = "polars-compute" -version = "0.44.2" +version = "0.45.0" dependencies = [ "atoi_simd", "bytemuck", @@ -3052,7 +3052,7 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.44.2" +version = "0.45.0" dependencies = [ "ahash", "bincode", @@ -3089,7 +3089,7 @@ dependencies = [ [[package]] name = "polars-doc-examples" -version = "0.44.2" +version = "0.45.0" dependencies = [ "aws-config", "aws-sdk-s3", @@ -3103,7 +3103,7 @@ dependencies = [ [[package]] name = "polars-dylib" -version = "0.44.2" +version = "0.45.0" dependencies = [ "polars", "polars-arrow", @@ -3117,7 +3117,7 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.44.2" +version = "0.45.0" dependencies = [ "avro-schema", "object_store", @@ -3129,7 +3129,7 @@ dependencies = [ [[package]] name = "polars-expr" -version = "0.44.2" +version = "0.45.0" dependencies = [ "ahash", "bitflags", @@ -3152,7 +3152,7 @@ dependencies = [ [[package]] name = "polars-ffi" -version = "0.44.2" +version = "0.45.0" dependencies = [ "polars-arrow", "polars-core", @@ -3160,7 +3160,7 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.44.2" +version = "0.45.0" dependencies = [ "ahash", "async-trait", @@ -3209,7 +3209,7 @@ dependencies = [ [[package]] name = "polars-json" -version = "0.44.2" +version = "0.45.0" dependencies = [ "ahash", "chrono", @@ -3230,7 +3230,7 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.44.2" +version = "0.45.0" dependencies = [ "ahash", "bitflags", @@ -3258,7 +3258,7 @@ dependencies = [ [[package]] name = "polars-mem-engine" -version = "0.44.2" +version = "0.45.0" dependencies = [ "futures", "memmap2", @@ -3279,7 +3279,7 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.44.2" +version = "0.45.0" dependencies = [ "ahash", "aho-corasick", @@ -3316,7 +3316,7 @@ dependencies = [ [[package]] name = "polars-parquet" -version = "0.44.2" +version = "0.45.0" dependencies = [ "ahash", "async-stream", @@ -3357,7 +3357,7 @@ dependencies = [ [[package]] name = "polars-pipe" -version = "0.44.2" +version = "0.45.0" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -3382,7 +3382,7 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.44.2" +version = "0.45.0" dependencies = [ "ahash", "bitflags", @@ -3421,7 +3421,7 @@ dependencies = [ [[package]] name = "polars-python" -version = "0.44.2" +version = "0.45.0" dependencies = [ "ahash", "arboard", @@ -3457,7 +3457,7 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.44.2" +version = "0.45.0" dependencies = [ "bitflags", "bytemuck", @@ -3469,7 +3469,7 @@ dependencies = [ [[package]] name = "polars-schema" -version = "0.44.2" +version = "0.45.0" dependencies = [ "indexmap", "polars-error", @@ -3480,7 +3480,7 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.44.2" +version = "0.45.0" dependencies = [ "hex", "once_cell", @@ -3500,7 +3500,7 @@ dependencies = [ [[package]] name = "polars-stream" -version = "0.44.2" +version = "0.45.0" dependencies = [ "atomic-waker", "crossbeam-deque", @@ -3528,7 +3528,7 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.44.2" +version = "0.45.0" dependencies = [ "atoi", "bytemuck", @@ -3549,7 +3549,7 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.44.2" +version = "0.45.0" dependencies = [ "ahash", "bytemuck", diff --git a/Cargo.toml b/Cargo.toml index 4249c5d4b494..3474ab16dbd7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ default-members = [ ] [workspace.package] -version = "0.44.2" +version = "0.45.0" authors = ["Ritchie Vink "] edition = "2021" homepage = "https://www.pola.rs/" @@ -92,28 +92,28 @@ version_check = "0.9.4" xxhash-rust = { version = "0.8.6", features = ["xxh3"] } zstd = "0.13" -polars = { version = "0.44.2", path = "crates/polars", default-features = false } -polars-compute = { version = "0.44.2", path = "crates/polars-compute", default-features = false } -polars-core = { version = "0.44.2", path = "crates/polars-core", default-features = false } -polars-dylib = { version = "0.44.2", path = "crates/polars-dylib", default-features = false } -polars-error = { version = "0.44.2", path = "crates/polars-error", default-features = false } -polars-expr = { version = "0.44.2", path = "crates/polars-expr", default-features = false } -polars-ffi = { version = "0.44.2", path = "crates/polars-ffi", default-features = false } -polars-io = { version = "0.44.2", path = "crates/polars-io", default-features = false } -polars-json = { version = "0.44.2", path = "crates/polars-json", default-features = false } -polars-lazy = { version = "0.44.2", path = "crates/polars-lazy", default-features = false } -polars-mem-engine = { version = "0.44.2", path = "crates/polars-mem-engine", default-features = false } -polars-ops = { version = "0.44.2", path = "crates/polars-ops", default-features = false } -polars-parquet = { version = "0.44.2", path = "crates/polars-parquet", default-features = false } -polars-pipe = { version = "0.44.2", path = "crates/polars-pipe", default-features = false } -polars-plan = { version = "0.44.2", path = "crates/polars-plan", default-features = false } -polars-python = { version = "0.44.2", path = "crates/polars-python", default-features = false } -polars-row = { version = "0.44.2", path = "crates/polars-row", default-features = false } -polars-schema = { version = "0.44.2", path = "crates/polars-schema", default-features = false } -polars-sql = { version = "0.44.2", path = "crates/polars-sql", default-features = false } -polars-stream = { version = "0.44.2", path = "crates/polars-stream", default-features = false } -polars-time = { version = "0.44.2", path = "crates/polars-time", default-features = false } -polars-utils = { version = "0.44.2", path = "crates/polars-utils", default-features = false } +polars = { version = "0.45.0", path = "crates/polars", default-features = false } +polars-compute = { version = "0.45.0", path = "crates/polars-compute", default-features = false } +polars-core = { version = "0.45.0", path = "crates/polars-core", default-features = false } +polars-dylib = { version = "0.45.0", path = "crates/polars-dylib", default-features = false } +polars-error = { version = "0.45.0", path = "crates/polars-error", default-features = false } +polars-expr = { version = "0.45.0", path = "crates/polars-expr", default-features = false } +polars-ffi = { version = "0.45.0", path = "crates/polars-ffi", default-features = false } +polars-io = { version = "0.45.0", path = "crates/polars-io", default-features = false } +polars-json = { version = "0.45.0", path = "crates/polars-json", default-features = false } +polars-lazy = { version = "0.45.0", path = "crates/polars-lazy", default-features = false } +polars-mem-engine = { version = "0.45.0", path = "crates/polars-mem-engine", default-features = false } +polars-ops = { version = "0.45.0", path = "crates/polars-ops", default-features = false } +polars-parquet = { version = "0.45.0", path = "crates/polars-parquet", default-features = false } +polars-pipe = { version = "0.45.0", path = "crates/polars-pipe", default-features = false } +polars-plan = { version = "0.45.0", path = "crates/polars-plan", default-features = false } +polars-python = { version = "0.45.0", path = "crates/polars-python", default-features = false } +polars-row = { version = "0.45.0", path = "crates/polars-row", default-features = false } +polars-schema = { version = "0.45.0", path = "crates/polars-schema", default-features = false } +polars-sql = { version = "0.45.0", path = "crates/polars-sql", default-features = false } +polars-stream = { version = "0.45.0", path = "crates/polars-stream", default-features = false } +polars-time = { version = "0.45.0", path = "crates/polars-time", default-features = false } +polars-utils = { version = "0.45.0", path = "crates/polars-utils", default-features = false } [workspace.dependencies.arrow-format] package = "polars-arrow-format" @@ -121,7 +121,7 @@ version = "0.1.0" [workspace.dependencies.arrow] package = "polars-arrow" -version = "0.44.2" +version = "0.45.0" path = "crates/polars-arrow" default-features = false features = [ diff --git a/crates/polars-json/Cargo.toml b/crates/polars-json/Cargo.toml index 3d93c34c3418..99e4af655129 100644 --- a/crates/polars-json/Cargo.toml +++ b/crates/polars-json/Cargo.toml @@ -9,7 +9,7 @@ repository = { workspace = true } description = "JSON related logic for the Polars DataFrame library" [dependencies] -polars-compute = { workspace = true } +polars-compute = { workspace = true, features = ["cast"] } polars-error = { workspace = true } polars-utils = { workspace = true } diff --git a/crates/polars-lazy/Cargo.toml b/crates/polars-lazy/Cargo.toml index 7a8236e4ce1c..e4de540fc7d4 100644 --- a/crates/polars-lazy/Cargo.toml +++ b/crates/polars-lazy/Cargo.toml @@ -112,6 +112,7 @@ dtype-array = [ dtype-categorical = [ "polars-plan/dtype-categorical", "polars-pipe?/dtype-categorical", + "polars-stream?/dtype-categorical", "polars-expr/dtype-categorical", "polars-mem-engine/dtype-categorical", ] @@ -230,7 +231,14 @@ list_eval = [] cumulative_eval = [] list_to_struct = ["polars-plan/list_to_struct"] array_to_struct = ["polars-plan/array_to_struct"] -python = ["pyo3", "polars-plan/python", "polars-core/python", "polars-io/python", "polars-mem-engine/python"] +python = [ + "pyo3", + "polars-plan/python", + "polars-core/python", + "polars-io/python", + "polars-mem-engine/python", + "polars-stream?/python", +] row_hash = ["polars-plan/row_hash"] reinterpret = ["polars-plan/reinterpret", "polars-ops/reinterpret"] string_pad = ["polars-plan/string_pad"] diff --git a/crates/polars-stream/Cargo.toml b/crates/polars-stream/Cargo.toml index f0b3b1c30e35..7f8eafb2747e 100644 --- a/crates/polars-stream/Cargo.toml +++ b/crates/polars-stream/Cargo.toml @@ -16,13 +16,13 @@ futures = { workspace = true } memmap = { workspace = true } parking_lot = { workspace = true } pin-project-lite = { workspace = true } -polars-io = { workspace = true } +polars-io = { workspace = true, features = ["async"] } polars-utils = { workspace = true } rand = { workspace = true } rayon = { workspace = true } recursive = { workspace = true } slotmap = { workspace = true } -tokio = { workspace = true } +tokio = { workspace = true, features = ["sync"] } polars-core = { workspace = true } polars-error = { workspace = true } @@ -41,11 +41,13 @@ bitwise = ["polars-core/bitwise", "polars-plan/bitwise", "polars-expr/bitwise"] merge_sorted = ["polars-plan/merge_sorted"] dynamic_group_by = [] strings = [] -ipc = ["polars-mem-engine/ipc", "polars-plan/ipc"] +ipc = ["polars-mem-engine/ipc", "polars-plan/ipc", "polars-io/ipc"] parquet = ["polars-mem-engine/parquet", "polars-plan/parquet"] -csv = ["polars-mem-engine/csv", "polars-plan/csv"] -json = ["polars-mem-engine/json", "polars-plan/json"] +csv = ["polars-mem-engine/csv", "polars-plan/csv", "polars-io/csv"] +json = ["polars-mem-engine/json", "polars-plan/json", "polars-io/json"] cloud = ["polars-mem-engine/cloud", "polars-plan/cloud", "polars-io/cloud"] +dtype-categorical = ["polars-core/dtype-categorical"] +python = ["polars-plan/python"] # We need to specify default features here to match workspace defaults. # Otherwise we get warnings with cargo check/clippy. diff --git a/crates/polars-stream/src/nodes/io_sources/csv.rs b/crates/polars-stream/src/nodes/io_sources/csv.rs index 43023991e97c..1c8b66ab2353 100644 --- a/crates/polars-stream/src/nodes/io_sources/csv.rs +++ b/crates/polars-stream/src/nodes/io_sources/csv.rs @@ -4,10 +4,12 @@ use std::sync::Arc; use futures::stream::FuturesUnordered; use futures::StreamExt; +use polars_core::config; use polars_core::prelude::{AnyValue, DataType, Field}; use polars_core::scalar::Scalar; use polars_core::schema::{SchemaExt, SchemaRef}; -use polars_core::{config, StringCacheHolder}; +#[cfg(feature = "dtype-categorical")] +use polars_core::StringCacheHolder; use polars_error::{polars_bail, PolarsResult}; use polars_io::prelude::_csv_read_internal::{ cast_columns, find_starting_point, prepare_csv_schema, read_chunk, CountLines, @@ -529,6 +531,7 @@ impl CsvSourceNode { struct ChunkReader { reader_schema: SchemaRef, fields_to_cast: Vec, + #[cfg(feature = "dtype-categorical")] _cat_lock: Option, separator: u8, ignore_errors: bool, @@ -568,6 +571,7 @@ impl ChunkReader { let has_categorical = prepare_csv_schema(&mut reader_schema, &mut fields_to_cast)?; + #[cfg(feature = "dtype-categorical")] let _cat_lock = has_categorical.then(polars_core::StringCacheHolder::hold); let parse_options = &*options.parse_options; @@ -602,6 +606,7 @@ impl ChunkReader { Ok(Self { reader_schema, fields_to_cast, + #[cfg(feature = "dtype-categorical")] _cat_lock, separator, ignore_errors: options.ignore_errors, diff --git a/crates/polars-stream/src/nodes/io_sources/mod.rs b/crates/polars-stream/src/nodes/io_sources/mod.rs index 100f2f2be8af..88e855dee72a 100644 --- a/crates/polars-stream/src/nodes/io_sources/mod.rs +++ b/crates/polars-stream/src/nodes/io_sources/mod.rs @@ -1,4 +1,6 @@ +#[cfg(feature = "csv")] pub mod csv; +#[cfg(feature = "ipc")] pub mod ipc; #[cfg(feature = "parquet")] pub mod parquet; diff --git a/crates/polars-stream/src/physical_plan/fmt.rs b/crates/polars-stream/src/physical_plan/fmt.rs index 7ef74d5b0ad9..5a131110f3d9 100644 --- a/crates/polars-stream/src/physical_plan/fmt.rs +++ b/crates/polars-stream/src/physical_plan/fmt.rs @@ -107,6 +107,8 @@ fn visualize_plan_rec( FileType::Csv(_) => ("csv-sink".to_string(), from_ref(input)), #[cfg(feature = "json")] FileType::Json(_) => ("json-sink".to_string(), from_ref(input)), + #[allow(unreachable_patterns)] + _ => todo!(), }, PhysNodeKind::InMemoryMap { input, map: _ } => { ("in-memory-map".to_string(), from_ref(input)) diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs index 488162d75849..31175f58ba7e 100644 --- a/crates/polars-stream/src/physical_plan/lower_ir.rs +++ b/crates/polars-stream/src/physical_plan/lower_ir.rs @@ -4,6 +4,7 @@ use polars_core::frame::DataFrame; use polars_core::prelude::{InitHashMaps, PlHashMap, PlIndexMap}; use polars_core::schema::Schema; use polars_error::{polars_ensure, PolarsResult}; +use polars_io::RowIndex; use polars_plan::plans::expr_ir::{ExprIR, OutputName}; use polars_plan::plans::{AExpr, FileScan, FunctionIR, IRAggExpr, IR}; use polars_plan::prelude::{FileType, SinkType}; @@ -335,6 +336,7 @@ pub fn lower_ir( df: Arc::new(DataFrame::empty_with_schema(output_schema.as_ref())), } } else { + #[cfg(feature = "ipc")] if matches!(scan_type, FileScan::Ipc { .. }) { // @TODO: All the things the IPC source does not support yet. if hive_parts.is_some() @@ -350,28 +352,32 @@ pub fn lower_ir( // * with_row_index() -> slice() -> filter() // Some scans have built-in support for applying these operations in an optimized manner. - let opt_rewrite_to_nodes = match &scan_type { - FileScan::Parquet { .. } => (None, None, None), - FileScan::Ipc { .. } => (None, None, predicate.take()), - FileScan::Csv { options, .. } => { - if options.parse_options.comment_prefix.is_none() - && std::env::var("POLARS_DISABLE_EXPERIMENTAL_CSV_SLICE").as_deref() - != Ok("1") - { - // Note: This relies on `CountLines` being exact. - (None, None, predicate.take()) - } else { - // There can be comments in the middle of the file, then `CountLines` won't - // return an accurate line count :'(. - ( - file_options.row_index.take(), - file_options.slice.take(), - predicate.take(), - ) - } - }, - _ => todo!(), - }; + let opt_rewrite_to_nodes: (Option, Option<(i64, usize)>, Option) = + match &scan_type { + #[cfg(feature = "parquet")] + FileScan::Parquet { .. } => (None, None, None), + #[cfg(feature = "ipc")] + FileScan::Ipc { .. } => (None, None, predicate.take()), + #[cfg(feature = "csv")] + FileScan::Csv { options, .. } => { + if options.parse_options.comment_prefix.is_none() + && std::env::var("POLARS_DISABLE_EXPERIMENTAL_CSV_SLICE").as_deref() + != Ok("1") + { + // Note: This relies on `CountLines` being exact. + (None, None, predicate.take()) + } else { + // There can be comments in the middle of the file, then `CountLines` won't + // return an accurate line count :'(. + ( + file_options.row_index.take(), + file_options.slice.take(), + predicate.take(), + ) + } + }, + _ => todo!(), + }; let phys_node = PhysNodeKind::FileScan { scan_sources, @@ -427,6 +433,7 @@ pub fn lower_ir( } }, + #[cfg(feature = "python")] IR::PythonScan { .. } => todo!(), IR::Reduce { .. } => todo!(), IR::Cache { .. } => todo!(), diff --git a/crates/polars-stream/src/physical_plan/to_graph.rs b/crates/polars-stream/src/physical_plan/to_graph.rs index 6a942f427af1..933a717bd960 100644 --- a/crates/polars-stream/src/physical_plan/to_graph.rs +++ b/crates/polars-stream/src/physical_plan/to_graph.rs @@ -368,6 +368,7 @@ fn to_graph_rec<'a>( todo!() } }, + #[cfg(feature = "ipc")] FileScan::Ipc { options, cloud_options, @@ -385,6 +386,7 @@ fn to_graph_rec<'a>( )?, [], ), + #[cfg(feature = "csv")] FileScan::Csv { options, .. } => { assert!(predicate.is_none());