diff --git a/crates/polars-core/src/frame/group_by/mod.rs b/crates/polars-core/src/frame/group_by/mod.rs index 08ca7f5e9f47..fc457c8bfdba 100644 --- a/crates/polars-core/src/frame/group_by/mod.rs +++ b/crates/polars-core/src/frame/group_by/mod.rs @@ -60,16 +60,21 @@ impl DataFrame { !by.is_empty(), ComputeError: "at least one key is required in a group_by operation" ); - let by_len = by[0].len(); + let minimal_by_len = by.iter().map(|s| s.len()).min().expect("at least 1 key"); + let df_height = self.height(); // we only throw this error if self.width > 0 // so that we can still call this on a dummy dataframe where we provide the keys - if (by_len != self.height()) && (self.width() > 0) { + if (minimal_by_len != df_height) && (self.width() > 0) { polars_ensure!( - by_len == 1, + minimal_by_len == 1, ShapeMismatch: "series used as keys should have the same length as the dataframe" ); - by[0] = by[0].new_from_index(0, self.height()) + for by_key in by.iter_mut() { + if by_key.len() == minimal_by_len { + *by_key = by_key.new_from_index(0, df_height) + } + } }; let n_partitions = _set_partition_size(); diff --git a/crates/polars-core/src/frame/group_by/proxy.rs b/crates/polars-core/src/frame/group_by/proxy.rs index 70a478f2da53..d8bc3b4c60e3 100644 --- a/crates/polars-core/src/frame/group_by/proxy.rs +++ b/crates/polars-core/src/frame/group_by/proxy.rs @@ -367,19 +367,16 @@ impl GroupsProxy { } } - pub fn take_group_lasts(self) -> Vec { + /// # Safety + /// This will not do any bounds checks. The caller must ensure + /// all groups have members. + pub unsafe fn take_group_lasts(self) -> Vec { match self { - GroupsProxy::Idx(groups) => { - groups - .all - .iter() - .map(|idx| { - // safety: - // idx has at least one eletment, so -1 is always in bounds - unsafe { *idx.get_unchecked(idx.len() - 1) } - }) - .collect() - }, + GroupsProxy::Idx(groups) => groups + .all + .iter() + .map(|idx| *idx.get_unchecked(idx.len() - 1)) + .collect(), GroupsProxy::Slice { groups, .. } => groups .into_iter() .map(|[first, len]| first + len - 1) diff --git a/crates/polars-ops/src/series/ops/is_last_distinct.rs b/crates/polars-ops/src/series/ops/is_last_distinct.rs index 9cb00799dacf..57c388f2c5fc 100644 --- a/crates/polars-ops/src/series/ops/is_last_distinct.rs +++ b/crates/polars-ops/src/series/ops/is_last_distinct.rs @@ -149,7 +149,8 @@ where #[cfg(feature = "dtype-struct")] fn is_last_distinct_struct(s: &Series) -> PolarsResult { let groups = s.group_tuples(true, false)?; - let last = groups.take_group_lasts(); + // SAFETY: all groups have at least a single member + let last = unsafe { groups.take_group_lasts() }; let mut out = MutableBitmap::with_capacity(s.len()); out.extend_constant(s.len(), false); @@ -165,7 +166,8 @@ fn is_last_distinct_struct(s: &Series) -> PolarsResult { #[cfg(feature = "group_by_list")] fn is_last_distinct_list(ca: &ListChunked) -> PolarsResult { let groups = ca.group_tuples(true, false)?; - let last = groups.take_group_lasts(); + // SAFETY: all groups have at least a single member + let last = unsafe { groups.take_group_lasts() }; let mut out = MutableBitmap::with_capacity(ca.len()); out.extend_constant(ca.len(), false); diff --git a/crates/polars-plan/src/dsl/functions/mod.rs b/crates/polars-plan/src/dsl/functions/mod.rs index fd8d8247e339..f95be89c0af6 100644 --- a/crates/polars-plan/src/dsl/functions/mod.rs +++ b/crates/polars-plan/src/dsl/functions/mod.rs @@ -21,7 +21,7 @@ pub use correlation::*; pub use horizontal::*; pub use index::*; #[cfg(feature = "temporal")] -use polars_core::export::arrow::temporal_conversions::NANOSECONDS; +use polars_core::export::arrow::temporal_conversions::{MICROSECONDS, MILLISECONDS, NANOSECONDS}; #[cfg(feature = "temporal")] use polars_core::utils::arrow::temporal_conversions::SECONDS_IN_DAY; #[cfg(feature = "dtype-struct")] diff --git a/crates/polars-plan/src/dsl/functions/temporal.rs b/crates/polars-plan/src/dsl/functions/temporal.rs index e6b866c2ac10..5d702059150b 100644 --- a/crates/polars-plan/src/dsl/functions/temporal.rs +++ b/crates/polars-plan/src/dsl/functions/temporal.rs @@ -177,6 +177,7 @@ pub struct DurationArgs { pub milliseconds: Expr, pub microseconds: Expr, pub nanoseconds: Expr, + pub time_unit: TimeUnit, } impl Default for DurationArgs { @@ -190,6 +191,7 @@ impl Default for DurationArgs { milliseconds: lit(0), microseconds: lit(0), nanoseconds: lit(0), + time_unit: TimeUnit::Microseconds, } } } @@ -258,15 +260,15 @@ pub fn duration(args: DurationArgs) -> Expr { if s.iter().any(|s| s.is_empty()) { return Ok(Some(Series::new_empty( s[0].name(), - &DataType::Duration(TimeUnit::Nanoseconds), + &DataType::Duration(args.time_unit), ))); } let days = s[0].cast(&DataType::Int64).unwrap(); let seconds = s[1].cast(&DataType::Int64).unwrap(); let mut nanoseconds = s[2].cast(&DataType::Int64).unwrap(); - let microseconds = s[3].cast(&DataType::Int64).unwrap(); - let milliseconds = s[4].cast(&DataType::Int64).unwrap(); + let mut microseconds = s[3].cast(&DataType::Int64).unwrap(); + let mut milliseconds = s[4].cast(&DataType::Int64).unwrap(); let minutes = s[5].cast(&DataType::Int64).unwrap(); let hours = s[6].cast(&DataType::Int64).unwrap(); let weeks = s[7].cast(&DataType::Int64).unwrap(); @@ -278,34 +280,59 @@ pub fn duration(args: DurationArgs) -> Expr { (s.len() != max_len && s.get(0).unwrap() != AnyValue::Int64(0)) || s.len() == max_len }; - if nanoseconds.len() != max_len { - nanoseconds = nanoseconds.new_from_index(0, max_len); - } - if condition(µseconds) { - nanoseconds = nanoseconds + (microseconds * 1_000); - } - if condition(&milliseconds) { - nanoseconds = nanoseconds + (milliseconds * 1_000_000); - } + let multiplier = match args.time_unit { + TimeUnit::Nanoseconds => NANOSECONDS, + TimeUnit::Microseconds => MICROSECONDS, + TimeUnit::Milliseconds => MILLISECONDS, + }; + + let mut duration = match args.time_unit { + TimeUnit::Nanoseconds => { + if nanoseconds.len() != max_len { + nanoseconds = nanoseconds.new_from_index(0, max_len); + } + if condition(µseconds) { + nanoseconds = nanoseconds + (microseconds * 1_000); + } + if condition(&milliseconds) { + nanoseconds = nanoseconds + (milliseconds * 1_000_000); + } + nanoseconds + }, + TimeUnit::Microseconds => { + if microseconds.len() != max_len { + microseconds = microseconds.new_from_index(0, max_len); + } + if condition(&milliseconds) { + microseconds = microseconds + (milliseconds * 1_000); + } + microseconds + }, + TimeUnit::Milliseconds => { + if milliseconds.len() != max_len { + milliseconds = milliseconds.new_from_index(0, max_len); + } + milliseconds + }, + }; + if condition(&seconds) { - nanoseconds = nanoseconds + (seconds * NANOSECONDS); + duration = duration + (seconds * multiplier); } if condition(&days) { - nanoseconds = nanoseconds + (days * NANOSECONDS * SECONDS_IN_DAY); + duration = duration + (days * multiplier * SECONDS_IN_DAY); } if condition(&minutes) { - nanoseconds = nanoseconds + minutes * NANOSECONDS * 60; + duration = duration + minutes * multiplier * 60; } if condition(&hours) { - nanoseconds = nanoseconds + hours * NANOSECONDS * 60 * 60; + duration = duration + hours * multiplier * 60 * 60; } if condition(&weeks) { - nanoseconds = nanoseconds + weeks * NANOSECONDS * SECONDS_IN_DAY * 7; + duration = duration + weeks * multiplier * SECONDS_IN_DAY * 7; } - nanoseconds - .cast(&DataType::Duration(TimeUnit::Nanoseconds)) - .map(Some) + duration.cast(&DataType::Duration(args.time_unit)).map(Some) }) as Arc); Expr::AnonymousFunction { @@ -320,7 +347,7 @@ pub fn duration(args: DurationArgs) -> Expr { args.weeks, ], function, - output_type: GetOutput::from_type(DataType::Duration(TimeUnit::Nanoseconds)), + output_type: GetOutput::from_type(DataType::Duration(args.time_unit)), options: FunctionOptions { collect_groups: ApplyOptions::ApplyFlat, input_wildcard_expansion: true, diff --git a/docs/src/python/user-guide/expressions/structs.py b/docs/src/python/user-guide/expressions/structs.py index f209420a37ab..ee034a362bc6 100644 --- a/docs/src/python/user-guide/expressions/structs.py +++ b/docs/src/python/user-guide/expressions/structs.py @@ -26,24 +26,24 @@ # --8<-- [end:struct_unnest] # --8<-- [start:series_struct] -rating_Series = pl.Series( +rating_series = pl.Series( "ratings", [ {"Movie": "Cars", "Theatre": "NE", "Avg_Rating": 4.5}, {"Movie": "Toy Story", "Theatre": "ME", "Avg_Rating": 4.9}, ], ) -print(rating_Series) +print(rating_series) # --8<-- [end:series_struct] # --8<-- [start:series_struct_extract] -out = rating_Series.struct.field("Movie") +out = rating_series.struct.field("Movie") print(out) # --8<-- [end:series_struct_extract] # --8<-- [start:series_struct_rename] out = ( - rating_Series.to_frame() + rating_series.to_frame() .select(pl.col("ratings").struct.rename_fields(["Film", "State", "Value"])) .unnest("ratings") ) diff --git a/docs/user-guide/expressions/structs.md b/docs/user-guide/expressions/structs.md index ee0012fe4db4..61978bbc25e7 100644 --- a/docs/user-guide/expressions/structs.md +++ b/docs/user-guide/expressions/structs.md @@ -45,7 +45,7 @@ Polars will interpret a `dict` sent to the `Series` constructor as a `Struct`: !!! note "Constructing `Series` objects" - Note that `Series` here was constructed with the `name` of the series in the begninng, followed by the `values`. Providing the latter first + Note that `Series` here was constructed with the `name` of the series in the beginning, followed by the `values`. Providing the latter first is considered an anti-pattern in Polars, and must be avoided. ### Extracting individual values of a `Struct` @@ -60,7 +60,7 @@ Let's say that we needed to obtain just the `movie` value in the `Series` that w ### Renaming individual keys of a `Struct` -What if we need to rename individual `field`s of a `Struct` column? We first convert the `rating_Series` object to a `DataFrame` so that we can view the changes easily, and then use the `rename_fields` method: +What if we need to rename individual `field`s of a `Struct` column? We first convert the `rating_series` object to a `DataFrame` so that we can view the changes easily, and then use the `rename_fields` method: {{code_block('user-guide/expressions/structs','series_struct_rename',['struct.rename_fields'])}} @@ -84,7 +84,7 @@ We can identify the unique cases at this level also with `is_unique`! ### Multi-column ranking -Suppose, given that we know there are duplicates, we want to choose which rank gets a higher priority. We define _Count_ of ratings to be more important than the actual `Avg_Rating` themselves, and only use it to break a tie. We can then do: +Suppose, given that we know there are duplicates, we want to choose which rank gets a higher priority. We define `Count` of ratings to be more important than the actual `Avg_Rating` themselves, and only use it to break a tie. We can then do: {{code_block('user-guide/expressions/structs','struct_ranking',['is_duplicated', 'struct'])}} diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index ab62bb81f188..0152e66382d6 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -7408,10 +7408,19 @@ def partition_by( ] if as_dict: - if len(by) == 1: - return {df[by][0, 0]: df for df in partitions} + df = self._from_pydf(self._df) + if include_key: + if len(by) == 1: + names = [p[by[0]][0] for p in partitions] + else: + names = [p.select(by).row(0) for p in partitions] else: - return {df[by].row(0): df for df in partitions} + if len(by) == 1: + names = df[by[0]].unique(maintain_order=True).to_list() + else: + names = df.select(by).unique(maintain_order=True).rows() + + return dict(zip(names, partitions)) return partitions diff --git a/py-polars/polars/functions/as_datatype.py b/py-polars/polars/functions/as_datatype.py index d991131a33d7..14e4d19a23ac 100644 --- a/py-polars/polars/functions/as_datatype.py +++ b/py-polars/polars/functions/as_datatype.py @@ -185,6 +185,7 @@ def duration( minutes: Expr | str | int | None = None, hours: Expr | str | int | None = None, weeks: Expr | str | int | None = None, + time_unit: TimeUnit = "us", ) -> Expr: """ Create polars `Duration` from distinct time components. @@ -294,6 +295,7 @@ def duration( minutes, hours, weeks, + time_unit, ) ) diff --git a/py-polars/src/functions/lazy.rs b/py-polars/src/functions/lazy.rs index 574f824569b9..93017aa529ed 100644 --- a/py-polars/src/functions/lazy.rs +++ b/py-polars/src/functions/lazy.rs @@ -288,6 +288,7 @@ pub fn dtype_cols(dtypes: Vec>) -> PyResult { #[allow(clippy::too_many_arguments)] #[pyfunction] +#[pyo3(signature = (days, seconds, nanoseconds, microseconds, milliseconds, minutes, hours, weeks, time_unit))] pub fn duration( days: Option, seconds: Option, @@ -297,6 +298,7 @@ pub fn duration( minutes: Option, hours: Option, weeks: Option, + time_unit: Wrap, ) -> PyExpr { set_unwrapped_or_0!( days, @@ -317,6 +319,7 @@ pub fn duration( minutes, hours, weeks, + time_unit: time_unit.0, }; dsl::duration(args).into() } diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index cd22687b67e2..46ad72d71a34 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -2610,6 +2610,19 @@ def test_partition_by() -> None: "b": [1, 3], } + # test with both as_dict and include_key=False + df = pl.DataFrame( + { + "a": pl.int_range(0, 100, dtype=pl.UInt8, eager=True), + "b": pl.int_range(0, 100, dtype=pl.UInt8, eager=True), + "c": pl.int_range(0, 100, dtype=pl.UInt8, eager=True), + "d": pl.int_range(0, 100, dtype=pl.UInt8, eager=True), + } + ).sample(n=100_000, with_replacement=True, shuffle=True) + + partitions = df.partition_by(["a", "b"], as_dict=True, include_key=False) + assert all(key == value.row(0) for key, value in partitions.items()) + def test_list_of_list_of_struct() -> None: expected = [{"list_of_list_of_struct": [[{"a": 1}, {"a": 2}]]}] diff --git a/py-polars/tests/unit/functions/test_as_datatype.py b/py-polars/tests/unit/functions/test_as_datatype.py index 6a92f0effd4f..bd5c8250dd9c 100644 --- a/py-polars/tests/unit/functions/test_as_datatype.py +++ b/py-polars/tests/unit/functions/test_as_datatype.py @@ -1,6 +1,6 @@ from __future__ import annotations -from datetime import date, datetime +from datetime import date, datetime, timedelta from typing import TYPE_CHECKING import pytest @@ -96,10 +96,36 @@ def test_time() -> None: def test_empty_duration() -> None: s = pl.DataFrame([], {"days": pl.Int32}).select(pl.duration(days="days")) - assert s.dtypes == [pl.Duration("ns")] + assert s.dtypes == [pl.Duration("us")] assert s.shape == (0, 1) +@pytest.mark.parametrize( + ("time_unit", "expected"), + [ + ("ms", timedelta(days=1, minutes=2, seconds=3, milliseconds=4)), + ("us", timedelta(days=1, minutes=2, seconds=3, milliseconds=4, microseconds=5)), + ("ns", timedelta(days=1, minutes=2, seconds=3, milliseconds=4, microseconds=5)), + ], +) +def test_duration_time_units(time_unit: TimeUnit, expected: timedelta) -> None: + result = pl.LazyFrame().select( + pl.duration( + days=1, + minutes=2, + seconds=3, + milliseconds=4, + microseconds=5, + nanoseconds=6, + time_unit=time_unit, + ) + ) + assert result.schema["duration"] == pl.Duration(time_unit) + assert result.collect()["duration"].item() == expected + if time_unit == "ns": + assert result.collect()["duration"].dt.nanoseconds().item() == 86523004005006 + + def test_list_concat() -> None: s0 = pl.Series("a", [[1, 2]]) s1 = pl.Series("b", [[3, 4, 5]]) diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index f69ec44e7b03..2919030a1a1f 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -728,6 +728,7 @@ def test_write_csv_delimiter() -> None: df.write_csv(f, delimiter_char="\t") f.seek(0) assert f.read() == b"a\tb\n1\t1\n2\t2\n3\t3\n" + assert_frame_equal(df, pl.read_csv(f, separator="\t")) def test_write_csv_line_terminator() -> None: @@ -736,6 +737,7 @@ def test_write_csv_line_terminator() -> None: df.write_csv(f, line_terminator="\r\n") f.seek(0) assert f.read() == b"a,b\r\n1,1\r\n2,2\r\n3,3\r\n" + assert_frame_equal(df, pl.read_csv(f, eol_char="\n")) def test_escaped_null_values() -> None: diff --git a/py-polars/tests/unit/namespaces/conftest.py b/py-polars/tests/unit/namespaces/conftest.py new file mode 100644 index 000000000000..9aa3a121a35d --- /dev/null +++ b/py-polars/tests/unit/namespaces/conftest.py @@ -0,0 +1,10 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest + + +@pytest.fixture() +def namespace_files_path() -> Path: + return Path(__file__).parent / "files" diff --git a/py-polars/tests/unit/namespaces/test_tree_fmt.txt b/py-polars/tests/unit/namespaces/files/test_tree_fmt.txt similarity index 100% rename from py-polars/tests/unit/namespaces/test_tree_fmt.txt rename to py-polars/tests/unit/namespaces/files/test_tree_fmt.txt diff --git a/py-polars/tests/unit/namespaces/test_meta.py b/py-polars/tests/unit/namespaces/test_meta.py index 782035807753..b8c79b056bd3 100644 --- a/py-polars/tests/unit/namespaces/test_meta.py +++ b/py-polars/tests/unit/namespaces/test_meta.py @@ -1,11 +1,14 @@ from __future__ import annotations -from pathlib import Path +from typing import TYPE_CHECKING import pytest import polars as pl +if TYPE_CHECKING: + from pathlib import Path + def test_meta_pop_and_cmp() -> None: e = pl.col("foo").alias("bar") @@ -70,10 +73,8 @@ def test_meta_is_regex_projection() -> None: assert e.meta.has_multiple_outputs() -def test_meta_tree_format() -> None: - with Path(__file__).parent.joinpath("test_tree_fmt.txt").open( - "r", encoding="utf-8" - ) as f: +def test_meta_tree_format(namespace_files_path: Path) -> None: + with (namespace_files_path / "test_tree_fmt.txt").open("r", encoding="utf-8") as f: test_sets = f.read().split("---") for test_set in test_sets: expression = test_set.strip().split("\n")[0] diff --git a/py-polars/tests/unit/operations/test_group_by.py b/py-polars/tests/unit/operations/test_group_by.py index 956e6cc5d994..aa6b10d8f0f4 100644 --- a/py-polars/tests/unit/operations/test_group_by.py +++ b/py-polars/tests/unit/operations/test_group_by.py @@ -898,3 +898,19 @@ def test_groupby_dynamic_deprecated() -> None: expected = df.group_by_dynamic("date", every="2d").agg(pl.sum("value")) assert_frame_equal(result, expected, check_row_order=False) assert_frame_equal(result_lazy, expected, check_row_order=False) + + +def test_group_by_multiple_keys_one_literal() -> None: + df = pl.DataFrame({"a": [1, 1, 2], "b": [4, 5, 6]}) + + expected = {"a": [1, 2], "literal": [1, 1], "b": [5, 6]} + for streaming in [True, False]: + assert ( + df.lazy() + .group_by("a", pl.lit(1)) + .agg(pl.col("b").max()) + .sort(["a", "b"]) + .collect(streaming=streaming) + .to_dict(False) + == expected + )