From 99b4a6989cff19d16b0af61775d4d3bde325a5f8 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 4 Nov 2024 07:41:40 +0000 Subject: [PATCH] fix: Sortedness was incorrectly being preserved in dt.offset_by when offsetting by non-constant durations in the timezone-naive case (#19616) --- crates/polars-time/src/offset_by.rs | 44 +++++++++---------- .../namespaces/temporal/test_datetime.py | 6 +-- .../namespaces/temporal/test_offset_by.py | 29 +++++++++++- 3 files changed, 53 insertions(+), 26 deletions(-) diff --git a/crates/polars-time/src/offset_by.rs b/crates/polars-time/src/offset_by.rs index 00485a59d50d..dcdd1e1666ff 100644 --- a/crates/polars-time/src/offset_by.rs +++ b/crates/polars-time/src/offset_by.rs @@ -58,20 +58,35 @@ fn apply_offsets_to_datetime( } pub fn impl_offset_by(ts: &Series, offsets: &Series) -> PolarsResult { - let preserve_sortedness: bool; let offsets = offsets.str()?; - let out = match ts.dtype() { + let dtype = ts.dtype(); + + // Sortedness may not be preserved for non-constant durations, + // see https://github.com/pola-rs/polars/issues/19608 for a counterexample. + // Constant durations (e.g. 2 hours) always preserve sortedness. + let tz = match dtype { + DataType::Date => None, + DataType::Datetime(_, tz) => tz.clone(), + _ => polars_bail!(InvalidOperation: "expected Date or Datetime, got {}", dtype), + }; + let preserve_sortedness = match offsets.len() { + 1 => match offsets.get(0) { + Some(offset) => { + let offset = Duration::parse(offset); + offset.is_constant_duration(tz.as_deref()) + }, + None => false, + }, + _ => false, + }; + + let out = match dtype { DataType::Date => { let ts = ts .cast(&DataType::Datetime(TimeUnit::Milliseconds, None)) .unwrap(); let datetime = ts.datetime().unwrap(); let out = apply_offsets_to_datetime(datetime, offsets, None)?; - // sortedness is only guaranteed to be preserved if a constant offset is being added to every datetime - preserve_sortedness = match offsets.len() { - 1 => offsets.get(0).is_some(), - _ => false, - }; out.cast(&DataType::Datetime(TimeUnit::Milliseconds, None)) .unwrap() .cast(&DataType::Date) @@ -86,21 +101,6 @@ pub fn impl_offset_by(ts: &Series, offsets: &Series) -> PolarsResult { }, _ => apply_offsets_to_datetime(datetime, offsets, None)?, }; - // Sortedness may not be preserved when crossing daylight savings time boundaries - // for calendar-aware durations. - // Constant durations (e.g. 2 hours) always preserve sortedness. - preserve_sortedness = match offsets.len() { - 1 => match offsets.get(0) { - Some(offset) => { - let offset = Duration::parse(offset); - tz.is_none() - || tz.as_deref() == Some("UTC") - || offset.is_constant_duration(tz.as_deref()) - }, - None => false, - }, - _ => false, - }; out.cast(&DataType::Datetime(*tu, tz.clone())) }, dt => polars_bail!( diff --git a/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py b/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py index 6bbd10828496..5852d31aed52 100644 --- a/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py +++ b/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py @@ -178,9 +178,9 @@ def test_local_time_before_epoch(time_unit: TimeUnit) -> None: (None, "1d", True), ("Europe/London", "1d", False), ("UTC", "1d", True), - (None, "1mo", True), - ("Europe/London", "1mo", False), - ("UTC", "1mo", True), + (None, "1m", True), + ("Europe/London", "1m", True), + ("UTC", "1m", True), (None, "1w", True), ("Europe/London", "1w", False), ("UTC", "1w", True), diff --git a/py-polars/tests/unit/operations/namespaces/temporal/test_offset_by.py b/py-polars/tests/unit/operations/namespaces/temporal/test_offset_by.py index 79adde1e7b81..478025cf9a83 100644 --- a/py-polars/tests/unit/operations/namespaces/temporal/test_offset_by.py +++ b/py-polars/tests/unit/operations/namespaces/temporal/test_offset_by.py @@ -6,7 +6,7 @@ import pytest import polars as pl -from polars.testing import assert_series_equal +from polars.testing import assert_frame_equal, assert_series_equal if TYPE_CHECKING: from polars._typing import TimeUnit @@ -116,3 +116,30 @@ def test_datetime_offset_by( time_zone ) assert_series_equal(result, expected) + + +def test_offset_by_unique_29_feb_19608() -> None: + df20 = pl.select( + t=pl.datetime_range( + pl.datetime(2020, 2, 28), + pl.datetime(2020, 3, 1), + closed="left", + time_unit="ms", + interval="8h", + time_zone="UTC", + ), + ).with_columns(x=pl.int_range(pl.len())) + df19 = df20.with_columns(pl.col("t").dt.offset_by("-1y")) + result = df19.unique("t", keep="first").sort("t") + expected = pl.DataFrame( + { + "t": [ + datetime(2019, 2, 28), + datetime(2019, 2, 28, 8), + datetime(2019, 2, 28, 16), + ], + "x": [0, 1, 2], + }, + schema_overrides={"t": pl.Datetime("ms", "UTC")}, + ) + assert_frame_equal(result, expected)