Skip to content

Commit

Permalink
feat: convert to give time zone in .str.to_datetime when values are…
Browse files Browse the repository at this point in the history
… offset-aware (#16742)

Co-authored-by: Stijn de Gooijer <[email protected]>
  • Loading branch information
MarcoGorelli and stinodego authored Jun 10, 2024
1 parent eaedf74 commit c9cad0a
Show file tree
Hide file tree
Showing 12 changed files with 122 additions and 53 deletions.
20 changes: 17 additions & 3 deletions crates/polars-core/src/chunked_array/temporal/datetime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,17 +206,31 @@ impl DatetimeChunked {
}

/// Change the underlying [`TimeUnit`]. This does not modify the data.
pub fn set_time_unit(&mut self, tu: TimeUnit) {
self.2 = Some(Datetime(tu, self.time_zone().clone()))
pub fn set_time_unit(&mut self, time_unit: TimeUnit) {
self.2 = Some(Datetime(time_unit, self.time_zone().clone()))
}

/// Change the underlying [`TimeZone`]. This does not modify the data.
/// This does not validate the time zone - it's up to the caller to verify that it's
/// already been validated.
#[cfg(feature = "timezones")]
pub fn set_time_zone(&mut self, time_zone: TimeZone) -> PolarsResult<()> {
validate_time_zone(&time_zone)?;
self.2 = Some(Datetime(self.time_unit(), Some(time_zone)));
Ok(())
}

/// Change the underlying [`TimeUnit`] and [`TimeZone`]. This does not modify the data.
/// This does not validate the time zone - it's up to the caller to verify that it's
/// already been validated.
#[cfg(feature = "timezones")]
pub fn set_time_unit_and_time_zone(
&mut self,
time_unit: TimeUnit,
time_zone: TimeZone,
) -> PolarsResult<()> {
self.2 = Some(Datetime(time_unit, Some(time_zone)));
Ok(())
}
}

#[cfg(test)]
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/temporal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ static FIXED_OFFSET_PATTERN: &str = r#"(?x)
static FIXED_OFFSET_RE: Lazy<Regex> = Lazy::new(|| Regex::new(FIXED_OFFSET_PATTERN).unwrap());

#[cfg(feature = "timezones")]
pub(crate) fn validate_time_zone(tz: &str) -> PolarsResult<()> {
pub fn validate_time_zone(tz: &str) -> PolarsResult<()> {
match tz.parse::<Tz>() {
Ok(_) => Ok(()),
Err(_) => {
Expand Down
3 changes: 3 additions & 0 deletions crates/polars-plan/src/dsl/function_expr/datetime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ use arrow::temporal_conversions::{MICROSECONDS, MILLISECONDS, NANOSECONDS, SECON
#[cfg(feature = "timezones")]
use chrono_tz::Tz;
#[cfg(feature = "timezones")]
use polars_core::chunked_array::temporal::validate_time_zone;
#[cfg(feature = "timezones")]
use polars_time::base_utc_offset as base_utc_offset_fn;
#[cfg(feature = "timezones")]
use polars_time::dst_offset as dst_offset_fn;
Expand Down Expand Up @@ -343,6 +345,7 @@ pub(super) fn convert_time_zone(s: &Series, time_zone: &TimeZone) -> PolarsResul
match s.dtype() {
DataType::Datetime(_, _) => {
let mut ca = s.datetime()?.clone();
validate_time_zone(time_zone)?;
ca.set_time_zone(time_zone.clone())?;
Ok(ca.into_series())
},
Expand Down
30 changes: 12 additions & 18 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,22 @@ use std::borrow::Cow;
use arrow::legacy::utils::CustomIterTools;
#[cfg(feature = "timezones")]
use once_cell::sync::Lazy;
#[cfg(feature = "regex")]
use regex::{escape, Regex};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

#[cfg(feature = "timezones")]
static TZ_AWARE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(%z)|(%:z)|(%::z)|(%:::z)|(%#z)|(^%\+$)").unwrap());

use polars_core::chunked_array::temporal::validate_time_zone;
use polars_core::utils::handle_casting_failures;
#[cfg(feature = "dtype-struct")]
use polars_utils::format_smartstring;
use regex::{escape, Regex};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

use super::*;
use crate::{map, map_as_slice};

#[cfg(feature = "timezones")]
static TZ_AWARE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(%z)|(%:z)|(%::z)|(%:::z)|(%#z)|(^%\+$)").unwrap());

#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Clone, PartialEq, Debug, Eq, Hash)]
pub enum StringFunction {
Expand Down Expand Up @@ -652,16 +652,10 @@ fn to_datetime(
Some(format) => TZ_AWARE_RE.is_match(format),
_ => false,
};
if let (Some(tz), true) = (time_zone, tz_aware) {
if tz != "UTC" {
polars_bail!(
ComputeError:
"if using strftime/to_datetime with a time-zone-aware format, the output will be in UTC. Please either drop the time zone from the function call, or set it to UTC. \
If you are trying to convert the output to a different time zone, please use `convert_time_zone`."
)
}
};

#[cfg(feature = "timezones")]
if let Some(time_zone) = time_zone {
validate_time_zone(time_zone)?;
}
let out = if options.exact {
datetime_strings
.as_datetime(
Expand Down
21 changes: 6 additions & 15 deletions crates/polars-time/src/chunkedarray/string/infer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -452,25 +452,16 @@ pub(crate) fn to_datetime(
.find_map(|opt_val| opt_val.and_then(infer_pattern_datetime_single))
.ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?;
let mut infer = DatetimeInfer::<Int64Type>::try_from_with_unit(pattern, Some(tu))?;
if pattern == Pattern::DatetimeYMDZ
&& tz.is_some()
&& tz.map(|x| x.as_str()) != Some("UTC")
{
polars_bail!(ComputeError: "offset-aware datetimes are converted to UTC. \
Please either drop the time zone from the function call, or set it to UTC. \
To convert to a different time zone, please use `convert_time_zone`.")
}
match pattern {
#[cfg(feature = "timezones")]
Pattern::DatetimeYMDZ => infer.coerce_string(ca).datetime().map(|ca| {
let mut ca = ca.clone();
ca.set_time_unit(tu);
polars_ops::prelude::replace_time_zone(
&ca,
Some("UTC"),
_ambiguous,
NonExistent::Raise,
)
// `tz` has already been validated.
ca.set_time_unit_and_time_zone(
tu,
tz.cloned().unwrap_or_else(|| "UTC".to_string()),
)?;
Ok(ca)
})?,
_ => infer.coerce_string(ca).datetime().map(|ca| {
let mut ca = ca.clone();
Expand Down
7 changes: 5 additions & 2 deletions crates/polars-time/src/chunkedarray/string/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ pub trait StringMethods: AsString {
NonExistent::Raise,
),
#[cfg(feature = "timezones")]
(true, _) => Ok(ca.into_datetime(tu, Some("UTC".to_string()))),
(true, tz) => Ok(ca.into_datetime(tu, tz.cloned().or_else(|| Some("UTC".to_string())))),
_ => Ok(ca.into_datetime(tu, None)),
}
}
Expand Down Expand Up @@ -305,7 +305,10 @@ pub trait StringMethods: AsString {
Ok(string_ca
.apply_generic(|opt_s| convert.eval(opt_s?, use_cache))
.with_name(string_ca.name())
.into_datetime(tu, Some("UTC".to_string())))
.into_datetime(
tu,
Some(tz.map(|x| x.to_string()).unwrap_or("UTC".to_string())),
))
}
#[cfg(not(feature = "timezones"))]
{
Expand Down
12 changes: 11 additions & 1 deletion py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,17 @@ def to_datetime(
`"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is
found, the default is `"us"`.
time_zone
Time zone for the resulting Datetime column.
Time zone for the resulting Datetime column. Rules are:
- If inputs are tz-naive and `time_zone` is None, the result time zone is
`None`.
- If inputs are offset-aware and `time_zone` is None, inputs are converted
to `'UTC'` and the result time zone is `'UTC'`.
- If inputs are offset-aware and `time_zone` is given, inputs are converted
to `time_zone` and the result time zone is `time_zone`.
- If inputs are tz-naive and `time_zone` is given, input time zones are
replaced with (not converted to!) `time_zone`, and the result time zone
is `time_zone`.
strict
Raise an error if any conversion fails.
exact
Expand Down
12 changes: 11 additions & 1 deletion py-polars/polars/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,17 @@ def to_datetime(
`"%F %T%.3f"` => `Datetime("ms")`. If no fractional second component is
found, the default is `"us"`.
time_zone
Time zone for the resulting Datetime column.
Time zone for the resulting Datetime column. Rules are:
- If inputs are tz-naive and `time_zone` is None, the result time zone is
`None`.
- If inputs are offset-aware and `time_zone` is None, inputs are converted
to `'UTC'` and the result time zone is `'UTC'`.
- If inputs are offset-aware and `time_zone` is given, inputs are converted
to `time_zone` and the result time zone is `time_zone`.
- If inputs are tz-naive and `time_zone` is given, input time zones are
replaced with (not converted to!) `time_zone`, and the result time zone
is `time_zone`.
strict
Raise an error if any conversion fails.
exact
Expand Down
12 changes: 7 additions & 5 deletions py-polars/tests/unit/dataframe/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import polars.selectors as cs
from polars._utils.construction import iterable_to_pydf
from polars.datatypes import DTYPE_TEMPORAL_UNITS, INTEGER_DTYPES
from polars.exceptions import ComputeError, TimeZoneAwareConstructorWarning
from polars.exceptions import TimeZoneAwareConstructorWarning
from polars.testing import (
assert_frame_equal,
assert_frame_not_equal,
Expand Down Expand Up @@ -2502,10 +2502,12 @@ def test_init_vs_strptime_consistency_raises() -> None:
[datetime(2020, 1, 1, tzinfo=timezone(timedelta(hours=-8)))],
dtype=pl.Datetime("us", "US/Pacific"),
)
with pytest.raises(ComputeError, match=msg):
pl.Series(["2020-01-01 00:00-08:00"]).str.strptime(
pl.Datetime("us", "US/Pacific")
)
result = (
pl.Series(["2020-01-01 00:00-08:00"])
.str.strptime(pl.Datetime("us", "US/Pacific"))
.item()
)
assert result == datetime(2020, 1, 1, 0, 0, tzinfo=ZoneInfo(key="US/Pacific"))


def test_init_physical_with_timezone() -> None:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/datatypes/test_temporal.py
Original file line number Diff line number Diff line change
Expand Up @@ -1205,7 +1205,7 @@ def test_strptime_with_invalid_tz() -> None:
pl.Series(["2020-01-01 03:00:00"]).str.strptime(pl.Datetime("us", "foo"))
with pytest.raises(
ComputeError,
match="Please either drop the time zone from the function call, or set it to UTC",
match="unable to parse time zone: 'foo'",
):
pl.Series(["2020-01-01 03:00:00+01:00"]).str.strptime(
pl.Datetime("us", "foo"), "%Y-%m-%d %H:%M:%S%z"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import sys
from datetime import datetime
from typing import TYPE_CHECKING

Expand All @@ -8,7 +9,16 @@
from hypothesis import given

import polars as pl
from polars.dependencies import _ZONEINFO_AVAILABLE
from polars.exceptions import ComputeError
from polars.testing import assert_series_equal

if sys.version_info >= (3, 9):
from zoneinfo import ZoneInfo
elif _ZONEINFO_AVAILABLE:
# Import from submodule due to typing issue with backports.zoneinfo package:
# https://github.com/pganssle/zoneinfo/issues/125
from backports.zoneinfo._zoneinfo import ZoneInfo

if TYPE_CHECKING:
from hypothesis.strategies import DrawFn
Expand Down Expand Up @@ -152,3 +162,34 @@ def test_cast_to_time_and_combine(d: datetime, tu: TimeUnit) -> None:
assert [d.date() for d in datetimes] == res["dt"].to_list()
assert [d.time() for d in datetimes] == res["tm"].to_list()
assert datetimes == res["dtm"].to_list()


def test_to_datetime_aware_values_aware_dtype() -> None:
s = pl.Series(["2020-01-01T01:12:34+01:00"])
expected = pl.Series([datetime(2020, 1, 1, 5, 57, 34)]).dt.replace_time_zone(
"Asia/Kathmandu"
)

# When Polars infers the format
result = s.str.to_datetime(time_zone="Asia/Kathmandu")
assert_series_equal(result, expected)

# When the format is provided
result = s.str.to_datetime(format="%Y-%m-%dT%H:%M:%S%z", time_zone="Asia/Kathmandu")
assert_series_equal(result, expected)

# With `exact=False`
result = s.str.to_datetime(
format="%Y-%m-%dT%H:%M:%S%z", time_zone="Asia/Kathmandu", exact=False
)
assert_series_equal(result, expected)

# Check consistency with Series constructor
# TODO: remove `raises`, after https://github.com/pola-rs/polars/pull/16828.
with pytest.raises(ValueError, match="Please either drop"):
result = pl.Series(
[datetime(2020, 1, 1, 5, 57, 34, tzinfo=ZoneInfo("Asia/Kathmandu"))],
dtype=pl.Datetime("us", "Asia/Kathmandu"),
)
# TODO: uncomment, after https://github.com/pola-rs/polars/pull/16828.
# assert_series_equal(result, expected)
13 changes: 7 additions & 6 deletions py-polars/tests/unit/operations/namespaces/test_strptime.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,12 +299,13 @@ def test_infer_tz_aware_with_utc(time_unit: TimeUnit) -> None:
assert result.item() == datetime(2020, 1, 2, 2, 0, tzinfo=timezone.utc)


def test_infer_tz_aware_raises() -> None:
msg = "Please either drop the time zone from the function call, or set it to UTC"
with pytest.raises(ComputeError, match=msg):
pl.Series(["2020-01-02T04:00:00+02:00"]).str.to_datetime(
time_unit="us", time_zone="Europe/Vienna"
)
def test_str_to_datetime_infer_tz_aware() -> None:
result = (
pl.Series(["2020-01-02T04:00:00+02:00"])
.str.to_datetime(time_unit="us", time_zone="Europe/Vienna")
.item()
)
assert result == datetime(2020, 1, 2, 3, tzinfo=ZoneInfo("Europe/Vienna"))


@pytest.mark.parametrize(
Expand Down

0 comments on commit c9cad0a

Please sign in to comment.