Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Datetime(time_unit, time_zone) and Duration(time_unit) types #960

Merged
merged 39 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
121f6f8
WIP
FBruzzesi Sep 12, 2024
4896df2
order matters?
FBruzzesi Sep 12, 2024
cd2ed40
datetime test and polars fix
FBruzzesi Sep 13, 2024
eb1468e
rm NoneType
FBruzzesi Sep 13, 2024
e71f9c3
pandas pre 1.5
FBruzzesi Sep 13, 2024
32385d0
no cover backend version branch
FBruzzesi Sep 13, 2024
3abeaf8
add pytz to dev requirements for testing
FBruzzesi Sep 13, 2024
c5b7635
merge main
FBruzzesi Sep 13, 2024
4415e3c
xfail pyarrow table on windows
FBruzzesi Sep 13, 2024
5309d4f
Duration(time_unit)
FBruzzesi Sep 14, 2024
85fdd80
Merge branch 'main' into feat/time-zone-aware-datetime
FBruzzesi Sep 14, 2024
91bfb7a
Merge branch 'main' into feat/time-zone-aware-datetime
FBruzzesi Sep 14, 2024
20e36a1
add Datetime and Duration methods, as in polars
FBruzzesi Sep 14, 2024
ec1cb5e
downstream?
FBruzzesi Sep 14, 2024
2147ec6
revert
FBruzzesi Sep 14, 2024
0f69ec1
hash class only
FBruzzesi Sep 14, 2024
22836a0
else case no cover
FBruzzesi Sep 14, 2024
a1f56bc
Merge branch 'main' into feat/time-zone-aware-datetime
FBruzzesi Sep 15, 2024
a84480d
merge main
FBruzzesi Sep 21, 2024
80a574d
trigger ci
FBruzzesi Sep 23, 2024
916eac5
try making stable dtypes
MarcoGorelli Sep 25, 2024
e94b517
Merge remote-tracking branch 'upstream/main' into feat/time-zone-awar…
MarcoGorelli Sep 28, 2024
180b86e
broken, but getting there?
MarcoGorelli Sep 28, 2024
da884e8
Merge remote-tracking branch 'upstream/main' into feat/time-zone-awar…
MarcoGorelli Sep 29, 2024
114be74
fixup
MarcoGorelli Sep 29, 2024
587d917
reduce diff
MarcoGorelli Sep 29, 2024
dd050a8
stableify duration too
MarcoGorelli Sep 29, 2024
b4de1f7
test duration too
MarcoGorelli Sep 29, 2024
458f2a2
try removing pytz
MarcoGorelli Sep 29, 2024
34c27ef
try fix ci
MarcoGorelli Sep 29, 2024
0de71a6
try fix ci
MarcoGorelli Sep 29, 2024
d105911
try fix ci
MarcoGorelli Sep 29, 2024
a773d85
try fix ci
MarcoGorelli Sep 29, 2024
0149431
try fix ci
MarcoGorelli Sep 29, 2024
2249af0
allow s time unit
MarcoGorelli Sep 30, 2024
942a77b
test second resolution
MarcoGorelli Sep 30, 2024
ad38667
override duration time unit for pandas pre 2.0
MarcoGorelli Sep 30, 2024
38898a8
:label:
MarcoGorelli Sep 30, 2024
43da4c3
pre-2.0 pandas
MarcoGorelli Sep 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions narwhals/_arrow/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def translate_dtype(dtype: Any) -> dtypes.DType:
if pa.types.is_date32(dtype):
return dtypes.Date()
if pa.types.is_timestamp(dtype):
return dtypes.Datetime()
return dtypes.Datetime(time_unit=dtype.unit, time_zone=dtype.tz)
if pa.types.is_duration(dtype):
return dtypes.Duration()
if pa.types.is_dictionary(dtype):
Expand Down Expand Up @@ -88,8 +88,10 @@ def narwhals_to_native_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any:
# with Polars for now
return pa.dictionary(pa.uint32(), pa.string())
if isinstance_or_issubclass(dtype, dtypes.Datetime):
# Use Polars' default
return pa.timestamp("us")
time_unit = getattr(dtype, "time_unit", "us")
time_zone = getattr(dtype, "time_zone", None)
return pa.timestamp(time_unit, tz=time_zone)

if isinstance_or_issubclass(dtype, dtypes.Duration):
# Use Polars' default
return pa.duration("us")
Expand Down
4 changes: 3 additions & 1 deletion narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,9 @@ def cast(
dtype: Any,
) -> Self:
ser = self._native_series
dtype = narwhals_to_native_dtype(dtype, ser.dtype, self._implementation)
dtype = narwhals_to_native_dtype(
dtype, ser.dtype, self._implementation, self._backend_version
)
return self._from_native_series(ser.astype(dtype))

def item(self: Self, index: int | None = None) -> Any:
Expand Down
47 changes: 36 additions & 11 deletions narwhals/_pandas_like/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from __future__ import annotations

import re
from typing import TYPE_CHECKING
from typing import Any
from typing import Iterable
from typing import Literal
from typing import TypeVar

from narwhals.dependencies import get_cudf
Expand Down Expand Up @@ -221,6 +223,12 @@ def translate_dtype(column: Any) -> DType:
from narwhals import dtypes

dtype = column.dtype

pd_datetime_rgx = (
r"^datetime64\[(?P<time_unit>ms|us|ns)(?:, (?P<time_zone>[a-zA-Z\/]+))?\]$"
)
pa_datetime_rgx = r"^timestamp\[(?P<time_unit>ms|us|ns)(?:, tz=(?P<time_zone>[a-zA-Z\/]+))?\]\[pyarrow\]$"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please try to break these πŸ™ˆ


if str(dtype) in ("int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"):
return dtypes.Int64()
if str(dtype) in ("int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"):
Expand Down Expand Up @@ -264,16 +272,15 @@ def translate_dtype(column: Any) -> DType:
return dtypes.Boolean()
if str(dtype) in ("category",) or str(dtype).startswith("dictionary<"):
return dtypes.Categorical()
if str(dtype).startswith("datetime64"):
# TODO(Unassigned): different time units and time zones
return dtypes.Datetime()
if (match_ := re.match(pd_datetime_rgx, str(dtype))) or (
match_ := re.match(pa_datetime_rgx, str(dtype))
):
time_unit: Literal["us", "ns", "ms"] = match_.group("time_unit") # type: ignore[assignment]
time_zone: str | None = match_.group("time_zone")
return dtypes.Datetime(time_unit, time_zone)
if str(dtype).startswith("timedelta64") or str(dtype).startswith("duration"):
# TODO(Unassigned): different time units
return dtypes.Duration()
if str(dtype).startswith("timestamp["):
# pyarrow-backed datetime
# TODO(Unassigned): different time units and time zones
return dtypes.Datetime()
if str(dtype) == "date32[day][pyarrow]":
return dtypes.Date()
if str(dtype) == "object":
Expand Down Expand Up @@ -321,7 +328,10 @@ def get_dtype_backend(dtype: Any, implementation: Implementation) -> str:


def narwhals_to_native_dtype( # noqa: PLR0915
dtype: DType | type[DType], starting_dtype: Any, implementation: Implementation
dtype: DType | type[DType],
starting_dtype: Any,
implementation: Implementation,
backend_version: tuple[int, ...],
) -> Any:
from narwhals import dtypes

Expand Down Expand Up @@ -425,10 +435,25 @@ def narwhals_to_native_dtype( # noqa: PLR0915
# convert to it?
return "category"
if isinstance_or_issubclass(dtype, dtypes.Datetime):
# TODO(Unassigned): different time units and time zones
time_unit = getattr(dtype, "time_unit", "us")
time_zone = getattr(dtype, "time_zone", None)

# Pandas does not support "ms" or "us" time units before version 1.5.0
# Let's overwrite with "ns"
if implementation is Implementation.PANDAS and backend_version < (
1,
5,
0,
): # pragma: no cover
time_unit = "ns"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we can do much else here


if dtype_backend == "pyarrow-nullable":
return "timestamp[ns][pyarrow]"
return "datetime64[ns]"
tz_part = f", tz={time_zone}" if time_zone else ""
return f"timestamp[{time_unit}{tz_part}][pyarrow]"
else:
tz_part = f", {time_zone}" if time_zone else ""
return f"datetime64[{time_unit}{tz_part}]"

if isinstance_or_issubclass(dtype, dtypes.Duration):
# TODO(Unassigned): different time units and time zones
if dtype_backend == "pyarrow-nullable":
Expand Down
14 changes: 10 additions & 4 deletions narwhals/_polars/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from typing import Any
from typing import Literal

from narwhals import dtypes
from narwhals.dependencies import get_polars
Expand Down Expand Up @@ -59,12 +60,14 @@ def translate_dtype(dtype: Any) -> dtypes.DType:
return dtypes.Categorical()
if dtype == pl.Enum:
return dtypes.Enum()
if dtype == pl.Datetime:
return dtypes.Datetime()
if dtype == pl.Duration:
return dtypes.Duration()
if dtype == pl.Date:
return dtypes.Date()
if dtype == pl.Datetime or isinstance(dtype, pl.Datetime):
time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us")
time_zone = getattr(dtype, "time_zone", None)
return dtypes.Datetime(time_unit=time_unit, time_zone=time_zone)
return dtypes.Unknown()


Expand Down Expand Up @@ -103,10 +106,13 @@ def narwhals_to_native_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any:
if dtype == dtypes.Enum:
msg = "Converting to Enum is not (yet) supported"
raise NotImplementedError(msg)
if dtype == dtypes.Datetime:
return pl.Datetime()
if dtype == dtypes.Duration:
return pl.Duration()
if dtype == dtypes.Date:
return pl.Date()
if dtype == dtypes.Datetime or isinstance(dtype, dtypes.Datetime):
time_unit = getattr(dtype, "time_unit", "us")
time_zone = getattr(dtype, "time_zone", None)
return pl.Datetime(time_unit, time_zone)

return pl.Unknown() # pragma: no cover
36 changes: 35 additions & 1 deletion narwhals/dtypes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations

from datetime import timezone
from typing import TYPE_CHECKING
from typing import Literal

if TYPE_CHECKING:
from typing_extensions import Self
Expand Down Expand Up @@ -71,7 +73,39 @@ class Object(DType): ...
class Unknown(DType): ...


class Datetime(TemporalType): ...
class Datetime(TemporalType):
"""
Data type representing a calendar date and time of day.

Arguments:
time_unit: Unit of time. Defaults to `'us'` (microseconds).
time_zone: Time zone string, as defined in zoneinfo (to see valid strings run
`import zoneinfo; zoneinfo.available_timezones()` for a full list).
When used to match dtypes, can set this to "*" to check for Datetime
columns that have any (non-null) timezone.

Notes:
Adapted from Polars implementation at:
https://github.com/pola-rs/polars/blob/py-1.7.1/py-polars/polars/datatypes/classes.py#L398-L457
"""

def __init__(
self: Self,
time_unit: Literal["us", "ns", "ms"] = "us",
time_zone: str | timezone | None = None,
) -> None:
if time_unit not in {"ms", "us", "ns"}:
msg = (
"invalid `time_unit`"
f"\n\nExpected one of {{'ns','us','ms'}}, got {time_unit!r}."
)
raise ValueError(msg)

if isinstance(time_zone, timezone):
time_zone = str(time_zone)

self.time_unit = time_unit
self.time_zone = time_zone


class Duration(TemporalType): ...
Expand Down
9 changes: 7 additions & 2 deletions narwhals/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from narwhals.dataframe import LazyFrame
from narwhals.translate import from_native
from narwhals.utils import Implementation
from narwhals.utils import parse_version
from narwhals.utils import validate_laziness

# Missing type parameters for generic type "DataFrame"
Expand Down Expand Up @@ -215,7 +216,10 @@ def new_series(
narwhals_to_native_dtype as pandas_like_narwhals_to_native_dtype,
)

dtype = pandas_like_narwhals_to_native_dtype(dtype, None, implementation)
backend_version = parse_version(native_namespace.__version__)
dtype = pandas_like_narwhals_to_native_dtype(
dtype, None, implementation, backend_version
)
native_series = native_namespace.Series(values, name=name, dtype=dtype)

elif implementation is Implementation.PYARROW:
Expand Down Expand Up @@ -332,9 +336,10 @@ def from_dict(
narwhals_to_native_dtype as pandas_like_narwhals_to_native_dtype,
)

backend_version = parse_version(native_namespace.__version__)
schema = {
name: pandas_like_narwhals_to_native_dtype(
schema[name], native_type, implementation
schema[name], native_type, implementation, backend_version
)
for name, native_type in native_frame.dtypes.items()
}
Expand Down
25 changes: 25 additions & 0 deletions tests/dtypes_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from __future__ import annotations

from datetime import timezone
from typing import Literal

import pytest

import narwhals.stable.v1 as nw


@pytest.mark.parametrize("time_unit", ["us", "ns", "ms"])
@pytest.mark.parametrize("time_zone", ["Europe/Rome", timezone.utc, None])
def test_datetime_valid(
time_unit: Literal["us", "ns", "ms"], time_zone: str | timezone | None
) -> None:
dtype = nw.Datetime(time_unit=time_unit, time_zone=time_zone)

assert dtype.time_unit == time_unit
assert isinstance(dtype.time_zone, str) or dtype.time_zone is None


@pytest.mark.parametrize("time_unit", ["abc", "s"])
def test_datetime_invalid(time_unit: str) -> None:
with pytest.raises(ValueError, match="invalid `time_unit`"):
nw.Datetime(time_unit=time_unit) # type: ignore[arg-type]
30 changes: 30 additions & 0 deletions tests/expr_and_series/cast_test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
from __future__ import annotations

from datetime import datetime
from datetime import timedelta
from datetime import timezone
from typing import Any

import pandas as pd
Expand All @@ -6,6 +11,7 @@

import narwhals.stable.v1 as nw
from narwhals.utils import parse_version
from tests.utils import compare_dicts

data = {
"a": [1],
Expand Down Expand Up @@ -179,3 +185,27 @@ class Banana:

with pytest.raises(AssertionError, match=r"Unknown dtype"):
df.select(nw.col("a").cast(Banana))


def test_cast_datetime_tz_aware(constructor: Any, request: Any) -> None:
if "dask" in str(constructor):
request.applymarker(pytest.mark.xfail)

data = {
"date": [
datetime(2024, 1, 1, tzinfo=timezone.utc) + timedelta(days=i)
for i in range(3)
]
}
expected = {
"date": ["2024-01-01 01:00:00", "2024-01-02 01:00:00", "2024-01-03 01:00:00"]
}

df = nw.from_native(constructor(data))
result = df.select(
nw.col("date")
.cast(nw.Datetime("ms", time_zone="Europe/Rome"))
.cast(nw.String())
.str.slice(offset=0, length=19)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

19: number of characters of "2024-01-01 01:00:00". The format right after that is different for each backend

)
compare_dicts(result, expected)
6 changes: 3 additions & 3 deletions tests/series_only/cast_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,13 @@ def test_cast_date_datetime_pandas() -> None:
df = df.select(nw.col("a").cast(nw.Datetime))
result = nw.to_native(df)
expected = pd.DataFrame({"a": [datetime(2020, 1, 1), datetime(2020, 1, 2)]}).astype(
{"a": "timestamp[ns][pyarrow]"}
{"a": "timestamp[us][pyarrow]"}
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changes the default to the polars one

)
pd.testing.assert_frame_equal(result, expected)

# pandas: pyarrow datetime to date
# # pandas: pyarrow datetime to date
dfpd = pd.DataFrame({"a": [datetime(2020, 1, 1), datetime(2020, 1, 2)]}).astype(
{"a": "timestamp[ns][pyarrow]"}
{"a": "timestamp[us][pyarrow]"}
)
df = nw.from_native(dfpd)
df = df.select(nw.col("a").cast(nw.Date))
Expand Down
Loading