Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Datetime(time_unit, time_zone) and Duration(time_unit) types #960

Merged
merged 39 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
121f6f8
WIP
FBruzzesi Sep 12, 2024
4896df2
order matters?
FBruzzesi Sep 12, 2024
cd2ed40
datetime test and polars fix
FBruzzesi Sep 13, 2024
eb1468e
rm NoneType
FBruzzesi Sep 13, 2024
e71f9c3
pandas pre 1.5
FBruzzesi Sep 13, 2024
32385d0
no cover backend version branch
FBruzzesi Sep 13, 2024
3abeaf8
add pytz to dev requirements for testing
FBruzzesi Sep 13, 2024
c5b7635
merge main
FBruzzesi Sep 13, 2024
4415e3c
xfail pyarrow table on windows
FBruzzesi Sep 13, 2024
5309d4f
Duration(time_unit)
FBruzzesi Sep 14, 2024
85fdd80
Merge branch 'main' into feat/time-zone-aware-datetime
FBruzzesi Sep 14, 2024
91bfb7a
Merge branch 'main' into feat/time-zone-aware-datetime
FBruzzesi Sep 14, 2024
20e36a1
add Datetime and Duration methods, as in polars
FBruzzesi Sep 14, 2024
ec1cb5e
downstream?
FBruzzesi Sep 14, 2024
2147ec6
revert
FBruzzesi Sep 14, 2024
0f69ec1
hash class only
FBruzzesi Sep 14, 2024
22836a0
else case no cover
FBruzzesi Sep 14, 2024
a1f56bc
Merge branch 'main' into feat/time-zone-aware-datetime
FBruzzesi Sep 15, 2024
a84480d
merge main
FBruzzesi Sep 21, 2024
80a574d
trigger ci
FBruzzesi Sep 23, 2024
916eac5
try making stable dtypes
MarcoGorelli Sep 25, 2024
e94b517
Merge remote-tracking branch 'upstream/main' into feat/time-zone-awar…
MarcoGorelli Sep 28, 2024
180b86e
broken, but getting there?
MarcoGorelli Sep 28, 2024
da884e8
Merge remote-tracking branch 'upstream/main' into feat/time-zone-awar…
MarcoGorelli Sep 29, 2024
114be74
fixup
MarcoGorelli Sep 29, 2024
587d917
reduce diff
MarcoGorelli Sep 29, 2024
dd050a8
stableify duration too
MarcoGorelli Sep 29, 2024
b4de1f7
test duration too
MarcoGorelli Sep 29, 2024
458f2a2
try removing pytz
MarcoGorelli Sep 29, 2024
34c27ef
try fix ci
MarcoGorelli Sep 29, 2024
0de71a6
try fix ci
MarcoGorelli Sep 29, 2024
d105911
try fix ci
MarcoGorelli Sep 29, 2024
a773d85
try fix ci
MarcoGorelli Sep 29, 2024
0149431
try fix ci
MarcoGorelli Sep 29, 2024
2249af0
allow s time unit
MarcoGorelli Sep 30, 2024
942a77b
test second resolution
MarcoGorelli Sep 30, 2024
ad38667
override duration time unit for pandas pre 2.0
MarcoGorelli Sep 30, 2024
38898a8
:label:
MarcoGorelli Sep 30, 2024
43da4c3
pre-2.0 pandas
MarcoGorelli Sep 30, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions narwhals/_arrow/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ def translate_dtype(dtype: Any) -> dtypes.DType:
if pa.types.is_date32(dtype):
return dtypes.Date()
if pa.types.is_timestamp(dtype):
return dtypes.Datetime()
return dtypes.Datetime(time_unit=dtype.unit, time_zone=dtype.tz)
if pa.types.is_duration(dtype):
return dtypes.Duration()
return dtypes.Duration(time_unit=dtype.unit)
if pa.types.is_dictionary(dtype):
return dtypes.Categorical()
return dtypes.Unknown() # pragma: no cover
Expand Down Expand Up @@ -86,11 +86,12 @@ def narwhals_to_native_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any:
if isinstance_or_issubclass(dtype, dtypes.Categorical):
return pa.dictionary(pa.uint32(), pa.string())
if isinstance_or_issubclass(dtype, dtypes.Datetime):
# Use Polars' default
return pa.timestamp("us")
time_unit = getattr(dtype, "time_unit", "us")
time_zone = getattr(dtype, "time_zone", None)
return pa.timestamp(time_unit, tz=time_zone)
if isinstance_or_issubclass(dtype, dtypes.Duration):
# Use Polars' default
return pa.duration("us")
time_unit = getattr(dtype, "time_unit", "us")
return pa.duration(time_unit)
if isinstance_or_issubclass(dtype, dtypes.Date):
return pa.date32()
msg = f"Unknown dtype: {dtype}" # pragma: no cover
Expand Down
4 changes: 3 additions & 1 deletion narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,9 @@ def cast(
dtype: Any,
) -> Self:
ser = self._native_series
dtype = narwhals_to_native_dtype(dtype, ser.dtype, self._implementation)
dtype = narwhals_to_native_dtype(
dtype, ser.dtype, self._implementation, self._backend_version
)
return self._from_native_series(ser.astype(dtype))

def item(self: Self, index: int | None = None) -> Any:
Expand Down
68 changes: 50 additions & 18 deletions narwhals/_pandas_like/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from __future__ import annotations

import re
from typing import TYPE_CHECKING
from typing import Any
from typing import Iterable
from typing import Literal
from typing import TypeVar

from narwhals.dependencies import get_cudf
Expand Down Expand Up @@ -221,6 +223,15 @@ def translate_dtype(column: Any) -> DType:
from narwhals import dtypes

dtype = column.dtype

pd_datetime_rgx = (
r"^datetime64\[(?P<time_unit>ms|us|ns)(?:, (?P<time_zone>[a-zA-Z\/]+))?\]$"
)
pa_datetime_rgx = r"^timestamp\[(?P<time_unit>ms|us|ns)(?:, tz=(?P<time_zone>[a-zA-Z\/]+))?\]\[pyarrow\]$"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please try to break these πŸ™ˆ


pd_duration_rgx = r"^timedelta64\[(?P<time_unit>ms|us|ns)\]$"
pa_duration_rgx = r"^duration\[(?P<time_unit>ms|us|ns)\]\[pyarrow\]$"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pandas / pyarrow support 'second' time unit, I think that should be allowed to pass through

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just by passing it along or doing manipulation for the user?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think we can just pass it through - adding a commit soon


if str(dtype) in ("int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"):
return dtypes.Int64()
if str(dtype) in ("int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"):
Expand Down Expand Up @@ -264,16 +275,17 @@ def translate_dtype(column: Any) -> DType:
return dtypes.Boolean()
if str(dtype) in ("category",) or str(dtype).startswith("dictionary<"):
return dtypes.Categorical()
if str(dtype).startswith("datetime64"):
# TODO(Unassigned): different time units and time zones
return dtypes.Datetime()
if str(dtype).startswith("timedelta64") or str(dtype).startswith("duration"):
# TODO(Unassigned): different time units
return dtypes.Duration()
if str(dtype).startswith("timestamp["):
# pyarrow-backed datetime
# TODO(Unassigned): different time units and time zones
return dtypes.Datetime()
if (match_ := re.match(pd_datetime_rgx, str(dtype))) or (
match_ := re.match(pa_datetime_rgx, str(dtype))
):
dt_time_unit: Literal["us", "ns", "ms"] = match_.group("time_unit") # type: ignore[assignment]
dt_time_zone: str | None = match_.group("time_zone")
return dtypes.Datetime(dt_time_unit, dt_time_zone)
if (match_ := re.match(pd_duration_rgx, str(dtype))) or (
match_ := re.match(pa_duration_rgx, str(dtype))
):
du_time_unit: Literal["us", "ns", "ms"] = match_.group("time_unit") # type: ignore[assignment]
return dtypes.Duration(du_time_unit)
if str(dtype) == "date32[day][pyarrow]":
return dtypes.Date()
if str(dtype) == "object":
Expand Down Expand Up @@ -321,7 +333,10 @@ def get_dtype_backend(dtype: Any, implementation: Implementation) -> str:


def narwhals_to_native_dtype( # noqa: PLR0915
dtype: DType | type[DType], starting_dtype: Any, implementation: Implementation
dtype: DType | type[DType],
starting_dtype: Any,
implementation: Implementation,
backend_version: tuple[int, ...],
) -> Any:
from narwhals import dtypes

Expand Down Expand Up @@ -425,15 +440,32 @@ def narwhals_to_native_dtype( # noqa: PLR0915
# convert to it?
return "category"
if isinstance_or_issubclass(dtype, dtypes.Datetime):
# TODO(Unassigned): different time units and time zones
dt_time_unit = getattr(dtype, "time_unit", "us")
dt_time_zone = getattr(dtype, "time_zone", None)

# Pandas does not support "ms" or "us" time units before version 1.5.0
# Let's overwrite with "ns"
if implementation is Implementation.PANDAS and backend_version < (
1,
5,
0,
): # pragma: no cover
dt_time_unit = "ns"
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved

if dtype_backend == "pyarrow-nullable":
return "timestamp[ns][pyarrow]"
return "datetime64[ns]"
tz_part = f", tz={dt_time_zone}" if dt_time_zone else ""
return f"timestamp[{dt_time_unit}{tz_part}][pyarrow]"
else:
tz_part = f", {dt_time_zone}" if dt_time_zone else ""
return f"datetime64[{dt_time_unit}{tz_part}]"
if isinstance_or_issubclass(dtype, dtypes.Duration):
# TODO(Unassigned): different time units and time zones
if dtype_backend == "pyarrow-nullable":
return "duration[ns][pyarrow]"
return "timedelta64[ns]"
du_time_unit = getattr(dtype, "time_unit", "us")
return (
f"duration[{du_time_unit}][pyarrow]"
if dtype_backend == "pyarrow-nullable"
else f"timedelta64[{du_time_unit}]"
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need to do the same pre-1.5.0 check here


if isinstance_or_issubclass(dtype, dtypes.Date):
if dtype_backend == "pyarrow-nullable":
return "date32[pyarrow]"
Expand Down
24 changes: 16 additions & 8 deletions narwhals/_polars/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from typing import Any
from typing import Literal

from narwhals import dtypes
from narwhals.dependencies import get_polars
Expand Down Expand Up @@ -59,12 +60,15 @@ def translate_dtype(dtype: Any) -> dtypes.DType:
return dtypes.Categorical()
if dtype == pl.Enum:
return dtypes.Enum()
if dtype == pl.Datetime:
return dtypes.Datetime()
if dtype == pl.Duration:
return dtypes.Duration()
if dtype == pl.Date:
return dtypes.Date()
if dtype == pl.Datetime or isinstance(dtype, pl.Datetime):
dt_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us")
dt_time_zone = getattr(dtype, "time_zone", None)
return dtypes.Datetime(time_unit=dt_time_unit, time_zone=dt_time_zone)
if dtype == pl.Duration or isinstance(dtype, pl.Duration):
du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us")
return dtypes.Duration(time_unit=du_time_unit)
return dtypes.Unknown()


Expand Down Expand Up @@ -103,10 +107,14 @@ def narwhals_to_native_dtype(dtype: dtypes.DType | type[dtypes.DType]) -> Any:
if dtype == dtypes.Enum:
msg = "Converting to Enum is not (yet) supported"
raise NotImplementedError(msg)
if dtype == dtypes.Datetime:
return pl.Datetime()
if dtype == dtypes.Duration:
return pl.Duration()
if dtype == dtypes.Date:
return pl.Date()
if dtype == dtypes.Datetime or isinstance(dtype, dtypes.Datetime):
dt_time_unit = getattr(dtype, "time_unit", "us")
dt_time_zone = getattr(dtype, "time_zone", None)
return pl.Datetime(dt_time_unit, dt_time_zone)
if dtype == dtypes.Duration or isinstance(dtype, dtypes.Duration):
du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us")
return pl.Duration(time_unit=du_time_unit)

return pl.Unknown() # pragma: no cover
97 changes: 93 additions & 4 deletions narwhals/dtypes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations

from datetime import timezone
from typing import TYPE_CHECKING
from typing import Literal

if TYPE_CHECKING:
from typing_extensions import Self
Expand Down Expand Up @@ -71,10 +73,97 @@ class Object(DType): ...
class Unknown(DType): ...


class Datetime(TemporalType): ...


class Duration(TemporalType): ...
class Datetime(TemporalType):
"""
Data type representing a calendar date and time of day.

Arguments:
time_unit: Unit of time. Defaults to `'us'` (microseconds).
time_zone: Time zone string, as defined in zoneinfo (to see valid strings run
`import zoneinfo; zoneinfo.available_timezones()` for a full list).
When used to match dtypes, can set this to "*" to check for Datetime
columns that have any (non-null) timezone.

Notes:
Adapted from Polars implementation at:
https://github.com/pola-rs/polars/blob/py-1.7.1/py-polars/polars/datatypes/classes.py#L398-L457
"""

def __init__(
self: Self,
time_unit: Literal["us", "ns", "ms"] = "us",
time_zone: str | timezone | None = None,
) -> None:
if time_unit not in {"ms", "us", "ns"}:
msg = (
"invalid `time_unit`"
f"\n\nExpected one of {{'ns','us','ms'}}, got {time_unit!r}."
)
raise ValueError(msg)

if isinstance(time_zone, timezone):
time_zone = str(time_zone)

self.time_unit = time_unit
self.time_zone = time_zone

def __eq__(self: Self, other: object) -> bool:
# allow comparing object instances to class
if type(other) is type and issubclass(other, Datetime):
return True
elif isinstance(other, Datetime):
return self.time_unit == other.time_unit and self.time_zone == other.time_zone
else:
return False

def __hash__(self: Self) -> int: # pragma: no cover
return hash((self.__class__, self.time_unit, self.time_zone))
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved

def __repr__(self: Self) -> str: # pragma: no cover
class_name = self.__class__.__name__
return f"{class_name}(time_unit={self.time_unit!r}, time_zone={self.time_zone!r})"


class Duration(TemporalType):
"""
Data type representing a time duration.

Arguments:
time_unit: Unit of time. Defaults to `'us'` (microseconds).

Notes:
Adapted from Polars implementation at:
https://github.com/pola-rs/polars/blob/py-1.7.1/py-polars/polars/datatypes/classes.py#L460-L502
"""

def __init__(
self: Self,
time_unit: Literal["us", "ns", "ms"] = "us",
) -> None:
if time_unit not in ("ms", "us", "ns"):
msg = (
"invalid `time_unit`"
f"\n\nExpected one of {{'ns','us','ms'}}, got {time_unit!r}."
)
raise ValueError(msg)

self.time_unit = time_unit

def __eq__(self: Self, other: object) -> bool:
# allow comparing object instances to class
if type(other) is type and issubclass(other, Duration):
return True
elif isinstance(other, Duration):
return self.time_unit == other.time_unit
else:
return False

def __hash__(self: Self) -> int: # pragma: no cover
return hash((self.__class__, self.time_unit))

def __repr__(self: Self) -> str: # pragma: no cover
class_name = self.__class__.__name__
return f"{class_name}(time_unit={self.time_unit!r})"


class Categorical(DType): ...
Expand Down
9 changes: 7 additions & 2 deletions narwhals/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from narwhals.dataframe import LazyFrame
from narwhals.translate import from_native
from narwhals.utils import Implementation
from narwhals.utils import parse_version
from narwhals.utils import validate_laziness

# Missing type parameters for generic type "DataFrame"
Expand Down Expand Up @@ -215,7 +216,10 @@ def new_series(
narwhals_to_native_dtype as pandas_like_narwhals_to_native_dtype,
)

dtype = pandas_like_narwhals_to_native_dtype(dtype, None, implementation)
backend_version = parse_version(native_namespace.__version__)
dtype = pandas_like_narwhals_to_native_dtype(
dtype, None, implementation, backend_version
)
native_series = native_namespace.Series(values, name=name, dtype=dtype)

elif implementation is Implementation.PYARROW:
Expand Down Expand Up @@ -332,9 +336,10 @@ def from_dict(
narwhals_to_native_dtype as pandas_like_narwhals_to_native_dtype,
)

backend_version = parse_version(native_namespace.__version__)
schema = {
name: pandas_like_narwhals_to_native_dtype(
schema[name], native_type, implementation
schema[name], native_type, implementation, backend_version
)
for name, native_type in native_frame.dtypes.items()
}
Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ pyarrow
pytest
pytest-cov
pytest-env
pytz
hypothesis
scikit-learn
typing_extensions
Expand Down
47 changes: 47 additions & 0 deletions tests/dtypes_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from __future__ import annotations

from datetime import timezone
from typing import Literal

import pytest

import narwhals.stable.v1 as nw


@pytest.mark.parametrize("time_unit", ["us", "ns", "ms"])
@pytest.mark.parametrize("time_zone", ["Europe/Rome", timezone.utc, None])
def test_datetime_valid(
time_unit: Literal["us", "ns", "ms"], time_zone: str | timezone | None
) -> None:
dtype = nw.Datetime(time_unit=time_unit, time_zone=time_zone)

assert dtype == nw.Datetime(time_unit=time_unit, time_zone=time_zone)
assert dtype == nw.Datetime

if time_zone:
assert dtype != nw.Datetime(time_unit=time_unit)
if time_unit != "ms":
assert dtype != nw.Datetime(time_unit="ms")


@pytest.mark.parametrize("time_unit", ["abc", "s"])
def test_datetime_invalid(time_unit: str) -> None:
with pytest.raises(ValueError, match="invalid `time_unit`"):
nw.Datetime(time_unit=time_unit) # type: ignore[arg-type]


@pytest.mark.parametrize("time_unit", ["us", "ns", "ms"])
def test_duration_valid(time_unit: Literal["us", "ns", "ms"]) -> None:
dtype = nw.Duration(time_unit=time_unit)

assert dtype == nw.Duration(time_unit=time_unit)
assert dtype == nw.Duration

if time_unit != "ms":
assert dtype != nw.Duration(time_unit="ms")


@pytest.mark.parametrize("time_unit", ["abc", "s"])
def test_duration_invalid(time_unit: str) -> None:
with pytest.raises(ValueError, match="invalid `time_unit`"):
nw.Duration(time_unit=time_unit) # type: ignore[arg-type]
Loading
Loading