Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Datetime(time_unit, time_zone) and Duration(time_unit) types #960

Merged
merged 39 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
121f6f8
WIP
FBruzzesi Sep 12, 2024
4896df2
order matters?
FBruzzesi Sep 12, 2024
cd2ed40
datetime test and polars fix
FBruzzesi Sep 13, 2024
eb1468e
rm NoneType
FBruzzesi Sep 13, 2024
e71f9c3
pandas pre 1.5
FBruzzesi Sep 13, 2024
32385d0
no cover backend version branch
FBruzzesi Sep 13, 2024
3abeaf8
add pytz to dev requirements for testing
FBruzzesi Sep 13, 2024
c5b7635
merge main
FBruzzesi Sep 13, 2024
4415e3c
xfail pyarrow table on windows
FBruzzesi Sep 13, 2024
5309d4f
Duration(time_unit)
FBruzzesi Sep 14, 2024
85fdd80
Merge branch 'main' into feat/time-zone-aware-datetime
FBruzzesi Sep 14, 2024
91bfb7a
Merge branch 'main' into feat/time-zone-aware-datetime
FBruzzesi Sep 14, 2024
20e36a1
add Datetime and Duration methods, as in polars
FBruzzesi Sep 14, 2024
ec1cb5e
downstream?
FBruzzesi Sep 14, 2024
2147ec6
revert
FBruzzesi Sep 14, 2024
0f69ec1
hash class only
FBruzzesi Sep 14, 2024
22836a0
else case no cover
FBruzzesi Sep 14, 2024
a1f56bc
Merge branch 'main' into feat/time-zone-aware-datetime
FBruzzesi Sep 15, 2024
a84480d
merge main
FBruzzesi Sep 21, 2024
80a574d
trigger ci
FBruzzesi Sep 23, 2024
916eac5
try making stable dtypes
MarcoGorelli Sep 25, 2024
e94b517
Merge remote-tracking branch 'upstream/main' into feat/time-zone-awar…
MarcoGorelli Sep 28, 2024
180b86e
broken, but getting there?
MarcoGorelli Sep 28, 2024
da884e8
Merge remote-tracking branch 'upstream/main' into feat/time-zone-awar…
MarcoGorelli Sep 29, 2024
114be74
fixup
MarcoGorelli Sep 29, 2024
587d917
reduce diff
MarcoGorelli Sep 29, 2024
dd050a8
stableify duration too
MarcoGorelli Sep 29, 2024
b4de1f7
test duration too
MarcoGorelli Sep 29, 2024
458f2a2
try removing pytz
MarcoGorelli Sep 29, 2024
34c27ef
try fix ci
MarcoGorelli Sep 29, 2024
0de71a6
try fix ci
MarcoGorelli Sep 29, 2024
d105911
try fix ci
MarcoGorelli Sep 29, 2024
a773d85
try fix ci
MarcoGorelli Sep 29, 2024
0149431
try fix ci
MarcoGorelli Sep 29, 2024
2249af0
allow s time unit
MarcoGorelli Sep 30, 2024
942a77b
test second resolution
MarcoGorelli Sep 30, 2024
ad38667
override duration time unit for pandas pre 2.0
MarcoGorelli Sep 30, 2024
38898a8
:label:
MarcoGorelli Sep 30, 2024
43da4c3
pre-2.0 pandas
MarcoGorelli Sep 30, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/workflows/extremes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ jobs:
kaggle kernels output "marcogorelli/variable-brink-glacier"
- name: install-polars
run: python -m pip install *.whl
- name: install-pandas-nightly
run: pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas
- name: install-reqs
run: uv pip install --upgrade tox virtualenv setuptools pip -r requirements-dev.txt --system
- name: uninstall pyarrow
Expand All @@ -127,8 +129,8 @@ jobs:
# run: uv pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ --pre pyarrow --system
- name: uninstall pandas
run: uv pip uninstall pandas --system
- name: install-pandas-nightly
run: uv pip install --prerelease=allow --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas --system
- name: show-deps
run: uv pip freeze
- name: uninstall numpy
run: uv pip uninstall numpy --system
- name: install numpy nightly
Expand Down
13 changes: 7 additions & 6 deletions narwhals/_arrow/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType:
if pa.types.is_date32(dtype):
return dtypes.Date()
if pa.types.is_timestamp(dtype):
return dtypes.Datetime()
return dtypes.Datetime(time_unit=dtype.unit, time_zone=dtype.tz)
if pa.types.is_duration(dtype):
return dtypes.Duration()
return dtypes.Duration(time_unit=dtype.unit)
if pa.types.is_dictionary(dtype):
return dtypes.Categorical()
if pa.types.is_struct(dtype):
Expand Down Expand Up @@ -94,11 +94,12 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], dtypes: DTypes) -> Any:
if isinstance_or_issubclass(dtype, dtypes.Categorical):
return pa.dictionary(pa.uint32(), pa.string())
if isinstance_or_issubclass(dtype, dtypes.Datetime):
# Use Polars' default
return pa.timestamp("us")
time_unit = getattr(dtype, "time_unit", "us")
time_zone = getattr(dtype, "time_zone", None)
return pa.timestamp(time_unit, tz=time_zone)
if isinstance_or_issubclass(dtype, dtypes.Duration):
# Use Polars' default
return pa.duration("us")
time_unit = getattr(dtype, "time_unit", "us")
return pa.duration(time_unit)
if isinstance_or_issubclass(dtype, dtypes.Date):
return pa.date32()
if isinstance_or_issubclass(dtype, dtypes.List): # pragma: no cover
Expand Down
2 changes: 1 addition & 1 deletion narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def cast(
) -> Self:
ser = self._native_series
dtype = narwhals_to_native_dtype(
dtype, ser.dtype, self._implementation, self._dtypes
dtype, ser.dtype, self._implementation, self._backend_version, self._dtypes
)
return self._from_native_series(ser.astype(dtype))

Expand Down
62 changes: 49 additions & 13 deletions narwhals/_pandas_like/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from __future__ import annotations

import re
from typing import TYPE_CHECKING
from typing import Any
from typing import Iterable
from typing import Literal
from typing import TypeVar

from narwhals.utils import Implementation
Expand Down Expand Up @@ -213,6 +215,15 @@ def set_axis(

def native_to_narwhals_dtype(column: Any, dtypes: DTypes) -> DType:
dtype = str(column.dtype)

pd_datetime_rgx = (
r"^datetime64\[(?P<time_unit>s|ms|us|ns)(?:, (?P<time_zone>[a-zA-Z\/]+))?\]$"
)
pa_datetime_rgx = r"^timestamp\[(?P<time_unit>s|ms|us|ns)(?:, tz=(?P<time_zone>[a-zA-Z\/]+))?\]\[pyarrow\]$"

pd_duration_rgx = r"^timedelta64\[(?P<time_unit>s|ms|us|ns)\]$"
pa_duration_rgx = r"^duration\[(?P<time_unit>s|ms|us|ns)\]\[pyarrow\]$"

if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}:
return dtypes.Int64()
if dtype in {"int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"}:
Expand Down Expand Up @@ -251,12 +262,17 @@ def native_to_narwhals_dtype(column: Any, dtypes: DTypes) -> DType:
return dtypes.Boolean()
if dtype == "category" or dtype.startswith("dictionary<"):
return dtypes.Categorical()
if dtype.startswith(("datetime64", "timestamp[")):
# TODO(Unassigned): different time units and time zones
return dtypes.Datetime()
if dtype.startswith(("timedelta64", "duration")):
# TODO(Unassigned): different time units
return dtypes.Duration()
if (match_ := re.match(pd_datetime_rgx, dtype)) or (
match_ := re.match(pa_datetime_rgx, dtype)
):
dt_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment]
dt_time_zone: str | None = match_.group("time_zone")
return dtypes.Datetime(dt_time_unit, dt_time_zone)
if (match_ := re.match(pd_duration_rgx, dtype)) or (
match_ := re.match(pa_duration_rgx, dtype)
):
du_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment]
return dtypes.Duration(du_time_unit)
if dtype == "date32[day][pyarrow]":
return dtypes.Date()
if dtype.startswith(("large_list", "list")):
Expand Down Expand Up @@ -314,6 +330,7 @@ def narwhals_to_native_dtype( # noqa: PLR0915
dtype: DType | type[DType],
starting_dtype: Any,
implementation: Implementation,
backend_version: tuple[int, ...],
dtypes: DTypes,
) -> Any:
if "polars" in str(type(dtype)):
Expand Down Expand Up @@ -416,15 +433,34 @@ def narwhals_to_native_dtype( # noqa: PLR0915
# convert to it?
return "category"
if isinstance_or_issubclass(dtype, dtypes.Datetime):
# TODO(Unassigned): different time units and time zones
dt_time_unit = getattr(dtype, "time_unit", "us")
dt_time_zone = getattr(dtype, "time_zone", None)

# Pandas does not support "ms" or "us" time units before version 2.0
# Let's overwrite with "ns"
if implementation is Implementation.PANDAS and backend_version < (
2,
): # pragma: no cover
dt_time_unit = "ns"

if dtype_backend == "pyarrow-nullable":
return "timestamp[ns][pyarrow]"
return "datetime64[ns]"
tz_part = f", tz={dt_time_zone}" if dt_time_zone else ""
return f"timestamp[{dt_time_unit}{tz_part}][pyarrow]"
else:
tz_part = f", {dt_time_zone}" if dt_time_zone else ""
return f"datetime64[{dt_time_unit}{tz_part}]"
if isinstance_or_issubclass(dtype, dtypes.Duration):
# TODO(Unassigned): different time units and time zones
if dtype_backend == "pyarrow-nullable":
return "duration[ns][pyarrow]"
return "timedelta64[ns]"
du_time_unit = getattr(dtype, "time_unit", "us")
if implementation is Implementation.PANDAS and backend_version < (
2,
): # pragma: no cover
dt_time_unit = "ns"
return (
f"duration[{du_time_unit}][pyarrow]"
if dtype_backend == "pyarrow-nullable"
else f"timedelta64[{du_time_unit}]"
)

if isinstance_or_issubclass(dtype, dtypes.Date):
if dtype_backend == "pyarrow-nullable":
return "date32[pyarrow]"
Expand Down
24 changes: 16 additions & 8 deletions narwhals/_polars/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from typing import TYPE_CHECKING
from typing import Any
from typing import Literal

if TYPE_CHECKING:
from narwhals.dtypes import DType
Expand Down Expand Up @@ -62,12 +63,15 @@ def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType:
return dtypes.Categorical()
if dtype == pl.Enum:
return dtypes.Enum()
if dtype == pl.Datetime:
return dtypes.Datetime()
if dtype == pl.Duration:
return dtypes.Duration()
if dtype == pl.Date:
return dtypes.Date()
if dtype == pl.Datetime or isinstance(dtype, pl.Datetime):
dt_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us")
dt_time_zone = getattr(dtype, "time_zone", None)
return dtypes.Datetime(time_unit=dt_time_unit, time_zone=dt_time_zone)
if dtype == pl.Duration or isinstance(dtype, pl.Duration):
du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us")
return dtypes.Duration(time_unit=du_time_unit)
if dtype == pl.Struct:
return dtypes.Struct()
if dtype == pl.List:
Expand Down Expand Up @@ -111,12 +115,16 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], dtypes: DTypes) -> Any:
if dtype == dtypes.Enum:
msg = "Converting to Enum is not (yet) supported"
raise NotImplementedError(msg)
if dtype == dtypes.Datetime:
return pl.Datetime()
if dtype == dtypes.Duration:
return pl.Duration()
if dtype == dtypes.Date:
return pl.Date()
if dtype == dtypes.Datetime or isinstance(dtype, dtypes.Datetime):
dt_time_unit = getattr(dtype, "time_unit", "us")
dt_time_zone = getattr(dtype, "time_zone", None)
return pl.Datetime(dt_time_unit, dt_time_zone) # type: ignore[arg-type]
if dtype == dtypes.Duration or isinstance(dtype, dtypes.Duration):
du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us")
return pl.Duration(time_unit=du_time_unit)

if dtype == dtypes.List: # pragma: no cover
msg = "Converting to List dtype is not supported yet"
return NotImplementedError(msg)
Expand Down
95 changes: 91 additions & 4 deletions narwhals/dtypes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations

from datetime import timezone
from typing import TYPE_CHECKING
from typing import Literal

if TYPE_CHECKING:
from typing_extensions import Self
Expand Down Expand Up @@ -71,10 +73,95 @@ class Object(DType): ...
class Unknown(DType): ...


class Datetime(TemporalType): ...


class Duration(TemporalType): ...
class Datetime(TemporalType):
"""
Data type representing a calendar date and time of day.

Arguments:
time_unit: Unit of time. Defaults to `'us'` (microseconds).
time_zone: Time zone string, as defined in zoneinfo (to see valid strings run
`import zoneinfo; zoneinfo.available_timezones()` for a full list).

Notes:
Adapted from Polars implementation at:
https://github.com/pola-rs/polars/blob/py-1.7.1/py-polars/polars/datatypes/classes.py#L398-L457
"""

def __init__(
self: Self,
time_unit: Literal["us", "ns", "ms", "s"] = "us",
time_zone: str | timezone | None = None,
) -> None:
if time_unit not in {"s", "ms", "us", "ns"}:
msg = (
"invalid `time_unit`"
f"\n\nExpected one of {{'ns','us','ms', 's'}}, got {time_unit!r}."
)
raise ValueError(msg)

if isinstance(time_zone, timezone):
time_zone = str(time_zone)

self.time_unit = time_unit
self.time_zone = time_zone

def __eq__(self: Self, other: object) -> bool:
# allow comparing object instances to class
if type(other) is type and issubclass(other, self.__class__):
return True
elif isinstance(other, self.__class__):
return self.time_unit == other.time_unit and self.time_zone == other.time_zone
else: # pragma: no cover
return False

def __hash__(self: Self) -> int: # pragma: no cover
return hash((self.__class__, self.time_unit, self.time_zone))
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved

def __repr__(self: Self) -> str: # pragma: no cover
class_name = self.__class__.__name__
return f"{class_name}(time_unit={self.time_unit!r}, time_zone={self.time_zone!r})"


class Duration(TemporalType):
"""
Data type representing a time duration.

Arguments:
time_unit: Unit of time. Defaults to `'us'` (microseconds).

Notes:
Adapted from Polars implementation at:
https://github.com/pola-rs/polars/blob/py-1.7.1/py-polars/polars/datatypes/classes.py#L460-L502
"""

def __init__(
self: Self,
time_unit: Literal["us", "ns", "ms", "s"] = "us",
) -> None:
if time_unit not in ("s", "ms", "us", "ns"):
msg = (
"invalid `time_unit`"
f"\n\nExpected one of {{'ns','us','ms', 's'}}, got {time_unit!r}."
)
raise ValueError(msg)

self.time_unit = time_unit

def __eq__(self: Self, other: object) -> bool:
# allow comparing object instances to class
if type(other) is type and issubclass(other, self.__class__):
return True
elif isinstance(other, self.__class__):
return self.time_unit == other.time_unit
else: # pragma: no cover
return False

def __hash__(self: Self) -> int: # pragma: no cover
return hash((self.__class__, self.time_unit))

def __repr__(self: Self) -> str: # pragma: no cover
class_name = self.__class__.__name__
return f"{class_name}(time_unit={self.time_unit!r})"


class Categorical(DType): ...
Expand Down
13 changes: 5 additions & 8 deletions narwhals/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from narwhals.dataframe import LazyFrame
from narwhals.translate import from_native
from narwhals.utils import Implementation
from narwhals.utils import parse_version
from narwhals.utils import validate_laziness

# Missing type parameters for generic type "DataFrame"
Expand Down Expand Up @@ -235,11 +236,9 @@ def _new_series_impl(
narwhals_to_native_dtype as pandas_like_narwhals_to_native_dtype,
)

backend_version = parse_version(native_namespace.__version__)
dtype = pandas_like_narwhals_to_native_dtype(
dtype,
None,
implementation,
dtypes,
dtype, None, implementation, backend_version, dtypes
)
native_series = native_namespace.Series(values, name=name, dtype=dtype)

Expand Down Expand Up @@ -374,12 +373,10 @@ def _from_dict_impl(
narwhals_to_native_dtype as pandas_like_narwhals_to_native_dtype,
)

backend_version = parse_version(native_namespace.__version__)
schema = {
name: pandas_like_narwhals_to_native_dtype(
schema[name],
native_type,
implementation,
dtypes,
schema[name], native_type, implementation, backend_version, dtypes
)
for name, native_type in native_frame.dtypes.items()
}
Expand Down
Loading
Loading