Skip to content

Commit

Permalink
feat: Datetime(time_unit, time_zone) and Duration(time_unit) types (
Browse files Browse the repository at this point in the history
  • Loading branch information
FBruzzesi authored Sep 30, 2024
1 parent e3d2a4b commit 93d2fc7
Show file tree
Hide file tree
Showing 13 changed files with 332 additions and 50 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/extremes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ jobs:
kaggle kernels output "marcogorelli/variable-brink-glacier"
- name: install-polars
run: python -m pip install *.whl
- name: install-pandas-nightly
run: pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas
- name: install-reqs
run: uv pip install --upgrade tox virtualenv setuptools pip -r requirements-dev.txt --system
- name: uninstall pyarrow
Expand All @@ -127,8 +129,8 @@ jobs:
# run: uv pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ --pre pyarrow --system
- name: uninstall pandas
run: uv pip uninstall pandas --system
- name: install-pandas-nightly
run: uv pip install --prerelease=allow --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas --system
- name: show-deps
run: uv pip freeze
- name: uninstall numpy
run: uv pip uninstall numpy --system
- name: install numpy nightly
Expand Down
13 changes: 7 additions & 6 deletions narwhals/_arrow/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType:
if pa.types.is_date32(dtype):
return dtypes.Date()
if pa.types.is_timestamp(dtype):
return dtypes.Datetime()
return dtypes.Datetime(time_unit=dtype.unit, time_zone=dtype.tz)
if pa.types.is_duration(dtype):
return dtypes.Duration()
return dtypes.Duration(time_unit=dtype.unit)
if pa.types.is_dictionary(dtype):
return dtypes.Categorical()
if pa.types.is_struct(dtype):
Expand Down Expand Up @@ -94,11 +94,12 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], dtypes: DTypes) -> Any:
if isinstance_or_issubclass(dtype, dtypes.Categorical):
return pa.dictionary(pa.uint32(), pa.string())
if isinstance_or_issubclass(dtype, dtypes.Datetime):
# Use Polars' default
return pa.timestamp("us")
time_unit = getattr(dtype, "time_unit", "us")
time_zone = getattr(dtype, "time_zone", None)
return pa.timestamp(time_unit, tz=time_zone)
if isinstance_or_issubclass(dtype, dtypes.Duration):
# Use Polars' default
return pa.duration("us")
time_unit = getattr(dtype, "time_unit", "us")
return pa.duration(time_unit)
if isinstance_or_issubclass(dtype, dtypes.Date):
return pa.date32()
if isinstance_or_issubclass(dtype, dtypes.List): # pragma: no cover
Expand Down
2 changes: 1 addition & 1 deletion narwhals/_pandas_like/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def cast(
) -> Self:
ser = self._native_series
dtype = narwhals_to_native_dtype(
dtype, ser.dtype, self._implementation, self._dtypes
dtype, ser.dtype, self._implementation, self._backend_version, self._dtypes
)
return self._from_native_series(ser.astype(dtype))

Expand Down
62 changes: 49 additions & 13 deletions narwhals/_pandas_like/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from __future__ import annotations

import re
from typing import TYPE_CHECKING
from typing import Any
from typing import Iterable
from typing import Literal
from typing import TypeVar

from narwhals.utils import Implementation
Expand Down Expand Up @@ -213,6 +215,15 @@ def set_axis(

def native_to_narwhals_dtype(column: Any, dtypes: DTypes) -> DType:
dtype = str(column.dtype)

pd_datetime_rgx = (
r"^datetime64\[(?P<time_unit>s|ms|us|ns)(?:, (?P<time_zone>[a-zA-Z\/]+))?\]$"
)
pa_datetime_rgx = r"^timestamp\[(?P<time_unit>s|ms|us|ns)(?:, tz=(?P<time_zone>[a-zA-Z\/]+))?\]\[pyarrow\]$"

pd_duration_rgx = r"^timedelta64\[(?P<time_unit>s|ms|us|ns)\]$"
pa_duration_rgx = r"^duration\[(?P<time_unit>s|ms|us|ns)\]\[pyarrow\]$"

if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}:
return dtypes.Int64()
if dtype in {"int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"}:
Expand Down Expand Up @@ -251,12 +262,17 @@ def native_to_narwhals_dtype(column: Any, dtypes: DTypes) -> DType:
return dtypes.Boolean()
if dtype == "category" or dtype.startswith("dictionary<"):
return dtypes.Categorical()
if dtype.startswith(("datetime64", "timestamp[")):
# TODO(Unassigned): different time units and time zones
return dtypes.Datetime()
if dtype.startswith(("timedelta64", "duration")):
# TODO(Unassigned): different time units
return dtypes.Duration()
if (match_ := re.match(pd_datetime_rgx, dtype)) or (
match_ := re.match(pa_datetime_rgx, dtype)
):
dt_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment]
dt_time_zone: str | None = match_.group("time_zone")
return dtypes.Datetime(dt_time_unit, dt_time_zone)
if (match_ := re.match(pd_duration_rgx, dtype)) or (
match_ := re.match(pa_duration_rgx, dtype)
):
du_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment]
return dtypes.Duration(du_time_unit)
if dtype == "date32[day][pyarrow]":
return dtypes.Date()
if dtype.startswith(("large_list", "list")):
Expand Down Expand Up @@ -314,6 +330,7 @@ def narwhals_to_native_dtype( # noqa: PLR0915
dtype: DType | type[DType],
starting_dtype: Any,
implementation: Implementation,
backend_version: tuple[int, ...],
dtypes: DTypes,
) -> Any:
if "polars" in str(type(dtype)):
Expand Down Expand Up @@ -416,15 +433,34 @@ def narwhals_to_native_dtype( # noqa: PLR0915
# convert to it?
return "category"
if isinstance_or_issubclass(dtype, dtypes.Datetime):
# TODO(Unassigned): different time units and time zones
dt_time_unit = getattr(dtype, "time_unit", "us")
dt_time_zone = getattr(dtype, "time_zone", None)

# Pandas does not support "ms" or "us" time units before version 2.0
# Let's overwrite with "ns"
if implementation is Implementation.PANDAS and backend_version < (
2,
): # pragma: no cover
dt_time_unit = "ns"

if dtype_backend == "pyarrow-nullable":
return "timestamp[ns][pyarrow]"
return "datetime64[ns]"
tz_part = f", tz={dt_time_zone}" if dt_time_zone else ""
return f"timestamp[{dt_time_unit}{tz_part}][pyarrow]"
else:
tz_part = f", {dt_time_zone}" if dt_time_zone else ""
return f"datetime64[{dt_time_unit}{tz_part}]"
if isinstance_or_issubclass(dtype, dtypes.Duration):
# TODO(Unassigned): different time units and time zones
if dtype_backend == "pyarrow-nullable":
return "duration[ns][pyarrow]"
return "timedelta64[ns]"
du_time_unit = getattr(dtype, "time_unit", "us")
if implementation is Implementation.PANDAS and backend_version < (
2,
): # pragma: no cover
dt_time_unit = "ns"
return (
f"duration[{du_time_unit}][pyarrow]"
if dtype_backend == "pyarrow-nullable"
else f"timedelta64[{du_time_unit}]"
)

if isinstance_or_issubclass(dtype, dtypes.Date):
if dtype_backend == "pyarrow-nullable":
return "date32[pyarrow]"
Expand Down
24 changes: 16 additions & 8 deletions narwhals/_polars/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from typing import TYPE_CHECKING
from typing import Any
from typing import Literal

if TYPE_CHECKING:
from narwhals.dtypes import DType
Expand Down Expand Up @@ -62,12 +63,15 @@ def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType:
return dtypes.Categorical()
if dtype == pl.Enum:
return dtypes.Enum()
if dtype == pl.Datetime:
return dtypes.Datetime()
if dtype == pl.Duration:
return dtypes.Duration()
if dtype == pl.Date:
return dtypes.Date()
if dtype == pl.Datetime or isinstance(dtype, pl.Datetime):
dt_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us")
dt_time_zone = getattr(dtype, "time_zone", None)
return dtypes.Datetime(time_unit=dt_time_unit, time_zone=dt_time_zone)
if dtype == pl.Duration or isinstance(dtype, pl.Duration):
du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us")
return dtypes.Duration(time_unit=du_time_unit)
if dtype == pl.Struct:
return dtypes.Struct()
if dtype == pl.List:
Expand Down Expand Up @@ -111,12 +115,16 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], dtypes: DTypes) -> Any:
if dtype == dtypes.Enum:
msg = "Converting to Enum is not (yet) supported"
raise NotImplementedError(msg)
if dtype == dtypes.Datetime:
return pl.Datetime()
if dtype == dtypes.Duration:
return pl.Duration()
if dtype == dtypes.Date:
return pl.Date()
if dtype == dtypes.Datetime or isinstance(dtype, dtypes.Datetime):
dt_time_unit = getattr(dtype, "time_unit", "us")
dt_time_zone = getattr(dtype, "time_zone", None)
return pl.Datetime(dt_time_unit, dt_time_zone) # type: ignore[arg-type]
if dtype == dtypes.Duration or isinstance(dtype, dtypes.Duration):
du_time_unit: Literal["us", "ns", "ms"] = getattr(dtype, "time_unit", "us")
return pl.Duration(time_unit=du_time_unit)

if dtype == dtypes.List: # pragma: no cover
msg = "Converting to List dtype is not supported yet"
return NotImplementedError(msg)
Expand Down
95 changes: 91 additions & 4 deletions narwhals/dtypes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations

from datetime import timezone
from typing import TYPE_CHECKING
from typing import Literal

if TYPE_CHECKING:
from typing_extensions import Self
Expand Down Expand Up @@ -71,10 +73,95 @@ class Object(DType): ...
class Unknown(DType): ...


class Datetime(TemporalType): ...


class Duration(TemporalType): ...
class Datetime(TemporalType):
"""
Data type representing a calendar date and time of day.
Arguments:
time_unit: Unit of time. Defaults to `'us'` (microseconds).
time_zone: Time zone string, as defined in zoneinfo (to see valid strings run
`import zoneinfo; zoneinfo.available_timezones()` for a full list).
Notes:
Adapted from Polars implementation at:
https://github.com/pola-rs/polars/blob/py-1.7.1/py-polars/polars/datatypes/classes.py#L398-L457
"""

def __init__(
self: Self,
time_unit: Literal["us", "ns", "ms", "s"] = "us",
time_zone: str | timezone | None = None,
) -> None:
if time_unit not in {"s", "ms", "us", "ns"}:
msg = (
"invalid `time_unit`"
f"\n\nExpected one of {{'ns','us','ms', 's'}}, got {time_unit!r}."
)
raise ValueError(msg)

if isinstance(time_zone, timezone):
time_zone = str(time_zone)

self.time_unit = time_unit
self.time_zone = time_zone

def __eq__(self: Self, other: object) -> bool:
# allow comparing object instances to class
if type(other) is type and issubclass(other, self.__class__):
return True
elif isinstance(other, self.__class__):
return self.time_unit == other.time_unit and self.time_zone == other.time_zone
else: # pragma: no cover
return False

def __hash__(self: Self) -> int: # pragma: no cover
return hash((self.__class__, self.time_unit, self.time_zone))

def __repr__(self: Self) -> str: # pragma: no cover
class_name = self.__class__.__name__
return f"{class_name}(time_unit={self.time_unit!r}, time_zone={self.time_zone!r})"


class Duration(TemporalType):
"""
Data type representing a time duration.
Arguments:
time_unit: Unit of time. Defaults to `'us'` (microseconds).
Notes:
Adapted from Polars implementation at:
https://github.com/pola-rs/polars/blob/py-1.7.1/py-polars/polars/datatypes/classes.py#L460-L502
"""

def __init__(
self: Self,
time_unit: Literal["us", "ns", "ms", "s"] = "us",
) -> None:
if time_unit not in ("s", "ms", "us", "ns"):
msg = (
"invalid `time_unit`"
f"\n\nExpected one of {{'ns','us','ms', 's'}}, got {time_unit!r}."
)
raise ValueError(msg)

self.time_unit = time_unit

def __eq__(self: Self, other: object) -> bool:
# allow comparing object instances to class
if type(other) is type and issubclass(other, self.__class__):
return True
elif isinstance(other, self.__class__):
return self.time_unit == other.time_unit
else: # pragma: no cover
return False

def __hash__(self: Self) -> int: # pragma: no cover
return hash((self.__class__, self.time_unit))

def __repr__(self: Self) -> str: # pragma: no cover
class_name = self.__class__.__name__
return f"{class_name}(time_unit={self.time_unit!r})"


class Categorical(DType): ...
Expand Down
13 changes: 5 additions & 8 deletions narwhals/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from narwhals.dataframe import LazyFrame
from narwhals.translate import from_native
from narwhals.utils import Implementation
from narwhals.utils import parse_version
from narwhals.utils import validate_laziness

# Missing type parameters for generic type "DataFrame"
Expand Down Expand Up @@ -235,11 +236,9 @@ def _new_series_impl(
narwhals_to_native_dtype as pandas_like_narwhals_to_native_dtype,
)

backend_version = parse_version(native_namespace.__version__)
dtype = pandas_like_narwhals_to_native_dtype(
dtype,
None,
implementation,
dtypes,
dtype, None, implementation, backend_version, dtypes
)
native_series = native_namespace.Series(values, name=name, dtype=dtype)

Expand Down Expand Up @@ -374,12 +373,10 @@ def _from_dict_impl(
narwhals_to_native_dtype as pandas_like_narwhals_to_native_dtype,
)

backend_version = parse_version(native_namespace.__version__)
schema = {
name: pandas_like_narwhals_to_native_dtype(
schema[name],
native_type,
implementation,
dtypes,
schema[name], native_type, implementation, backend_version, dtypes
)
for name, native_type in native_frame.dtypes.items()
}
Expand Down
Loading

0 comments on commit 93d2fc7

Please sign in to comment.