Skip to content

Commit

Permalink
fix: fix parsing of fixed-offset timezones
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoGorelli committed Nov 2, 2024
1 parent 5c3db5b commit d2725cb
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 12 deletions.
61 changes: 49 additions & 12 deletions narwhals/_pandas_like/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,51 @@
Implementation.CUDF,
Implementation.MODIN,
}
PD_DATETIME_RGX = r"""^
datetime64\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
(?:, # Begin non-capturing group for optional timezone
\s? # Optional whitespace after comma
(?P<time_zone> # Start named group for timezone
[a-zA-Z\/]+ # Match timezone name, e.g., UTC, America/New_York
(?: # Begin optional non-capturing group for offset
[+-]\d{2}:\d{2} # Match offset in format +HH:MM or -HH:MM
)? # End optional offset group
) # End time_zone group
)? # End optional timezone group
\] # Closing bracket
$"""
PATTERN_PD_DATETIME = re.compile(PD_DATETIME_RGX, re.VERBOSE)
PA_DATETIME_RGX = r"""^
timestamp\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
(?:, # Begin non-capturing group for optional timezone
\s?tz= # Match "tz=" prefix
(?P<time_zone> # Start named group for timezone
[a-zA-Z\/]* # Match timezone name (e.g., UTC, America/New_York)
(?: # Begin optional non-capturing group for offset
[+-]\d{2}:\d{2} # Match offset in format +HH:MM or -HH:MM
)? # End optional offset group
) # End time_zone group
)? # End optional timezone group
\] # Closing bracket for timestamp
\[pyarrow\] # Literal string "[pyarrow]"
$"""
PATTERN_PA_DATETIME = re.compile(PA_DATETIME_RGX, re.VERBOSE)
PD_DURATION_RGX = r"""^
timedelta64\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
\] # Closing bracket for timedelta64
$"""

PATTERN_PD_DURATION = re.compile(PD_DURATION_RGX, re.VERBOSE)
PA_DURATION_RGX = r"""^
duration\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
\] # Closing bracket for duration
\[pyarrow\] # Literal string "[pyarrow]"
$"""
PATTERN_PA_DURATION = re.compile(PA_DURATION_RGX, re.VERBOSE)


def validate_column_comparand(index: Any, other: Any) -> Any:
Expand Down Expand Up @@ -223,14 +268,6 @@ def native_to_narwhals_dtype(
) -> DType:
dtype = str(native_column.dtype)

pd_datetime_rgx = (
r"^datetime64\[(?P<time_unit>s|ms|us|ns)(?:, (?P<time_zone>[a-zA-Z\/]+))?\]$"
)
pa_datetime_rgx = r"^timestamp\[(?P<time_unit>s|ms|us|ns)(?:, tz=(?P<time_zone>[a-zA-Z\/]+))?\]\[pyarrow\]$"

pd_duration_rgx = r"^timedelta64\[(?P<time_unit>s|ms|us|ns)\]$"
pa_duration_rgx = r"^duration\[(?P<time_unit>s|ms|us|ns)\]\[pyarrow\]$"

if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}:
return dtypes.Int64()
if dtype in {"int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"}:
Expand Down Expand Up @@ -269,14 +306,14 @@ def native_to_narwhals_dtype(
return dtypes.Boolean()
if dtype == "category" or dtype.startswith("dictionary<"):
return dtypes.Categorical()
if (match_ := re.match(pd_datetime_rgx, dtype)) or (
match_ := re.match(pa_datetime_rgx, dtype)
if (match_ := PATTERN_PD_DATETIME.match(dtype)) or (
match_ := PATTERN_PA_DATETIME.match(dtype)
):
dt_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment]
dt_time_zone: str | None = match_.group("time_zone")
return dtypes.Datetime(dt_time_unit, dt_time_zone)
if (match_ := re.match(pd_duration_rgx, dtype)) or (
match_ := re.match(pa_duration_rgx, dtype)
if (match_ := PATTERN_PD_DURATION.match(dtype)) or (
match_ := PATTERN_PA_DURATION.match(dtype)
):
du_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment]
return dtypes.Duration(du_time_unit)
Expand Down
15 changes: 15 additions & 0 deletions tests/dtypes_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,18 @@ def test_pandas_inplace_modification_1267(request: pytest.FixtureRequest) -> Non
assert snw.dtype == nw.Int64
s[0] = 999.5
assert snw.dtype == nw.Float64


def test_pandas_fixed_offset_1302() -> None:
result = nw.from_native(
pd.Series(pd.to_datetime(["2020-01-01T00:00:00.000000000+01:00"])),
series_only=True,
).dtype
assert result == nw.Datetime("ns", "UTC+01:00")
result = nw.from_native(
pd.Series(pd.to_datetime(["2020-01-01T00:00:00.000000000+01:00"])).convert_dtypes(
dtype_backend="pyarrow"
),
series_only=True,
).dtype
assert result == nw.Datetime("ns", "+01:00")

0 comments on commit d2725cb

Please sign in to comment.