Skip to content

Commit

Permalink
verbose regex
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoGorelli committed Nov 2, 2024
1 parent 7e3300e commit 2709c6b
Showing 1 changed file with 49 additions and 10 deletions.
59 changes: 49 additions & 10 deletions narwhals/_pandas_like/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,51 @@
Implementation.CUDF,
Implementation.MODIN,
}
PD_DATETIME_RGX = r"""^
datetime64\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
(?:, # Begin non-capturing group for optional timezone
\s? # Optional whitespace after comma
(?P<time_zone> # Start named group for timezone
[a-zA-Z\/]+ # Match timezone name, e.g., UTC, America/New_York
(?: # Begin optional non-capturing group for offset
[+-]\d{2}:\d{2} # Match offset in format +HH:MM or -HH:MM
)? # End optional offset group
) # End time_zone group
)? # End optional timezone group
\] # Closing bracket
$"""
PATTERN_PD_DATETIME = re.compile(PD_DATETIME_RGX, re.VERBOSE)
PA_DATETIME_RGX = r"""^
timestamp\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
(?:, # Begin non-capturing group for optional timezone
\s?tz= # Match "tz=" prefix
(?P<time_zone> # Start named group for timezone
[a-zA-Z\/]* # Match timezone name (e.g., UTC, America/New_York)
(?: # Begin optional non-capturing group for offset
[+-]\d{2}:\d{2} # Match offset in format +HH:MM or -HH:MM
)? # End optional offset group
) # End time_zone group
)? # End optional timezone group
\] # Closing bracket for timestamp
\[pyarrow\] # Literal string "[pyarrow]"
$"""
PATTERN_PA_DATETIME = re.compile(PA_DATETIME_RGX, re.VERBOSE)
PD_DURATION_RGX = r"""^
timedelta64\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
\] # Closing bracket for timedelta64
$"""

PATTERN_PD_DURATION = re.compile(PD_DURATION_RGX, re.VERBOSE)
PA_DURATION_RGX = r"""^
duration\[
(?P<time_unit>s|ms|us|ns) # Match time unit: s, ms, us, or ns
\] # Closing bracket for duration
\[pyarrow\] # Literal string "[pyarrow]"
$"""
PATTERN_PA_DURATION = re.compile(PA_DURATION_RGX, re.VERBOSE)


def validate_column_comparand(index: Any, other: Any) -> Any:
Expand Down Expand Up @@ -223,12 +268,6 @@ def native_to_narwhals_dtype(
) -> DType:
dtype = str(native_column.dtype)

pd_datetime_rgx = r"^datetime64\[(?P<time_unit>s|ms|us|ns)(?:, (?P<time_zone>[a-zA-Z\/]+(?:[+-]\d{2}:\d{2})?))?\]$"
pa_datetime_rgx = r"^timestamp\[(?P<time_unit>s|ms|us|ns)(?:, tz=(?P<time_zone>[a-zA-Z\/]*(?:[+-]\d{2}:\d{2})?))?\]\[pyarrow\]$"

pd_duration_rgx = r"^timedelta64\[(?P<time_unit>s|ms|us|ns)\]$"
pa_duration_rgx = r"^duration\[(?P<time_unit>s|ms|us|ns)\]\[pyarrow\]$"

if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}:
return dtypes.Int64()
if dtype in {"int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"}:
Expand Down Expand Up @@ -267,14 +306,14 @@ def native_to_narwhals_dtype(
return dtypes.Boolean()
if dtype == "category" or dtype.startswith("dictionary<"):
return dtypes.Categorical()
if (match_ := re.match(pd_datetime_rgx, dtype)) or (
match_ := re.match(pa_datetime_rgx, dtype)
if (match_ := PATTERN_PD_DATETIME.match(dtype)) or (
match_ := PATTERN_PA_DATETIME.match(dtype)
):
dt_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment]
dt_time_zone: str | None = match_.group("time_zone")
return dtypes.Datetime(dt_time_unit, dt_time_zone)
if (match_ := re.match(pd_duration_rgx, dtype)) or (
match_ := re.match(pa_duration_rgx, dtype)
if (match_ := PATTERN_PD_DURATION.match(dtype)) or (
match_ := PATTERN_PA_DURATION.match(dtype)
):
du_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment]
return dtypes.Duration(du_time_unit)
Expand Down

0 comments on commit 2709c6b

Please sign in to comment.