From 2709c6bc614a119aa936bc0ec2358d435ec6164a Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 2 Nov 2024 10:23:13 +0000 Subject: [PATCH] verbose regex --- narwhals/_pandas_like/utils.py | 59 ++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index e1a6e7a5f..287f7f878 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -30,6 +30,51 @@ Implementation.CUDF, Implementation.MODIN, } +PD_DATETIME_RGX = r"""^ + datetime64\[ + (?Ps|ms|us|ns) # Match time unit: s, ms, us, or ns + (?:, # Begin non-capturing group for optional timezone + \s? # Optional whitespace after comma + (?P # Start named group for timezone + [a-zA-Z\/]+ # Match timezone name, e.g., UTC, America/New_York + (?: # Begin optional non-capturing group for offset + [+-]\d{2}:\d{2} # Match offset in format +HH:MM or -HH:MM + )? # End optional offset group + ) # End time_zone group + )? # End optional timezone group + \] # Closing bracket +$""" +PATTERN_PD_DATETIME = re.compile(PD_DATETIME_RGX, re.VERBOSE) +PA_DATETIME_RGX = r"""^ + timestamp\[ + (?Ps|ms|us|ns) # Match time unit: s, ms, us, or ns + (?:, # Begin non-capturing group for optional timezone + \s?tz= # Match "tz=" prefix + (?P # Start named group for timezone + [a-zA-Z\/]* # Match timezone name (e.g., UTC, America/New_York) + (?: # Begin optional non-capturing group for offset + [+-]\d{2}:\d{2} # Match offset in format +HH:MM or -HH:MM + )? # End optional offset group + ) # End time_zone group + )? # End optional timezone group + \] # Closing bracket for timestamp + \[pyarrow\] # Literal string "[pyarrow]" +$""" +PATTERN_PA_DATETIME = re.compile(PA_DATETIME_RGX, re.VERBOSE) +PD_DURATION_RGX = r"""^ + timedelta64\[ + (?Ps|ms|us|ns) # Match time unit: s, ms, us, or ns + \] # Closing bracket for timedelta64 +$""" + +PATTERN_PD_DURATION = re.compile(PD_DURATION_RGX, re.VERBOSE) +PA_DURATION_RGX = r"""^ + duration\[ + (?Ps|ms|us|ns) # Match time unit: s, ms, us, or ns + \] # Closing bracket for duration + \[pyarrow\] # Literal string "[pyarrow]" +$""" +PATTERN_PA_DURATION = re.compile(PA_DURATION_RGX, re.VERBOSE) def validate_column_comparand(index: Any, other: Any) -> Any: @@ -223,12 +268,6 @@ def native_to_narwhals_dtype( ) -> DType: dtype = str(native_column.dtype) - pd_datetime_rgx = r"^datetime64\[(?Ps|ms|us|ns)(?:, (?P[a-zA-Z\/]+(?:[+-]\d{2}:\d{2})?))?\]$" - pa_datetime_rgx = r"^timestamp\[(?Ps|ms|us|ns)(?:, tz=(?P[a-zA-Z\/]*(?:[+-]\d{2}:\d{2})?))?\]\[pyarrow\]$" - - pd_duration_rgx = r"^timedelta64\[(?Ps|ms|us|ns)\]$" - pa_duration_rgx = r"^duration\[(?Ps|ms|us|ns)\]\[pyarrow\]$" - if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}: return dtypes.Int64() if dtype in {"int32", "Int32", "Int32[pyarrow]", "int32[pyarrow]"}: @@ -267,14 +306,14 @@ def native_to_narwhals_dtype( return dtypes.Boolean() if dtype == "category" or dtype.startswith("dictionary<"): return dtypes.Categorical() - if (match_ := re.match(pd_datetime_rgx, dtype)) or ( - match_ := re.match(pa_datetime_rgx, dtype) + if (match_ := PATTERN_PD_DATETIME.match(dtype)) or ( + match_ := PATTERN_PA_DATETIME.match(dtype) ): dt_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment] dt_time_zone: str | None = match_.group("time_zone") return dtypes.Datetime(dt_time_unit, dt_time_zone) - if (match_ := re.match(pd_duration_rgx, dtype)) or ( - match_ := re.match(pa_duration_rgx, dtype) + if (match_ := PATTERN_PD_DURATION.match(dtype)) or ( + match_ := PATTERN_PA_DURATION.match(dtype) ): du_time_unit: Literal["us", "ns", "ms", "s"] = match_.group("time_unit") # type: ignore[assignment] return dtypes.Duration(du_time_unit)