DEP: Use Cython 3.0 (pandas-dev#55179)

* DEP: Use Cython 3.0 * Cython 3.0.3 * Update to Cython 3.0.4 * Merge pyi updates * fixup * Update pyi files and upgrade to Cython 3.0.5 * Remove debug print * fix typo --------- Co-authored-by: Thomas Li <[email protected]>
MichaelTiemannOSC · Nov 16, 2023 · e5301a8 · e5301a8
1 parent 02e2bae
commit e5301a8
Show file tree

Hide file tree

Showing 24 changed files with 54 additions and 41 deletions.
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -41,7 +41,7 @@
     // pip (with all the conda available packages installed first,
     // followed by the pip installed packages).
     "matrix": {
-        "Cython": ["0.29.33"],
+        "Cython": ["3.0.5"],
         "matplotlib": [],
         "sqlalchemy": [],
         "scipy": [],

diff --git a/environment.yml b/environment.yml
@@ -8,7 +8,7 @@ dependencies:
 
   # build dependencies
   - versioneer[toml]
-  - cython=0.29.33
+  - cython=3.0.5
   - meson[ninja]=1.2.1
   - meson-python=0.13.1
 

diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi
@@ -26,7 +26,7 @@ class NDArrayBacked:
     def size(self) -> int: ...
     @property
     def nbytes(self) -> int: ...
-    def copy(self): ...
+    def copy(self, order=...): ...
     def delete(self, loc, axis=...): ...
     def swapaxes(self, axis1, axis2): ...
     def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ...

diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -44,7 +44,6 @@ def group_fillna_indexer(
     labels: np.ndarray,  # ndarray[int64_t]
     sorted_labels: npt.NDArray[np.intp],
     mask: npt.NDArray[np.uint8],
-    direction: Literal["ffill", "bfill"],
     limit: int,  # int64_t
     dropna: bool,
 ) -> None: ...
@@ -55,7 +54,7 @@ def group_any_all(
     mask: np.ndarray,  # const uint8_t[::1]
     val_test: Literal["any", "all"],
     skipna: bool,
-    nullable: bool,
+    result_mask: np.ndarray | None,
 ) -> None: ...
 def group_sum(
     out: np.ndarray,  # complexfloatingintuint_t[:, ::1]

diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
@@ -20,7 +20,6 @@ class Factorizer:
     def factorize(
         self,
         values: np.ndarray,
-        sort: bool = ...,
         na_sentinel=...,
         na_value=...,
         mask=...,
@@ -157,9 +156,9 @@ class HashTable:
     def __contains__(self, key: Hashable) -> bool: ...
     def sizeof(self, deep: bool = ...) -> int: ...
     def get_state(self) -> dict[str, int]: ...
-    # TODO: `item` type is subclass-specific
-    def get_item(self, item): ...  # TODO: return type?
-    def set_item(self, item, val) -> None: ...
+    # TODO: `val/key` type is subclass-specific
+    def get_item(self, val): ...  # TODO: return type?
+    def set_item(self, key, val) -> None: ...
     def get_na(self): ...  # TODO: return type?
     def set_na(self, val) -> None: ...
     def map_locations(
@@ -185,6 +184,7 @@ class HashTable:
         self,
         values: np.ndarray,  # np.ndarray[subclass-specific]
         return_inverse: bool = ...,
+        mask=...,
     ) -> (
         tuple[
             np.ndarray,  # np.ndarray[subclass-specific]
@@ -198,6 +198,7 @@ class HashTable:
         na_sentinel: int = ...,
         na_value: object = ...,
         mask=...,
+        ignore_na: bool = True,
     ) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ...  # np.ndarray[subclass-specific]
 
 class Complex128HashTable(HashTable): ...

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -1239,9 +1239,10 @@ cdef class StringHashTable(HashTable):
                             na_value=na_value, ignore_na=ignore_na,
                             return_inverse=True)
 
+    # Add unused mask parameter for compat with other signatures
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                   object na_value=None):
+                   object na_value=None, object mask=None):
         # -> np.ndarray[np.intp]
         _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,
@@ -1496,9 +1497,10 @@ cdef class PyObjectHashTable(HashTable):
                             na_value=na_value, ignore_na=ignore_na,
                             return_inverse=True)
 
+    # Add unused mask parameter for compat with other signatures
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
                    Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
-                   object na_value=None):
+                   object na_value=None, object mask=None):
         # -> np.ndarray[np.intp]
         _, labels = self._unique(values, uniques, count_prior=count_prior,
                                  na_sentinel=na_sentinel, na_value=na_value,

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -45,22 +45,24 @@ def is_scalar(val: object) -> bool: ...
 def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ...
 def is_pyarrow_array(obj: object) -> bool: ...
 def is_period(val: object) -> TypeGuard[Period]: ...
-def is_interval(val: object) -> TypeGuard[Interval]: ...
-def is_decimal(val: object) -> TypeGuard[Decimal]: ...
-def is_complex(val: object) -> TypeGuard[complex]: ...
-def is_bool(val: object) -> TypeGuard[bool | np.bool_]: ...
-def is_integer(val: object) -> TypeGuard[int | np.integer]: ...
+def is_interval(obj: object) -> TypeGuard[Interval]: ...
+def is_decimal(obj: object) -> TypeGuard[Decimal]: ...
+def is_complex(obj: object) -> TypeGuard[complex]: ...
+def is_bool(obj: object) -> TypeGuard[bool | np.bool_]: ...
+def is_integer(obj: object) -> TypeGuard[int | np.integer]: ...
 def is_int_or_none(obj) -> bool: ...
-def is_float(val: object) -> TypeGuard[float]: ...
+def is_float(obj: object) -> TypeGuard[float]: ...
 def is_interval_array(values: np.ndarray) -> bool: ...
-def is_datetime64_array(values: np.ndarray) -> bool: ...
-def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ...
+def is_datetime64_array(values: np.ndarray, skipna: bool = True) -> bool: ...
+def is_timedelta_or_timedelta64_array(
+    values: np.ndarray, skipna: bool = True
+) -> bool: ...
 def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ...
 def is_time_array(values: np.ndarray, skipna: bool = ...): ...
 def is_date_array(values: np.ndarray, skipna: bool = ...): ...
 def is_datetime_array(values: np.ndarray, skipna: bool = ...): ...
 def is_string_array(values: np.ndarray, skipna: bool = ...): ...
-def is_float_array(values: np.ndarray, skipna: bool = ...): ...
+def is_float_array(values: np.ndarray): ...
 def is_integer_array(values: np.ndarray, skipna: bool = ...): ...
 def is_bool_array(values: np.ndarray, skipna: bool = ...): ...
 def fast_multiget(
@@ -185,7 +187,7 @@ def count_level_2d(
     max_bin: int,
 ) -> np.ndarray: ...  # np.ndarray[np.int64, ndim=2]
 def get_level_sorter(
-    label: np.ndarray,  # const int64_t[:]
+    codes: np.ndarray,  # const int64_t[:]
     starts: np.ndarray,  # const intp_t[:]
 ) -> np.ndarray: ...  # np.ndarray[np.intp, ndim=1]
 def generate_bins_dt64(

diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi
@@ -37,8 +37,8 @@ def vec_binop(
 @overload
 def maybe_convert_bool(
     arr: npt.NDArray[np.object_],
-    true_values: Iterable = ...,
-    false_values: Iterable = ...,
+    true_values: Iterable | None = None,
+    false_values: Iterable | None = None,
     convert_to_masked_nullable: Literal[False] = ...,
 ) -> tuple[np.ndarray, None]: ...
 @overload

diff --git a/pandas/_libs/sparse.pyi b/pandas/_libs/sparse.pyi
@@ -39,6 +39,10 @@ class BlockIndex(SparseIndex):
         self, length: int, blocs: np.ndarray, blengths: np.ndarray
     ) -> None: ...
 
+    # Override to have correct parameters
+    def intersect(self, other: SparseIndex) -> Self: ...
+    def make_union(self, y: SparseIndex) -> Self: ...
+
 def make_mask_object_ndarray(
     arr: npt.NDArray[np.object_], fill_value
 ) -> npt.NDArray[np.bool_]: ...

diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi
@@ -9,6 +9,7 @@ DT64NS_DTYPE: np.dtype
 TD64NS_DTYPE: np.dtype
 
 def precision_from_unit(
-    in_reso: int,  # NPY_DATETIMEUNIT
+    in_reso: int,
+    out_reso: int = ...,
 ) -> tuple[int, int]: ...  # (int64_t, _)
 def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ...
diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi
@@ -2,10 +2,10 @@ from enum import Enum
 
 OFFSET_TO_PERIOD_FREQSTR: dict[str, str]
 
-def periods_per_day(reso: int) -> int: ...
+def periods_per_day(reso: int = ...) -> int: ...
 def periods_per_second(reso: int) -> int: ...
 def is_supported_unit(reso: int) -> bool: ...
-def npy_unit_to_abbrev(reso: int) -> str: ...
+def npy_unit_to_abbrev(unit: int) -> str: ...
 def get_supported_reso(reso: int) -> int: ...
 def abbrev_to_npy_unit(abbrev: str) -> int: ...
 def freq_to_period_freqstr(freq_n: int, freq_name: str) -> str: ...

diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi
@@ -9,7 +9,7 @@ class OutOfBoundsTimedelta(ValueError): ...
 def py_get_unit_from_dtype(dtype: np.dtype): ...
 def py_td64_to_tdstruct(td64: int, unit: int) -> dict: ...
 def astype_overflowsafe(
-    arr: np.ndarray,
+    values: np.ndarray,
     dtype: np.dtype,
     copy: bool = ...,
     round_ok: bool = ...,

diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi
@@ -89,7 +89,7 @@ class Period(PeriodMixin):
     @classmethod
     def _from_ordinal(cls, ordinal: int, freq) -> Period: ...
     @classmethod
-    def now(cls, freq: Frequency = ...) -> Period: ...
+    def now(cls, freq: Frequency) -> Period: ...
     def strftime(self, fmt: str | None) -> str: ...
     def to_timestamp(
         self,

diff --git a/pandas/_libs/tslibs/strptime.pyi b/pandas/_libs/tslibs/strptime.pyi
@@ -8,6 +8,7 @@ def array_strptime(
     exact: bool = ...,
     errors: str = ...,
     utc: bool = ...,
+    creso: int = ...,  # NPY_DATETIMEUNIT
 ) -> tuple[np.ndarray, np.ndarray]: ...
 
 # first ndarray is M8[ns], second is object ndarray of tzinfo | None
diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi
@@ -71,7 +71,7 @@ _S = TypeVar("_S", bound=timedelta)
 def get_unit_for_round(freq, creso: int) -> int: ...
 def disallow_ambiguous_unit(unit: str | None) -> None: ...
 def ints_to_pytimedelta(
-    arr: npt.NDArray[np.timedelta64],
+    m8values: npt.NDArray[np.timedelta64],
     box: bool = ...,
 ) -> npt.NDArray[np.object_]: ...
 def array_to_timedelta64(
@@ -165,8 +165,10 @@ class Timedelta(timedelta):
     def __gt__(self, other: timedelta) -> bool: ...
     def __hash__(self) -> int: ...
     def isoformat(self) -> str: ...
-    def to_numpy(self) -> np.timedelta64: ...
-    def view(self, dtype: npt.DTypeLike = ...) -> object: ...
+    def to_numpy(
+        self, dtype: npt.DTypeLike = ..., copy: bool = False
+    ) -> np.timedelta64: ...
+    def view(self, dtype: npt.DTypeLike) -> object: ...
     @property
     def unit(self) -> str: ...
     def as_unit(self, unit: str, round_ok: bool = ...) -> Timedelta: ...
diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi
@@ -183,7 +183,7 @@ class Timestamp(datetime):
     def is_year_end(self) -> bool: ...
     def to_pydatetime(self, warn: bool = ...) -> datetime: ...
     def to_datetime64(self) -> np.datetime64: ...
-    def to_period(self, freq: BaseOffset | str = ...) -> Period: ...
+    def to_period(self, freq: BaseOffset | str | None = None) -> Period: ...
     def to_julian_date(self) -> np.float64: ...
     @property
     def asm8(self) -> np.datetime64: ...

diff --git a/pandas/_libs/tslibs/tzconversion.pyi b/pandas/_libs/tslibs/tzconversion.pyi
@@ -10,7 +10,7 @@ from pandas._typing import npt
 
 # tz_convert_from_utc_single exposed for testing
 def tz_convert_from_utc_single(
-    val: np.int64, tz: tzinfo, creso: int = ...
+    utc_val: np.int64, tz: tzinfo, creso: int = ...
 ) -> np.int64: ...
 def tz_localize_to_utc(
     vals: npt.NDArray[np.int64],

diff --git a/pandas/_libs/tslibs/vectorized.pyi b/pandas/_libs/tslibs/vectorized.pyi
@@ -31,7 +31,7 @@ def get_resolution(
     reso: int = ...,  # NPY_DATETIMEUNIT
 ) -> Resolution: ...
 def ints_to_pydatetime(
-    arr: npt.NDArray[np.int64],
+    stamps: npt.NDArray[np.int64],
     tz: tzinfo | None = ...,
     box: str = ...,
     reso: int = ...,  # NPY_DATETIMEUNIT

diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi
@@ -111,8 +111,8 @@ def ewm(
     com: float,  # float64_t
     adjust: bool,
     ignore_na: bool,
-    deltas: np.ndarray,  # const float64_t[:]
-    normalize: bool,
+    deltas: np.ndarray | None = None,  # const float64_t[:]
+    normalize: bool = True,
 ) -> np.ndarray: ...  # np.ndarray[np.float64]
 def ewmcov(
     input_x: np.ndarray,  # const float64_t[:]

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -2314,8 +2314,7 @@ def _concat_same_type(
         return new_obj
 
     def copy(self, order: str = "C") -> Self:
-        # error: Unexpected keyword argument "order" for "copy"
-        new_obj = super().copy(order=order)  # type: ignore[call-arg]
+        new_obj = super().copy(order=order)
         new_obj._freq = self.freq
         return new_obj
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "meson-python==0.13.1",
     "meson==1.2.1",
     "wheel",
-    "Cython>=0.29.33,<3",  # Note: sync with setup.py, environment.yml and asv.conf.json
+    "Cython==3.0.5",  # Note: sync with setup.py, environment.yml and asv.conf.json
     # Any NumPy version should be fine for compiling.  Users are unlikely
     # to get a NumPy<1.25 so the result will be compatible with all relevant
     # NumPy versions (if not it is presumably compatible with their version).

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -3,7 +3,7 @@
 
 pip
 versioneer[toml]
-cython==0.29.33
+cython==3.0.5
 meson[ninja]==1.2.1
 meson-python==0.13.1
 pytest>=7.3.2

diff --git a/scripts/run_stubtest.py b/scripts/run_stubtest.py
@@ -47,6 +47,8 @@
     # stubtest might be too sensitive
     "pandas._libs.lib.NoDefault",
     "pandas._libs.lib._NoDefault.no_default",
+    # stubtest/Cython is not recognizing the default value for the dtype parameter
+    "pandas._libs.lib.map_infer_mask",
     # internal type alias (should probably be private)
     "pandas._libs.lib.ndarray_obj_2d",
     # runtime argument "owner" has a default value but stub argument does not

diff --git a/setup.py b/setup.py
@@ -37,7 +37,7 @@ def is_platform_mac():
 
 
 # note: sync with pyproject.toml, environment.yml and asv.conf.json
-min_cython_ver = "0.29.33"
+min_cython_ver = "3.0.5"
 
 try:
     from Cython import (