Use more, cheaper dtype checking utilities in cudf Python (#18139)

Avoids using potentially more expensive dtype checking utilities referenced in #12494 `is_string_dtype` -> `== CUDF_STRING_DTYPE` `is_decimal_dtype` -> `isinstance` `is_numeric_dtype` -> (new) `is_dtype_obj_numeric` ```python In [1]: import numpy as np In [2]: from cudf.api.types import is_numeric_dtype In [3]: from cudf.utils.dtypes import is_dtype_obj_numeric In [4]: dtype = np.dtype(np.int64) In [5]: %timeit is_dtype_obj_numeric(dtype) 211 ns ± 2.26 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each) In [6]: %timeit is_numeric_dtype(dtype) 1.14 μs ± 2.61 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each) ``` Also standardizes some imports from `cudf.api.types` Authors: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: #18139
rapidsai · Mar 4, 2025 · c0c9dfe · c0c9dfe
1 parent 8645992
commit c0c9dfe
Show file tree

Hide file tree

Showing 23 changed files with 155 additions and 128 deletions.
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
@@ -73,19 +73,6 @@ def is_numeric_dtype(obj):
     return pd_types.is_numeric_dtype(obj)
 
 
-# A version of numerical type check that does not include cudf decimals for
-# places where we need to distinguish fixed and floating point numbers.
-def _is_non_decimal_numeric_dtype(obj):
-    if isinstance(obj, _BaseDtype) or isinstance(
-        getattr(obj, "dtype", None), _BaseDtype
-    ):
-        return False
-    try:
-        return pd_types.is_numeric_dtype(obj)
-    except TypeError:
-        return False
-
-
 def is_integer(obj):
     """Return True if given object is integer.
 

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
@@ -7,9 +7,13 @@
 import numpy as np
 
 import cudf
-from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
+from cudf.api.types import is_scalar
 from cudf.core.dtypes import CategoricalDtype
-from cudf.utils.dtypes import find_common_type, is_mixed_with_object_dtype
+from cudf.utils.dtypes import (
+    find_common_type,
+    is_dtype_obj_numeric,
+    is_mixed_with_object_dtype,
+)
 
 if TYPE_CHECKING:
     from cudf._typing import DtypeObj, ScalarLike
@@ -18,7 +22,7 @@
 
 def _normalize_categorical(input_col, other):
     if isinstance(input_col, cudf.core.column.CategoricalColumn):
-        if cudf.api.types.is_scalar(other):
+        if is_scalar(other):
             try:
                 other = input_col._encode(other)
             except ValueError:
@@ -81,7 +85,7 @@ def _check_and_cast_columns_with_other(
             )
         return _normalize_categorical(source_col, other.astype(source_dtype))
 
-    if _is_non_decimal_numeric_dtype(source_dtype) and as_column(
+    if is_dtype_obj_numeric(source_dtype, include_decimal=False) and as_column(
         other
     ).can_cast_safely(source_dtype):
         common_dtype = source_dtype

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -14,6 +14,7 @@
 import pylibcudf as plc
 
 import cudf
+from cudf.api.types import is_scalar
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import CategoricalDtype, IntervalDtype
@@ -623,12 +624,10 @@ def ordered(self) -> bool:
         return self.dtype.ordered
 
     def __setitem__(self, key, value):
-        if cudf.api.types.is_scalar(
-            value
-        ) and cudf.utils.utils._is_null_host_scalar(value):
+        if is_scalar(value) and cudf.utils.utils._is_null_host_scalar(value):
             to_add_categories = 0
         else:
-            if cudf.api.types.is_scalar(value):
+            if is_scalar(value):
                 arr = column.as_column(value, length=1, nan_as_null=False)
             else:
                 arr = column.as_column(value, nan_as_null=False)
@@ -644,7 +643,7 @@ def __setitem__(self, key, value):
                 "category, set the categories first"
             )
 
-        if cudf.api.types.is_scalar(value):
+        if is_scalar(value):
             value = self._encode(value) if value is not None else value
         else:
             value = cudf.core.column.as_column(value).astype(self.dtype)
@@ -1045,7 +1044,7 @@ def _validate_fillna_value(
         self, fill_value: ScalarLike | ColumnLike
     ) -> plc.Scalar | ColumnBase:
         """Align fill_value for .fillna based on column type."""
-        if cudf.api.types.is_scalar(fill_value):
+        if is_scalar(fill_value):
             if fill_value != _DEFAULT_CATEGORICAL_VALUE:
                 try:
                     fill_value = self._encode(fill_value)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -23,13 +23,10 @@
 
 import cudf
 from cudf.api.types import (
-    _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
     infer_dtype,
-    is_decimal_dtype,
     is_dtype_equal,
     is_scalar,
-    is_string_dtype,
 )
 from cudf.core._compat import PANDAS_GE_210
 from cudf.core._internals import (
@@ -69,6 +66,7 @@
     find_common_type,
     get_time_unit,
     is_column_like,
+    is_dtype_obj_numeric,
     is_mixed_with_object_dtype,
     min_signed_type,
     min_unsigned_type,
@@ -858,7 +856,7 @@ def _fill(
         if end <= begin or begin >= self.size:
             return self if inplace else self.copy()
 
-        if not inplace or is_string_dtype(self.dtype):
+        if not inplace or self.dtype == CUDF_STRING_DTYPE:
             with acquire_spill_lock():
                 result = type(self).from_pylibcudf(
                     plc.filling.fill(
@@ -868,7 +866,7 @@ def _fill(
                         fill_value,
                     )
                 )
-            if is_string_dtype(self.dtype):
+            if self.dtype == CUDF_STRING_DTYPE:
                 return self._mimic_inplace(result, inplace=True)
             return result  # type: ignore[return-value]
 
@@ -1599,7 +1597,10 @@ def cast(self, dtype: Dtype) -> ColumnBase:
                 self.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype)
             )
         )
-        if is_decimal_dtype(result.dtype):
+        if isinstance(
+            result.dtype,
+            (cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype),
+        ):
             result.dtype.precision = dtype.precision  # type: ignore[union-attr]
         return result
 
@@ -2993,7 +2994,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     # Notice, we can always cast pure null columns
     not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)]
     if len(not_null_col_dtypes) and all(
-        _is_non_decimal_numeric_dtype(dtype) and dtype.kind == "M"
+        is_dtype_obj_numeric(dtype, include_decimal=False)
+        and dtype.kind == "M"
         for dtype in not_null_col_dtypes
     ):
         common_dtype = find_common_type(not_null_col_dtypes)

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
@@ -14,15 +14,15 @@
 
 import cudf
 import cudf.core.column.column as column
-from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
+from cudf.api.types import is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import ColumnBase, as_column
 from cudf.core.column.methods import ColumnMethods, ParentType
 from cudf.core.column.numerical import NumericalColumn
 from cudf.core.dtypes import ListDtype
 from cudf.core.missing import NA
 from cudf.core.scalar import pa_scalar_to_plc_scalar
-from cudf.utils.dtypes import SIZE_TYPE_DTYPE
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_dtype_obj_numeric
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -718,8 +718,8 @@ def take(self, lists_indices: ColumnLike) -> ParentType:
                 "lists_indices and list column is of different size."
             )
         if (
-            not _is_non_decimal_numeric_dtype(
-                lists_indices_col.children[1].dtype
+            not is_dtype_obj_numeric(
+                lists_indices_col.children[1].dtype, include_decimal=False
             )
             or lists_indices_col.children[1].dtype.kind not in "iu"
         ):

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -14,7 +14,7 @@
 
 import cudf
 import cudf.core.column.column as column
-from cudf.api.types import is_integer, is_scalar
+from cudf.api.types import infer_dtype, is_integer, is_scalar
 from cudf.core._internals import binaryop
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.core.column.column import ColumnBase, as_column
@@ -439,7 +439,7 @@ def _process_values_for_isin(
         except (MixedTypeError, TypeError) as e:
             # There is a corner where `values` can be of `object` dtype
             # but have values of homogeneous type.
-            inferred_dtype = cudf.api.types.infer_dtype(values)
+            inferred_dtype = infer_dtype(values)
             if (
                 self.dtype.kind in {"i", "u"} and inferred_dtype == "integer"
             ) or (

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -18,7 +18,7 @@
 import cudf
 import cudf.core.column.column as column
 import cudf.core.column.datetime as datetime
-from cudf.api.types import is_integer, is_scalar, is_string_dtype
+from cudf.api.types import is_integer, is_scalar
 from cudf.core._internals import binaryop
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column.column import ColumnBase
@@ -75,7 +75,7 @@ def __init__(self, parent):
             if isinstance(parent.dtype, cudf.ListDtype)
             else parent.dtype
         )
-        if not is_string_dtype(value_type):
+        if value_type != CUDF_STRING_DTYPE:
             raise AttributeError(
                 "Can only use .str accessor with string values"
             )

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -41,10 +41,7 @@
     is_dict_like,
     is_dtype_equal,
     is_list_like,
-    is_numeric_dtype,
-    is_object_dtype,
     is_scalar,
-    is_string_dtype,
 )
 from cudf.core import column, indexing_utils, reshape
 from cudf.core._compat import PANDAS_LT_300
@@ -90,6 +87,7 @@
     cudf_dtype_from_pydata_dtype,
     find_common_type,
     is_column_like,
+    is_dtype_obj_numeric,
     min_signed_type,
 )
 from cudf.utils.performance_tracking import _performance_tracking
@@ -145,7 +143,7 @@ def __setitem__(self, key, value):
         return self._setitem_tuple_arg(key, value)
 
     @_performance_tracking
-    def _can_downcast_to_series(self, df, arg):
+    def _can_downcast_to_series(self, df: DataFrame, arg):
         """
         This method encapsulates the logic used
         to determine whether or not the result of a loc/iloc
@@ -170,8 +168,8 @@ def _can_downcast_to_series(self, df, arg):
                     arg[1], slice
                 ):
                     return True
-            dtypes = df.dtypes.values.tolist()
-            all_numeric = all(is_numeric_dtype(t) for t in dtypes)
+            dtypes = [dtype for _, dtype in df._dtypes]
+            all_numeric = all(is_dtype_obj_numeric(t) for t in dtypes)
             if all_numeric or (
                 len(dtypes) and all(t == dtypes[0] for t in dtypes)
             ):
@@ -348,7 +346,7 @@ def _getitem_tuple_arg(self, arg):
                     df.index.name = columns_df.index.name
                     if not isinstance(
                         df.index, MultiIndex
-                    ) and is_numeric_dtype(df.index.dtype):
+                    ) and is_dtype_obj_numeric(df.index.dtype):
                         # Preserve the original index type.
                         df.index = df.index.astype(self._frame.index.dtype)
                     df = df.sort_values(by=[tmp_col_name, cantor_name])
@@ -3158,7 +3156,7 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
         # If other was provided, process that next.
         if isinstance(other, DataFrame):
             other_cols = [other._data[col] for col in self._column_names]
-        elif cudf.api.types.is_scalar(other):
+        elif is_scalar(other):
             other_cols = [other] * len(self._column_names)
         elif isinstance(other, cudf.Series):
             other_cols = other.to_pandas()
@@ -3788,14 +3786,14 @@ def agg(self, aggs, axis=None):
             * Not supporting: ``axis``, ``*args``, ``**kwargs``
 
         """
-        dtypes = [self[col].dtype for col in self._column_names]
+        dtypes = [dtype for _, dtype in self._dtypes]
         common_dtype = find_common_type(dtypes)
         if common_dtype.kind != "b" and any(
             dtype.kind == "b" for dtype in dtypes
         ):
             raise MixedTypeError("Cannot create a column with mixed types")
 
-        if any(is_string_dtype(dt) for dt in dtypes):
+        if any(dt == CUDF_STRING_DTYPE for dt in dtypes):
             raise NotImplementedError(
                 "DataFrame.agg() is not supported for "
                 "frames containing string columns"
@@ -4934,7 +4932,7 @@ def apply_rows(
         """
         for col in incols:
             current_col_dtype = self._data[col].dtype
-            if is_string_dtype(current_col_dtype) or isinstance(
+            if current_col_dtype == CUDF_STRING_DTYPE or isinstance(
                 current_col_dtype, cudf.CategoricalDtype
             ):
                 raise TypeError(
@@ -6294,8 +6292,8 @@ def make_false_column_like_self():
                     else:
                         # These checks must happen after the conversions above
                         # since numpy can't handle categorical dtypes.
-                        self_is_str = is_string_dtype(self_col.dtype)
-                        other_is_str = is_string_dtype(other_col.dtype)
+                        self_is_str = self_col.dtype == CUDF_STRING_DTYPE
+                        other_is_str = other_col.dtype == CUDF_STRING_DTYPE
 
                     if self_is_str != other_is_str:
                         # Strings can't compare to anything else.
@@ -6352,8 +6350,8 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
         common_dtype = find_common_type(filtered.dtypes)
         if (
             not numeric_only
-            and is_string_dtype(common_dtype)
-            and any(not is_string_dtype(dt) for dt in filtered.dtypes)
+            and common_dtype == CUDF_STRING_DTYPE
+            and any(dtype != CUDF_STRING_DTYPE for dtype in filtered._dtypes)
         ):
             raise TypeError(
                 f"Cannot perform row-wise {method} across mixed-dtype columns,"
@@ -6476,7 +6474,9 @@ def _reduce(
 
         if numeric_only:
             numeric_cols = (
-                name for name, dtype in self._dtypes if is_numeric_dtype(dtype)
+                name
+                for name, dtype in self._dtypes
+                if is_dtype_obj_numeric(dtype)
             )
             source = self._get_columns_by_label(numeric_cols)
             if source.empty:
@@ -6507,7 +6507,7 @@ def _reduce(
                         raise NotImplementedError(
                             f"Column {col_label} with type {col.dtype} does not support {op}"
                         ) from err
-                    elif not is_numeric_dtype(col.dtype):
+                    elif not is_dtype_obj_numeric(col.dtype):
                         raise TypeError(
                             "Non numeric columns passed with "
                             "`numeric_only=False`, pass `numeric_only=True` "
@@ -6523,9 +6523,9 @@ def _reduce(
                 source_dtypes = [dtype for _, dtype in source._dtypes]
                 common_dtype = find_common_type(source_dtypes)
                 if (
-                    is_object_dtype(common_dtype)
+                    common_dtype == CUDF_STRING_DTYPE
                     and any(
-                        not is_object_dtype(dtype) for dtype in source_dtypes
+                        dtype != CUDF_STRING_DTYPE for dtype in source_dtypes
                     )
                     or common_dtype.kind != "b"
                     and any(dtype.kind == "b" for dtype in source_dtypes)
@@ -8603,7 +8603,7 @@ def _find_common_dtypes_and_categories(
         # default to the first non-null dtype
         dtypes[idx] = cols[0].dtype
         # If all the non-null dtypes are int/float, find a common dtype
-        if all(is_numeric_dtype(col.dtype) for col in cols):
+        if all(is_dtype_obj_numeric(col.dtype) for col in cols):
             dtypes[idx] = find_common_type([col.dtype for col in cols])
         # If all categorical dtypes, combine the categories
         elif all(