Skip to content

Commit

Permalink
Use more, cheaper dtype checking utilities in cudf Python (#18139)
Browse files Browse the repository at this point in the history
Avoids using potentially more expensive dtype checking utilities referenced in #12494

`is_string_dtype` -> `== CUDF_STRING_DTYPE`
`is_decimal_dtype` -> `isinstance`
`is_numeric_dtype` -> (new) `is_dtype_obj_numeric`

```python
In [1]: import numpy as np

In [2]: from cudf.api.types import is_numeric_dtype

In [3]: from cudf.utils.dtypes import is_dtype_obj_numeric

In [4]: dtype = np.dtype(np.int64)

In [5]: %timeit is_dtype_obj_numeric(dtype)
211 ns ± 2.26 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

In [6]: %timeit is_numeric_dtype(dtype)
1.14 μs ± 2.61 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
```

Also standardizes some imports from `cudf.api.types`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #18139
  • Loading branch information
mroeschke authored Mar 4, 2025
1 parent 8645992 commit c0c9dfe
Show file tree
Hide file tree
Showing 23 changed files with 155 additions and 128 deletions.
13 changes: 0 additions & 13 deletions python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,19 +73,6 @@ def is_numeric_dtype(obj):
return pd_types.is_numeric_dtype(obj)


# A version of numerical type check that does not include cudf decimals for
# places where we need to distinguish fixed and floating point numbers.
def _is_non_decimal_numeric_dtype(obj):
if isinstance(obj, _BaseDtype) or isinstance(
getattr(obj, "dtype", None), _BaseDtype
):
return False
try:
return pd_types.is_numeric_dtype(obj)
except TypeError:
return False


def is_integer(obj):
"""Return True if given object is integer.
Expand Down
12 changes: 8 additions & 4 deletions python/cudf/cudf/core/_internals/where.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
import numpy as np

import cudf
from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
from cudf.api.types import is_scalar
from cudf.core.dtypes import CategoricalDtype
from cudf.utils.dtypes import find_common_type, is_mixed_with_object_dtype
from cudf.utils.dtypes import (
find_common_type,
is_dtype_obj_numeric,
is_mixed_with_object_dtype,
)

if TYPE_CHECKING:
from cudf._typing import DtypeObj, ScalarLike
Expand All @@ -18,7 +22,7 @@

def _normalize_categorical(input_col, other):
if isinstance(input_col, cudf.core.column.CategoricalColumn):
if cudf.api.types.is_scalar(other):
if is_scalar(other):
try:
other = input_col._encode(other)
except ValueError:
Expand Down Expand Up @@ -81,7 +85,7 @@ def _check_and_cast_columns_with_other(
)
return _normalize_categorical(source_col, other.astype(source_dtype))

if _is_non_decimal_numeric_dtype(source_dtype) and as_column(
if is_dtype_obj_numeric(source_dtype, include_decimal=False) and as_column(
other
).can_cast_safely(source_dtype):
common_dtype = source_dtype
Expand Down
11 changes: 5 additions & 6 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import pylibcudf as plc

import cudf
from cudf.api.types import is_scalar
from cudf.core.column import column
from cudf.core.column.methods import ColumnMethods
from cudf.core.dtypes import CategoricalDtype, IntervalDtype
Expand Down Expand Up @@ -623,12 +624,10 @@ def ordered(self) -> bool:
return self.dtype.ordered

def __setitem__(self, key, value):
if cudf.api.types.is_scalar(
value
) and cudf.utils.utils._is_null_host_scalar(value):
if is_scalar(value) and cudf.utils.utils._is_null_host_scalar(value):
to_add_categories = 0
else:
if cudf.api.types.is_scalar(value):
if is_scalar(value):
arr = column.as_column(value, length=1, nan_as_null=False)
else:
arr = column.as_column(value, nan_as_null=False)
Expand All @@ -644,7 +643,7 @@ def __setitem__(self, key, value):
"category, set the categories first"
)

if cudf.api.types.is_scalar(value):
if is_scalar(value):
value = self._encode(value) if value is not None else value
else:
value = cudf.core.column.as_column(value).astype(self.dtype)
Expand Down Expand Up @@ -1045,7 +1044,7 @@ def _validate_fillna_value(
self, fill_value: ScalarLike | ColumnLike
) -> plc.Scalar | ColumnBase:
"""Align fill_value for .fillna based on column type."""
if cudf.api.types.is_scalar(fill_value):
if is_scalar(fill_value):
if fill_value != _DEFAULT_CATEGORICAL_VALUE:
try:
fill_value = self._encode(fill_value)
Expand Down
16 changes: 9 additions & 7 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,10 @@

import cudf
from cudf.api.types import (
_is_non_decimal_numeric_dtype,
_is_pandas_nullable_extension_dtype,
infer_dtype,
is_decimal_dtype,
is_dtype_equal,
is_scalar,
is_string_dtype,
)
from cudf.core._compat import PANDAS_GE_210
from cudf.core._internals import (
Expand Down Expand Up @@ -69,6 +66,7 @@
find_common_type,
get_time_unit,
is_column_like,
is_dtype_obj_numeric,
is_mixed_with_object_dtype,
min_signed_type,
min_unsigned_type,
Expand Down Expand Up @@ -858,7 +856,7 @@ def _fill(
if end <= begin or begin >= self.size:
return self if inplace else self.copy()

if not inplace or is_string_dtype(self.dtype):
if not inplace or self.dtype == CUDF_STRING_DTYPE:
with acquire_spill_lock():
result = type(self).from_pylibcudf(
plc.filling.fill(
Expand All @@ -868,7 +866,7 @@ def _fill(
fill_value,
)
)
if is_string_dtype(self.dtype):
if self.dtype == CUDF_STRING_DTYPE:
return self._mimic_inplace(result, inplace=True)
return result # type: ignore[return-value]

Expand Down Expand Up @@ -1599,7 +1597,10 @@ def cast(self, dtype: Dtype) -> ColumnBase:
self.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype)
)
)
if is_decimal_dtype(result.dtype):
if isinstance(
result.dtype,
(cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype),
):
result.dtype.precision = dtype.precision # type: ignore[union-attr]
return result

Expand Down Expand Up @@ -2993,7 +2994,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
# Notice, we can always cast pure null columns
not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)]
if len(not_null_col_dtypes) and all(
_is_non_decimal_numeric_dtype(dtype) and dtype.kind == "M"
is_dtype_obj_numeric(dtype, include_decimal=False)
and dtype.kind == "M"
for dtype in not_null_col_dtypes
):
common_dtype = find_common_type(not_null_col_dtypes)
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@

import cudf
import cudf.core.column.column as column
from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
from cudf.api.types import is_scalar
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column.column import ColumnBase, as_column
from cudf.core.column.methods import ColumnMethods, ParentType
from cudf.core.column.numerical import NumericalColumn
from cudf.core.dtypes import ListDtype
from cudf.core.missing import NA
from cudf.core.scalar import pa_scalar_to_plc_scalar
from cudf.utils.dtypes import SIZE_TYPE_DTYPE
from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_dtype_obj_numeric

if TYPE_CHECKING:
from collections.abc import Sequence
Expand Down Expand Up @@ -718,8 +718,8 @@ def take(self, lists_indices: ColumnLike) -> ParentType:
"lists_indices and list column is of different size."
)
if (
not _is_non_decimal_numeric_dtype(
lists_indices_col.children[1].dtype
not is_dtype_obj_numeric(
lists_indices_col.children[1].dtype, include_decimal=False
)
or lists_indices_col.children[1].dtype.kind not in "iu"
):
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

import cudf
import cudf.core.column.column as column
from cudf.api.types import is_integer, is_scalar
from cudf.api.types import infer_dtype, is_integer, is_scalar
from cudf.core._internals import binaryop
from cudf.core.buffer import acquire_spill_lock, as_buffer
from cudf.core.column.column import ColumnBase, as_column
Expand Down Expand Up @@ -439,7 +439,7 @@ def _process_values_for_isin(
except (MixedTypeError, TypeError) as e:
# There is a corner where `values` can be of `object` dtype
# but have values of homogeneous type.
inferred_dtype = cudf.api.types.infer_dtype(values)
inferred_dtype = infer_dtype(values)
if (
self.dtype.kind in {"i", "u"} and inferred_dtype == "integer"
) or (
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import cudf
import cudf.core.column.column as column
import cudf.core.column.datetime as datetime
from cudf.api.types import is_integer, is_scalar, is_string_dtype
from cudf.api.types import is_integer, is_scalar
from cudf.core._internals import binaryop
from cudf.core.buffer import Buffer, acquire_spill_lock
from cudf.core.column.column import ColumnBase
Expand Down Expand Up @@ -75,7 +75,7 @@ def __init__(self, parent):
if isinstance(parent.dtype, cudf.ListDtype)
else parent.dtype
)
if not is_string_dtype(value_type):
if value_type != CUDF_STRING_DTYPE:
raise AttributeError(
"Can only use .str accessor with string values"
)
Expand Down
40 changes: 20 additions & 20 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,7 @@
is_dict_like,
is_dtype_equal,
is_list_like,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
)
from cudf.core import column, indexing_utils, reshape
from cudf.core._compat import PANDAS_LT_300
Expand Down Expand Up @@ -90,6 +87,7 @@
cudf_dtype_from_pydata_dtype,
find_common_type,
is_column_like,
is_dtype_obj_numeric,
min_signed_type,
)
from cudf.utils.performance_tracking import _performance_tracking
Expand Down Expand Up @@ -145,7 +143,7 @@ def __setitem__(self, key, value):
return self._setitem_tuple_arg(key, value)

@_performance_tracking
def _can_downcast_to_series(self, df, arg):
def _can_downcast_to_series(self, df: DataFrame, arg):
"""
This method encapsulates the logic used
to determine whether or not the result of a loc/iloc
Expand All @@ -170,8 +168,8 @@ def _can_downcast_to_series(self, df, arg):
arg[1], slice
):
return True
dtypes = df.dtypes.values.tolist()
all_numeric = all(is_numeric_dtype(t) for t in dtypes)
dtypes = [dtype for _, dtype in df._dtypes]
all_numeric = all(is_dtype_obj_numeric(t) for t in dtypes)
if all_numeric or (
len(dtypes) and all(t == dtypes[0] for t in dtypes)
):
Expand Down Expand Up @@ -348,7 +346,7 @@ def _getitem_tuple_arg(self, arg):
df.index.name = columns_df.index.name
if not isinstance(
df.index, MultiIndex
) and is_numeric_dtype(df.index.dtype):
) and is_dtype_obj_numeric(df.index.dtype):
# Preserve the original index type.
df.index = df.index.astype(self._frame.index.dtype)
df = df.sort_values(by=[tmp_col_name, cantor_name])
Expand Down Expand Up @@ -3158,7 +3156,7 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
# If other was provided, process that next.
if isinstance(other, DataFrame):
other_cols = [other._data[col] for col in self._column_names]
elif cudf.api.types.is_scalar(other):
elif is_scalar(other):
other_cols = [other] * len(self._column_names)
elif isinstance(other, cudf.Series):
other_cols = other.to_pandas()
Expand Down Expand Up @@ -3788,14 +3786,14 @@ def agg(self, aggs, axis=None):
* Not supporting: ``axis``, ``*args``, ``**kwargs``
"""
dtypes = [self[col].dtype for col in self._column_names]
dtypes = [dtype for _, dtype in self._dtypes]
common_dtype = find_common_type(dtypes)
if common_dtype.kind != "b" and any(
dtype.kind == "b" for dtype in dtypes
):
raise MixedTypeError("Cannot create a column with mixed types")

if any(is_string_dtype(dt) for dt in dtypes):
if any(dt == CUDF_STRING_DTYPE for dt in dtypes):
raise NotImplementedError(
"DataFrame.agg() is not supported for "
"frames containing string columns"
Expand Down Expand Up @@ -4934,7 +4932,7 @@ def apply_rows(
"""
for col in incols:
current_col_dtype = self._data[col].dtype
if is_string_dtype(current_col_dtype) or isinstance(
if current_col_dtype == CUDF_STRING_DTYPE or isinstance(
current_col_dtype, cudf.CategoricalDtype
):
raise TypeError(
Expand Down Expand Up @@ -6294,8 +6292,8 @@ def make_false_column_like_self():
else:
# These checks must happen after the conversions above
# since numpy can't handle categorical dtypes.
self_is_str = is_string_dtype(self_col.dtype)
other_is_str = is_string_dtype(other_col.dtype)
self_is_str = self_col.dtype == CUDF_STRING_DTYPE
other_is_str = other_col.dtype == CUDF_STRING_DTYPE

if self_is_str != other_is_str:
# Strings can't compare to anything else.
Expand Down Expand Up @@ -6352,8 +6350,8 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
common_dtype = find_common_type(filtered.dtypes)
if (
not numeric_only
and is_string_dtype(common_dtype)
and any(not is_string_dtype(dt) for dt in filtered.dtypes)
and common_dtype == CUDF_STRING_DTYPE
and any(dtype != CUDF_STRING_DTYPE for dtype in filtered._dtypes)
):
raise TypeError(
f"Cannot perform row-wise {method} across mixed-dtype columns,"
Expand Down Expand Up @@ -6476,7 +6474,9 @@ def _reduce(

if numeric_only:
numeric_cols = (
name for name, dtype in self._dtypes if is_numeric_dtype(dtype)
name
for name, dtype in self._dtypes
if is_dtype_obj_numeric(dtype)
)
source = self._get_columns_by_label(numeric_cols)
if source.empty:
Expand Down Expand Up @@ -6507,7 +6507,7 @@ def _reduce(
raise NotImplementedError(
f"Column {col_label} with type {col.dtype} does not support {op}"
) from err
elif not is_numeric_dtype(col.dtype):
elif not is_dtype_obj_numeric(col.dtype):
raise TypeError(
"Non numeric columns passed with "
"`numeric_only=False`, pass `numeric_only=True` "
Expand All @@ -6523,9 +6523,9 @@ def _reduce(
source_dtypes = [dtype for _, dtype in source._dtypes]
common_dtype = find_common_type(source_dtypes)
if (
is_object_dtype(common_dtype)
common_dtype == CUDF_STRING_DTYPE
and any(
not is_object_dtype(dtype) for dtype in source_dtypes
dtype != CUDF_STRING_DTYPE for dtype in source_dtypes
)
or common_dtype.kind != "b"
and any(dtype.kind == "b" for dtype in source_dtypes)
Expand Down Expand Up @@ -8603,7 +8603,7 @@ def _find_common_dtypes_and_categories(
# default to the first non-null dtype
dtypes[idx] = cols[0].dtype
# If all the non-null dtypes are int/float, find a common dtype
if all(is_numeric_dtype(col.dtype) for col in cols):
if all(is_dtype_obj_numeric(col.dtype) for col in cols):
dtypes[idx] = find_common_type([col.dtype for col in cols])
# If all categorical dtypes, combine the categories
elif all(
Expand Down
Loading

0 comments on commit c0c9dfe

Please sign in to comment.