diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 625938ca168..d19176c71c2 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -13,7 +13,6 @@ import pylibcudf import rmm -import cudf from cudf.core.abc import Serializable from cudf.utils.string import format_bytes @@ -504,7 +503,7 @@ def get_ptr_and_size(array_interface: Mapping) -> tuple[int, int]: shape = array_interface["shape"] or (1,) strides = array_interface["strides"] - itemsize = cudf.dtype(array_interface["typestr"]).itemsize + itemsize = numpy.dtype(array_interface["typestr"]).itemsize if strides is None or pylibcudf.column.is_c_contiguous( shape, strides, itemsize ): diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 6e83d4d7051..f7ad49aed9f 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -64,11 +64,6 @@ def dtype(arbitrary): raise TypeError(f"Unsupported type {np_dtype}") return np_dtype - if isinstance(arbitrary, str) and arbitrary in {"hex", "hex32", "hex64"}: - # read_csv only accepts "hex" - # e.g. test_csv_reader_hexadecimals, test_csv_reader_hexadecimal_overflow - return arbitrary - # use `pandas_dtype` to try and interpret # `arbitrary` as a Pandas extension type. # Return the corresponding NumPy/cuDF type. diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 533eeafedd4..d78ea83d578 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -476,16 +476,16 @@ def __repr__(self) -> str: # https://github.com/numpy/numpy/issues/17552 return f"{self.__class__.__name__}({self.value!s}, dtype={self.dtype})" - def _binop_result_dtype_or_error(self, other, op): + def _binop_result_dtype_or_error(self, other, op) -> np.dtype: if op in {"__eq__", "__ne__", "__lt__", "__gt__", "__le__", "__ge__"}: - return np.bool_ + return np.dtype(np.bool_) out_dtype = get_allowed_combinations_for_operator( self.dtype, other.dtype, op ) # datetime handling - if out_dtype in {"M", "m"}: + if out_dtype.kind in {"M", "m"}: if self.dtype.char in {"M", "m"} and other.dtype.char not in { "M", "m", @@ -505,7 +505,7 @@ def _binop_result_dtype_or_error(self, other, op): return np.dtype(f"m8[{res}]") return np.result_type(self.dtype, other.dtype) - return cudf.dtype(out_dtype) + return out_dtype def _binaryop(self, other, op: str): if is_scalar(other): diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 271f72556f3..9a4d773d5d6 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -174,7 +174,7 @@ def to_numeric( type_set = list(np.typecodes["UnsignedInteger"]) for t in type_set: - downcast_dtype = cudf.dtype(t) + downcast_dtype = np.dtype(t) if downcast_dtype.itemsize <= col.dtype.itemsize: if col.can_cast_safely(downcast_dtype): col = col.cast(downcast_dtype) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 56d83aa49a8..f83bbb5a8fa 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -7,7 +7,7 @@ import warnings from collections import abc from io import BytesIO, StringIO -from typing import cast +from typing import TYPE_CHECKING, cast import numpy as np import pandas as pd @@ -16,7 +16,7 @@ import cudf from cudf._lib.column import Column -from cudf.api.types import is_hashable, is_scalar +from cudf.api.types import is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.core.column_accessor import ColumnAccessor from cudf.utils import ioutils @@ -26,6 +26,10 @@ ) from cudf.utils.performance_tracking import _performance_tracking +if TYPE_CHECKING: + from cudf._typing import DtypeObj + + _CSV_HEX_TYPE_MAP = { "hex": np.dtype("int64"), "hex64": np.dtype("int64"), @@ -158,33 +162,49 @@ def read_csv( header = 0 hex_cols: list[abc.Hashable] = [] - new_dtypes: list[plc.DataType] | dict[abc.Hashable, plc.DataType] = [] + cudf_dtypes: list[DtypeObj] | dict[abc.Hashable, DtypeObj] | DtypeObj = [] + plc_dtypes: list[plc.DataType] | dict[abc.Hashable, plc.DataType] = [] if dtype is not None: if isinstance(dtype, abc.Mapping): - new_dtypes = {} + plc_dtypes = {} + cudf_dtypes = {} for k, col_type in dtype.items(): - if is_hashable(col_type) and col_type in _CSV_HEX_TYPE_MAP: + if isinstance(col_type, str) and col_type in _CSV_HEX_TYPE_MAP: col_type = _CSV_HEX_TYPE_MAP[col_type] hex_cols.append(str(k)) - new_dtypes[k] = _get_plc_data_type_from_dtype( - cudf.dtype(col_type) - ) - elif cudf.api.types.is_scalar(dtype) or isinstance( - dtype, (np.dtype, pd.api.extensions.ExtensionDtype, type) + cudf_dtype = cudf.dtype(col_type) + cudf_dtypes[k] = cudf_dtype + plc_dtypes[k] = _get_plc_data_type_from_dtype(cudf_dtype) + elif isinstance( + dtype, + ( + str, + np.dtype, + pd.api.extensions.ExtensionDtype, + cudf.core.dtypes._BaseDtype, + type, + ), ): - if is_hashable(dtype) and dtype in _CSV_HEX_TYPE_MAP: + if isinstance(dtype, str) and dtype in _CSV_HEX_TYPE_MAP: dtype = _CSV_HEX_TYPE_MAP[dtype] hex_cols.append(0) - - cast(list, new_dtypes).append(_get_plc_data_type_from_dtype(dtype)) + else: + dtype = cudf.dtype(dtype) + cudf_dtypes = dtype + cast(list, plc_dtypes).append(_get_plc_data_type_from_dtype(dtype)) elif isinstance(dtype, abc.Collection): for index, col_dtype in enumerate(dtype): - if is_hashable(col_dtype) and col_dtype in _CSV_HEX_TYPE_MAP: + if ( + isinstance(col_dtype, str) + and col_dtype in _CSV_HEX_TYPE_MAP + ): col_dtype = _CSV_HEX_TYPE_MAP[col_dtype] hex_cols.append(index) - - new_dtypes.append(_get_plc_data_type_from_dtype(col_dtype)) + else: + col_dtype = cudf.dtype(col_dtype) + cudf_dtypes.append(col_dtype) + plc_dtypes.append(_get_plc_data_type_from_dtype(col_dtype)) else: raise ValueError( "dtype should be a scalar/str/list-like/dict-like" @@ -243,7 +263,7 @@ def read_csv( if hex_cols is not None: options.set_parse_hex(list(hex_cols)) - options.set_dtypes(new_dtypes) + options.set_dtypes(plc_dtypes) if true_values is not None: options.set_true_values([str(val) for val in true_values]) @@ -266,15 +286,21 @@ def read_csv( ca = ColumnAccessor(data, rangeindex=len(data) == 0) df = cudf.DataFrame._from_data(ca) - if isinstance(dtype, abc.Mapping): - for k, v in dtype.items(): - if isinstance(cudf.dtype(v), cudf.CategoricalDtype): - df._data[str(k)] = df._data[str(k)].astype(v) - elif dtype == "category" or isinstance(dtype, cudf.CategoricalDtype): + # Cast result to categorical if specified in dtype= + # since categorical is not handled in pylibcudf + if isinstance(cudf_dtypes, dict): + to_category = { + k: v + for k, v in cudf_dtypes.items() + if isinstance(v, cudf.CategoricalDtype) + } + if to_category: + df = df.astype(to_category) + elif isinstance(cudf_dtypes, cudf.CategoricalDtype): df = df.astype(dtype) - elif isinstance(dtype, abc.Collection) and not is_scalar(dtype): - for index, col_dtype in enumerate(dtype): - if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype): + elif isinstance(cudf_dtypes, list): + for index, col_dtype in enumerate(cudf_dtypes): + if isinstance(col_dtype, cudf.CategoricalDtype): col_name = df._column_names[index] df._data[col_name] = df._data[col_name].astype(col_dtype) @@ -527,30 +553,11 @@ def _validate_args( ) -def _get_plc_data_type_from_dtype(dtype) -> plc.DataType: +def _get_plc_data_type_from_dtype(dtype: DtypeObj) -> plc.DataType: # TODO: Remove this work-around Dictionary types # in libcudf are fully mapped to categorical columns: # https://github.com/rapidsai/cudf/issues/3960 if isinstance(dtype, cudf.CategoricalDtype): + # TODO: should we do this generally in dtype_to_pylibcudf_type? dtype = dtype.categories.dtype - elif dtype == "category": - dtype = "str" - - if isinstance(dtype, str): - if dtype == "date32": - return plc.DataType(plc.types.TypeId.TIMESTAMP_DAYS) - elif dtype in ("date", "date64"): - return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS) - elif dtype == "timestamp": - return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS) - elif dtype == "timestamp[us]": - return plc.DataType(plc.types.TypeId.TIMESTAMP_MICROSECONDS) - elif dtype == "timestamp[s]": - return plc.DataType(plc.types.TypeId.TIMESTAMP_SECONDS) - elif dtype == "timestamp[ms]": - return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS) - elif dtype == "timestamp[ns]": - return plc.DataType(plc.types.TypeId.TIMESTAMP_NANOSECONDS) - - dtype = cudf.dtype(dtype) return dtype_to_pylibcudf_type(dtype) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index a7c7136ad4c..bcc9aacd2a7 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -527,7 +527,7 @@ def write_to_dataset( return metadata -def _parse_metadata(meta) -> tuple[bool, Any, Any]: +def _parse_metadata(meta) -> tuple[bool, Any, None | np.dtype]: file_is_range_index = False file_index_cols = None file_column_dtype = None @@ -541,7 +541,7 @@ def _parse_metadata(meta) -> tuple[bool, Any, Any]: ): file_is_range_index = True if "column_indexes" in meta and len(meta["column_indexes"]) == 1: - file_column_dtype = meta["column_indexes"][0]["numpy_type"] + file_column_dtype = np.dtype(meta["column_indexes"][0]["numpy_type"]) return file_is_range_index, file_index_cols, file_column_dtype @@ -2368,6 +2368,6 @@ def _process_metadata( df.index.names = index_col if df._num_columns == 0 and column_index_type is not None: - df._data.label_dtype = cudf.dtype(column_index_type) + df._data.label_dtype = column_index_type return df diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 20ec5dce12e..c545b840c0e 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -430,7 +430,9 @@ def _get_nan_for_dtype(dtype: DtypeObj) -> DtypeObj: return np.float64("nan") -def get_allowed_combinations_for_operator(dtype_l, dtype_r, op): +def get_allowed_combinations_for_operator( + dtype_l: np.dtype, dtype_r: np.dtype, op: str +) -> np.dtype: error = TypeError( f"{op} not supported between {dtype_l} and {dtype_r} scalars" ) @@ -456,18 +458,19 @@ def get_allowed_combinations_for_operator(dtype_l, dtype_r, op): # special rules for string if dtype_l == "object" or dtype_r == "object": if (dtype_l == dtype_r == "object") and op == "__add__": - return "str" + return CUDF_STRING_DTYPE else: raise error # Check if we can directly operate for valid_combo in allowed: - ltype, rtype, outtype = valid_combo - if np.can_cast(dtype_l.char, ltype) and np.can_cast( - dtype_r.char, rtype + ltype, rtype, outtype = valid_combo # type: ignore[misc] + if np.can_cast(dtype_l.char, ltype) and np.can_cast( # type: ignore[has-type] + dtype_r.char, + rtype, # type: ignore[has-type] ): - return outtype + return np.dtype(outtype) # type: ignore[has-type] raise error