diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp index dcaa47e722b..4477ca388df 100644 --- a/cpp/tests/rolling/offset_row_window_test.cpp +++ b/cpp/tests/rolling/offset_row_window_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,18 +43,21 @@ auto constexpr null = int32_t{0}; // NULL representation for int32_t; auto no_nulls_list() { return nulls_at({}); } struct OffsetRowWindowTest : public cudf::test::BaseFixture { - static ints_column const _keys; // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; - static ints_column const _values; // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - struct rolling_runner { cudf::window_bounds _preceding, _following; cudf::size_type _min_periods; bool _grouped = true; + ints_column const _keys; // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; + ints_column const _values; // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; rolling_runner(cudf::window_bounds const& preceding, cudf::window_bounds const& following, cudf::size_type min_periods_ = 1) - : _preceding{preceding}, _following{following}, _min_periods{min_periods_} + : _preceding{preceding}, + _following{following}, + _min_periods{min_periods_}, + _keys{0, 0, 0, 0, 0, 0, 1, 1, 1, 1}, + _values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} { } @@ -80,9 +83,6 @@ struct OffsetRowWindowTest : public cudf::test::BaseFixture { }; }; -ints_column const OffsetRowWindowTest::_keys{0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; -ints_column const OffsetRowWindowTest::_values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - auto const AGG_COUNT_NON_NULL = cudf::make_count_aggregation(cudf::null_policy::EXCLUDE); auto const AGG_COUNT_ALL = @@ -96,7 +96,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_3_to_Minus_1) { auto const preceding = cudf::window_bounds::get(3); auto const following = cudf::window_bounds::get(-1); - auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(true); + auto run_rolling = rolling_runner{preceding, following}; + run_rolling.min_periods(1).grouped(true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), ints_column{{0, 1, 2, 2, 2, 2, 0, 1, 2, 2}, nulls_at({0, 6})}); @@ -136,7 +137,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_3_to_Minus_1) { auto const preceding = cudf::window_bounds::get(3); auto const following = cudf::window_bounds::get(-1); - auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(false); + auto run_rolling = rolling_runner{preceding, following}; + run_rolling.min_periods(1).grouped(false); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), ints_column{{0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, nulls_at({0})}); @@ -176,7 +178,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_0_to_2) { auto const preceding = cudf::window_bounds::get(0); auto const following = cudf::window_bounds::get(2); - auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(true); + auto run_rolling = rolling_runner{preceding, following}; + run_rolling.min_periods(1).grouped(true); CUDF_TEST_EXPECT_COLUMNS_EQUAL( *run_rolling(*AGG_COUNT_NON_NULL), @@ -219,7 +222,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2) { auto const preceding = cudf::window_bounds::get(0); auto const following = cudf::window_bounds::get(2); - auto run_rolling = rolling_runner{preceding, following}.min_periods(1).grouped(false); + auto run_rolling = rolling_runner{preceding, following}; + run_rolling.min_periods(1).grouped(false); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL), ints_column{{2, 2, 2, 2, 2, 2, 2, 2, 1, null}, nulls_at({9})}); diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 67a0aa7a781..89ac39b2be5 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2,7 +2,6 @@ from __future__ import annotations -import warnings from collections import abc from collections.abc import MutableSequence, Sequence from functools import cached_property @@ -1946,8 +1945,7 @@ def _reduce( skipna=skipna, min_count=min_count ) if isinstance(preprocessed, ColumnBase): - dtype = kwargs.pop("dtype", None) - return preprocessed.reduce(op, dtype, **kwargs) + return preprocessed.reduce(op, **kwargs) return preprocessed def _can_return_nan(self, skipna: bool | None = None) -> bool: @@ -2110,16 +2108,8 @@ def scan(self, scan_op: str, inclusive: bool, **kwargs) -> Self: ) ) - def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike: - if dtype is not None: - warnings.warn( - "dtype is deprecated and will be remove in a future release. " - "Cast the result (e.g. .astype) after the operation instead.", - FutureWarning, - ) - col_dtype = dtype - else: - col_dtype = self._reduction_result_dtype(reduction_op) + def reduce(self, reduction_op: str, **kwargs) -> ScalarLike: + col_dtype = self._reduction_result_dtype(reduction_op) # check empty case if len(self) <= self.null_count: @@ -2148,7 +2138,7 @@ def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike: }: scale = -plc_scalar.type().scale() # https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql - p = col_dtype.precision + p = col_dtype.precision # type: ignore[union-attr] nrows = len(self) if reduction_op in {"min", "max"}: new_p = p @@ -2162,7 +2152,7 @@ def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike: raise NotImplementedError( f"{reduction_op} not implemented for decimal types." ) - precision = max(min(new_p, col_dtype.MAX_PRECISION), 0) + precision = max(min(new_p, col_dtype.MAX_PRECISION), 0) # type: ignore[union-attr] new_dtype = type(col_dtype)(precision, scale) result_col = result_col.astype(new_dtype) elif isinstance(col_dtype, IntervalDtype): @@ -2322,13 +2312,14 @@ def build_column( offset=offset, null_count=null_count, ) - elif dtype.type in (np.object_, np.str_): + elif dtype == CUDF_STRING_DTYPE: return cudf.core.column.StringColumn( - data=data, - mask=mask, + data=data, # type: ignore[arg-type] size=size, + dtype=dtype, + mask=mask, offset=offset, - children=children, + children=children, # type: ignore[arg-type] null_count=null_count, ) elif isinstance(dtype, ListDtype): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c0ad33ec7d6..28e8b98edfe 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -21,7 +21,7 @@ import cudf.core.column.datetime as datetime from cudf.api.types import is_integer, is_scalar, is_string_dtype from cudf.core._internals import binaryop -from cudf.core.buffer import acquire_spill_lock +from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.core.scalar import pa_scalar_to_plc_scalar @@ -46,7 +46,6 @@ ScalarLike, SeriesOrIndex, ) - from cudf.core.buffer import Buffer from cudf.core.column.lists import ListColumn from cudf.core.column.numerical import NumericalColumn @@ -5595,13 +5594,14 @@ class StringColumn(column.ColumnBase): Parameters ---------- + data : Buffer + Buffer of the string data mask : Buffer The validity mask offset : int Data offset children : Tuple[Column] - Two non-null columns containing the string data and offsets - respectively + Columns containing the offsets """ _start_offset: int | None @@ -5629,14 +5629,20 @@ class StringColumn(column.ColumnBase): def __init__( self, - data: Buffer | None = None, + data: Buffer, + size: int | None, + dtype: np.dtype, mask: Buffer | None = None, - size: int | None = None, # TODO: make non-optional offset: int = 0, null_count: int | None = None, - children: tuple["column.ColumnBase", ...] = (), + children: tuple[column.ColumnBase] = (), # type: ignore[assignment] ): - dtype = cudf.api.types.dtype("object") + if not isinstance(data, Buffer): + raise ValueError("data must be a Buffer") + if dtype != CUDF_STRING_DTYPE: + raise ValueError(f"dtypy must be {CUDF_STRING_DTYPE}") + if len(children) > 1: + raise ValueError("StringColumn must have at most 1 offset column.") if size is None: for child in children: @@ -5731,8 +5737,6 @@ def base_size(self) -> int: # override for string column @property def data(self): - if self.base_data is None: - return None if self._data is None: if ( self.offset == 0 diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 1cbbac0f8cc..8b0ef9f0cc8 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -452,14 +452,13 @@ def sum( self, skipna: bool | None = None, min_count: int = 0, - dtype: Dtype | None = None, ) -> pd.Timedelta: return pd.Timedelta( # Since sum isn't overridden in Numerical[Base]Column, mypy only # sees the signature from Reducible (which doesn't have the extra # parameters from ColumnBase._reduce) so we have to ignore this. self.astype(np.dtype(np.int64)).sum( # type: ignore - skipna=skipna, min_count=min_count, dtype=dtype + skipna=skipna, min_count=min_count ), unit=self.time_unit, ).as_unit(self.time_unit) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 211d161696e..9d426ad6bf7 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1328,7 +1328,6 @@ def sum( self, axis=no_default, skipna=True, - dtype=None, numeric_only=False, min_count=0, **kwargs, @@ -1342,8 +1341,6 @@ def sum( Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. numeric_only : bool, default False If True, includes only float, int, boolean columns. If False, will raise error in-case there are @@ -1373,7 +1370,6 @@ def sum( "sum", axis=axis, skipna=skipna, - dtype=dtype, numeric_only=numeric_only, min_count=min_count, **kwargs, @@ -1384,7 +1380,6 @@ def product( self, axis=no_default, skipna=True, - dtype=None, numeric_only=False, min_count=0, **kwargs, @@ -1398,8 +1393,6 @@ def product( Axis for the function to be applied on. skipna: bool, default True Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. numeric_only : bool, default False If True, includes only float, int, boolean columns. If False, will raise error in-case there are @@ -1432,7 +1425,6 @@ def product( "prod" if axis in {1, "columns"} else "product", axis=axis, skipna=skipna, - dtype=dtype, numeric_only=numeric_only, min_count=min_count, **kwargs, diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 80ffce9e8be..75e38b9246a 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -512,14 +512,6 @@ def test_reduction_column_multiindex(): assert_eq(result, expected) -@pytest.mark.parametrize("op", ["sum", "product"]) -def test_dtype_deprecated(op): - ser = cudf.Series(range(5)) - with pytest.warns(FutureWarning): - result = getattr(ser, op)(dtype=np.dtype(np.int8)) - assert isinstance(result, np.int8) - - @pytest.mark.parametrize( "columns", [pd.RangeIndex(2), pd.Index([0, 1], dtype="int8")] ) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 164fcb06624..18aee0001c4 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -13,8 +13,11 @@ import pyarrow as pa import pytest +import rmm + import cudf from cudf import concat +from cudf.core.buffer import as_buffer from cudf.core.column.string import StringColumn from cudf.core.index import Index from cudf.testing import assert_eq @@ -1202,7 +1205,12 @@ def test_string_misc_name(ps_gs, name): def test_string_no_children_properties(): - empty_col = StringColumn(children=()) + empty_col = StringColumn( + as_buffer(rmm.DeviceBuffer(size=0)), + size=0, + dtype=np.dtype("object"), + children=(), + ) assert empty_col.base_children == () assert empty_col.base_size == 0