Merge branch 'branch-25.04' into as_proxy_object

rapidsai · Feb 27, 2025 · bba133c · bba133c
2 parents 6ebdc85 + 79d0b75
commit bba133c
Show file tree

Hide file tree

Showing 7 changed files with 50 additions and 60 deletions.
diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,18 +43,21 @@ auto constexpr null = int32_t{0};  // NULL representation for int32_t;
 auto no_nulls_list() { return nulls_at({}); }
 
 struct OffsetRowWindowTest : public cudf::test::BaseFixture {
-  static ints_column const _keys;    // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
-  static ints_column const _values;  // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
   struct rolling_runner {
     cudf::window_bounds _preceding, _following;
     cudf::size_type _min_periods;
     bool _grouped = true;
+    ints_column const _keys;    // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
+    ints_column const _values;  // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
     rolling_runner(cudf::window_bounds const& preceding,
                    cudf::window_bounds const& following,
                    cudf::size_type min_periods_ = 1)
-      : _preceding{preceding}, _following{following}, _min_periods{min_periods_}
+      : _preceding{preceding},
+        _following{following},
+        _min_periods{min_periods_},
+        _keys{0, 0, 0, 0, 0, 0, 1, 1, 1, 1},
+        _values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
     {
     }
 
@@ -80,9 +83,6 @@ struct OffsetRowWindowTest : public cudf::test::BaseFixture {
   };
 };
 
-ints_column const OffsetRowWindowTest::_keys{0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
-ints_column const OffsetRowWindowTest::_values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
 auto const AGG_COUNT_NON_NULL =
   cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::EXCLUDE);
 auto const AGG_COUNT_ALL =
@@ -96,7 +96,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_3_to_Minus_1)
 {
   auto const preceding = cudf::window_bounds::get(3);
   auto const following = cudf::window_bounds::get(-1);
-  auto run_rolling     = rolling_runner{preceding, following}.min_periods(1).grouped(true);
+  auto run_rolling     = rolling_runner{preceding, following};
+  run_rolling.min_periods(1).grouped(true);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL),
                                  ints_column{{0, 1, 2, 2, 2, 2, 0, 1, 2, 2}, nulls_at({0, 6})});
@@ -136,7 +137,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_3_to_Minus_1)
 {
   auto const preceding = cudf::window_bounds::get(3);
   auto const following = cudf::window_bounds::get(-1);
-  auto run_rolling     = rolling_runner{preceding, following}.min_periods(1).grouped(false);
+  auto run_rolling     = rolling_runner{preceding, following};
+  run_rolling.min_periods(1).grouped(false);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL),
                                  ints_column{{0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, nulls_at({0})});
@@ -176,7 +178,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_0_to_2)
 {
   auto const preceding = cudf::window_bounds::get(0);
   auto const following = cudf::window_bounds::get(2);
-  auto run_rolling     = rolling_runner{preceding, following}.min_periods(1).grouped(true);
+  auto run_rolling     = rolling_runner{preceding, following};
+  run_rolling.min_periods(1).grouped(true);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *run_rolling(*AGG_COUNT_NON_NULL),
@@ -219,7 +222,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2)
 {
   auto const preceding = cudf::window_bounds::get(0);
   auto const following = cudf::window_bounds::get(2);
-  auto run_rolling     = rolling_runner{preceding, following}.min_periods(1).grouped(false);
+  auto run_rolling     = rolling_runner{preceding, following};
+  run_rolling.min_periods(1).grouped(false);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL),
                                  ints_column{{2, 2, 2, 2, 2, 2, 2, 2, 1, null}, nulls_at({9})});

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import warnings
 from collections import abc
 from collections.abc import MutableSequence, Sequence
 from functools import cached_property
@@ -1946,8 +1945,7 @@ def _reduce(
             skipna=skipna, min_count=min_count
         )
         if isinstance(preprocessed, ColumnBase):
-            dtype = kwargs.pop("dtype", None)
-            return preprocessed.reduce(op, dtype, **kwargs)
+            return preprocessed.reduce(op, **kwargs)
         return preprocessed
 
     def _can_return_nan(self, skipna: bool | None = None) -> bool:
@@ -2110,16 +2108,8 @@ def scan(self, scan_op: str, inclusive: bool, **kwargs) -> Self:
             )
         )
 
-    def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike:
-        if dtype is not None:
-            warnings.warn(
-                "dtype is deprecated and will be remove in a future release. "
-                "Cast the result (e.g. .astype) after the operation instead.",
-                FutureWarning,
-            )
-            col_dtype = dtype
-        else:
-            col_dtype = self._reduction_result_dtype(reduction_op)
+    def reduce(self, reduction_op: str, **kwargs) -> ScalarLike:
+        col_dtype = self._reduction_result_dtype(reduction_op)
 
         # check empty case
         if len(self) <= self.null_count:
@@ -2148,7 +2138,7 @@ def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike:
             }:
                 scale = -plc_scalar.type().scale()
                 # https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
-                p = col_dtype.precision
+                p = col_dtype.precision  # type: ignore[union-attr]
                 nrows = len(self)
                 if reduction_op in {"min", "max"}:
                     new_p = p
@@ -2162,7 +2152,7 @@ def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike:
                     raise NotImplementedError(
                         f"{reduction_op} not implemented for decimal types."
                     )
-                precision = max(min(new_p, col_dtype.MAX_PRECISION), 0)
+                precision = max(min(new_p, col_dtype.MAX_PRECISION), 0)  # type: ignore[union-attr]
                 new_dtype = type(col_dtype)(precision, scale)
                 result_col = result_col.astype(new_dtype)
             elif isinstance(col_dtype, IntervalDtype):
@@ -2322,13 +2312,14 @@ def build_column(
             offset=offset,
             null_count=null_count,
         )
-    elif dtype.type in (np.object_, np.str_):
+    elif dtype == CUDF_STRING_DTYPE:
         return cudf.core.column.StringColumn(
-            data=data,
-            mask=mask,
+            data=data,  # type: ignore[arg-type]
             size=size,
+            dtype=dtype,
+            mask=mask,
             offset=offset,
-            children=children,
+            children=children,  # type: ignore[arg-type]
             null_count=null_count,
         )
     elif isinstance(dtype, ListDtype):

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -21,7 +21,7 @@
 import cudf.core.column.datetime as datetime
 from cudf.api.types import is_integer, is_scalar, is_string_dtype
 from cudf.core._internals import binaryop
-from cudf.core.buffer import acquire_spill_lock
+from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.scalar import pa_scalar_to_plc_scalar
@@ -46,7 +46,6 @@
         ScalarLike,
         SeriesOrIndex,
     )
-    from cudf.core.buffer import Buffer
     from cudf.core.column.lists import ListColumn
     from cudf.core.column.numerical import NumericalColumn
 
@@ -5595,13 +5594,14 @@ class StringColumn(column.ColumnBase):
 
     Parameters
     ----------
+    data : Buffer
+        Buffer of the string data
     mask : Buffer
         The validity mask
     offset : int
         Data offset
     children : Tuple[Column]
-        Two non-null columns containing the string data and offsets
-        respectively
+        Columns containing the offsets
     """
 
     _start_offset: int | None
@@ -5629,14 +5629,20 @@ class StringColumn(column.ColumnBase):
 
     def __init__(
         self,
-        data: Buffer | None = None,
+        data: Buffer,
+        size: int | None,
+        dtype: np.dtype,
         mask: Buffer | None = None,
-        size: int | None = None,  # TODO: make non-optional
         offset: int = 0,
         null_count: int | None = None,
-        children: tuple["column.ColumnBase", ...] = (),
+        children: tuple[column.ColumnBase] = (),  # type: ignore[assignment]
     ):
-        dtype = cudf.api.types.dtype("object")
+        if not isinstance(data, Buffer):
+            raise ValueError("data must be a Buffer")
+        if dtype != CUDF_STRING_DTYPE:
+            raise ValueError(f"dtypy must be {CUDF_STRING_DTYPE}")
+        if len(children) > 1:
+            raise ValueError("StringColumn must have at most 1 offset column.")
 
         if size is None:
             for child in children:
@@ -5731,8 +5737,6 @@ def base_size(self) -> int:
     # override for string column
     @property
     def data(self):
-        if self.base_data is None:
-            return None
         if self._data is None:
             if (
                 self.offset == 0

diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
@@ -452,14 +452,13 @@ def sum(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype: Dtype | None = None,
     ) -> pd.Timedelta:
         return pd.Timedelta(
             # Since sum isn't overridden in Numerical[Base]Column, mypy only
             # sees the signature from Reducible (which doesn't have the extra
             # parameters from ColumnBase._reduce) so we have to ignore this.
             self.astype(np.dtype(np.int64)).sum(  # type: ignore
-                skipna=skipna, min_count=min_count, dtype=dtype
+                skipna=skipna, min_count=min_count
             ),
             unit=self.time_unit,
         ).as_unit(self.time_unit)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -1328,7 +1328,6 @@ def sum(
         self,
         axis=no_default,
         skipna=True,
-        dtype=None,
         numeric_only=False,
         min_count=0,
         **kwargs,
@@ -1342,8 +1341,6 @@ def sum(
             Axis for the function to be applied on.
         skipna: bool, default True
             Exclude NA/null values when computing the result.
-        dtype: data type
-            Data type to cast the result to.
         numeric_only : bool, default False
             If True, includes only float, int, boolean columns.
             If False, will raise error in-case there are
@@ -1373,7 +1370,6 @@ def sum(
             "sum",
             axis=axis,
             skipna=skipna,
-            dtype=dtype,
             numeric_only=numeric_only,
             min_count=min_count,
             **kwargs,
@@ -1384,7 +1380,6 @@ def product(
         self,
         axis=no_default,
         skipna=True,
-        dtype=None,
         numeric_only=False,
         min_count=0,
         **kwargs,
@@ -1398,8 +1393,6 @@ def product(
             Axis for the function to be applied on.
         skipna: bool, default True
             Exclude NA/null values when computing the result.
-        dtype: data type
-            Data type to cast the result to.
         numeric_only : bool, default False
             If True, includes only float, int, boolean columns.
             If False, will raise error in-case there are
@@ -1432,7 +1425,6 @@ def product(
             "prod" if axis in {1, "columns"} else "product",
             axis=axis,
             skipna=skipna,
-            dtype=dtype,
             numeric_only=numeric_only,
             min_count=min_count,
             **kwargs,

diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
@@ -512,14 +512,6 @@ def test_reduction_column_multiindex():
     assert_eq(result, expected)
 
 
-@pytest.mark.parametrize("op", ["sum", "product"])
-def test_dtype_deprecated(op):
-    ser = cudf.Series(range(5))
-    with pytest.warns(FutureWarning):
-        result = getattr(ser, op)(dtype=np.dtype(np.int8))
-    assert isinstance(result, np.int8)
-
-
 @pytest.mark.parametrize(
     "columns", [pd.RangeIndex(2), pd.Index([0, 1], dtype="int8")]
 )

diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
@@ -13,8 +13,11 @@
 import pyarrow as pa
 import pytest
 
+import rmm
+
 import cudf
 from cudf import concat
+from cudf.core.buffer import as_buffer
 from cudf.core.column.string import StringColumn
 from cudf.core.index import Index
 from cudf.testing import assert_eq
@@ -1202,7 +1205,12 @@ def test_string_misc_name(ps_gs, name):
 
 
 def test_string_no_children_properties():
-    empty_col = StringColumn(children=())
+    empty_col = StringColumn(
+        as_buffer(rmm.DeviceBuffer(size=0)),
+        size=0,
+        dtype=np.dtype("object"),
+        children=(),
+    )
     assert empty_col.base_children == ()
     assert empty_col.base_size == 0