From 2174f56a39d3ccf83581186f85d652336ed429aa Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 31 May 2024 13:53:33 +0200 Subject: [PATCH 1/5] API: Change Scalar to raise for invalid Python values This aligns with NumPy, which deprecated this since a while and raises an error now, for example for `Scalar(-1, dtype=np.uint8)`. Necessary to ensure we give the right errors when promotion doesn't up-cast based on values. --- python/cudf/cudf/tests/test_unaops.py | 5 ++++- python/cudf/cudf/utils/dtypes.py | 14 ++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index dbbf4fba3a6..5f5d79c1dce 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -81,7 +81,10 @@ def generate_valid_scalar_unaop_combos(): @pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos()) def test_scalar_unary_operations(slr, dtype, op): slr_host = np.array([slr])[0].astype(cudf.dtype(dtype)) - slr_device = cudf.Scalar(slr, dtype=dtype) + # The scalar may be out of bounds, so go via array force-cast + # NOTE: This is a change in behavior + slr = np.array(slr).astype(dtype)[()] + slr_device = cudf.Scalar(slr) expect = op(slr_host) got = op(slr_device) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 2aa3129ab30..0dec857ea96 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -253,16 +253,18 @@ def to_cudf_compatible_scalar(val, dtype=None): elif isinstance(val, datetime.timedelta): val = np.timedelta64(val) - val = _maybe_convert_to_default_type( - cudf.api.types.pandas_dtype(type(val)) - ).type(val) - if dtype is not None: - if isinstance(val, str) and np.dtype(dtype).kind == "M": + dtype = np.dtype(dtype) + if isinstance(val, str) and dtype.kind == "M": # pd.Timestamp can handle str, but not np.str_ val = pd.Timestamp(str(val)).to_datetime64().astype(dtype) else: - val = val.astype(dtype) + # At least datetimes cannot be converted to scalar via dtype.type: + val = np.array(val, dtype)[()] + else: + val = _maybe_convert_to_default_type( + cudf.api.types.pandas_dtype(type(val)) + ).type(val) if val.dtype.type is np.datetime64: time_unit, _ = np.datetime_data(val.dtype) From 14b9a0999f43d7f0b687bc431e6fd61798253e93 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Mon, 1 Jul 2024 12:23:42 +0200 Subject: [PATCH 2/5] TST: Add test to check scalar conversion behavior for python ints --- python/cudf/cudf/tests/test_scalar.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 05a91a8fea3..598d9456040 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -253,6 +253,23 @@ def test_generic_null_scalar_construction_fails(value): cudf.Scalar(value) +@pytest.mark.parametrize( + "value, dtype", [(1000, "uint8"), (2**30, "int16"), (-1, "uint16")] +) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +def test_scalar_out_of_bounds_pyint_fails(value, dtype): + # Test that we align with NumPy on scalar creation behavior from + # Python integers. + np_ver = np.lib.NumpyVersion(np.__version__) + if np_ver >= "2.0": + with pytest.raises(OverflowError): + cudf.Scalar(value, dtype) + else: + # NumPy allowed this, but it gives a DeprecationWarning on newer + # versions (which cudf did not used to do). + assert cudf.Scalar(value, dtype).value == np.dtype(dtype).type(value) + + @pytest.mark.parametrize( "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"] ) From 2e1eb4aa24566aa1f06327541978075ccfb47a91 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 31 May 2024 14:00:39 +0200 Subject: [PATCH 3/5] MAINT: Use `int8(-1)` as default categorical as `np.uint8(-1)` fails --- python/cudf/cudf/core/column/categorical.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index f763d3b4b0c..9aaccca349d 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -47,7 +47,9 @@ ) -_DEFAULT_CATEGORICAL_VALUE = -1 +# Using np.int8(-1) to allow silent wrap-around when casting to uint +# it may make sense to make this dtype specific or a function. +_DEFAULT_CATEGORICAL_VALUE = np.int8(-1) class CategoricalAccessor(ColumnMethods): From 174f2b34f6e0eefa0cb0d7aae5d083879fea4510 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 2 Jul 2024 21:33:38 +0200 Subject: [PATCH 4/5] TST: Fixup test mistake --- python/cudf/cudf/tests/test_scalar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 598d9456040..45c07a651d4 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -261,7 +261,7 @@ def test_scalar_out_of_bounds_pyint_fails(value, dtype): # Test that we align with NumPy on scalar creation behavior from # Python integers. np_ver = np.lib.NumpyVersion(np.__version__) - if np_ver >= "2.0": + if np_ver >= "2.0.0": with pytest.raises(OverflowError): cudf.Scalar(value, dtype) else: From 39d240f2a8bc08620ac73c29831be83273f3c837 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 12 Jul 2024 19:19:19 +0200 Subject: [PATCH 5/5] Use packaging.version for numpy version check as suggested --- python/cudf/cudf/tests/test_scalar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 45c07a651d4..195231e9960 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -8,6 +8,7 @@ import pandas as pd import pyarrow as pa import pytest +from packaging import version import rmm @@ -260,8 +261,7 @@ def test_generic_null_scalar_construction_fails(value): def test_scalar_out_of_bounds_pyint_fails(value, dtype): # Test that we align with NumPy on scalar creation behavior from # Python integers. - np_ver = np.lib.NumpyVersion(np.__version__) - if np_ver >= "2.0.0": + if version.parse(np.__version__) >= version.parse("2.0"): with pytest.raises(OverflowError): cudf.Scalar(value, dtype) else: