From bc41d1edecad9b73f158adbe87f3c785871835dc Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Thu, 6 Feb 2025 20:56:31 +0100 Subject: [PATCH 1/2] Add missing standard includes (#17928) With the upcoming CCCL release we moved some includes around and it seems that we relied on transitively including `` in some files. Fix that and include all the headers used in at least those two files Authors: - Michael Schellenberger Costa (https://github.com/miscco) - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) - Muhammad Haseeb (https://github.com/mhaseeb123) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17928 --- cpp/include/cudf/column/column_device_view.cuh | 3 ++- cpp/src/copying/pack.cpp | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index aacb5ccfede..990dfee2d17 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,6 +39,7 @@ #include #include +#include #include /** diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index 0c6b7977752..869a83cf369 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -20,6 +20,12 @@ #include +#include +#include +#include +#include +#include + namespace cudf { namespace detail { From 6a032290eb8224802f2be8f9c8d6acf422b647f5 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 6 Feb 2025 17:43:29 -0600 Subject: [PATCH 2/2] Patch `__init__` of `cudf` constructors to parse through `cudf.pandas` proxy objects (#17878) Fixes: https://github.com/rapidsai/cuml/issues/6232 This PR patches `Series`, `Index` and `DataFrame` constructors in such a way that true objects are extracted from `cudf.pandas` proxy objects if they are passed to any of these constructors. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/17878 --- python/cudf/cudf/pandas/_wrappers/pandas.py | 81 +++++++++++++++++++ python/cudf/cudf/pandas/module_accelerator.py | 4 +- .../cudf_pandas_tests/test_cudf_pandas.py | 62 ++++++++++++++ 3 files changed, 145 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 526778b4ecb..5ec2b4b4a03 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import abc import copyreg +import functools import importlib import os import pickle @@ -37,6 +38,7 @@ _FunctionProxy, _maybe_wrap_result, _Unusable, + is_proxy_object, make_final_proxy_type as _make_final_proxy_type, make_intermediate_proxy_type as _make_intermediate_proxy_type, register_proxy_func, @@ -1734,6 +1736,85 @@ def _unpickle_obj(pickled_args): return obj +# Save the original __init__ methods +_original_Series_init = cudf.Series.__init__ +_original_DataFrame_init = cudf.DataFrame.__init__ +_original_Index_init = cudf.Index.__init__ +_original_IndexMeta_call = cudf.core.index.IndexMeta.__call__ + + +def wrap_init(original_init): + @functools.wraps(original_init) + def wrapped_init(self, data=None, *args, **kwargs): + if is_proxy_object(data): + data = data.as_gpu_object() + if ( + isinstance(data, type(self)) + and len(args) == 0 + and len(kwargs) == 0 + ): + # This short-circuits the constructor to avoid + # unnecessary work when the data is already a + # proxy object of the same type. + # It is a common case in `cuml` and `xgboost`. + # For perf impact see: + # https://github.com/rapidsai/cudf/pull/17878/files#r1936469215 + self.__dict__.update(data.__dict__) + return + original_init(self, data, *args, **kwargs) + + return wrapped_init + + +def wrap_call(original_call): + @functools.wraps(original_call) + def wrapped_call(cls, data, *args, **kwargs): + if is_proxy_object(data): + data = data.as_gpu_object() + return original_call(cls, data, *args, **kwargs) + + return wrapped_call + + +@functools.wraps(_original_DataFrame_init) +def DataFrame_init_(self, data, index=None, columns=None, *args, **kwargs): + data_is_proxy = is_proxy_object(data) + + if data_is_proxy: + data = data.as_gpu_object() + if is_proxy_object(index): + index = index.as_gpu_object() + if is_proxy_object(columns): + columns = columns.as_cpu_object() + if ( + ( + (data_is_proxy and isinstance(data, type(self))) + and (index is None) + and (columns is None) + ) + and len(args) == 0 + and len(kwargs) == 0 + ): + self.__dict__.update(data.__dict__) + return + _original_DataFrame_init(self, data, index, columns, *args, **kwargs) + + +def initial_setup(): + """ + This is a one-time setup function that can contain + any initialization code that needs to be run once + when the module is imported. Currently, it is used + to wrap the __init__ methods and enable pandas compatibility mode. + """ + cudf.Series.__init__ = wrap_init(_original_Series_init) + cudf.Index.__init__ = wrap_init(_original_Index_init) + cudf.DataFrame.__init__ = DataFrame_init_ + cudf.core.index.IndexMeta.__call__ = wrap_call(_original_IndexMeta_call) + + cudf.set_option("mode.pandas_compatible", True) + + copyreg.dispatch_table[pd.Timestamp] = _reduce_obj # same reducer/unpickler can be used for Timedelta: copyreg.dispatch_table[pd.Timedelta] = _reduce_obj diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py index 818971105cb..c4020887907 100644 --- a/python/cudf/cudf/pandas/module_accelerator.py +++ b/python/cudf/cudf/pandas/module_accelerator.py @@ -595,10 +595,10 @@ def install( ) mode = deduce_cudf_pandas_mode(slow_lib, fast_lib) if mode.use_fast_lib: - pandas_wrappers = importlib.import_module( + lib_wrappers = importlib.import_module( f".._wrappers.{mode.slow_lib}", __name__ ) - pandas_wrappers.cudf.set_option("mode.pandas_compatible", True) + lib_wrappers.initial_setup() try: (self,) = ( p diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 81e0f09f795..800702a6544 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -5,11 +5,13 @@ import collections import contextlib import copy +import cProfile import datetime import operator import os import pathlib import pickle +import pstats import subprocess import tempfile import time @@ -1910,6 +1912,66 @@ def test_series_dtype_property(): assert expected == actual +def assert_functions_called(profiler, functions): + # Process profiling data + stream = StringIO() + stats = pstats.Stats(profiler, stream=stream) + + # Get all called functions as (filename, lineno, func_name) + called_functions = {func[2] for func in stats.stats.keys()} + print(called_functions) + for func_str in functions: + assert func_str in called_functions + + +def test_cudf_series_from_cudf_pandas(): + s = xpd.Series([1, 2, 3]) + + with cProfile.Profile() as profiler: + gs = cudf.Series(s) + + assert_functions_called( + profiler, ["as_gpu_object", ""] + ) + + tm.assert_equal(s.as_gpu_object(), gs) + + +def test_cudf_dataframe_from_cudf_pandas(): + df = xpd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + + with cProfile.Profile() as profiler: + gdf = cudf.DataFrame(df) + + assert_functions_called( + profiler, ["as_gpu_object", ""] + ) + tm.assert_frame_equal(df.as_gpu_object(), gdf) + + df = xpd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]}) + gdf = cudf.DataFrame( + {"a": xpd.Series([1, 2, 3]), "b": xpd.Series([1, 2, 3])} + ) + + tm.assert_frame_equal(df.as_gpu_object(), gdf) + + df = xpd.DataFrame({0: [1, 2, 3], 1: [1, 2, 3]}) + gdf = cudf.DataFrame( + [xpd.Series([1, 1]), xpd.Series([2, 2]), xpd.Series([3, 3])] + ) + + tm.assert_frame_equal(df.as_gpu_object(), gdf) + + +def test_cudf_index_from_cudf_pandas(): + idx = xpd.Index([1, 2, 3]) + with cProfile.Profile() as profiler: + gidx = cudf.Index(idx) + assert_functions_called(profiler, ["as_gpu_object"]) + + tm.assert_equal(idx.as_gpu_object(), gidx) + + def test_numpy_data_access(): s = pd.Series([1, 2, 3]) xs = xpd.Series([1, 2, 3])