From 3b3bba9a83b6bec35e86826800004987afe24d47 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Thu, 10 Aug 2023 13:38:27 +0200 Subject: [PATCH] FIX-#6367: Enable support for 'groupby.size()' in reshuffling groupby (#6370) Signed-off-by: Dmitry Chigarev --- .../storage_formats/pandas/query_compiler.py | 10 +++++++- modin/pandas/groupby.py | 12 ++-------- .../storage_formats/pandas/test_internals.py | 23 ++++++++++++++++++- 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 9a5280844c4..4c069c23266 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -3480,7 +3480,15 @@ def _groupby_shuffle( original_agg_func = agg_func def agg_func(grp, *args, **kwargs): - return agg_method(grp, original_agg_func, *args, **kwargs) + result = agg_method(grp, original_agg_func, *args, **kwargs) + + # Convert Series to DataFrame + if result.ndim == 1: + result = result.to_frame( + MODIN_UNNAMED_SERIES_LABEL if result.name is None else result.name + ) + + return result result = obj._modin_frame.groupby( axis=axis, diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 7436a7f1637..0732d43db54 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -963,16 +963,8 @@ def size(self): idx_name=self._idx_name, **self._kwargs, ).size() - work_object = type(self)( - self._df, - self._by, - self._axis, - drop=False, - idx_name=None, - **self._kwargs, - ) - result = work_object._wrap_aggregation( - type(work_object._query_compiler).groupby_size, + result = self._wrap_aggregation( + type(self._query_compiler).groupby_size, numeric_only=False, ) if not isinstance(result, Series): diff --git a/modin/test/storage_formats/pandas/test_internals.py b/modin/test/storage_formats/pandas/test_internals.py index 53332c330b8..8c82ea2de75 100644 --- a/modin/test/storage_formats/pandas/test_internals.py +++ b/modin/test/storage_formats/pandas/test_internals.py @@ -17,13 +17,15 @@ test_data_values, df_equals, ) -from modin.config import NPartitions, Engine, MinPartitionSize +from modin.config import NPartitions, Engine, MinPartitionSize, ExperimentalGroupbyImpl from modin.distributed.dataframe.pandas import from_partitions from modin.core.storage_formats.pandas.utils import split_result_of_axis_func_pandas +from modin.utils import try_cast_to_pandas import numpy as np import pandas import pytest +import unittest.mock as mock NPartitions.put(4) @@ -1091,6 +1093,25 @@ def test_setitem_bool_preserve_dtypes(): assert df._query_compiler._modin_frame.has_materialized_dtypes +@pytest.mark.parametrize( + "modify_config", [{ExperimentalGroupbyImpl: True}], indirect=True +) +def test_groupby_size_shuffling(modify_config): + # verifies that 'groupby.size()' works with reshuffling implementation + # https://github.com/modin-project/modin/issues/6367 + df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]}) + modin_frame = df._query_compiler._modin_frame + + with mock.patch.object( + modin_frame, + "_apply_func_to_range_partitioning", + wraps=modin_frame._apply_func_to_range_partitioning, + ) as shuffling_method: + try_cast_to_pandas(df.groupby("a").size()) + + shuffling_method.assert_called() + + @pytest.mark.parametrize( "kwargs", [dict(axis=0, labels=[]), dict(axis=1, labels=["a"]), dict(axis=1, labels=[])],