Skip to content

Commit

Permalink
FIX-#6367: Enable support for 'groupby.size()' in reshuffling groupby (
Browse files Browse the repository at this point in the history
…#6370)

Signed-off-by: Dmitry Chigarev <[email protected]>
  • Loading branch information
dchigarev authored and RehanSD committed Aug 22, 2023
1 parent 958a900 commit be19563
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 12 deletions.
10 changes: 9 additions & 1 deletion modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3460,7 +3460,15 @@ def _groupby_shuffle(
original_agg_func = agg_func

def agg_func(grp, *args, **kwargs):
return agg_method(grp, original_agg_func, *args, **kwargs)
result = agg_method(grp, original_agg_func, *args, **kwargs)

# Convert Series to DataFrame
if result.ndim == 1:
result = result.to_frame(
MODIN_UNNAMED_SERIES_LABEL if result.name is None else result.name
)

return result

result = obj._modin_frame.groupby(
axis=axis,
Expand Down
12 changes: 2 additions & 10 deletions modin/pandas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -963,16 +963,8 @@ def size(self):
idx_name=self._idx_name,
**self._kwargs,
).size()
work_object = type(self)(
self._df,
self._by,
self._axis,
drop=False,
idx_name=None,
**self._kwargs,
)
result = work_object._wrap_aggregation(
type(work_object._query_compiler).groupby_size,
result = self._wrap_aggregation(
type(self._query_compiler).groupby_size,
numeric_only=False,
)
if not isinstance(result, Series):
Expand Down
34 changes: 33 additions & 1 deletion modin/test/storage_formats/pandas/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@
test_data_values,
df_equals,
)
from modin.config import NPartitions, Engine, MinPartitionSize
from modin.config import NPartitions, Engine, MinPartitionSize, ExperimentalGroupbyImpl
from modin.distributed.dataframe.pandas import from_partitions
from modin.core.storage_formats.pandas.utils import split_result_of_axis_func_pandas
from modin.utils import try_cast_to_pandas

import numpy as np
import pandas
import pytest
import unittest.mock as mock

NPartitions.put(4)

Expand Down Expand Up @@ -1062,3 +1064,33 @@ def test_setitem_bool_preserve_dtypes():
# scalar as a col_loc
df.loc[indexer, "a"] = 2.0
assert df._query_compiler._modin_frame.has_materialized_dtypes


@pytest.mark.parametrize(
"modify_config", [{ExperimentalGroupbyImpl: True}], indirect=True
)
def test_groupby_size_shuffling(modify_config):
# verifies that 'groupby.size()' works with reshuffling implementation
# https://github.com/modin-project/modin/issues/6367
df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]})
modin_frame = df._query_compiler._modin_frame

with mock.patch.object(
modin_frame,
"_apply_func_to_range_partitioning",
wraps=modin_frame._apply_func_to_range_partitioning,
) as shuffling_method:
try_cast_to_pandas(df.groupby("a").size())

shuffling_method.assert_called()


@pytest.mark.parametrize(
"kwargs",
[dict(axis=0, labels=[]), dict(axis=1, labels=["a"]), dict(axis=1, labels=[])],
)
def test_reindex_preserve_dtypes(kwargs):
df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]})

reindexed_df = df.reindex(**kwargs)
assert reindexed_df._query_compiler._modin_frame.has_materialized_dtypes

0 comments on commit be19563

Please sign in to comment.