Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX-#5091: Handle pd.Grouper objects correctly #6174

Merged
merged 6 commits into from
May 22, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3114,7 +3114,12 @@ def _groupby_internal_columns(self, by, drop):
else:
if not isinstance(by, list):
by = [by] if by is not None else []
internal_by = [o for o in by if hashable(o) and o in self.columns]
internal_by = []
for o in by:
if isinstance(o, pandas.Grouper) and o.key in self.columns:
internal_by.append(o.key)
devin-petersohn marked this conversation as resolved.
Show resolved Hide resolved
elif hashable(o) and o in self.columns:
internal_by.append(o)
internal_qc = (
[self.getitem_column_array(internal_by)] if len(internal_by) else []
)
Expand Down
29 changes: 18 additions & 11 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,35 +476,42 @@ def groupby(
drop = by._parent is self
idx_name = by.name
by = by._query_compiler
elif isinstance(by, pandas.Grouper):
drop = by.key in self
elif is_list_like(by):
# fastpath for multi column groupby
if axis == 0 and all(
(
(hashable(o) and (o in self))
or isinstance(o, Series)
or (isinstance(o, pandas.Grouper) and o.key in self)
or (is_list_like(o) and len(o) == len(self.axes[axis]))
)
for o in by
):
# We want to split 'by's into those that belongs to the self (internal_by)
# and those that doesn't (external_by)
internal_by, external_by = [], []
has_external = False
processed_by = []

for current_by in by:
if hashable(current_by):
internal_by.append(current_by)
if isinstance(current_by, pandas.Grouper):
processed_by.append(current_by)
has_external = True
elif hashable(current_by):
processed_by.append(current_by)
elif isinstance(current_by, Series):
if current_by._parent is self:
internal_by.append(current_by.name)
processed_by.append(current_by.name)
else:
external_by.append(current_by._query_compiler)
processed_by.append(current_by._query_compiler)
has_external = True
else:
external_by.append(current_by)
has_external = True
processed_by.append(current_by)

by = internal_by + external_by
by = processed_by

if len(external_by) == 0:
by = self[internal_by]._query_compiler
if not has_external:
by = self[processed_by]._query_compiler

drop = True
else:
Expand Down
12 changes: 10 additions & 2 deletions modin/pandas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,11 +573,19 @@ def _internal_by(self):
internal_by = tuple()
if self._drop:
if is_list_like(self._by):
internal_by = tuple(by for by in self._by if isinstance(by, str))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you're fixing the order here, can you port the test case from #6164 and address the comments I left there?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

resolved. there's a separate test case for order now, but this doesn't fix #6164

internal_by_list = []
for by in self._by:
if isinstance(by, str):
internal_by_list.append(by)
elif isinstance(by, pandas.Grouper):
internal_by_list.append(by.key)
internal_by = tuple(internal_by_list)
elif isinstance(self._by, pandas.Grouper):
internal_by = tuple([self._by.key])
else:
ErrorMessage.catch_bugs_and_request_email(
failure_condition=not isinstance(self._by, BaseQueryCompiler),
extra_log=f"When 'drop' is True, 'by' must be either list-like or a QueryCompiler, met: {type(self._by)}.",
extra_log=f"When 'drop' is True, 'by' must be either list-like, Grouper, or a QueryCompiler, met: {type(self._by)}.",
)
internal_by = tuple(self._by.columns)

Expand Down
38 changes: 38 additions & 0 deletions modin/pandas/test/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2535,3 +2535,41 @@ def test_skew_corner_cases():
# https://github.com/modin-project/modin/issues/5545
modin_df, pandas_df = create_test_dfs({"col0": [1, 1], "col1": [171, 137]})
eval_general(modin_df, pandas_df, lambda df: df.groupby("col0").skew())


def test_groupby_with_grouper():
# See https://github.com/modin-project/modin/issues/5091 for more details
data = {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pyrito can you add some more columns too? We should have multiple column partitions on ray/dask

"id": [1, 2],
"time_stamp": ["2022-03-24 23:53:09", "2022-03-24 21:53:09"],
"count": [5, 5],
}
modin_df, pandas_df = create_test_dfs(data)
pyrito marked this conversation as resolved.
Show resolved Hide resolved

# modin Grouper is the same as the pandas Grouper objects
# test just for one key
eval_general(
modin_df,
pandas_df,
lambda df: df.groupby(pandas.Grouper(key="time_stamp", freq="D").mean()),
)

data = {
pyrito marked this conversation as resolved.
Show resolved Hide resolved
"Publish date": [
pd.Timestamp("2000-01-02"),
pd.Timestamp("2000-01-02"),
pd.Timestamp("2000-01-09"),
pd.Timestamp("2000-01-16"),
],
"ID": [0, 1, 1, 3],
"Price": [10, 20, 30, 40],
}
modin_df, pandas_df = create_test_dfs(data)

eval_general(
modin_df,
pandas_df,
lambda df: df.groupby(
[pandas.Grouper(key="Publish date", freq="1M"), "ID"]
).sum(),
)