Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF-#6398: Improved performance of list-like objects insertion into DataFrames #6476

Merged
merged 1 commit into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2757,6 +2757,8 @@ def getitem_row_array(self, key):
)

def setitem(self, axis, key, value):
if axis == 1:
value = self._wrap_column_data(value)
return self._setitem(axis=axis, key=key, value=value, how=None)

def _setitem(self, axis, key, value, how="inner"):
Expand Down Expand Up @@ -2922,6 +2924,7 @@ def _compute_duplicated(df): # pragma: no cover
# return a new one from here and let the front end handle the inplace
# update.
def insert(self, loc, column, value):
value = self._wrap_column_data(value)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks fine as a temporary hack.

However, there's a TODO message a few lines below mentioning that the insertion of a non-distributed data can be speeded-up significantly by changing apply_full_axis... call to the apply_select_indices(item_to_distribute=value, ...)

# TODO: rework by passing list-like values to `apply_select_indices`
# as an item to distribute

This way, we wouldn't need to wrap the newly created dataframe but propagate the value directly to the partitions, which, in theory, should be quite fast.

Could you please quickly check the approach from TODO message makes sense? If it'll be too inefficient or too complex to implement, we can stick to the changes introduced by this PR

if isinstance(value, type(self)):
value.columns = [column]
return self.insert_item(axis=1, loc=loc, value=value, how=None)
Expand Down Expand Up @@ -2954,6 +2957,25 @@ def insert(df, internal_indices=[]): # pragma: no cover
)
return self.__constructor__(new_modin_frame)

def _wrap_column_data(self, data):
"""
If the data is list-like, create a single column query compiler.

Parameters
----------
data : any

Returns
-------
data or PandasQueryCompiler
"""
if is_list_like(data):
return self.from_pandas(
pandas.DataFrame(pandas.Series(data, index=self.index)),
data_cls=type(self._modin_frame),
)
return data

# END Insert

def explode(self, column):
Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2511,7 +2511,7 @@ def setitem_unhashable_key(df, value):
value = value.T.reshape(-1)
if len(self) > 0:
value = value[: len(self)]
if not isinstance(value, (Series, Categorical, np.ndarray)):
if not isinstance(value, (Series, Categorical, np.ndarray, list, range)):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

list(value) makes a copy if value is already a list?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it makes an unnecessary copy.

value = list(value)

if not self._query_compiler.lazy_execution and len(self.index) == 0:
Expand Down
37 changes: 37 additions & 0 deletions modin/pandas/test/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership. The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import pytest

from modin.config import Engine, StorageFormat


def pytest_collection_modifyitems(items):
try:
if (
Engine.get() in ("Ray", "Unidist", "Dask", "Python")
and StorageFormat.get() != "Base"
):
for item in items:
if item.name in (
"test_dataframe_dt_index[3s-both-DateCol-0]",
"test_dataframe_dt_index[3s-right-DateCol-0]",
Comment on lines +27 to +28
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it seems that these test fail anytime the execution goes to this if-else branch

pandas_df[on] = pandas.date_range("22/06/1941", periods=12, freq="T")
modin_df[on] = pd.date_range("22/06/1941", periods=12, freq="T")

can we call .xfail() directly from there?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can, but we will definitely forget to remove this call when the problem is fixed.

):
item.add_marker(
pytest.mark.xfail(
reason="https://github.com/modin-project/modin/issues/6399"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't like to merge perf fixes that break current tests.

)
)
except ImportError:
# No engine
...

Check notice

Code scanning / CodeQL

Statement has no effect Note test

This statement has no effect.
17 changes: 15 additions & 2 deletions modin/pandas/test/dataframe/test_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
test_data_large_categorical_dataframe,
default_to_pandas_ignore_string,
)
from modin.config import NPartitions, StorageFormat
from modin.config import NPartitions, StorageFormat, Engine
from modin.test.test_utils import warns_that_defaulting_to_pandas

NPartitions.put(4)
Expand Down Expand Up @@ -850,7 +850,20 @@ def test_resampler_functions_with_arg(rule, axis, method_arg):
@pytest.mark.parametrize("rule", ["5T"])
@pytest.mark.parametrize("closed", ["left", "right"])
@pytest.mark.parametrize("label", ["right", "left"])
@pytest.mark.parametrize("on", [None, "DateColumn"])
@pytest.mark.parametrize(
"on",
[
None,
pytest.param(
"DateColumn",
marks=pytest.mark.xfail(
condition=Engine.get() in ("Ray", "Unidist", "Dask", "Python")
and StorageFormat.get() != "Base",
reason="https://github.com/modin-project/modin/issues/6399",
),
),
],
)
@pytest.mark.parametrize("level", [None, 1])
def test_resample_specific(rule, closed, label, on, level):
data, index = (
Expand Down
6 changes: 5 additions & 1 deletion modin/pandas/test/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2748,7 +2748,11 @@ def test_rolling_timedelta_window(center, closed, as_index, on):
pd_df = md_df._to_pandas()

if StorageFormat.get() == "Pandas":
assert md_df._query_compiler._modin_frame._partitions.shape[1] == 2
assert (
md_df._query_compiler._modin_frame._partitions.shape[1] == 2
if on is None
else 3
)

md_window = md_df.groupby("by", as_index=as_index).rolling(
datetime.timedelta(days=3), center=center, closed=closed, on=on
Expand Down
Loading