Skip to content

Commit

Permalink
PERF-#6478: Do not propagate new columns if they're identical to the …
Browse files Browse the repository at this point in the history
…previous ones (#6481)

Signed-off-by: Dmitry Chigarev <[email protected]>
  • Loading branch information
dchigarev authored Aug 21, 2023
1 parent 8ff352c commit cf335c3
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 1 deletion.
9 changes: 9 additions & 0 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,15 @@ def _set_columns(self, new_columns):
The new column labels.
"""
if self.has_materialized_columns:
# do not set new columns if they're identical to the previous ones
if (
isinstance(new_columns, pandas.Index)
and self.columns.identical(new_columns)
) or (
not isinstance(new_columns, pandas.Index)
and np.array_equal(self.columns.values, new_columns)
):
return
new_columns = self._validate_set_axis(new_columns, self._columns_cache)
if self.has_materialized_dtypes:
self.dtypes.index = new_columns
Expand Down
3 changes: 2 additions & 1 deletion modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2401,7 +2401,8 @@ def __setattr__(self, key, value):
# before it appears in __dict__.
if key in ("_query_compiler", "_siblings", "_cache") or key in self.__dict__:
pass
elif key in self and key not in dir(self):
# we have to check for the key in `dir(self)` first in order not to trigger columns computation
elif key not in dir(self) and key in self:
self.__setitem__(key, value)
# Note: return immediately so we don't keep this `key` as dataframe state.
# `__getattr__` will return the columns not present in `dir(self)`, so we do not need
Expand Down
73 changes: 73 additions & 0 deletions modin/test/storage_formats/pandas/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,29 @@ def validate_partitions_cache(df):
assert df._partitions[i, j].width() == column_widths[j]


def remove_axis_cache(df, axis, remove_lengths=True):
"""
Remove index/columns cache for the passed dataframe.
Parameters
----------
df : modin.pandas.DataFrame
axis : int
0 - remove index cache, 1 - remove columns cache.
remove_lengths : bool, default: True
Whether to remove row lengths/column widths cache.
"""
mf = df._query_compiler._modin_frame
if axis == 0:
mf.set_index_cache(None)
if remove_lengths:
mf._row_lengths_cache = None
else:
mf.set_columns_cache(None)
if remove_lengths:
mf._column_widths_cache = None


def test_aligning_blocks():
# Test problem when modin frames have the same number of rows, but different
# blocks (partition.list_of_blocks). See #2322 for details
Expand Down Expand Up @@ -1121,3 +1144,53 @@ def test_reindex_preserve_dtypes(kwargs):

reindexed_df = df.reindex(**kwargs)
assert reindexed_df._query_compiler._modin_frame.has_materialized_dtypes


def test_skip_set_columns():
"""
Verifies that the mechanism of skipping the actual ``._set_columns()`` call in case
the new columns are identical to the previous ones works properly.
In this test, we rely on the ``modin_frame._deferred_column`` attribute.
The new indices propagation is done lazily, and the ``deferred_column`` attribute
indicates whether there's a new indices propagation pending.
"""
df = pd.DataFrame({"col1": [1, 2, 3], "col2": [3, 4, 5]})
df.columns = ["col1", "col10"]
# Verifies that the new columns were successfully set in case they're actually new
assert df._query_compiler._modin_frame._deferred_column
assert np.all(df.columns.values == ["col1", "col10"])

df = pd.DataFrame({"col1": [1, 2, 3], "col2": [3, 4, 5]})
df.columns = ["col1", "col2"]
# Verifies that the new columns weren't set if they're equal to the previous ones
assert not df._query_compiler._modin_frame._deferred_column

df = pd.DataFrame({"col1": [1, 2, 3], "col2": [3, 4, 5]})
df.columns = pandas.Index(["col1", "col2"], name="new name")
# Verifies that the new columns were successfully set in case they's new metadata
assert df.columns.name == "new name"

df = pd.DataFrame(
{("a", "col1"): [1, 2, 3], ("a", "col2"): [3, 4, 5], ("b", "col1"): [6, 7, 8]}
)
df.columns = df.columns.copy()
# Verifies that the new columns weren't set if they're equal to the previous ones
assert not df._query_compiler._modin_frame._deferred_column

df = pd.DataFrame(
{("a", "col1"): [1, 2, 3], ("a", "col2"): [3, 4, 5], ("b", "col1"): [6, 7, 8]}
)
new_cols = df.columns[::-1]
df.columns = new_cols
# Verifies that the new columns were successfully set in case they're actually new
assert df._query_compiler._modin_frame._deferred_column
assert df.columns.equals(new_cols)

df = pd.DataFrame({"col1": [1, 2, 3], "col2": [3, 4, 5]})
remove_axis_cache(df, axis=1)
df.columns = ["col1", "col2"]
# Verifies that the computation of the old columns wasn't triggered for the sake
# of equality comparison, in this case the new columns should be set unconditionally,
# meaning that the '_deferred_column' has to be True
assert df._query_compiler._modin_frame._deferred_column

0 comments on commit cf335c3

Please sign in to comment.