PERF-#6478: Do not propagate new columns if they're identical to the …

…previous ones (#6481) Signed-off-by: Dmitry Chigarev <[email protected]>
modin-project · Aug 21, 2023 · cf335c3 · cf335c3
1 parent 8ff352c
commit cf335c3
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 1 deletion.
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -579,6 +579,15 @@ def _set_columns(self, new_columns):
            The new column labels.
         """
         if self.has_materialized_columns:
+            # do not set new columns if they're identical to the previous ones
+            if (
+                isinstance(new_columns, pandas.Index)
+                and self.columns.identical(new_columns)
+            ) or (
+                not isinstance(new_columns, pandas.Index)
+                and np.array_equal(self.columns.values, new_columns)
+            ):
+                return
             new_columns = self._validate_set_axis(new_columns, self._columns_cache)
             if self.has_materialized_dtypes:
                 self.dtypes.index = new_columns

diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
@@ -2401,7 +2401,8 @@ def __setattr__(self, key, value):
         #   before it appears in __dict__.
         if key in ("_query_compiler", "_siblings", "_cache") or key in self.__dict__:
             pass
-        elif key in self and key not in dir(self):
+        # we have to check for the key in `dir(self)` first in order not to trigger columns computation
+        elif key not in dir(self) and key in self:
             self.__setitem__(key, value)
             # Note: return immediately so we don't keep this `key` as dataframe state.
             # `__getattr__` will return the columns not present in `dir(self)`, so we do not need

diff --git a/modin/test/storage_formats/pandas/test_internals.py b/modin/test/storage_formats/pandas/test_internals.py
@@ -148,6 +148,29 @@ def validate_partitions_cache(df):
             assert df._partitions[i, j].width() == column_widths[j]
 
 
+def remove_axis_cache(df, axis, remove_lengths=True):
+    """
+    Remove index/columns cache for the passed dataframe.
+
+    Parameters
+    ----------
+    df : modin.pandas.DataFrame
+    axis : int
+        0 - remove index cache, 1 - remove columns cache.
+    remove_lengths : bool, default: True
+        Whether to remove row lengths/column widths cache.
+    """
+    mf = df._query_compiler._modin_frame
+    if axis == 0:
+        mf.set_index_cache(None)
+        if remove_lengths:
+            mf._row_lengths_cache = None
+    else:
+        mf.set_columns_cache(None)
+        if remove_lengths:
+            mf._column_widths_cache = None
+
+
 def test_aligning_blocks():
     # Test problem when modin frames have the same number of rows, but different
     # blocks (partition.list_of_blocks). See #2322 for details
@@ -1121,3 +1144,53 @@ def test_reindex_preserve_dtypes(kwargs):
 
     reindexed_df = df.reindex(**kwargs)
     assert reindexed_df._query_compiler._modin_frame.has_materialized_dtypes
+
+
+def test_skip_set_columns():
+    """
+    Verifies that the mechanism of skipping the actual ``._set_columns()`` call in case
+    the new columns are identical to the previous ones works properly.
+
+    In this test, we rely on the ``modin_frame._deferred_column`` attribute.
+    The new indices propagation is done lazily, and the ``deferred_column`` attribute
+    indicates whether there's a new indices propagation pending.
+    """
+    df = pd.DataFrame({"col1": [1, 2, 3], "col2": [3, 4, 5]})
+    df.columns = ["col1", "col10"]
+    # Verifies that the new columns were successfully set in case they're actually new
+    assert df._query_compiler._modin_frame._deferred_column
+    assert np.all(df.columns.values == ["col1", "col10"])
+
+    df = pd.DataFrame({"col1": [1, 2, 3], "col2": [3, 4, 5]})
+    df.columns = ["col1", "col2"]
+    # Verifies that the new columns weren't set if they're equal to the previous ones
+    assert not df._query_compiler._modin_frame._deferred_column
+
+    df = pd.DataFrame({"col1": [1, 2, 3], "col2": [3, 4, 5]})
+    df.columns = pandas.Index(["col1", "col2"], name="new name")
+    # Verifies that the new columns were successfully set in case they's new metadata
+    assert df.columns.name == "new name"
+
+    df = pd.DataFrame(
+        {("a", "col1"): [1, 2, 3], ("a", "col2"): [3, 4, 5], ("b", "col1"): [6, 7, 8]}
+    )
+    df.columns = df.columns.copy()
+    # Verifies that the new columns weren't set if they're equal to the previous ones
+    assert not df._query_compiler._modin_frame._deferred_column
+
+    df = pd.DataFrame(
+        {("a", "col1"): [1, 2, 3], ("a", "col2"): [3, 4, 5], ("b", "col1"): [6, 7, 8]}
+    )
+    new_cols = df.columns[::-1]
+    df.columns = new_cols
+    # Verifies that the new columns were successfully set in case they're actually new
+    assert df._query_compiler._modin_frame._deferred_column
+    assert df.columns.equals(new_cols)
+
+    df = pd.DataFrame({"col1": [1, 2, 3], "col2": [3, 4, 5]})
+    remove_axis_cache(df, axis=1)
+    df.columns = ["col1", "col2"]
+    # Verifies that the computation of the old columns wasn't triggered for the sake
+    # of equality comparison, in this case the new columns should be set unconditionally,
+    # meaning that the '_deferred_column' has to be True
+    assert df._query_compiler._modin_frame._deferred_column