feat: Adding dataframe estimated size (#1549)

DeaMariaLeon · web-flow · commit 36afa70945fa · 2024-12-10T13:10:37.000Z
diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md
@@ -11,6 +11,7 @@
         - columns
         - drop
         - drop_nulls
+        - estimated_size
         - filter
         - gather_every
         - get_column
diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py
@@ -21,6 +21,7 @@
 from narwhals.utils import generate_temporary_column_name
 from narwhals.utils import is_sequence_but_not_str
 from narwhals.utils import parse_columns_to_drop
+from narwhals.utils import scale_bytes
 
 if TYPE_CHECKING:
     from types import ModuleType
@@ -35,6 +36,7 @@
     from narwhals._arrow.series import ArrowSeries
     from narwhals._arrow.typing import IntoArrowExpr
     from narwhals.dtypes import DType
+    from narwhals.typing import SizeUnit
     from narwhals.utils import Version
 
 
@@ -285,6 +287,10 @@ def schema(self: Self) -> dict[str, DType]:
     def collect_schema(self: Self) -> dict[str, DType]:
         return self.schema
 
+    def estimated_size(self: Self, unit: SizeUnit) -> int | float:
+        sz = self._native_frame.nbytes
+        return scale_bytes(sz, unit)
+
     @property
     def columns(self: Self) -> list[str]:
         return self._native_frame.schema.names  # type: ignore[no-any-return]
diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py
@@ -25,6 +25,7 @@
 from narwhals.utils import import_dtypes_module
 from narwhals.utils import is_sequence_but_not_str
 from narwhals.utils import parse_columns_to_drop
+from narwhals.utils import scale_bytes
 
 if TYPE_CHECKING:
     from types import ModuleType
@@ -38,6 +39,7 @@
     from narwhals._pandas_like.series import PandasLikeSeries
     from narwhals._pandas_like.typing import IntoPandasLikeExpr
     from narwhals.dtypes import DType
+    from narwhals.typing import SizeUnit
     from narwhals.utils import Version
 
 
@@ -371,6 +373,10 @@ def drop_nulls(self, subset: str | list[str] | None) -> Self:
         plx = self.__narwhals_namespace__()
         return self.filter(~plx.any_horizontal(plx.col(*subset).is_null()))
 
+    def estimated_size(self, unit: SizeUnit) -> int | float:
+        sz = self._native_frame.memory_usage(deep=True).sum()
+        return scale_bytes(sz, unit=unit)
+
     def with_row_index(self, name: str) -> Self:
         row_index = create_compliant_series(
             range(len(self._native_frame)),
diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py
@@ -36,6 +36,7 @@
     from narwhals.typing import IntoDataFrame
     from narwhals.typing import IntoExpr
     from narwhals.typing import IntoFrame
+    from narwhals.typing import SizeUnit
     from narwhals.utils import Implementation
 
 FrameT = TypeVar("FrameT", bound="IntoFrame")
@@ -764,6 +765,50 @@ def get_column(self, name: str) -> Series[Any]:
             level=self._level,
         )
 
+    def estimated_size(self, unit: SizeUnit = "b") -> int | float:
+        """Return an estimation of the total (heap) allocated size of the `DataFrame`.
+
+        Estimated size is given in the specified unit (bytes by default).
+
+        Arguments:
+            unit: 'b', 'kb', 'mb', 'gb', 'tb', 'bytes', 'kilobytes', 'megabytes',
+                    'gigabytes', or 'terabytes'.
+
+        Returns:
+            Integer or Float.
+
+        Examples:
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> import pyarrow as pa
+            >>> import narwhals as nw
+            >>> from narwhals.typing import IntoDataFrameT
+            >>> data = {
+            ...     "foo": [1, 2, 3],
+            ...     "bar": [6.0, 7.0, 8.0],
+            ...     "ham": ["a", "b", "c"],
+            ... }
+            >>> df_pd = pd.DataFrame(data)
+            >>> df_pl = pl.DataFrame(data)
+            >>> df_pa = pa.table(data)
+
+            Let's define a dataframe-agnostic function:
+
+            >>> def agnostic_estimated_size(df_native: IntoDataFrameT) -> int | float:
+            ...     df = nw.from_native(df_native)
+            ...     return df.estimated_size()
+
+            We can then pass either pandas, Polars or PyArrow to `agnostic_estimated_size`:
+
+            >>> agnostic_estimated_size(df_pd)
+            np.int64(330)
+            >>> agnostic_estimated_size(df_pl)
+            51
+            >>> agnostic_estimated_size(df_pa)
+            63
+        """
+        return self._compliant_frame.estimated_size(unit=unit)  # type: ignore[no-any-return]
+
     @overload
     def __getitem__(self, item: tuple[Sequence[int], slice]) -> Self: ...
     @overload
diff --git a/narwhals/typing.py b/narwhals/typing.py
@@ -2,6 +2,7 @@
 
 from typing import TYPE_CHECKING
 from typing import Any
+from typing import Literal
 from typing import Protocol
 from typing import TypeVar
 from typing import Union
@@ -173,6 +174,19 @@ def __dataframe__(self, *args: Any, **kwargs: Any) -> Any: ...
     ...     return s.abs().to_native()
 """
 
+SizeUnit: TypeAlias = Literal[
+    "b",
+    "kb",
+    "mb",
+    "gb",
+    "tb",
+    "bytes",
+    "kilobytes",
+    "megabytes",
+    "gigabytes",
+    "terabytes",
+]
+
 
 class DTypes:
     Int64: type[dtypes.Int64]
diff --git a/narwhals/utils.py b/narwhals/utils.py
@@ -43,6 +43,7 @@
     from narwhals.series import Series
     from narwhals.typing import DTypes
     from narwhals.typing import IntoSeriesT
+    from narwhals.typing import SizeUnit
 
     FrameOrSeriesT = TypeVar(
         "FrameOrSeriesT", bound=Union[LazyFrame[Any], DataFrame[Any], Series[Any]]
@@ -681,6 +682,31 @@ def maybe_convert_dtypes(
     return obj_any  # type: ignore[no-any-return]
 
 
+def scale_bytes(sz: int, unit: SizeUnit) -> int | float:
+    """Scale size in bytes to other size units (eg: "kb", "mb", "gb", "tb").
+
+    Arguments:
+        sz: original size in bytes
+        unit: size unit to convert into
+
+    Returns:
+        Integer or float.
+    """
+    if unit in {"b", "bytes"}:
+        return sz
+    elif unit in {"kb", "kilobytes"}:
+        return sz / 1024
+    elif unit in {"mb", "megabytes"}:
+        return sz / 1024**2
+    elif unit in {"gb", "gigabytes"}:
+        return sz / 1024**3
+    elif unit in {"tb", "terabytes"}:
+        return sz / 1024**4
+    else:
+        msg = f"`unit` must be one of {{'b', 'kb', 'mb', 'gb', 'tb'}}, got {unit!r}"
+        raise ValueError(msg)
+
+
 def is_ordered_categorical(series: Series[Any]) -> bool:
     """Return whether indices of categories are semantically meaningful.
 
diff --git a/tests/frame/estimated_size_test.py b/tests/frame/estimated_size_test.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+
+import narwhals.stable.v1 as nw
+
+if TYPE_CHECKING:
+    from tests.utils import ConstructorEager
+
+data = {"a": list(range(100))}
+
+
+def test_estimated_size(constructor_eager: ConstructorEager) -> None:
+    df = nw.from_native(constructor_eager(data), eager_only=True)
+
+    assert df.estimated_size("b") > 0
+    assert df.estimated_size("kb") == (df.estimated_size("b") / 1024)
+    assert df.estimated_size("mb") == (df.estimated_size("kb") / 1024)
+    assert df.estimated_size("gb") == (df.estimated_size("mb") / 1024)
+    assert df.estimated_size("tb") == (df.estimated_size("gb") / 1024)
+
+    with pytest.raises(
+        ValueError,
+        match="`unit` must be one of {'b', 'kb', 'mb', 'gb', 'tb'}, got 'pizza'",
+    ):
+        df.estimated_size("pizza")  # type: ignore[arg-type]