diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md index 883fb7897c..c93862f2f6 100644 --- a/docs/api-reference/dataframe.md +++ b/docs/api-reference/dataframe.md @@ -11,6 +11,7 @@ - columns - drop - drop_nulls + - estimated_size - filter - gather_every - get_column diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 68f7ab534d..fe0201a901 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -21,6 +21,7 @@ from narwhals.utils import generate_temporary_column_name from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop +from narwhals.utils import scale_bytes if TYPE_CHECKING: from types import ModuleType @@ -35,6 +36,7 @@ from narwhals._arrow.series import ArrowSeries from narwhals._arrow.typing import IntoArrowExpr from narwhals.dtypes import DType + from narwhals.typing import SizeUnit from narwhals.utils import Version @@ -285,6 +287,10 @@ def schema(self: Self) -> dict[str, DType]: def collect_schema(self: Self) -> dict[str, DType]: return self.schema + def estimated_size(self: Self, unit: SizeUnit) -> int | float: + sz = self._native_frame.nbytes + return scale_bytes(sz, unit) + @property def columns(self: Self) -> list[str]: return self._native_frame.schema.names # type: ignore[no-any-return] diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 32ba74f29a..a2b739d34c 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -25,6 +25,7 @@ from narwhals.utils import import_dtypes_module from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop +from narwhals.utils import scale_bytes if TYPE_CHECKING: from types import ModuleType @@ -38,6 +39,7 @@ from narwhals._pandas_like.series import PandasLikeSeries from narwhals._pandas_like.typing import IntoPandasLikeExpr from narwhals.dtypes import DType + from narwhals.typing import SizeUnit from narwhals.utils import Version @@ -371,6 +373,10 @@ def drop_nulls(self, subset: str | list[str] | None) -> Self: plx = self.__narwhals_namespace__() return self.filter(~plx.any_horizontal(plx.col(*subset).is_null())) + def estimated_size(self, unit: SizeUnit) -> int | float: + sz = self._native_frame.memory_usage(deep=True).sum() + return scale_bytes(sz, unit=unit) + def with_row_index(self, name: str) -> Self: row_index = create_compliant_series( range(len(self._native_frame)), diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index c057b72276..04cd43daa8 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -36,6 +36,7 @@ from narwhals.typing import IntoDataFrame from narwhals.typing import IntoExpr from narwhals.typing import IntoFrame + from narwhals.typing import SizeUnit from narwhals.utils import Implementation FrameT = TypeVar("FrameT", bound="IntoFrame") @@ -764,6 +765,50 @@ def get_column(self, name: str) -> Series[Any]: level=self._level, ) + def estimated_size(self, unit: SizeUnit = "b") -> int | float: + """Return an estimation of the total (heap) allocated size of the `DataFrame`. + + Estimated size is given in the specified unit (bytes by default). + + Arguments: + unit: 'b', 'kb', 'mb', 'gb', 'tb', 'bytes', 'kilobytes', 'megabytes', + 'gigabytes', or 'terabytes'. + + Returns: + Integer or Float. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoDataFrameT + >>> data = { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + Let's define a dataframe-agnostic function: + + >>> def agnostic_estimated_size(df_native: IntoDataFrameT) -> int | float: + ... df = nw.from_native(df_native) + ... return df.estimated_size() + + We can then pass either pandas, Polars or PyArrow to `agnostic_estimated_size`: + + >>> agnostic_estimated_size(df_pd) + np.int64(330) + >>> agnostic_estimated_size(df_pl) + 51 + >>> agnostic_estimated_size(df_pa) + 63 + """ + return self._compliant_frame.estimated_size(unit=unit) # type: ignore[no-any-return] + @overload def __getitem__(self, item: tuple[Sequence[int], slice]) -> Self: ... @overload diff --git a/narwhals/typing.py b/narwhals/typing.py index 8121c33d2e..329512de6e 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING from typing import Any +from typing import Literal from typing import Protocol from typing import TypeVar from typing import Union @@ -173,6 +174,19 @@ def __dataframe__(self, *args: Any, **kwargs: Any) -> Any: ... ... return s.abs().to_native() """ +SizeUnit: TypeAlias = Literal[ + "b", + "kb", + "mb", + "gb", + "tb", + "bytes", + "kilobytes", + "megabytes", + "gigabytes", + "terabytes", +] + class DTypes: Int64: type[dtypes.Int64] diff --git a/narwhals/utils.py b/narwhals/utils.py index f69eb56615..58f56cdf33 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -43,6 +43,7 @@ from narwhals.series import Series from narwhals.typing import DTypes from narwhals.typing import IntoSeriesT + from narwhals.typing import SizeUnit FrameOrSeriesT = TypeVar( "FrameOrSeriesT", bound=Union[LazyFrame[Any], DataFrame[Any], Series[Any]] @@ -681,6 +682,31 @@ def maybe_convert_dtypes( return obj_any # type: ignore[no-any-return] +def scale_bytes(sz: int, unit: SizeUnit) -> int | float: + """Scale size in bytes to other size units (eg: "kb", "mb", "gb", "tb"). + + Arguments: + sz: original size in bytes + unit: size unit to convert into + + Returns: + Integer or float. + """ + if unit in {"b", "bytes"}: + return sz + elif unit in {"kb", "kilobytes"}: + return sz / 1024 + elif unit in {"mb", "megabytes"}: + return sz / 1024**2 + elif unit in {"gb", "gigabytes"}: + return sz / 1024**3 + elif unit in {"tb", "terabytes"}: + return sz / 1024**4 + else: + msg = f"`unit` must be one of {{'b', 'kb', 'mb', 'gb', 'tb'}}, got {unit!r}" + raise ValueError(msg) + + def is_ordered_categorical(series: Series[Any]) -> bool: """Return whether indices of categories are semantically meaningful. diff --git a/tests/frame/estimated_size_test.py b/tests/frame/estimated_size_test.py new file mode 100644 index 0000000000..499a154fda --- /dev/null +++ b/tests/frame/estimated_size_test.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +import narwhals.stable.v1 as nw + +if TYPE_CHECKING: + from tests.utils import ConstructorEager + +data = {"a": list(range(100))} + + +def test_estimated_size(constructor_eager: ConstructorEager) -> None: + df = nw.from_native(constructor_eager(data), eager_only=True) + + assert df.estimated_size("b") > 0 + assert df.estimated_size("kb") == (df.estimated_size("b") / 1024) + assert df.estimated_size("mb") == (df.estimated_size("kb") / 1024) + assert df.estimated_size("gb") == (df.estimated_size("mb") / 1024) + assert df.estimated_size("tb") == (df.estimated_size("gb") / 1024) + + with pytest.raises( + ValueError, + match="`unit` must be one of {'b', 'kb', 'mb', 'gb', 'tb'}, got 'pizza'", + ): + df.estimated_size("pizza") # type: ignore[arg-type]