Skip to content

Commit 36afa70

Browse files
authored
feat: Adding dataframe estimated size (#1549)
1 parent 77d11e8 commit 36afa70

File tree

7 files changed

+126
-0
lines changed

7 files changed

+126
-0
lines changed

docs/api-reference/dataframe.md

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
- columns
1212
- drop
1313
- drop_nulls
14+
- estimated_size
1415
- filter
1516
- gather_every
1617
- get_column

narwhals/_arrow/dataframe.py

+6
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from narwhals.utils import generate_temporary_column_name
2222
from narwhals.utils import is_sequence_but_not_str
2323
from narwhals.utils import parse_columns_to_drop
24+
from narwhals.utils import scale_bytes
2425

2526
if TYPE_CHECKING:
2627
from types import ModuleType
@@ -35,6 +36,7 @@
3536
from narwhals._arrow.series import ArrowSeries
3637
from narwhals._arrow.typing import IntoArrowExpr
3738
from narwhals.dtypes import DType
39+
from narwhals.typing import SizeUnit
3840
from narwhals.utils import Version
3941

4042

@@ -285,6 +287,10 @@ def schema(self: Self) -> dict[str, DType]:
285287
def collect_schema(self: Self) -> dict[str, DType]:
286288
return self.schema
287289

290+
def estimated_size(self: Self, unit: SizeUnit) -> int | float:
291+
sz = self._native_frame.nbytes
292+
return scale_bytes(sz, unit)
293+
288294
@property
289295
def columns(self: Self) -> list[str]:
290296
return self._native_frame.schema.names # type: ignore[no-any-return]

narwhals/_pandas_like/dataframe.py

+6
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from narwhals.utils import import_dtypes_module
2626
from narwhals.utils import is_sequence_but_not_str
2727
from narwhals.utils import parse_columns_to_drop
28+
from narwhals.utils import scale_bytes
2829

2930
if TYPE_CHECKING:
3031
from types import ModuleType
@@ -38,6 +39,7 @@
3839
from narwhals._pandas_like.series import PandasLikeSeries
3940
from narwhals._pandas_like.typing import IntoPandasLikeExpr
4041
from narwhals.dtypes import DType
42+
from narwhals.typing import SizeUnit
4143
from narwhals.utils import Version
4244

4345

@@ -371,6 +373,10 @@ def drop_nulls(self, subset: str | list[str] | None) -> Self:
371373
plx = self.__narwhals_namespace__()
372374
return self.filter(~plx.any_horizontal(plx.col(*subset).is_null()))
373375

376+
def estimated_size(self, unit: SizeUnit) -> int | float:
377+
sz = self._native_frame.memory_usage(deep=True).sum()
378+
return scale_bytes(sz, unit=unit)
379+
374380
def with_row_index(self, name: str) -> Self:
375381
row_index = create_compliant_series(
376382
range(len(self._native_frame)),

narwhals/dataframe.py

+45
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from narwhals.typing import IntoDataFrame
3737
from narwhals.typing import IntoExpr
3838
from narwhals.typing import IntoFrame
39+
from narwhals.typing import SizeUnit
3940
from narwhals.utils import Implementation
4041

4142
FrameT = TypeVar("FrameT", bound="IntoFrame")
@@ -764,6 +765,50 @@ def get_column(self, name: str) -> Series[Any]:
764765
level=self._level,
765766
)
766767

768+
def estimated_size(self, unit: SizeUnit = "b") -> int | float:
769+
"""Return an estimation of the total (heap) allocated size of the `DataFrame`.
770+
771+
Estimated size is given in the specified unit (bytes by default).
772+
773+
Arguments:
774+
unit: 'b', 'kb', 'mb', 'gb', 'tb', 'bytes', 'kilobytes', 'megabytes',
775+
'gigabytes', or 'terabytes'.
776+
777+
Returns:
778+
Integer or Float.
779+
780+
Examples:
781+
>>> import pandas as pd
782+
>>> import polars as pl
783+
>>> import pyarrow as pa
784+
>>> import narwhals as nw
785+
>>> from narwhals.typing import IntoDataFrameT
786+
>>> data = {
787+
... "foo": [1, 2, 3],
788+
... "bar": [6.0, 7.0, 8.0],
789+
... "ham": ["a", "b", "c"],
790+
... }
791+
>>> df_pd = pd.DataFrame(data)
792+
>>> df_pl = pl.DataFrame(data)
793+
>>> df_pa = pa.table(data)
794+
795+
Let's define a dataframe-agnostic function:
796+
797+
>>> def agnostic_estimated_size(df_native: IntoDataFrameT) -> int | float:
798+
... df = nw.from_native(df_native)
799+
... return df.estimated_size()
800+
801+
We can then pass either pandas, Polars or PyArrow to `agnostic_estimated_size`:
802+
803+
>>> agnostic_estimated_size(df_pd)
804+
np.int64(330)
805+
>>> agnostic_estimated_size(df_pl)
806+
51
807+
>>> agnostic_estimated_size(df_pa)
808+
63
809+
"""
810+
return self._compliant_frame.estimated_size(unit=unit) # type: ignore[no-any-return]
811+
767812
@overload
768813
def __getitem__(self, item: tuple[Sequence[int], slice]) -> Self: ...
769814
@overload

narwhals/typing.py

+14
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from typing import TYPE_CHECKING
44
from typing import Any
5+
from typing import Literal
56
from typing import Protocol
67
from typing import TypeVar
78
from typing import Union
@@ -173,6 +174,19 @@ def __dataframe__(self, *args: Any, **kwargs: Any) -> Any: ...
173174
... return s.abs().to_native()
174175
"""
175176

177+
SizeUnit: TypeAlias = Literal[
178+
"b",
179+
"kb",
180+
"mb",
181+
"gb",
182+
"tb",
183+
"bytes",
184+
"kilobytes",
185+
"megabytes",
186+
"gigabytes",
187+
"terabytes",
188+
]
189+
176190

177191
class DTypes:
178192
Int64: type[dtypes.Int64]

narwhals/utils.py

+26
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
from narwhals.series import Series
4444
from narwhals.typing import DTypes
4545
from narwhals.typing import IntoSeriesT
46+
from narwhals.typing import SizeUnit
4647

4748
FrameOrSeriesT = TypeVar(
4849
"FrameOrSeriesT", bound=Union[LazyFrame[Any], DataFrame[Any], Series[Any]]
@@ -681,6 +682,31 @@ def maybe_convert_dtypes(
681682
return obj_any # type: ignore[no-any-return]
682683

683684

685+
def scale_bytes(sz: int, unit: SizeUnit) -> int | float:
686+
"""Scale size in bytes to other size units (eg: "kb", "mb", "gb", "tb").
687+
688+
Arguments:
689+
sz: original size in bytes
690+
unit: size unit to convert into
691+
692+
Returns:
693+
Integer or float.
694+
"""
695+
if unit in {"b", "bytes"}:
696+
return sz
697+
elif unit in {"kb", "kilobytes"}:
698+
return sz / 1024
699+
elif unit in {"mb", "megabytes"}:
700+
return sz / 1024**2
701+
elif unit in {"gb", "gigabytes"}:
702+
return sz / 1024**3
703+
elif unit in {"tb", "terabytes"}:
704+
return sz / 1024**4
705+
else:
706+
msg = f"`unit` must be one of {{'b', 'kb', 'mb', 'gb', 'tb'}}, got {unit!r}"
707+
raise ValueError(msg)
708+
709+
684710
def is_ordered_categorical(series: Series[Any]) -> bool:
685711
"""Return whether indices of categories are semantically meaningful.
686712

tests/frame/estimated_size_test.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
import pytest
6+
7+
import narwhals.stable.v1 as nw
8+
9+
if TYPE_CHECKING:
10+
from tests.utils import ConstructorEager
11+
12+
data = {"a": list(range(100))}
13+
14+
15+
def test_estimated_size(constructor_eager: ConstructorEager) -> None:
16+
df = nw.from_native(constructor_eager(data), eager_only=True)
17+
18+
assert df.estimated_size("b") > 0
19+
assert df.estimated_size("kb") == (df.estimated_size("b") / 1024)
20+
assert df.estimated_size("mb") == (df.estimated_size("kb") / 1024)
21+
assert df.estimated_size("gb") == (df.estimated_size("mb") / 1024)
22+
assert df.estimated_size("tb") == (df.estimated_size("gb") / 1024)
23+
24+
with pytest.raises(
25+
ValueError,
26+
match="`unit` must be one of {'b', 'kb', 'mb', 'gb', 'tb'}, got 'pizza'",
27+
):
28+
df.estimated_size("pizza") # type: ignore[arg-type]

0 commit comments

Comments
 (0)