Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Adding dataframe estimated size #1549

Merged
merged 11 commits into from
Dec 10, 2024
1 change: 1 addition & 0 deletions docs/api-reference/dataframe.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- columns
- drop
- drop_nulls
- estimated_size
- filter
- gather_every
- get_column
Expand Down
6 changes: 6 additions & 0 deletions narwhals/_arrow/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from narwhals.utils import generate_temporary_column_name
from narwhals.utils import is_sequence_but_not_str
from narwhals.utils import parse_columns_to_drop
from narwhals.utils import scale_bytes

if TYPE_CHECKING:
from types import ModuleType
Expand All @@ -35,6 +36,7 @@
from narwhals._arrow.series import ArrowSeries
from narwhals._arrow.typing import IntoArrowExpr
from narwhals.dtypes import DType
from narwhals.typing import SizeUnit
from narwhals.utils import Version


Expand Down Expand Up @@ -285,6 +287,10 @@ def schema(self: Self) -> dict[str, DType]:
def collect_schema(self: Self) -> dict[str, DType]:
return self.schema

def estimated_size(self: Self, unit: SizeUnit) -> int | float:
sz = self._native_frame.nbytes
return scale_bytes(sz, unit)

@property
def columns(self: Self) -> list[str]:
return self._native_frame.schema.names # type: ignore[no-any-return]
Expand Down
6 changes: 6 additions & 0 deletions narwhals/_pandas_like/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from narwhals.utils import import_dtypes_module
from narwhals.utils import is_sequence_but_not_str
from narwhals.utils import parse_columns_to_drop
from narwhals.utils import scale_bytes

if TYPE_CHECKING:
from types import ModuleType
Expand All @@ -37,6 +38,7 @@
from narwhals._pandas_like.series import PandasLikeSeries
from narwhals._pandas_like.typing import IntoPandasLikeExpr
from narwhals.dtypes import DType
from narwhals.typing import SizeUnit
from narwhals.utils import Version


Expand Down Expand Up @@ -370,6 +372,10 @@ def drop_nulls(self, subset: str | list[str] | None) -> Self:
plx = self.__narwhals_namespace__()
return self.filter(~plx.any_horizontal(plx.col(*subset).is_null()))

def estimated_size(self, unit: SizeUnit) -> int | float:
sz = int(self._native_frame.memory_usage(deep=True).sum())
return scale_bytes(sz, unit=unit)

def with_row_index(self, name: str) -> Self:
row_index = create_compliant_series(
range(len(self._native_frame)),
Expand Down
45 changes: 45 additions & 0 deletions narwhals/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from narwhals.typing import IntoDataFrame
from narwhals.typing import IntoExpr
from narwhals.typing import IntoFrame
from narwhals.typing import SizeUnit
from narwhals.utils import Implementation

FrameT = TypeVar("FrameT", bound="IntoFrame")
Expand Down Expand Up @@ -764,6 +765,50 @@ def get_column(self, name: str) -> Series[Any]:
level=self._level,
)

def estimated_size(self, unit: SizeUnit = "b") -> int | float:
"""Return an estimation of the total (heap) allocated size of the `DataFrame`.

Estimated size is given in the specified unit (bytes by default).

Arguments:
unit: 'b', 'kb', 'mb', 'gb', 'tb', 'bytes', 'kilobytes', 'megabytes',
'gigabytes', or 'terabytes'.

Returns:
Integer or Float.

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoDataFrameT
>>> data = {
... "foo": [1, 2, 3],
... "bar": [6.0, 7.0, 8.0],
... "ham": ["a", "b", "c"],
... }
>>> df_pd = pd.DataFrame(data)
>>> df_pl = pl.DataFrame(data)
>>> df_pa = pa.table(data)

Let's define a dataframe-agnostic function:

>>> def agnostic_estimated_size(df_native: IntoDataFrameT) -> int | float:
... df = nw.from_native(df_native)
... return df.estimated_size()

We can then pass either pandas, Polars or PyArrow to `agnostic_estimated_size`:

>>> agnostic_estimated_size(df_pd)
330
>>> agnostic_estimated_size(df_pl)
51
>>> agnostic_estimated_size(df_pa)
63
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fascinating

"""
return self._compliant_frame.estimated_size(unit=unit)

@overload
def __getitem__(self, item: tuple[Sequence[int], slice]) -> Self: ...
@overload
Expand Down
14 changes: 14 additions & 0 deletions narwhals/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from typing import TYPE_CHECKING
from typing import Any
from typing import Literal
from typing import Protocol
from typing import TypeVar
from typing import Union
Expand Down Expand Up @@ -173,6 +174,19 @@ def __dataframe__(self, *args: Any, **kwargs: Any) -> Any: ...
... return s.abs().to_native()
"""

SizeUnit: TypeAlias = Literal[
"b",
"kb",
"mb",
"gb",
"tb",
"bytes",
"kilobytes",
"megabytes",
"gigabytes",
"terabytes",
]


class DTypes:
Int64: type[dtypes.Int64]
Expand Down
26 changes: 26 additions & 0 deletions narwhals/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from narwhals.series import Series
from narwhals.typing import DTypes
from narwhals.typing import IntoSeriesT
from narwhals.typing import SizeUnit

FrameOrSeriesT = TypeVar(
"FrameOrSeriesT", bound=Union[LazyFrame[Any], DataFrame[Any], Series[Any]]
Expand Down Expand Up @@ -681,6 +682,31 @@ def maybe_convert_dtypes(
return obj_any # type: ignore[no-any-return]


def scale_bytes(sz: int, unit: SizeUnit) -> int | float:
"""Scale size in bytes to other size units (eg: "kb", "mb", "gb", "tb").

Arguments:
sz: original size in bytes
unit: size unit to convert into

Returns:
Integer or float.
"""
if unit in {"b", "bytes"}:
return sz
elif unit in {"kb", "kilobytes"}:
return sz / 1024
elif unit in {"mb", "megabytes"}:
return sz / 1024**2
elif unit in {"gb", "gigabytes"}:
return sz / 1024**3
elif unit in {"tb", "terabytes"}:
return sz / 1024**4
else:
msg = f"`unit` must be one of {{'b', 'kb', 'mb', 'gb', 'tb'}}, got {unit!r}"
raise ValueError(msg)


def is_ordered_categorical(series: Series[Any]) -> bool:
"""Return whether indices of categories are semantically meaningful.

Expand Down
28 changes: 28 additions & 0 deletions tests/frame/estimated_size_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import pytest

import narwhals.stable.v1 as nw

if TYPE_CHECKING:
from tests.utils import ConstructorEager

data = {"a": list(range(100))}


def test_estimated_size(constructor_eager: ConstructorEager) -> None:
df = nw.from_native(constructor_eager(data), eager_only=True)

assert df.estimated_size("b") > 0
assert df.estimated_size("kb") == (df.estimated_size("b") / 1024)
assert df.estimated_size("mb") == (df.estimated_size("kb") / 1024)
assert df.estimated_size("gb") == (df.estimated_size("mb") / 1024)
assert df.estimated_size("tb") == (df.estimated_size("gb") / 1024)

with pytest.raises(
ValueError,
match="`unit` must be one of {'b', 'kb', 'mb', 'gb', 'tb'}, got 'pizza'",
):
Comment on lines +24 to +27
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That got me a serious smile πŸ˜‚

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we should allow unit='pizza'

I think I can eat a pizza in 40 bites. Haven't measured, just guessing. So, unit='pizza' should be 1/40th of unit='b'

We would allow unit='pizza', but leave it undocumented, as a little easter egg for anyone who chooses to read the source code

idk if this is too silly 😜

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I bow in front of such piece of art πŸ•

df.estimated_size("pizza") # type: ignore[arg-type]
Loading