-
Notifications
You must be signed in to change notification settings - Fork 132
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Adding dataframe estimated size #1549
Changes from 7 commits
724044f
4700c11
3f1cda0
837027c
c4c0c70
39bdb1c
6a100f8
7daaa3a
73eccd4
0727056
d9cf8f2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
- columns | ||
- drop | ||
- drop_nulls | ||
- estimated_size | ||
- filter | ||
- gather_every | ||
- get_column | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -24,6 +24,7 @@ | |||||
from narwhals.utils import import_dtypes_module | ||||||
from narwhals.utils import is_sequence_but_not_str | ||||||
from narwhals.utils import parse_columns_to_drop | ||||||
from narwhals.utils import scale_bytes | ||||||
|
||||||
if TYPE_CHECKING: | ||||||
from types import ModuleType | ||||||
|
@@ -37,6 +38,7 @@ | |||||
from narwhals._pandas_like.series import PandasLikeSeries | ||||||
from narwhals._pandas_like.typing import IntoPandasLikeExpr | ||||||
from narwhals.dtypes import DType | ||||||
from narwhals.typing import SizeUnit | ||||||
from narwhals.utils import Version | ||||||
|
||||||
|
||||||
|
@@ -370,6 +372,10 @@ def drop_nulls(self, subset: str | list[str] | None) -> Self: | |||||
plx = self.__narwhals_namespace__() | ||||||
return self.filter(~plx.any_horizontal(plx.col(*subset).is_null())) | ||||||
|
||||||
def estimated_size(self, unit: SizeUnit = "b") -> int | float: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly as per arrow:
Suggested change
|
||||||
sz = int(self._native_frame.memory_usage(deep=True).sum()) | ||||||
MarcoGorelli marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
return scale_bytes(sz, unit) | ||||||
|
||||||
def with_row_index(self, name: str) -> Self: | ||||||
row_index = create_compliant_series( | ||||||
range(len(self._native_frame)), | ||||||
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -19,6 +19,7 @@ | |||||||
from narwhals.utils import flatten | ||||||||
from narwhals.utils import is_sequence_but_not_str | ||||||||
from narwhals.utils import parse_version | ||||||||
from narwhals.utils import scale_bytes | ||||||||
|
||||||||
if TYPE_CHECKING: | ||||||||
from io import BytesIO | ||||||||
|
@@ -36,6 +37,7 @@ | |||||||
from narwhals.typing import IntoDataFrame | ||||||||
from narwhals.typing import IntoExpr | ||||||||
from narwhals.typing import IntoFrame | ||||||||
from narwhals.typing import SizeUnit | ||||||||
from narwhals.utils import Implementation | ||||||||
|
||||||||
FrameT = TypeVar("FrameT", bound="IntoFrame") | ||||||||
|
@@ -764,6 +766,51 @@ def get_column(self, name: str) -> Series[Any]: | |||||||
level=self._level, | ||||||||
) | ||||||||
|
||||||||
def estimated_size(self, unit: SizeUnit = "b") -> int | float: | ||||||||
"""Return an estimation of the total (heap) allocated size of the `DataFrame`. | ||||||||
|
||||||||
Estimated size is given in the specified unit (bytes by default). | ||||||||
|
||||||||
Arguments: | ||||||||
unit : 'b', 'kb', 'mb', 'gb', 'tb', 'bytes', 'kilobytes', 'megabytes', | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Otherwise the |
||||||||
'gigabytes', or 'terabytes'. | ||||||||
|
||||||||
Returns: | ||||||||
Integer or Float. | ||||||||
|
||||||||
Examples: | ||||||||
>>> import pandas as pd | ||||||||
>>> import polars as pl | ||||||||
>>> import pyarrow as pa | ||||||||
>>> import narwhals as nw | ||||||||
>>> from narwhals.typing import IntoDataFrameT | ||||||||
>>> data = { | ||||||||
... "foo": [1, 2, 3], | ||||||||
... "bar": [6.0, 7.0, 8.0], | ||||||||
... "ham": ["a", "b", "c"], | ||||||||
... } | ||||||||
>>> df_pd = pd.DataFrame(data) | ||||||||
>>> df_pl = pl.DataFrame(data) | ||||||||
>>> df_pa = pa.table(data) | ||||||||
|
||||||||
Let's define a dataframe-agnostic function: | ||||||||
|
||||||||
>>> def agnostic_estimated_size(df_native: IntoDataFrameT) -> int | float: | ||||||||
... df = nw.from_native(df_native) | ||||||||
... return df.estimated_size() | ||||||||
|
||||||||
We can then pass either pandas, Polars or PyArrow to `agnostic_estimated_size`: | ||||||||
|
||||||||
>>> agnostic_estimated_size(df_pd) | ||||||||
330 | ||||||||
>>> agnostic_estimated_size(df_pl) | ||||||||
51 | ||||||||
>>> agnostic_estimated_size(df_pa) | ||||||||
63 | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fascinating |
||||||||
""" | ||||||||
sz = self._compliant_frame.estimated_size() | ||||||||
return scale_bytes(sz, unit) | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wouldn't this directly be
Suggested change
? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where did the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Each compliant frame There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. π«£ .... of course. I need new glasses. π€ |
||||||||
|
||||||||
@overload | ||||||||
def __getitem__(self, item: tuple[Sequence[int], slice]) -> Self: ... | ||||||||
@overload | ||||||||
|
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -43,6 +43,7 @@ | |||||||||||||
from narwhals.series import Series | ||||||||||||||
from narwhals.typing import DTypes | ||||||||||||||
from narwhals.typing import IntoSeriesT | ||||||||||||||
from narwhals.typing import SizeUnit | ||||||||||||||
|
||||||||||||||
FrameOrSeriesT = TypeVar( | ||||||||||||||
"FrameOrSeriesT", bound=Union[LazyFrame[Any], DataFrame[Any], Series[Any]] | ||||||||||||||
|
@@ -681,6 +682,31 @@ def maybe_convert_dtypes( | |||||||||||||
return obj_any # type: ignore[no-any-return] | ||||||||||||||
|
||||||||||||||
|
||||||||||||||
def scale_bytes(sz: int, unit: SizeUnit) -> int | float: | ||||||||||||||
"""Scale size in bytes to other size units (eg: "kb", "mb", "gb", "tb"). | ||||||||||||||
|
||||||||||||||
Arguments: | ||||||||||||||
sz: size | ||||||||||||||
unit: size unit | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. may I suggest:
Suggested change
|
||||||||||||||
|
||||||||||||||
Returns: | ||||||||||||||
Integer or float. | ||||||||||||||
""" | ||||||||||||||
if unit in {"b", "bytes"}: | ||||||||||||||
return sz | ||||||||||||||
elif unit in {"kb", "kilobytes"}: | ||||||||||||||
return sz / 1024 | ||||||||||||||
elif unit in {"mb", "megabytes"}: | ||||||||||||||
return sz / 1024**2 | ||||||||||||||
elif unit in {"gb", "gigabytes"}: | ||||||||||||||
return sz / 1024**3 | ||||||||||||||
elif unit in {"tb", "terabytes"}: | ||||||||||||||
return sz / 1024**4 | ||||||||||||||
else: | ||||||||||||||
msg = f"`unit` must be one of {{'b', 'kb', 'mb', 'gb', 'tb'}}, got {unit!r}" | ||||||||||||||
raise ValueError(msg) | ||||||||||||||
|
||||||||||||||
|
||||||||||||||
def is_ordered_categorical(series: Series[Any]) -> bool: | ||||||||||||||
"""Return whether indices of categories are semantically meaningful. | ||||||||||||||
|
||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING | ||
|
||
import pytest | ||
|
||
import narwhals.stable.v1 as nw | ||
|
||
if TYPE_CHECKING: | ||
from tests.utils import ConstructorEager | ||
|
||
data = {"a": list(range(100))} | ||
|
||
|
||
def test_estimated_size(constructor_eager: ConstructorEager) -> None: | ||
df = nw.from_native(constructor_eager(data), eager_only=True) | ||
|
||
assert df.estimated_size() > 0 | ||
assert df.estimated_size("kb") == (df.estimated_size("b") / 1024) | ||
assert df.estimated_size("mb") == (df.estimated_size("kb") / 1024) | ||
assert df.estimated_size("gb") == (df.estimated_size("mb") / 1024) | ||
assert df.estimated_size("tb") == (df.estimated_size("gb") / 1024) | ||
|
||
with pytest.raises( | ||
ValueError, | ||
match="`unit` must be one of {'b', 'kb', 'mb', 'gb', 'tb'}, got 'pizza'", | ||
): | ||
Comment on lines
+24
to
+27
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That got me a serious smile π There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe we should allow I think I can eat a pizza in 40 bites. Haven't measured, just guessing. So, We would allow idk if this is too silly π There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I bow in front of such piece of art π |
||
df.estimated_size("pizza") # type: ignore[arg-type] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We are trying to avoid having defaults for compliant object methods