From 7c8c6fa1d3e7a4c31b8bf7577ca80ab9322a1c95 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 30 Dec 2024 19:35:15 +0000 Subject: [PATCH 01/13] fix: casting to List for cudf (#1686) --- narwhals/_pandas_like/utils.py | 2 ++ tests/expr_and_series/max_horizontal_test.py | 2 ++ tests/expr_and_series/min_horizontal_test.py | 2 ++ tests/group_by_test.py | 3 --- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 5c523138f..0bb7edf8f 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -446,6 +446,8 @@ def native_to_narwhals_dtype( dtypes = import_dtypes_module(version) if dtype.startswith(("large_list", "list", "struct", "fixed_size_list")): + if implementation is Implementation.CUDF: + return arrow_native_to_narwhals_dtype(native_column.dtype.to_arrow(), version) return arrow_native_to_narwhals_dtype(native_column.dtype.pyarrow_dtype, version) if dtype != "object": return non_object_native_to_narwhals_dtype(dtype, version, implementation) diff --git a/tests/expr_and_series/max_horizontal_test.py b/tests/expr_and_series/max_horizontal_test.py index a489f9cb3..3becb36be 100644 --- a/tests/expr_and_series/max_horizontal_test.py +++ b/tests/expr_and_series/max_horizontal_test.py @@ -13,6 +13,7 @@ @pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) +@pytest.mark.filterwarnings(r"ignore:.*All-NaN slice encountered:RuntimeWarning") def test_maxh(constructor: Constructor, col_expr: Any) -> None: df = nw.from_native(constructor(data)) result = df.select(horizontal_max=nw.max_horizontal(col_expr, nw.col("b"), "z")) @@ -20,6 +21,7 @@ def test_maxh(constructor: Constructor, col_expr: Any) -> None: assert_equal_data(result, expected) +@pytest.mark.filterwarnings(r"ignore:.*All-NaN slice encountered:RuntimeWarning") def test_maxh_all(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(nw.max_horizontal(nw.all()), c=nw.max_horizontal(nw.all())) diff --git a/tests/expr_and_series/min_horizontal_test.py b/tests/expr_and_series/min_horizontal_test.py index 263b76e45..5fb7fce97 100644 --- a/tests/expr_and_series/min_horizontal_test.py +++ b/tests/expr_and_series/min_horizontal_test.py @@ -13,6 +13,7 @@ @pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) +@pytest.mark.filterwarnings(r"ignore:.*All-NaN slice encountered:RuntimeWarning") def test_minh(constructor: Constructor, col_expr: Any) -> None: df = nw.from_native(constructor(data)) result = df.select(horizontal_min=nw.min_horizontal(col_expr, nw.col("b"), "z")) @@ -20,6 +21,7 @@ def test_minh(constructor: Constructor, col_expr: Any) -> None: assert_equal_data(result, expected) +@pytest.mark.filterwarnings(r"ignore:.*All-NaN slice encountered:RuntimeWarning") def test_minh_all(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(nw.min_horizontal(nw.all()), c=nw.min_horizontal(nw.all())) diff --git a/tests/group_by_test.py b/tests/group_by_test.py index 7ed99f07a..f98508ef3 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -324,9 +324,6 @@ def test_key_with_nulls_iter( if PANDAS_VERSION < (1, 3) and "pandas_constructor" in str(constructor_eager): # bug in old pandas request.applymarker(pytest.mark.xfail) - if "cudf" in str(constructor_eager): - # https://github.com/rapidsai/cudf/issues/17650 - request.applymarker(pytest.mark.xfail) data = {"b": ["4", "5", None, "7"], "a": [1, 2, 3, 4], "c": ["4", "3", None, None]} result = dict( nw.from_native(constructor_eager(data), eager_only=True) From f1baf90780776b2d246404b279367792147020d8 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Tue, 31 Dec 2024 10:24:24 +0100 Subject: [PATCH 02/13] feat: add some `SparkLikeLazyFrame` methods (#1633) --- narwhals/_spark_like/dataframe.py | 77 +++++- tests/spark_like_test.py | 378 ++++++++++++++++++++++++++++++ 2 files changed, 450 insertions(+), 5 deletions(-) diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index 28cefd6fd..ba57da5bd 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING from typing import Any from typing import Iterable +from typing import Literal from typing import Sequence from narwhals._spark_like.utils import native_to_narwhals_dtype @@ -169,16 +170,82 @@ def sort( flat_by = flatten([*flatten([by]), *more_by]) if isinstance(descending, bool): - descending = [descending] + descending = [descending] * len(flat_by) if nulls_last: - sort_funcs = [ + sort_funcs = ( F.desc_nulls_last if d else F.asc_nulls_last for d in descending - ] + ) else: - sort_funcs = [ + sort_funcs = ( F.desc_nulls_first if d else F.asc_nulls_first for d in descending - ] + ) sort_cols = [sort_f(col) for col, sort_f in zip(flat_by, sort_funcs)] return self._from_native_frame(self._native_frame.sort(*sort_cols)) + + def drop_nulls(self: Self, subset: str | list[str] | None) -> Self: + return self._from_native_frame(self._native_frame.dropna(subset=subset)) + + def rename(self: Self, mapping: dict[str, str]) -> Self: + import pyspark.sql.functions as F # noqa: N812 + + rename_mapping = { + colname: mapping.get(colname, colname) for colname in self.columns + } + return self._from_native_frame( + self._native_frame.select( + [F.col(old).alias(new) for old, new in rename_mapping.items()] + ) + ) + + def unique( + self: Self, + subset: str | list[str] | None = None, + *, + keep: Literal["any", "first", "last", "none"], + maintain_order: bool, + ) -> Self: + # The param `maintain_order` is only here for compatibility with the Polars API + # and has no effect on the output. + if keep != "any": + msg = "`LazyFrame.unique` with PySpark backend only supports `keep='any'`." + raise ValueError(msg) + subset = [subset] if isinstance(subset, str) else subset + return self._from_native_frame(self._native_frame.dropDuplicates(subset=subset)) + + def join( + self, + other: Self, + how: Literal["inner", "left", "cross", "semi", "anti"], + left_on: str | list[str] | None, + right_on: str | list[str] | None, + suffix: str, + ) -> Self: + import pyspark.sql.functions as F # noqa: N812 + + self_native = self._native_frame + other_native = other._native_frame + + left_columns = self.columns + right_columns = other.columns + + if isinstance(left_on, str): + left_on = [left_on] + if isinstance(right_on, str): + right_on = [right_on] + + # create a mapping for columns on other + # `right_on` columns will be renamed as `left_on` + # the remaining columns will be either added the suffix or left unchanged. + rename_mapping = { + **dict(zip(right_on or [], left_on or [])), + **{ + colname: f"{colname}{suffix}" if colname in left_columns else colname + for colname in list(set(right_columns).difference(set(right_on or []))) + }, + } + other = other_native.select( + [F.col(old).alias(new) for old, new in rename_mapping.items()] + ) + return self._from_native_frame(self_native.join(other=other, on=left_on, how=how)) diff --git a/tests/spark_like_test.py b/tests/spark_like_test.py index 0d13edefd..3b9fa12f3 100644 --- a/tests/spark_like_test.py +++ b/tests/spark_like_test.py @@ -491,3 +491,381 @@ def test_group_by_depth_1_std_var( expr = getattr(nw.col("b"), attr)(ddof=ddof) result = nw.from_native(pyspark_constructor(data)).group_by("a").agg(expr).sort("a") assert_equal_data(result, expected) + + +# copied from tests/frame/drop_nulls_test.py +def test_drop_nulls(pyspark_constructor: Constructor) -> None: + data = { + "a": [1.0, 2.0, None, 4.0], + "b": [None, 3.0, None, 5.0], + } + + result = nw.from_native(pyspark_constructor(data)).drop_nulls() + expected = { + "a": [2.0, 4.0], + "b": [3.0, 5.0], + } + assert_equal_data(result, expected) + + +@pytest.mark.parametrize( + ("subset", "expected"), + [ + ("a", {"a": [1, 2.0, 4.0], "b": [float("nan"), 3.0, 5.0]}), + (["a"], {"a": [1, 2.0, 4.0], "b": [float("nan"), 3.0, 5.0]}), + (["a", "b"], {"a": [2.0, 4.0], "b": [3.0, 5.0]}), + ], +) +def test_drop_nulls_subset( + pyspark_constructor: Constructor, subset: str | list[str], expected: dict[str, float] +) -> None: + data = { + "a": [1.0, 2.0, None, 4.0], + "b": [None, 3.0, None, 5.0], + } + + result = nw.from_native(pyspark_constructor(data)).drop_nulls(subset=subset) + assert_equal_data(result, expected) + + +# copied from tests/frame/rename_test.py +def test_rename(pyspark_constructor: Constructor) -> None: + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + df = nw.from_native(pyspark_constructor(data)) + result = df.rename({"a": "x", "b": "y"}) + expected = {"x": [1, 3, 2], "y": [4, 4, 6], "z": [7.0, 8, 9]} + assert_equal_data(result, expected) + + +# adapted from tests/frame/unique_test.py +@pytest.mark.parametrize("subset", ["b", ["b"]]) +@pytest.mark.parametrize( + ("keep", "expected"), + [ + ("first", {"a": [1, 2], "b": [4, 6], "z": [7.0, 9.0]}), + ("last", {"a": [3, 2], "b": [4, 6], "z": [8.0, 9.0]}), + ("any", {"a": [1, 2], "b": [4, 6], "z": [7.0, 9.0]}), + ("none", {"a": [2], "b": [6], "z": [9]}), + ], +) +@pytest.mark.filterwarnings("ignore:Argument `maintain_order=True` is unused") +def test_unique( + pyspark_constructor: Constructor, + subset: str | list[str] | None, + keep: str, + expected: dict[str, list[float]], +) -> None: + context = ( + does_not_raise() + if keep == "any" + else pytest.raises( + ValueError, + match=r"`LazyFrame.unique` with PySpark backend only supports `keep='any'`.", + ) + ) + + with context: + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + df = nw.from_native(pyspark_constructor(data)) + + result = df.unique(subset, keep=keep, maintain_order=True) # type: ignore[arg-type] + assert_equal_data(result, expected) + + +@pytest.mark.filterwarnings("ignore:Argument `maintain_order=True` is unused") +def test_unique_none(pyspark_constructor: Constructor) -> None: + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + df = nw.from_native(pyspark_constructor(data)) + result = df.unique(maintain_order=True) + assert_equal_data(result, data) + + +def test_inner_join_two_keys(pyspark_constructor: Constructor) -> None: + data = { + "antananarivo": [1, 3, 2], + "bob": [4, 4, 6], + "zorro": [7.0, 8, 9], + "idx": [0, 1, 2], + } + df = nw.from_native(pyspark_constructor(data)) + df_right = nw.from_native(pyspark_constructor(data)) + result = df.join( + df_right, # type: ignore[arg-type] + left_on=["antananarivo", "bob"], + right_on=["antananarivo", "bob"], + how="inner", + ) + result_on = df.join(df_right, on=["antananarivo", "bob"], how="inner") # type: ignore[arg-type] + result = result.sort("idx").drop("idx_right") + result_on = result_on.sort("idx").drop("idx_right") + expected = { + "antananarivo": [1, 3, 2], + "bob": [4, 4, 6], + "zorro": [7.0, 8, 9], + "zorro_right": [7.0, 8, 9], + "idx": [0, 1, 2], + } + assert_equal_data(result, expected) + assert_equal_data(result_on, expected) + + +def test_inner_join_single_key(pyspark_constructor: Constructor) -> None: + data = { + "antananarivo": [1, 3, 2], + "bob": [4, 4, 6], + "zorro": [7.0, 8, 9], + "idx": [0, 1, 2], + } + df = nw.from_native(pyspark_constructor(data)) + df_right = nw.from_native(pyspark_constructor(data)) + result = df.join( + df_right, # type: ignore[arg-type] + left_on="antananarivo", + right_on="antananarivo", + how="inner", + ).sort("idx") + result_on = df.join(df_right, on="antananarivo", how="inner").sort("idx") # type: ignore[arg-type] + result = result.drop("idx_right") + result_on = result_on.drop("idx_right") + expected = { + "antananarivo": [1, 3, 2], + "bob": [4, 4, 6], + "bob_right": [4, 4, 6], + "zorro": [7.0, 8, 9], + "zorro_right": [7.0, 8, 9], + "idx": [0, 1, 2], + } + assert_equal_data(result, expected) + assert_equal_data(result_on, expected) + + +def test_cross_join(pyspark_constructor: Constructor) -> None: + data = {"antananarivo": [1, 3, 2]} + df = nw.from_native(pyspark_constructor(data)) + other = nw.from_native(pyspark_constructor(data)) + result = df.join(other, how="cross").sort("antananarivo", "antananarivo_right") # type: ignore[arg-type] + expected = { + "antananarivo": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "antananarivo_right": [1, 2, 3, 1, 2, 3, 1, 2, 3], + } + assert_equal_data(result, expected) + + with pytest.raises( + ValueError, match="Can not pass `left_on`, `right_on` or `on` keys for cross join" + ): + df.join(other, how="cross", left_on="antananarivo") # type: ignore[arg-type] + + +@pytest.mark.parametrize("how", ["inner", "left"]) +@pytest.mark.parametrize("suffix", ["_right", "_custom_suffix"]) +def test_suffix(pyspark_constructor: Constructor, how: str, suffix: str) -> None: + data = { + "antananarivo": [1, 3, 2], + "bob": [4, 4, 6], + "zorro": [7.0, 8, 9], + } + df = nw.from_native(pyspark_constructor(data)) + df_right = nw.from_native(pyspark_constructor(data)) + result = df.join( + df_right, # type: ignore[arg-type] + left_on=["antananarivo", "bob"], + right_on=["antananarivo", "bob"], + how=how, # type: ignore[arg-type] + suffix=suffix, + ) + result_cols = result.collect_schema().names() + assert result_cols == ["antananarivo", "bob", "zorro", f"zorro{suffix}"] + + +@pytest.mark.parametrize("suffix", ["_right", "_custom_suffix"]) +def test_cross_join_suffix(pyspark_constructor: Constructor, suffix: str) -> None: + data = {"antananarivo": [1, 3, 2]} + df = nw.from_native(pyspark_constructor(data)) + other = nw.from_native(pyspark_constructor(data)) + result = df.join(other, how="cross", suffix=suffix).sort( # type: ignore[arg-type] + "antananarivo", f"antananarivo{suffix}" + ) + expected = { + "antananarivo": [1, 1, 1, 2, 2, 2, 3, 3, 3], + f"antananarivo{suffix}": [1, 2, 3, 1, 2, 3, 1, 2, 3], + } + assert_equal_data(result, expected) + + +@pytest.mark.parametrize( + ("join_key", "filter_expr", "expected"), + [ + ( + ["antananarivo", "bob"], + (nw.col("bob") < 5), + {"antananarivo": [2], "bob": [6], "zorro": [9]}, + ), + (["bob"], (nw.col("bob") < 5), {"antananarivo": [2], "bob": [6], "zorro": [9]}), + ( + ["bob"], + (nw.col("bob") > 5), + {"antananarivo": [1, 3], "bob": [4, 4], "zorro": [7.0, 8.0]}, + ), + ], +) +def test_anti_join( + pyspark_constructor: Constructor, + join_key: list[str], + filter_expr: nw.Expr, + expected: dict[str, list[Any]], +) -> None: + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + df = nw.from_native(pyspark_constructor(data)) + other = df.filter(filter_expr) + result = df.join(other, how="anti", left_on=join_key, right_on=join_key) # type: ignore[arg-type] + assert_equal_data(result, expected) + + +@pytest.mark.parametrize( + ("join_key", "filter_expr", "expected"), + [ + ( + "antananarivo", + (nw.col("bob") > 5), + {"antananarivo": [2], "bob": [6], "zorro": [9]}, + ), + ( + ["antananarivo"], + (nw.col("bob") > 5), + {"antananarivo": [2], "bob": [6], "zorro": [9]}, + ), + ( + ["bob"], + (nw.col("bob") < 5), + {"antananarivo": [1, 3], "bob": [4, 4], "zorro": [7, 8]}, + ), + ( + ["antananarivo", "bob"], + (nw.col("bob") < 5), + {"antananarivo": [1, 3], "bob": [4, 4], "zorro": [7, 8]}, + ), + ], +) +def test_semi_join( + pyspark_constructor: Constructor, + join_key: list[str], + filter_expr: nw.Expr, + expected: dict[str, list[Any]], +) -> None: + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + df = nw.from_native(pyspark_constructor(data)) + other = df.filter(filter_expr) + result = df.join(other, how="semi", left_on=join_key, right_on=join_key).sort( # type: ignore[arg-type] + "antananarivo" + ) + assert_equal_data(result, expected) + + +@pytest.mark.filterwarnings("ignore:the default coalesce behavior") +def test_left_join(pyspark_constructor: Constructor) -> None: + data_left = { + "antananarivo": [1.0, 2, 3], + "bob": [4.0, 5, 6], + "idx": [0.0, 1.0, 2.0], + } + data_right = { + "antananarivo": [1.0, 2, 3], + "co": [4.0, 5, 7], + "idx": [0.0, 1.0, 2.0], + } + df_left = nw.from_native(pyspark_constructor(data_left)) + df_right = nw.from_native(pyspark_constructor(data_right)) + result = ( + df_left.join(df_right, left_on="bob", right_on="co", how="left") # type: ignore[arg-type] + .sort("idx") + .drop("idx_right") + ) + expected = { + "antananarivo": [1, 2, 3], + "bob": [4, 5, 6], + "antananarivo_right": [1, 2, float("nan")], + "idx": [0, 1, 2], + } + result_on_list = df_left.join( + df_right, # type: ignore[arg-type] + on=["antananarivo", "idx"], + how="left", + ) + result_on_list = result_on_list.sort("idx") + expected_on_list = { + "antananarivo": [1, 2, 3], + "bob": [4, 5, 6], + "idx": [0, 1, 2], + "co": [4, 5, 7], + } + assert_equal_data(result, expected) + assert_equal_data(result_on_list, expected_on_list) + + +@pytest.mark.filterwarnings("ignore: the default coalesce behavior") +def test_left_join_multiple_column(pyspark_constructor: Constructor) -> None: + data_left = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "idx": [0, 1, 2]} + data_right = {"antananarivo": [1, 2, 3], "c": [4, 5, 6], "idx": [0, 1, 2]} + df_left = nw.from_native(pyspark_constructor(data_left)) + df_right = nw.from_native(pyspark_constructor(data_right)) + result = ( + df_left.join( + df_right, # type: ignore[arg-type] + left_on=["antananarivo", "bob"], + right_on=["antananarivo", "c"], + how="left", + ) + .sort("idx") + .drop("idx_right") + ) + expected = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "idx": [0, 1, 2]} + assert_equal_data(result, expected) + + +@pytest.mark.filterwarnings("ignore: the default coalesce behavior") +def test_left_join_overlapping_column(pyspark_constructor: Constructor) -> None: + data_left = { + "antananarivo": [1.0, 2, 3], + "bob": [4.0, 5, 6], + "d": [1.0, 4, 2], + "idx": [0.0, 1.0, 2.0], + } + data_right = { + "antananarivo": [1.0, 2, 3], + "c": [4.0, 5, 6], + "d": [1.0, 4, 2], + "idx": [0.0, 1.0, 2.0], + } + df_left = nw.from_native(pyspark_constructor(data_left)) + df_right = nw.from_native(pyspark_constructor(data_right)) + result = df_left.join(df_right, left_on="bob", right_on="c", how="left").sort("idx") # type: ignore[arg-type] + result = result.drop("idx_right") + expected: dict[str, list[Any]] = { + "antananarivo": [1, 2, 3], + "bob": [4, 5, 6], + "d": [1, 4, 2], + "antananarivo_right": [1, 2, 3], + "d_right": [1, 4, 2], + "idx": [0, 1, 2], + } + assert_equal_data(result, expected) + + result = ( + df_left.join( + df_right, # type: ignore[arg-type] + left_on="antananarivo", + right_on="d", + how="left", + ) + .sort("idx") + .drop("idx_right") + ) + expected = { + "antananarivo": [1, 2, 3], + "bob": [4, 5, 6], + "d": [1, 4, 2], + "antananarivo_right": [1.0, 3.0, float("nan")], + "c": [4.0, 6.0, float("nan")], + "idx": [0, 1, 2], + } + assert_equal_data(result, expected) From 6de971ebf4b1c401910d702d83d8d3567305ac86 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 31 Dec 2024 13:27:20 +0000 Subject: [PATCH 03/13] fix: Dask was raising for scalar vs Series binary operations (#1684) --- narwhals/_dask/expr.py | 170 +++++++++++++++++++------------------ narwhals/_dask/utils.py | 8 ++ tests/frame/select_test.py | 18 ++++ tests/utils.py | 1 + 4 files changed, 115 insertions(+), 82 deletions(-) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index a194c1068..8da2d4226 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -9,6 +9,7 @@ from typing import Sequence from narwhals._dask.utils import add_row_index +from narwhals._dask.utils import binary_operation_returns_scalar from narwhals._dask.utils import maybe_evaluate from narwhals._dask.utils import narwhals_to_native_dtype from narwhals._pandas_like.utils import calculate_timestamp_date @@ -136,10 +137,13 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: inputs = self._call(df) _kwargs = {key: maybe_evaluate(df, value) for key, value in kwargs.items()} for _input in inputs: + name = _input.name + if self._returns_scalar: + _input = _input[0] result = call(_input, **_kwargs) if returns_scalar: result = result.to_series() - result = result.rename(_input.name) + result = result.rename(name) results.append(result) return results @@ -174,7 +178,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: function_name=f"{self._function_name}->{expr_name}", root_names=root_names, output_names=output_names, - returns_scalar=self._returns_scalar or returns_scalar, + returns_scalar=returns_scalar, backend_version=self._backend_version, version=self._version, kwargs={**self._kwargs, **kwargs}, @@ -202,7 +206,7 @@ def __add__(self, other: Any) -> Self: lambda _input, other: _input.__add__(other), "__add__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ) def __radd__(self, other: Any) -> Self: @@ -210,7 +214,7 @@ def __radd__(self, other: Any) -> Self: lambda _input, other: _input.__radd__(other), "__radd__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ).alias("literal") def __sub__(self, other: Any) -> Self: @@ -218,7 +222,7 @@ def __sub__(self, other: Any) -> Self: lambda _input, other: _input.__sub__(other), "__sub__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ) def __rsub__(self, other: Any) -> Self: @@ -226,7 +230,7 @@ def __rsub__(self, other: Any) -> Self: lambda _input, other: _input.__rsub__(other), "__rsub__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ).alias("literal") def __mul__(self, other: Any) -> Self: @@ -234,7 +238,7 @@ def __mul__(self, other: Any) -> Self: lambda _input, other: _input.__mul__(other), "__mul__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ) def __rmul__(self, other: Any) -> Self: @@ -242,7 +246,7 @@ def __rmul__(self, other: Any) -> Self: lambda _input, other: _input.__rmul__(other), "__rmul__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ).alias("literal") def __truediv__(self, other: Any) -> Self: @@ -250,7 +254,7 @@ def __truediv__(self, other: Any) -> Self: lambda _input, other: _input.__truediv__(other), "__truediv__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ) def __rtruediv__(self, other: Any) -> Self: @@ -258,7 +262,7 @@ def __rtruediv__(self, other: Any) -> Self: lambda _input, other: _input.__rtruediv__(other), "__rtruediv__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ).alias("literal") def __floordiv__(self, other: Any) -> Self: @@ -266,7 +270,7 @@ def __floordiv__(self, other: Any) -> Self: lambda _input, other: _input.__floordiv__(other), "__floordiv__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ) def __rfloordiv__(self, other: Any) -> Self: @@ -274,7 +278,7 @@ def __rfloordiv__(self, other: Any) -> Self: lambda _input, other: _input.__rfloordiv__(other), "__rfloordiv__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ).alias("literal") def __pow__(self, other: Any) -> Self: @@ -282,7 +286,7 @@ def __pow__(self, other: Any) -> Self: lambda _input, other: _input.__pow__(other), "__pow__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ) def __rpow__(self, other: Any) -> Self: @@ -290,7 +294,7 @@ def __rpow__(self, other: Any) -> Self: lambda _input, other: _input.__rpow__(other), "__rpow__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ).alias("literal") def __mod__(self, other: Any) -> Self: @@ -298,7 +302,7 @@ def __mod__(self, other: Any) -> Self: lambda _input, other: _input.__mod__(other), "__mod__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ) def __rmod__(self, other: Any) -> Self: @@ -306,7 +310,7 @@ def __rmod__(self, other: Any) -> Self: lambda _input, other: _input.__rmod__(other), "__rmod__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ).alias("literal") def __eq__(self, other: DaskExpr) -> Self: # type: ignore[override] @@ -314,7 +318,7 @@ def __eq__(self, other: DaskExpr) -> Self: # type: ignore[override] lambda _input, other: _input.__eq__(other), "__eq__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ) def __ne__(self, other: DaskExpr) -> Self: # type: ignore[override] @@ -322,7 +326,7 @@ def __ne__(self, other: DaskExpr) -> Self: # type: ignore[override] lambda _input, other: _input.__ne__(other), "__ne__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ) def __ge__(self, other: DaskExpr) -> Self: @@ -330,7 +334,7 @@ def __ge__(self, other: DaskExpr) -> Self: lambda _input, other: _input.__ge__(other), "__ge__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ) def __gt__(self, other: DaskExpr) -> Self: @@ -338,7 +342,7 @@ def __gt__(self, other: DaskExpr) -> Self: lambda _input, other: _input.__gt__(other), "__gt__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ) def __le__(self, other: DaskExpr) -> Self: @@ -346,7 +350,7 @@ def __le__(self, other: DaskExpr) -> Self: lambda _input, other: _input.__le__(other), "__le__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ) def __lt__(self, other: DaskExpr) -> Self: @@ -354,7 +358,7 @@ def __lt__(self, other: DaskExpr) -> Self: lambda _input, other: _input.__lt__(other), "__lt__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ) def __and__(self, other: DaskExpr) -> Self: @@ -362,7 +366,7 @@ def __and__(self, other: DaskExpr) -> Self: lambda _input, other: _input.__and__(other), "__and__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ) def __rand__(self, other: DaskExpr) -> Self: @@ -370,7 +374,7 @@ def __rand__(self, other: DaskExpr) -> Self: lambda _input, other: _input.__rand__(other), "__rand__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ).alias("literal") def __or__(self, other: DaskExpr) -> Self: @@ -378,7 +382,7 @@ def __or__(self, other: DaskExpr) -> Self: lambda _input, other: _input.__or__(other), "__or__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ) def __ror__(self, other: DaskExpr) -> Self: @@ -386,14 +390,14 @@ def __ror__(self, other: DaskExpr) -> Self: lambda _input, other: _input.__ror__(other), "__ror__", other=other, - returns_scalar=False, + returns_scalar=binary_operation_returns_scalar(self, other), ).alias("literal") def __invert__(self: Self) -> Self: return self._from_call( lambda _input: _input.__invert__(), "__invert__", - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def map_batches( @@ -471,7 +475,7 @@ def shift(self, n: int) -> Self: lambda _input, n: _input.shift(n), "shift", n=n, - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def cum_sum(self: Self, *, reverse: bool) -> Self: @@ -482,7 +486,7 @@ def cum_sum(self: Self, *, reverse: bool) -> Self: return self._from_call( lambda _input: _input.cumsum(), "cum_sum", - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def cum_count(self: Self, *, reverse: bool) -> Self: @@ -493,7 +497,7 @@ def cum_count(self: Self, *, reverse: bool) -> Self: return self._from_call( lambda _input: (~_input.isna()).astype(int).cumsum(), "cum_count", - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def cum_min(self: Self, *, reverse: bool) -> Self: @@ -504,7 +508,7 @@ def cum_min(self: Self, *, reverse: bool) -> Self: return self._from_call( lambda _input: _input.cummin(), "cum_min", - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def cum_max(self: Self, *, reverse: bool) -> Self: @@ -515,7 +519,7 @@ def cum_max(self: Self, *, reverse: bool) -> Self: return self._from_call( lambda _input: _input.cummax(), "cum_max", - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def cum_prod(self: Self, *, reverse: bool) -> Self: @@ -526,7 +530,7 @@ def cum_prod(self: Self, *, reverse: bool) -> Self: return self._from_call( lambda _input: _input.cumprod(), "cum_prod", - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def is_between( @@ -547,7 +551,7 @@ def is_between( lower_bound=lower_bound, upper_bound=upper_bound, closed=closed, - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def sum(self) -> Self: @@ -569,7 +573,7 @@ def round(self, decimals: int) -> Self: lambda _input, decimals: _input.round(decimals), "round", decimals=decimals, - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def ewm_mean( @@ -616,7 +620,7 @@ def abs(self) -> Self: return self._from_call( lambda _input: _input.abs(), "abs", - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def all(self) -> Self: @@ -663,7 +667,7 @@ def func( value=value, strategy=strategy, limit=limit, - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def clip( @@ -678,14 +682,14 @@ def clip( "clip", lower_bound=lower_bound, upper_bound=upper_bound, - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def diff(self: Self) -> Self: return self._from_call( lambda _input: _input.diff(), "diff", - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def n_unique(self: Self) -> Self: @@ -699,7 +703,7 @@ def is_null(self: Self) -> Self: return self._from_call( lambda _input: _input.isna(), "is_null", - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def len(self: Self) -> Self: @@ -746,7 +750,7 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: return self._from_call( func, "is_first_distinct", - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def is_last_distinct(self: Self) -> Self: @@ -761,7 +765,7 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: return self._from_call( func, "is_last_distinct", - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def is_duplicated(self: Self) -> Self: @@ -777,7 +781,7 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: return self._from_call( func, "is_duplicated", - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def is_unique(self: Self) -> Self: @@ -793,7 +797,7 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: return self._from_call( func, "is_unique", - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def is_in(self: Self, other: Any) -> Self: @@ -801,7 +805,7 @@ def is_in(self: Self, other: Any) -> Self: lambda _input, other: _input.isin(other), "is_in", other=other, - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def null_count(self: Self) -> Self: @@ -884,7 +888,7 @@ def func(_input: Any, dtype: DType | type[DType]) -> Any: func, "cast", dtype=dtype, - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def is_finite(self: Self) -> Self: @@ -893,7 +897,7 @@ def is_finite(self: Self) -> Self: return self._from_call( lambda _input: da.isfinite(_input), "is_finite", - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def rolling_sum( @@ -919,7 +923,7 @@ def func( window_size=window_size, min_periods=min_periods, center=center, - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def rolling_mean( @@ -945,7 +949,7 @@ def func( window_size=window_size, min_periods=min_periods, center=center, - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def rolling_var( @@ -974,7 +978,7 @@ def func( min_periods=min_periods, center=center, ddof=ddof, - returns_scalar=False, + returns_scalar=self._returns_scalar, ) def rolling_std( @@ -1003,7 +1007,7 @@ def func( min_periods=min_periods, center=center, ddof=ddof, - returns_scalar=False, + returns_scalar=self._returns_scalar, ) @@ -1013,7 +1017,9 @@ def __init__(self, expr: DaskExpr) -> None: def len_chars(self) -> DaskExpr: return self._compliant_expr._from_call( - lambda _input: _input.str.len(), "len", returns_scalar=False + lambda _input: _input.str.len(), + "len", + returns_scalar=self._compliant_expr._returns_scalar, ) def replace( @@ -1033,7 +1039,7 @@ def replace( value=value, literal=literal, n=n, - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def replace_all( @@ -1051,7 +1057,7 @@ def replace_all( pattern=pattern, value=value, literal=literal, - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def strip_chars(self, characters: str | None = None) -> DaskExpr: @@ -1059,7 +1065,7 @@ def strip_chars(self, characters: str | None = None) -> DaskExpr: lambda _input, characters: _input.str.strip(characters), "strip", characters=characters, - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def starts_with(self, prefix: str) -> DaskExpr: @@ -1067,7 +1073,7 @@ def starts_with(self, prefix: str) -> DaskExpr: lambda _input, prefix: _input.str.startswith(prefix), "starts_with", prefix=prefix, - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def ends_with(self, suffix: str) -> DaskExpr: @@ -1075,7 +1081,7 @@ def ends_with(self, suffix: str) -> DaskExpr: lambda _input, suffix: _input.str.endswith(suffix), "ends_with", suffix=suffix, - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def contains(self, pattern: str, *, literal: bool = False) -> DaskExpr: @@ -1086,7 +1092,7 @@ def contains(self, pattern: str, *, literal: bool = False) -> DaskExpr: "contains", pattern=pattern, literal=literal, - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def slice(self, offset: int, length: int | None = None) -> DaskExpr: @@ -1097,7 +1103,7 @@ def slice(self, offset: int, length: int | None = None) -> DaskExpr: "slice", offset=offset, length=length, - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def to_datetime(self: Self, format: str | None) -> DaskExpr: # noqa: A002 @@ -1107,21 +1113,21 @@ def to_datetime(self: Self, format: str | None) -> DaskExpr: # noqa: A002 lambda _input, format: dd.to_datetime(_input, format=format), "to_datetime", format=format, - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def to_uppercase(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.str.upper(), "to_uppercase", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def to_lowercase(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.str.lower(), "to_lowercase", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) @@ -1133,77 +1139,77 @@ def date(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.date, "date", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def year(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.year, "year", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def month(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.month, "month", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def day(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.day, "day", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def hour(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.hour, "hour", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def minute(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.minute, "minute", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def second(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.second, "second", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def millisecond(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.microsecond // 1000, "millisecond", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def microsecond(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.microsecond, "microsecond", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def nanosecond(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.microsecond * 1000 + _input.dt.nanosecond, "nanosecond", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def ordinal_day(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.dayofyear, "ordinal_day", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def to_string(self, format: str) -> DaskExpr: # noqa: A002 @@ -1211,7 +1217,7 @@ def to_string(self, format: str) -> DaskExpr: # noqa: A002 lambda _input, format: _input.dt.strftime(format.replace("%.f", ".%f")), "strftime", format=format, - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def replace_time_zone(self, time_zone: str | None) -> DaskExpr: @@ -1223,7 +1229,7 @@ def replace_time_zone(self, time_zone: str | None) -> DaskExpr: else _input.dt.tz_localize(None), "tz_localize", time_zone=time_zone, - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def convert_time_zone(self, time_zone: str) -> DaskExpr: @@ -1240,7 +1246,7 @@ def func(s: dask_expr.Series, time_zone: str) -> dask_expr.Series: func, "tz_convert", time_zone=time_zone, - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> DaskExpr: @@ -1274,42 +1280,42 @@ def func( func, "datetime", time_unit=time_unit, - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def total_minutes(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.total_seconds() // 60, "total_minutes", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def total_seconds(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.total_seconds() // 1, "total_seconds", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def total_milliseconds(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.total_seconds() * 1000 // 1, "total_milliseconds", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def total_microseconds(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.total_seconds() * 1_000_000 // 1, "total_microseconds", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) def total_nanoseconds(self) -> DaskExpr: return self._compliant_expr._from_call( lambda _input: _input.dt.total_seconds() * 1_000_000_000 // 1, "total_nanoseconds", - returns_scalar=False, + returns_scalar=self._compliant_expr._returns_scalar, ) diff --git a/narwhals/_dask/utils.py b/narwhals/_dask/utils.py index 86a0e5193..71e17b3f9 100644 --- a/narwhals/_dask/utils.py +++ b/narwhals/_dask/utils.py @@ -15,6 +15,7 @@ import dask_expr from narwhals._dask.dataframe import DaskLazyFrame + from narwhals._dask.expr import DaskExpr from narwhals.dtypes import DType from narwhals.utils import Version @@ -143,3 +144,10 @@ def name_preserving_sum(s1: dask_expr.Series, s2: dask_expr.Series) -> dask_expr def name_preserving_div(s1: dask_expr.Series, s2: dask_expr.Series) -> dask_expr.Series: return (s1 / s2).rename(s1.name) + + +def binary_operation_returns_scalar(lhs: DaskExpr, rhs: DaskExpr | Any) -> bool: + # If `rhs` is a DaskExpr, we look at `_returns_scalar`. If it isn't, + # it means that it was a scalar (e.g. nw.col('a') + 1), and so we default + # to `True`. + return lhs._returns_scalar and getattr(rhs, "_returns_scalar", True) diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index b06efe003..2cb3df91d 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -9,6 +9,7 @@ import narwhals.stable.v1 as nw from narwhals.exceptions import ColumnNotFoundError from narwhals.exceptions import InvalidIntoExprError +from tests.utils import DASK_VERSION from tests.utils import PANDAS_VERSION from tests.utils import POLARS_VERSION from tests.utils import Constructor @@ -112,3 +113,20 @@ def test_missing_columns(constructor: Constructor) -> None: df.drop(selected_columns, strict=True) with pytest.raises(ColumnNotFoundError, match=msg): df.select(nw.col("fdfa")) + + +def test_left_to_right_broadcasting( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "dask" in str(constructor) and DASK_VERSION < (2024, 9): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor({"a": [1, 1, 2], "b": [4, 5, 6]})) + result = df.select(nw.col("a") + nw.col("b").sum()) + expected = {"a": [16, 16, 17]} + assert_equal_data(result, expected) + result = df.select(nw.col("b").sum() + nw.col("a")) + expected = {"b": [16, 16, 17]} + assert_equal_data(result, expected) + result = df.select(nw.col("b").sum() + nw.col("a").sum()) + expected = {"b": [19]} + assert_equal_data(result, expected) diff --git a/tests/utils.py b/tests/utils.py index 907eafa6a..60933046b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -32,6 +32,7 @@ def get_module_version_as_tuple(module_name: str) -> tuple[int, ...]: NUMPY_VERSION: tuple[int, ...] = get_module_version_as_tuple("numpy") PANDAS_VERSION: tuple[int, ...] = get_module_version_as_tuple("pandas") POLARS_VERSION: tuple[int, ...] = get_module_version_as_tuple("polars") +DASK_VERSION: tuple[int, ...] = get_module_version_as_tuple("dask") PYARROW_VERSION: tuple[int, ...] = get_module_version_as_tuple("pyarrow") PYSPARK_VERSION: tuple[int, ...] = get_module_version_as_tuple("pyspark") From 024d5d2294d1dcdd2e2f043d59eb03a3238c9e87 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Tue, 31 Dec 2024 14:36:13 +0100 Subject: [PATCH 04/13] docs: `DataFrame` method' docstrings (#1688) --- narwhals/dataframe.py | 1090 +++++++++++++++++++++----------- narwhals/stable/v1/__init__.py | 117 ++-- 2 files changed, 782 insertions(+), 425 deletions(-) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 4bc706e47..088f6198d 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -400,9 +400,8 @@ def __init__( def implementation(self) -> Implementation: """Return implementation of native frame. - This can be useful when you need to some special-casing for - some libraries for features outside of Narwhals' scope - for - example, when dealing with pandas' Period Dtype. + This can be useful when you need to use special-casing for features outside of + Narwhals' scope - for example, when dealing with pandas' Period Dtype. Returns: Implementation. @@ -423,8 +422,8 @@ def implementation(self) -> Implementation: """ return self._compliant_frame._implementation # type: ignore[no-any-return] - def __len__(self) -> Any: - return self._compliant_frame.__len__() + def __len__(self) -> int: + return self._compliant_frame.__len__() # type: ignore[no-any-return] def __array__(self, dtype: Any = None, copy: bool | None = None) -> np.ndarray: return self._compliant_frame.__array__(dtype, copy=copy) @@ -482,11 +481,10 @@ def lazy(self) -> LazyFrame[Any]: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrame - >>> - >>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) - >>> df_pa = pa.table(df) + >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: @@ -494,7 +492,8 @@ def lazy(self) -> LazyFrame[Any]: ... df = nw.from_native(df_native) ... return df.lazy().to_native() - Note that then, pandas and pyarrow dataframe stay eager, but Polars DataFrame becomes a Polars LazyFrame: + Note that then, pandas and pyarrow dataframe stay eager, but Polars DataFrame + becomes a Polars LazyFrame: >>> agnostic_lazy(df_pd) foo bar ham @@ -575,11 +574,10 @@ def to_pandas(self) -> pd.DataFrame: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoDataFrame - >>> - >>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) - >>> df_pa = pa.table(df) + >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: @@ -587,7 +585,8 @@ def to_pandas(self) -> pd.DataFrame: ... df = nw.from_native(df_native) ... return df.to_pandas() - We can then pass any supported library such as pandas, Polars (eager), or PyArrow to `agnostic_to_pandas`: + We can then pass any supported library such as pandas, Polars (eager), or + PyArrow to `agnostic_to_pandas`: >>> agnostic_to_pandas(df_pd) foo bar ham @@ -607,7 +606,13 @@ def to_pandas(self) -> pd.DataFrame: """ return self._compliant_frame.to_pandas() - def write_csv(self, file: str | Path | BytesIO | None = None) -> Any: + @overload + def write_csv(self, file: None = None) -> str: ... + + @overload + def write_csv(self, file: str | Path | BytesIO) -> None: ... + + def write_csv(self, file: str | Path | BytesIO | None = None) -> str | None: r"""Write dataframe to comma-separated values (CSV) file. Arguments: @@ -624,30 +629,31 @@ def write_csv(self, file: str | Path | BytesIO | None = None) -> Any: >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw - >>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) - >>> df_pa = pa.table(df) + >>> from narwhals.typing import IntoDataFrame + >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def func(df): - ... df = nw.from_native(df) + >>> def agnostic_write_csv(df_native: IntoDataFrame) -> str: + ... df = nw.from_native(df_native) ... return df.write_csv() - We can pass any supported library such as pandas, Polars or PyArrow to `func`: + We can pass any supported library such as pandas, Polars or PyArrow to `agnostic_write_csv`: - >>> func(df_pd) + >>> agnostic_write_csv(df_pd) 'foo,bar,ham\n1,6.0,a\n2,7.0,b\n3,8.0,c\n' - >>> func(df_pl) + >>> agnostic_write_csv(df_pl) 'foo,bar,ham\n1,6.0,a\n2,7.0,b\n3,8.0,c\n' - >>> func(df_pa) + >>> agnostic_write_csv(df_pa) '"foo","bar","ham"\n1,6,"a"\n2,7,"b"\n3,8,"c"\n' If we had passed a file name to `write_csv`, it would have been written to that file. """ - return self._compliant_frame.write_csv(file) + return self._compliant_frame.write_csv(file) # type: ignore[no-any-return] def write_parquet(self, file: str | Path | BytesIO) -> None: """Write dataframe to parquet file. @@ -666,22 +672,23 @@ def write_parquet(self, file: str | Path | BytesIO) -> None: >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw - >>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) - >>> df_pa = pa.table(df) + >>> from narwhals.typing import IntoDataFrame + >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def func(df): - ... df = nw.from_native(df) + >>> def agnostic_write_parquet(df_native: IntoDataFrame): + ... df = nw.from_native(df_native) ... df.write_parquet("foo.parquet") - We can then pass either pandas, Polars or PyArrow to `func`: + We can then pass either pandas, Polars or PyArrow to `agnostic_write_parquet`: - >>> func(df_pd) # doctest:+SKIP - >>> func(df_pl) # doctest:+SKIP - >>> func(df_pa) # doctest:+SKIP + >>> agnostic_write_parquet(df_pd) # doctest:+SKIP + >>> agnostic_write_parquet(df_pl) # doctest:+SKIP + >>> agnostic_write_parquet(df_pa) # doctest:+SKIP """ self._compliant_frame.write_parquet(file) @@ -700,11 +707,10 @@ def to_numpy(self) -> np.ndarray: >>> import narwhals as nw >>> import numpy as np >>> from narwhals.typing import IntoDataFrame - >>> - >>> df = {"foo": [1, 2, 3], "bar": [6.5, 7.0, 8.5], "ham": ["a", "b", "c"]} - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) - >>> df_pa = pa.table(df) + >>> data = {"foo": [1, 2, 3], "bar": [6.5, 7.0, 8.5], "ham": ["a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: @@ -744,11 +750,10 @@ def shape(self) -> tuple[int, int]: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoDataFrame - >>> - >>> df = {"foo": [1, 2, 3, 4, 5]} - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) - >>> df_pa = pa.table(df) + >>> data = {"foo": [1, 2, 3, 4, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: @@ -786,13 +791,14 @@ def get_column(self, name: str) -> Series[Any]: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoDataFrame >>> from narwhals.typing import IntoSeries - >>> >>> data = {"a": [1, 2], "b": [3, 4]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: @@ -801,7 +807,7 @@ def get_column(self, name: str) -> Series[Any]: ... name = df.columns[0] ... return df.get_column(name).to_native() - We can then pass either pandas or Polars to `agnostic_get_column`: + We can then pass either pandas, Polars or PyArrow to `agnostic_get_column`: >>> agnostic_get_column(df_pd) 0 1 @@ -814,6 +820,14 @@ def get_column(self, name: str) -> Series[Any]: 1 2 ] + >>> agnostic_get_column(df_pa) # doctest:+ELLIPSIS + + [ + [ + 1, + 2 + ] + ] """ return self._series( self._compliant_frame.get_column(name), @@ -827,7 +841,7 @@ def estimated_size(self, unit: SizeUnit = "b") -> int | float: Arguments: unit: 'b', 'kb', 'mb', 'gb', 'tb', 'bytes', 'kilobytes', 'megabytes', - 'gigabytes', or 'terabytes'. + 'gigabytes', or 'terabytes'. Returns: Integer or Float. @@ -957,7 +971,6 @@ def __getitem__( >>> import narwhals as nw >>> from narwhals.typing import IntoDataFrame >>> from narwhals.typing import IntoSeries - >>> >>> data = {"a": [1, 2], "b": [3, 4]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -1048,7 +1061,7 @@ def to_dict( Arguments: as_series: If set to true ``True``, then the values are Narwhals Series, - otherwise the values are Any. + otherwise the values are Any. Returns: A mapping from column name to values / Series. @@ -1059,17 +1072,16 @@ def to_dict( >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoDataFrame - >>> - >>> df = { + >>> data = { ... "A": [1, 2, 3, 4, 5], ... "fruits": ["banana", "banana", "apple", "apple", "banana"], ... "B": [5, 4, 3, 2, 1], ... "animals": ["beetle", "fly", "beetle", "beetle", "beetle"], ... "optional": [28, 300, None, 2, -30], ... } - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) - >>> df_pa = pa.table(df) + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: @@ -1103,9 +1115,10 @@ def to_dict( def row(self, index: int) -> tuple[Any, ...]: """Get values at given row. - !!!note + !!! warning You should NEVER use this method to iterate over a DataFrame; - if you require row-iteration you should strongly prefer use of iter_rows() instead. + if you require row-iteration you should strongly prefer use of iter_rows() + instead. Arguments: index: Row number. @@ -1120,25 +1133,27 @@ def row(self, index: int) -> tuple[Any, ...]: >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> from narwhals.typing import IntoDataFrame >>> from typing import Any - >>> >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a library-agnostic function to get the second row. >>> def agnostic_row(df_native: IntoDataFrame) -> tuple[Any, ...]: - ... df = nw.from_native(df_native) - ... return df.row(1) + ... return nw.from_native(df_native).row(1) - We can then pass pandas / Polars / any other supported library: + We can then pass either pandas, Polars or PyArrow to `agnostic_row`: >>> agnostic_row(df_pd) (2, 5) >>> agnostic_row(df_pl) (2, 5) + >>> agnostic_row(df_pa) + (, ) """ return self._compliant_frame.row(index) # type: ignore[no-any-return] @@ -1155,14 +1170,15 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se The original object with the function applied. Examples: - >>> import polars as pl >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> >>> data = {"a": [1, 2, 3], "ba": [4, 5, 6]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: @@ -1174,7 +1190,7 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se ... ).to_native() ... ) - We can then pass either pandas or Polars: + We can then pass either pandas, Polars or PyArrow to `agnostic_pipe`: >>> agnostic_pipe(df_pd) a @@ -1192,6 +1208,11 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se │ 2 │ │ 3 │ └─────┘ + >>> agnostic_pipe(df_pa) + pyarrow.Table + a: int64 + ---- + a: [[1,2,3]] """ return super().pipe(function, *args, **kwargs) @@ -1216,7 +1237,6 @@ def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> >>> data = {"a": [1.0, 2.0, None], "ba": [1.0, None, 2.0]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -1228,7 +1248,8 @@ def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self: ... df = nw.from_native(df_native) ... return df.drop_nulls().to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_drop_nulls`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_drop_nulls`: >>> agnostic_drop_nulls(df_pd) a ba @@ -1264,14 +1285,15 @@ def with_row_index(self, name: str = "index") -> Self: Examples: Construct pandas as polars DataFrames: - >>> import polars as pl >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: @@ -1279,7 +1301,8 @@ def with_row_index(self, name: str = "index") -> Self: ... df = nw.from_native(df_native) ... return df.with_row_index().to_native() - We can then pass either pandas or Polars: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_with_row_index`: >>> agnostic_with_row_index(df_pd) index a b @@ -1297,6 +1320,15 @@ def with_row_index(self, name: str = "index") -> Self: │ 1 ┆ 2 ┆ 5 │ │ 2 ┆ 3 ┆ 6 │ └───────┴─────┴─────┘ + >>> agnostic_with_row_index(df_pa) + pyarrow.Table + a: int64 + b: int64 + index: int64 + ---- + a: [[1,2,3]] + b: [[4,5,6]] + index: [[0,1,2]] """ return super().with_row_index(name) @@ -1308,12 +1340,12 @@ def schema(self) -> Schema: A Narwhals Schema object that displays the mapping of column names. Examples: - >>> import polars as pl >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.schema import Schema >>> from narwhals.typing import IntoFrame - >>> >>> data = { ... "foo": [1, 2, 3], ... "bar": [6.0, 7.0, 8.0], @@ -1321,6 +1353,7 @@ def schema(self) -> Schema: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: @@ -1328,14 +1361,14 @@ def schema(self) -> Schema: ... df = nw.from_native(df_native) ... return df.schema - You can pass either pandas or Polars to `agnostic_schema`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_schema`: - >>> df_pd_schema = agnostic_schema(df_pd) - >>> df_pd_schema + >>> agnostic_schema(df_pd) Schema({'foo': Int64, 'bar': Float64, 'ham': String}) - - >>> df_pl_schema = agnostic_schema(df_pl) - >>> df_pl_schema + >>> agnostic_schema(df_pl) + Schema({'foo': Int64, 'bar': Float64, 'ham': String}) + >>> agnostic_schema(df_pa) Schema({'foo': Int64, 'bar': Float64, 'ham': String}) """ return super().schema @@ -1347,12 +1380,12 @@ def collect_schema(self: Self) -> Schema: A Narwhals Schema object that displays the mapping of column names. Examples: - >>> import polars as pl >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.schema import Schema >>> from narwhals.typing import IntoFrame - >>> >>> data = { ... "foo": [1, 2, 3], ... "bar": [6.0, 7.0, 8.0], @@ -1360,6 +1393,7 @@ def collect_schema(self: Self) -> Schema: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: @@ -1367,14 +1401,14 @@ def collect_schema(self: Self) -> Schema: ... df = nw.from_native(df_native) ... return df.collect_schema() - You can pass either pandas or Polars to `agnostic_collect_schema`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_collect_schema`: - >>> df_pd_schema = agnostic_collect_schema(df_pd) - >>> df_pd_schema + >>> agnostic_collect_schema(df_pd) Schema({'foo': Int64, 'bar': Float64, 'ham': String}) - - >>> df_pl_schema = agnostic_collect_schema(df_pl) - >>> df_pl_schema + >>> agnostic_collect_schema(df_pl) + Schema({'foo': Int64, 'bar': Float64, 'ham': String}) + >>> agnostic_collect_schema(df_pa) Schema({'foo': Int64, 'bar': Float64, 'ham': String}) """ return super().collect_schema() @@ -1392,11 +1426,10 @@ def columns(self) -> list[str]: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrame - >>> - >>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) - >>> df_pa = pa.table(df) + >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: @@ -1404,7 +1437,8 @@ def columns(self) -> list[str]: ... df = nw.from_native(df_native) ... return df.columns - We can pass any supported library such as pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_columns`: >>> agnostic_columns(df_pd) ['foo', 'bar', 'ham'] @@ -1416,28 +1450,16 @@ def columns(self) -> list[str]: return super().columns @overload - def rows( - self, - *, - named: Literal[False] = False, - ) -> list[tuple[Any, ...]]: ... + def rows(self, *, named: Literal[False] = False) -> list[tuple[Any, ...]]: ... + @overload - def rows( - self, - *, - named: Literal[True], - ) -> list[dict[str, Any]]: ... + def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ... + @overload - def rows( - self, - *, - named: bool, - ) -> list[tuple[Any, ...]] | list[dict[str, Any]]: ... + def rows(self, *, named: bool) -> list[tuple[Any, ...]] | list[dict[str, Any]]: ... def rows( - self, - *, - named: bool = False, + self, *, named: bool = False ) -> list[tuple[Any, ...]] | list[dict[str, Any]]: """Returns all data in the DataFrame as a list of rows of python-native values. @@ -1452,26 +1474,33 @@ def rows( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) + >>> from narwhals.typing import IntoDataFrame + >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> @nw.narwhalify - ... def func(df, *, named): - ... return df.rows(named=named) + >>> def agnostic_rows(df_native: IntoDataFrame, *, named: bool): + ... return nw.from_native(df_native, eager_only=True).rows(named=named) - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_rows`: - >>> func(df_pd, named=False) + >>> agnostic_rows(df_pd, named=False) + [(1, 6.0, 'a'), (2, 7.0, 'b'), (3, 8.0, 'c')] + >>> agnostic_rows(df_pd, named=True) + [{'foo': 1, 'bar': 6.0, 'ham': 'a'}, {'foo': 2, 'bar': 7.0, 'ham': 'b'}, {'foo': 3, 'bar': 8.0, 'ham': 'c'}] + >>> agnostic_rows(df_pl, named=False) [(1, 6.0, 'a'), (2, 7.0, 'b'), (3, 8.0, 'c')] - >>> func(df_pd, named=True) + >>> agnostic_rows(df_pl, named=True) [{'foo': 1, 'bar': 6.0, 'ham': 'a'}, {'foo': 2, 'bar': 7.0, 'ham': 'b'}, {'foo': 3, 'bar': 8.0, 'ham': 'c'}] - >>> func(df_pl, named=False) + >>> agnostic_rows(df_pa, named=False) [(1, 6.0, 'a'), (2, 7.0, 'b'), (3, 8.0, 'c')] - >>> func(df_pl, named=True) + >>> agnostic_rows(df_pa, named=True) [{'foo': 1, 'bar': 6.0, 'ham': 'a'}, {'foo': 2, 'bar': 7.0, 'ham': 'b'}, {'foo': 3, 'bar': 8.0, 'ham': 'c'}] """ return self._compliant_frame.rows(named=named) # type: ignore[no-any-return] @@ -1513,26 +1542,33 @@ def iter_rows( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) + >>> from narwhals.typing import IntoDataFrame + >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> @nw.narwhalify - ... def func(df, *, named): - ... return df.iter_rows(named=named) + >>> def agnostic_iter_rows(df_native: IntoDataFrame, *, named: bool): + ... return nw.from_native(df_native, eager_only=True).iter_rows(named=named) - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_iter_rows`: - >>> [row for row in func(df_pd, named=False)] + >>> [row for row in agnostic_iter_rows(df_pd, named=False)] + [(1, 6.0, 'a'), (2, 7.0, 'b'), (3, 8.0, 'c')] + >>> [row for row in agnostic_iter_rows(df_pd, named=True)] + [{'foo': 1, 'bar': 6.0, 'ham': 'a'}, {'foo': 2, 'bar': 7.0, 'ham': 'b'}, {'foo': 3, 'bar': 8.0, 'ham': 'c'}] + >>> [row for row in agnostic_iter_rows(df_pl, named=False)] [(1, 6.0, 'a'), (2, 7.0, 'b'), (3, 8.0, 'c')] - >>> [row for row in func(df_pd, named=True)] + >>> [row for row in agnostic_iter_rows(df_pl, named=True)] [{'foo': 1, 'bar': 6.0, 'ham': 'a'}, {'foo': 2, 'bar': 7.0, 'ham': 'b'}, {'foo': 3, 'bar': 8.0, 'ham': 'c'}] - >>> [row for row in func(df_pl, named=False)] + >>> [row for row in agnostic_iter_rows(df_pa, named=False)] [(1, 6.0, 'a'), (2, 7.0, 'b'), (3, 8.0, 'c')] - >>> [row for row in func(df_pl, named=True)] + >>> [row for row in agnostic_iter_rows(df_pa, named=True)] [{'foo': 1, 'bar': 6.0, 'ham': 'a'}, {'foo': 2, 'bar': 7.0, 'ham': 'b'}, {'foo': 3, 'bar': 8.0, 'ham': 'c'}] """ return self._compliant_frame.iter_rows(named=named, buffer_size=buffer_size) # type: ignore[no-any-return] @@ -1562,31 +1598,38 @@ def with_columns( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> df = { + >>> from narwhals.typing import IntoFrameT + >>> data = { ... "a": [1, 2, 3, 4], ... "b": [0.5, 4, 10, 13], ... "c": [True, True, False, True], ... } - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function in which we pass an expression to add it as a new column: - >>> @nw.narwhalify - ... def func(df): - ... return df.with_columns((nw.col("a") * 2).alias("a*2")) + >>> def agnostic_with_columns(df_native: IntoFrameT) -> IntoFrameT: + ... return ( + ... nw.from_native(df_native) + ... .with_columns((nw.col("a") * 2).alias("a*2")) + ... .to_native() + ... ) - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_with_columns`: - >>> func(df_pd) + >>> agnostic_with_columns(df_pd) a b c a*2 0 1 0.5 True 2 1 2 4.0 True 4 2 3 10.0 False 6 3 4 13.0 True 8 - >>> func(df_pl) + >>> agnostic_with_columns(df_pl) shape: (4, 4) ┌─────┬──────┬───────┬─────┐ │ a ┆ b ┆ c ┆ a*2 │ @@ -1598,6 +1641,17 @@ def with_columns( │ 3 ┆ 10.0 ┆ false ┆ 6 │ │ 4 ┆ 13.0 ┆ true ┆ 8 │ └─────┴──────┴───────┴─────┘ + >>> agnostic_with_columns(df_pa) + pyarrow.Table + a: int64 + b: double + c: bool + a*2: int64 + ---- + a: [[1,2,3,4]] + b: [[0.5,4,10,13]] + c: [[true,true,false,true]] + a*2: [[2,4,6,8]] """ return super().with_columns(*exprs, **named_exprs) @@ -1622,30 +1676,33 @@ def select( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> df = { + >>> from narwhals.typing import IntoFrameT + >>> data = { ... "foo": [1, 2, 3], ... "bar": [6, 7, 8], ... "ham": ["a", "b", "c"], ... } - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function in which we pass the name of a column to select that column. - >>> @nw.narwhalify - ... def func(df): - ... return df.select("foo") + >>> def agnostic_single_select(df_native: IntoFrameT) -> IntoFrameT: + ... return nw.from_native(df_native).select("foo").to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_single_select`: - >>> func(df_pd) + >>> agnostic_single_select(df_pd) foo 0 1 1 2 2 3 - >>> func(df_pl) + >>> agnostic_single_select(df_pl) shape: (3, 1) ┌─────┐ │ foo │ @@ -1656,18 +1713,23 @@ def select( │ 2 │ │ 3 │ └─────┘ + >>> agnostic_single_select(df_pa) + pyarrow.Table + foo: int64 + ---- + foo: [[1,2,3]] Multiple columns can be selected by passing a list of column names. - >>> @nw.narwhalify - ... def func(df): - ... return df.select(["foo", "bar"]) - >>> func(df_pd) + >>> def agnostic_multi_select(df_native: IntoFrameT) -> IntoFrameT: + ... return nw.from_native(df_native).select(["foo", "bar"]).to_native() + + >>> agnostic_multi_select(df_pd) foo bar 0 1 6 1 2 7 2 3 8 - >>> func(df_pl) + >>> agnostic_multi_select(df_pl) shape: (3, 2) ┌─────┬─────┐ │ foo ┆ bar │ @@ -1678,19 +1740,30 @@ def select( │ 2 ┆ 7 │ │ 3 ┆ 8 │ └─────┴─────┘ + >>> agnostic_multi_select(df_pa) + pyarrow.Table + foo: int64 + bar: int64 + ---- + foo: [[1,2,3]] + bar: [[6,7,8]] Multiple columns can also be selected using positional arguments instead of a list. Expressions are also accepted. - >>> @nw.narwhalify - ... def func(df): - ... return df.select(nw.col("foo"), nw.col("bar") + 1) - >>> func(df_pd) + >>> def agnostic_select(df_native: IntoFrameT) -> IntoFrameT: + ... return ( + ... nw.from_native(df_native) + ... .select(nw.col("foo"), nw.col("bar") + 1) + ... .to_native() + ... ) + + >>> agnostic_select(df_pd) foo bar 0 1 7 1 2 8 2 3 9 - >>> func(df_pl) + >>> agnostic_select(df_pl) shape: (3, 2) ┌─────┬─────┐ │ foo ┆ bar │ @@ -1701,18 +1774,29 @@ def select( │ 2 ┆ 8 │ │ 3 ┆ 9 │ └─────┴─────┘ + >>> agnostic_select(df_pa) + pyarrow.Table + foo: int64 + bar: int64 + ---- + foo: [[1,2,3]] + bar: [[7,8,9]] Use keyword arguments to easily name your expression inputs. - >>> @nw.narwhalify - ... def func(df): - ... return df.select(threshold=nw.col("foo") * 2) - >>> func(df_pd) + >>> def agnostic_select_w_kwargs(df_native: IntoFrameT) -> IntoFrameT: + ... return ( + ... nw.from_native(df_native) + ... .select(threshold=nw.col("foo") * 2) + ... .to_native() + ... ) + + >>> agnostic_select_w_kwargs(df_pd) threshold 0 2 1 4 2 6 - >>> func(df_pl) + >>> agnostic_select_w_kwargs(df_pl) shape: (3, 1) ┌───────────┐ │ threshold │ @@ -1723,6 +1807,11 @@ def select( │ 4 │ │ 6 │ └───────────┘ + >>> agnostic_select_w_kwargs(df_pa) + pyarrow.Table + threshold: int64 + ---- + threshold: [[2,4,6]] """ return super().select(*exprs, **named_exprs) @@ -1738,25 +1827,28 @@ def rename(self, mapping: dict[str, str]) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> df = {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) + >>> from narwhals.typing import IntoFrameT + >>> data = {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> @nw.narwhalify - ... def func(df): - ... return df.rename({"foo": "apple"}) + >>> def agnostic_rename(df_native: IntoFrameT) -> IntoFrameT: + ... return nw.from_native(df_native).rename({"foo": "apple"}).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_rename`: - >>> func(df_pd) + >>> agnostic_rename(df_pd) apple bar ham 0 1 6 a 1 2 7 b 2 3 8 c - >>> func(df_pl) + >>> agnostic_rename(df_pl) shape: (3, 3) ┌───────┬─────┬─────┐ │ apple ┆ bar ┆ ham │ @@ -1767,6 +1859,15 @@ def rename(self, mapping: dict[str, str]) -> Self: │ 2 ┆ 7 ┆ b │ │ 3 ┆ 8 ┆ c │ └───────┴─────┴─────┘ + >>> agnostic_rename(df_pa) + pyarrow.Table + apple: int64 + bar: int64 + ham: string + ---- + apple: [[1,2,3]] + bar: [[6,7,8]] + ham: [["a","b","c"]] """ return super().rename(mapping) @@ -1783,29 +1884,32 @@ def head(self, n: int = 5) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> df = { + >>> from narwhals.typing import IntoFrameT + >>> data = { ... "foo": [1, 2, 3, 4, 5], ... "bar": [6, 7, 8, 9, 10], ... "ham": ["a", "b", "c", "d", "e"], ... } - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function that gets the first 3 rows. - >>> @nw.narwhalify - ... def func(df): - ... return df.head(3) + >>> def agnostic_head(df_native: IntoFrameT) -> IntoFrameT: + ... return nw.from_native(df_native).head(3).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_head`: - >>> func(df_pd) + >>> agnostic_head(df_pd) foo bar ham 0 1 6 a 1 2 7 b 2 3 8 c - >>> func(df_pl) + >>> agnostic_head(df_pl) shape: (3, 3) ┌─────┬─────┬─────┐ │ foo ┆ bar ┆ ham │ @@ -1816,6 +1920,15 @@ def head(self, n: int = 5) -> Self: │ 2 ┆ 7 ┆ b │ │ 3 ┆ 8 ┆ c │ └─────┴─────┴─────┘ + >>> agnostic_head(df_pa) + pyarrow.Table + foo: int64 + bar: int64 + ham: string + ---- + foo: [[1,2,3]] + bar: [[6,7,8]] + ham: [["a","b","c"]] """ return super().head(n) @@ -1832,29 +1945,32 @@ def tail(self, n: int = 5) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> df = { + >>> from narwhals.typing import IntoFrameT + >>> data = { ... "foo": [1, 2, 3, 4, 5], ... "bar": [6, 7, 8, 9, 10], ... "ham": ["a", "b", "c", "d", "e"], ... } - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function that gets the last 3 rows. - >>> @nw.narwhalify - ... def func(df): - ... return df.tail(3) + >>> def agnostic_tail(df_native: IntoFrameT) -> IntoFrameT: + ... return nw.from_native(df_native).tail(3).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_tail`: - >>> func(df_pd) + >>> agnostic_tail(df_pd) foo bar ham 2 3 8 c 3 4 9 d 4 5 10 e - >>> func(df_pl) + >>> agnostic_tail(df_pl) shape: (3, 3) ┌─────┬─────┬─────┐ │ foo ┆ bar ┆ ham │ @@ -1865,6 +1981,15 @@ def tail(self, n: int = 5) -> Self: │ 4 ┆ 9 ┆ d │ │ 5 ┆ 10 ┆ e │ └─────┴─────┴─────┘ + >>> agnostic_tail(df_pa) + pyarrow.Table + foo: int64 + bar: int64 + ham: string + ---- + foo: [[3,4,5]] + bar: [[8,9,10]] + ham: [["c","d","e"]] """ return super().tail(n) @@ -1882,25 +2007,28 @@ def drop(self, *columns: str | Iterable[str], strict: bool = True) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> @nw.narwhalify - ... def func(df): - ... return df.drop("ham") + >>> def agnostic_drop(df_native: IntoFrameT) -> IntoFrameT: + ... return nw.from_native(df_native).drop("ham").to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_drop`: - >>> func(df_pd) + >>> agnostic_drop(df_pd) foo bar 0 1 6.0 1 2 7.0 2 3 8.0 - >>> func(df_pl) + >>> agnostic_drop(df_pl) shape: (3, 2) ┌─────┬─────┐ │ foo ┆ bar │ @@ -1911,19 +2039,25 @@ def drop(self, *columns: str | Iterable[str], strict: bool = True) -> Self: │ 2 ┆ 7.0 │ │ 3 ┆ 8.0 │ └─────┴─────┘ + >>> agnostic_drop(df_pa) + pyarrow.Table + foo: int64 + bar: double + ---- + foo: [[1,2,3]] + bar: [[6,7,8]] Use positional arguments to drop multiple columns. - >>> @nw.narwhalify - ... def func(df): - ... return df.drop("foo", "ham") + >>> def agnostic_drop_multi(df_native: IntoFrameT) -> IntoFrameT: + ... return nw.from_native(df_native).drop("foo", "ham").to_native() - >>> func(df_pd) + >>> agnostic_drop_multi(df_pd) bar 0 6.0 1 7.0 2 8.0 - >>> func(df_pl) + >>> agnostic_drop_multi(df_pl) shape: (3, 1) ┌─────┐ │ bar │ @@ -1934,6 +2068,12 @@ def drop(self, *columns: str | Iterable[str], strict: bool = True) -> Self: │ 7.0 │ │ 8.0 │ └─────┘ + >>> agnostic_drop_multi(df_pa) + pyarrow.Table + bar: double + ---- + bar: [[6,7,8]] + """ return super().drop(*flatten(columns), strict=strict) @@ -1966,7 +2106,9 @@ def unique( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT >>> data = { ... "foo": [1, 2, 3, 1], ... "bar": ["a", "a", "a", "a"], @@ -1974,19 +2116,20 @@ def unique( ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> @nw.narwhalify - ... def func(df): - ... return df.unique(["bar", "ham"]) + >>> def agnostic_unique(df_native: IntoFrameT) -> IntoFrameT: + ... return nw.from_native(df_native).unique(["bar", "ham"]).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_unique`: - >>> func(df_pd) + >>> agnostic_unique(df_pd) foo bar ham 0 1 a b - >>> func(df_pl) + >>> agnostic_unique(df_pl) shape: (1, 3) ┌─────┬─────┬─────┐ │ foo ┆ bar ┆ ham │ @@ -1995,6 +2138,15 @@ def unique( ╞═════╪═════╪═════╡ │ 1 ┆ a ┆ b │ └─────┴─────┴─────┘ + >>> agnostic_unique(df_pa) + pyarrow.Table + foo: int64 + bar: string + ham: string + ---- + foo: [[1]] + bar: [["a"]] + ham: [["b"]] """ return super().unique(subset, keep=keep, maintain_order=maintain_order) @@ -2018,25 +2170,27 @@ def filter( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> from narwhals.typing import IntoFrame - >>> - >>> df = { + >>> from narwhals.typing import IntoFrameT + >>> data = { ... "foo": [1, 2, 3], ... "bar": [6, 7, 8], ... "ham": ["a", "b", "c"], ... } - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function in which we filter on one condition. - >>> def agnostic_filter(df_native: IntoFrame) -> IntoFrame: + >>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.filter(nw.col("foo") > 1).to_native() - We can then pass either pandas or Polars to `agnostic_filter`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_filter`: >>> agnostic_filter(df_pd) foo bar ham @@ -2052,10 +2206,19 @@ def filter( │ 2 ┆ 7 ┆ b │ │ 3 ┆ 8 ┆ c │ └─────┴─────┴─────┘ + >>> agnostic_filter(df_pa) + pyarrow.Table + foo: int64 + bar: int64 + ham: string + ---- + foo: [[2,3]] + bar: [[7,8]] + ham: [["b","c"]] Filter on multiple conditions, combined with and/or operators: - >>> def agnostic_filter(df_native: IntoFrame) -> IntoFrame: + >>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.filter((nw.col("foo") < 3) & (nw.col("ham") == "a")).to_native() >>> agnostic_filter(df_pd) @@ -2070,8 +2233,17 @@ def filter( ╞═════╪═════╪═════╡ │ 1 ┆ 6 ┆ a │ └─────┴─────┴─────┘ + >>> agnostic_filter(df_pa) + pyarrow.Table + foo: int64 + bar: int64 + ham: string + ---- + foo: [[1]] + bar: [[6]] + ham: [["a"]] - >>> def agnostic_filter(df_native: IntoFrame) -> IntoFrame: + >>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... dframe = df.filter( ... (nw.col("foo") == 1) | (nw.col("ham") == "c") @@ -2091,10 +2263,19 @@ def filter( │ 1 ┆ 6 ┆ a │ │ 3 ┆ 8 ┆ c │ └─────┴─────┴─────┘ + >>> agnostic_filter(df_pa) + pyarrow.Table + foo: int64 + bar: int64 + ham: string + ---- + foo: [[1,3]] + bar: [[6,8]] + ham: [["a","c"]] Provide multiple filters using `*args` syntax: - >>> def agnostic_filter(df_native: IntoFrame) -> IntoFrame: + >>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... dframe = df.filter( ... nw.col("foo") <= 2, @@ -2113,10 +2294,19 @@ def filter( ╞═════╪═════╪═════╡ │ 1 ┆ 6 ┆ a │ └─────┴─────┴─────┘ + >>> agnostic_filter(df_pa) + pyarrow.Table + foo: int64 + bar: int64 + ham: string + ---- + foo: [[1]] + bar: [[6]] + ham: [["a"]] Provide multiple filters using `**kwargs` syntax: - >>> def agnostic_filter(df_native: IntoFrame) -> IntoFrame: + >>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.filter(foo=2, ham="b").to_native() >>> agnostic_filter(df_pd) @@ -2131,6 +2321,15 @@ def filter( ╞═════╪═════╪═════╡ │ 2 ┆ 7 ┆ b │ └─────┴─────┴─────┘ + >>> agnostic_filter(df_pa) + pyarrow.Table + foo: int64 + bar: int64 + ham: string + ---- + foo: [[2]] + bar: [[7]] + ham: [["b"]] """ return super().filter(*predicates, **constraints) @@ -2150,30 +2349,34 @@ def group_by( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> df = { + >>> from narwhals.typing import IntoDataFrameT + >>> data = { ... "a": ["a", "b", "a", "b", "c"], ... "b": [1, 2, 1, 3, 3], ... "c": [5, 4, 3, 2, 1], ... } - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function in which we group by one column and call `agg` to compute the grouped sum of another column. - >>> @nw.narwhalify - ... def func(df): - ... return df.group_by("a").agg(nw.col("b").sum()).sort("a") + >>> def agnostic_group_by_agg(df_native: IntoDataFrameT) -> IntoDataFrameT: + ... df = nw.from_native(df_native, eager_only=True) + ... return df.group_by("a").agg(nw.col("b").sum()).sort("a").to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_group_by_agg`: - >>> func(df_pd) + >>> agnostic_group_by_agg(df_pd) a b 0 a 2 1 b 5 2 c 3 - >>> func(df_pl) + >>> agnostic_group_by_agg(df_pl) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -2184,19 +2387,27 @@ def group_by( │ b ┆ 5 │ │ c ┆ 3 │ └─────┴─────┘ + >>> agnostic_group_by_agg(df_pa) + pyarrow.Table + a: string + b: int64 + ---- + a: [["a","b","c"]] + b: [[2,5,3]] Group by multiple columns by passing a list of column names. - >>> @nw.narwhalify - ... def func(df): - ... return df.group_by(["a", "b"]).agg(nw.max("c")).sort("a", "b") - >>> func(df_pd) + >>> def agnostic_group_by_agg(df_native: IntoDataFrameT) -> IntoDataFrameT: + ... df = nw.from_native(df_native, eager_only=True) + ... return df.group_by(["a", "b"]).agg(nw.max("c")).sort("a", "b").to_native() + + >>> agnostic_group_by_agg(df_pd) a b c 0 a 1 5 1 b 2 4 2 b 3 2 3 c 3 1 - >>> func(df_pl) + >>> agnostic_group_by_agg(df_pl) shape: (4, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ @@ -2208,6 +2419,15 @@ def group_by( │ b ┆ 3 ┆ 2 │ │ c ┆ 3 ┆ 1 │ └─────┴─────┴─────┘ + >>> agnostic_group_by_agg(df_pa) + pyarrow.Table + a: string + b: int64 + c: int64 + ---- + a: [["a","b","b","c"]] + b: [[1,2,3,3]] + c: [[5,4,2,1]] """ from narwhals.expr import Expr from narwhals.group_by import GroupBy @@ -2247,9 +2467,11 @@ def sort( boolean is applied for all `by` columns. Examples: - >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT >>> data = { ... "a": [1, 2, None], ... "b": [6.0, 5.0, 4.0], @@ -2257,22 +2479,24 @@ def sort( ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function in which we sort by multiple columns in different orders - >>> @nw.narwhalify - ... def func(df): - ... return df.sort("c", "a", descending=[False, True]) + >>> def agnostic_sort(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.sort("c", "a", descending=[False, True]).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_sort`: - >>> func(df_pd) + >>> agnostic_sort(df_pd) a b c 0 1.0 6.0 a 2 NaN 4.0 b 1 2.0 5.0 c - >>> func(df_pl) + >>> agnostic_sort(df_pl) shape: (3, 3) ┌──────┬─────┬─────┐ │ a ┆ b ┆ c │ @@ -2283,6 +2507,15 @@ def sort( │ null ┆ 4.0 ┆ b │ │ 2 ┆ 5.0 ┆ c │ └──────┴─────┴─────┘ + >>> agnostic_sort(df_pa) + pyarrow.Table + a: int64 + b: double + c: string + ---- + a: [[1,null,2]] + b: [[6,4,5]] + c: [["a","b","c"]] """ return super().sort(by, *more_by, descending=descending, nulls_last=nulls_last) @@ -2317,9 +2550,11 @@ def join( A new joined DataFrame Examples: - >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT >>> data = { ... "foo": [1, 2, 3], ... "bar": [6.0, 7.0, 8.0], @@ -2336,20 +2571,27 @@ def join( >>> df_pl = pl.DataFrame(data) >>> other_pl = pl.DataFrame(data_other) + >>> df_pa = pa.table(data) + >>> other_pa = pa.table(data_other) + Let's define a dataframe-agnostic function in which we join over "ham" column: - >>> @nw.narwhalify - ... def join_on_ham(df, other_any): - ... return df.join(other_any, left_on="ham", right_on="ham") + >>> def agnostic_join_on_ham( + ... df_native: IntoFrameT, other_native: IntoFrameT + ... ) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... other = nw.from_native(other_native) + ... return df.join(other, left_on="ham", right_on="ham").to_native() - We can now pass either pandas or Polars to the function: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_join_on_ham`: - >>> join_on_ham(df_pd, other_pd) + >>> agnostic_join_on_ham(df_pd, other_pd) foo bar ham apple 0 1 6.0 a x 1 2 7.0 b y - >>> join_on_ham(df_pl, other_pl) + >>> agnostic_join_on_ham(df_pl, other_pl) shape: (2, 4) ┌─────┬─────┬─────┬───────┐ │ foo ┆ bar ┆ ham ┆ apple │ @@ -2359,6 +2601,17 @@ def join( │ 1 ┆ 6.0 ┆ a ┆ x │ │ 2 ┆ 7.0 ┆ b ┆ y │ └─────┴─────┴─────┴───────┘ + >>> agnostic_join_on_ham(df_pa, other_pa) + pyarrow.Table + foo: int64 + bar: double + ham: string + apple: string + ---- + foo: [[1,2]] + bar: [[6,7]] + ham: [["a","b"]] + apple: [["x","y"]] """ return super().join( other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix @@ -2384,19 +2637,12 @@ def join_asof( Arguments: other: DataFrame to join with. - left_on: Name(s) of the left join column(s). - right_on: Name(s) of the right join column(s). - on: Join column of both DataFrames. If set, left_on and right_on should be None. - - by_left: join on these columns before doing asof join - - by_right: join on these columns before doing asof join - - by: join on these columns before doing asof join - + by_left: join on these columns before doing asof join. + by_right: join on these columns before doing asof join. + by: join on these columns before doing asof join. strategy: Join strategy. The default is "backward". * *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key. @@ -2408,9 +2654,11 @@ def join_asof( Examples: >>> from datetime import datetime - >>> import narwhals as nw + >>> from typing import Literal >>> import pandas as pd >>> import polars as pl + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT >>> data_gdp = { ... "datetime": [ ... datetime(2016, 1, 1), @@ -2437,19 +2685,25 @@ def join_asof( Let's define a dataframe-agnostic function in which we join over "datetime" column: - >>> @nw.narwhalify - ... def join_asof_datetime(df, other_any, strategy): - ... return df.join_asof(other_any, on="datetime", strategy=strategy) + >>> def agnostic_join_asof_datetime( + ... df_native: IntoFrameT, + ... other_native: IntoFrameT, + ... strategy: Literal["backward", "forward", "nearest"], + ... ) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... other = nw.from_native(other_native) + ... return df.join_asof(other, on="datetime", strategy=strategy).to_native() - We can now pass either pandas or Polars to the function: + We can then pass any supported library such as Pandas or Polars + to `agnostic_join_asof_datetime`: - >>> join_asof_datetime(population_pd, gdp_pd, strategy="backward") + >>> agnostic_join_asof_datetime(population_pd, gdp_pd, strategy="backward") datetime population gdp 0 2016-03-01 82.19 4164 1 2018-08-01 82.66 4566 2 2019-01-01 83.12 4696 - >>> join_asof_datetime(population_pl, gdp_pl, strategy="backward") + >>> agnostic_join_asof_datetime(population_pl, gdp_pl, strategy="backward") shape: (3, 3) ┌─────────────────────┬────────────┬──────┐ │ datetime ┆ population ┆ gdp │ @@ -2510,13 +2764,16 @@ def join_asof( Let's define a dataframe-agnostic function in which we join over "datetime" and by "ticker" columns: - >>> @nw.narwhalify - ... def join_asof_datetime_by_ticker(df, other_any): - ... return df.join_asof(other_any, on="datetime", by="ticker") + >>> def agnostic_join_asof_datetime_by_ticker( + ... df_native: IntoFrameT, other_native: IntoFrameT + ... ) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... other = nw.from_native(other_native) + ... return df.join_asof(other, on="datetime", by="ticker").to_native() We can now pass either pandas or Polars to the function: - >>> join_asof_datetime_by_ticker(trades_pd, quotes_pd) + >>> agnostic_join_asof_datetime_by_ticker(trades_pd, quotes_pd) datetime ticker price quantity bid ask 0 2016-05-25 13:30:00.000023 MSFT 51.95 75 51.95 51.96 1 2016-05-25 13:30:00.000038 MSFT 51.95 155 51.97 51.98 @@ -2524,7 +2781,7 @@ def join_asof( 3 2016-05-25 13:30:00.000048 GOOG 720.92 100 720.50 720.93 4 2016-05-25 13:30:00.000048 AAPL 98.00 100 NaN NaN - >>> join_asof_datetime_by_ticker(trades_pl, quotes_pl) + >>> agnostic_join_asof_datetime_by_ticker(trades_pl, quotes_pl) shape: (5, 6) ┌────────────────────────────┬────────┬────────┬──────────┬───────┬────────┐ │ datetime ┆ ticker ┆ price ┆ quantity ┆ bid ┆ ask │ @@ -2557,38 +2814,37 @@ def is_duplicated(self: Self) -> Series[Any]: A new Series. Examples: - >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl - >>> df_pd = pd.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df_pl = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoDataFrame + >>> from narwhals.typing import IntoSeries + >>> data = { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> @nw.narwhalify - ... def func(df): - ... return df.is_duplicated() + >>> def agnostic_is_duplicated(df_native: IntoDataFrame) -> IntoSeries: + ... df = nw.from_native(df_native, eager_only=True) + ... return df.is_duplicated().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_is_duplicated`: - >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_duplicated(df_pd) 0 True 1 False 2 False 3 True dtype: bool - >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_duplicated(df_pl) # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [bool] [ @@ -2597,6 +2853,16 @@ def is_duplicated(self: Self) -> Series[Any]: false true ] + >>> agnostic_is_duplicated(df_pa) # doctest: +ELLIPSIS + + [ + [ + true, + false, + false, + true + ] + ] """ return self._series( self._compliant_frame.is_duplicated(), @@ -2610,28 +2876,35 @@ def is_empty(self: Self) -> bool: A boolean indicating whether the dataframe is empty (True) or not (False). Examples: - >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoDataFrame Let's define a dataframe-agnostic function that filters rows in which "foo" values are greater than 10, and then checks if the result is empty or not: - >>> @nw.narwhalify - ... def func(df): + >>> def agnostic_is_empty(df_native: IntoDataFrame) -> bool: + ... df = nw.from_native(df_native, eager_only=True) ... return df.filter(nw.col("foo") > 10).is_empty() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_is_empty`: - >>> df_pd = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> df_pl = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]}) - >>> func(df_pd), func(df_pl) - (True, True) + >>> data = {"foo": [1, 2, 3], "bar": [4, 5, 6]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + >>> agnostic_is_empty(df_pd), agnostic_is_empty(df_pl), agnostic_is_empty(df_pa) + (True, True, True) - >>> df_pd = pd.DataFrame({"foo": [100, 2, 3], "bar": [4, 5, 6]}) - >>> df_pl = pl.DataFrame({"foo": [100, 2, 3], "bar": [4, 5, 6]}) - >>> func(df_pd), func(df_pl) - (False, False) + >>> data = {"foo": [100, 2, 3], "bar": [4, 5, 6]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + >>> agnostic_is_empty(df_pd), agnostic_is_empty(df_pl), agnostic_is_empty(df_pa) + (False, False, False) """ return self._compliant_frame.is_empty() # type: ignore[no-any-return] @@ -2642,38 +2915,37 @@ def is_unique(self: Self) -> Series[Any]: A new Series. Examples: - >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl - >>> df_pd = pd.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df_pl = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoDataFrame + >>> from narwhals.typing import IntoSeries + >>> data = { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> @nw.narwhalify - ... def func(df): - ... return df.is_unique() + >>> def agnostic_is_unique(df_native: IntoDataFrame) -> IntoSeries: + ... df = nw.from_native(df_native, eager_only=True) + ... return df.is_unique().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_is_unique`: - >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_unique(df_pd) 0 False 1 True 2 True 3 False dtype: bool - >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_unique(df_pl) # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [bool] [ @@ -2682,6 +2954,16 @@ def is_unique(self: Self) -> Series[Any]: true false ] + >>> agnostic_is_unique(df_pa) # doctest: +ELLIPSIS + + [ + [ + false, + true, + true, + false + ] + ] """ return self._series( self._compliant_frame.is_unique(), @@ -2700,11 +2982,11 @@ def null_count(self: Self) -> Self: for reference. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT >>> data = { ... "foo": [1, None, 3], ... "bar": [6, 7, None], @@ -2721,7 +3003,8 @@ def null_count(self: Self) -> Self: ... df = nw.from_native(df_native) ... return df.null_count().to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_null_count`: + We can then pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_null_count`: >>> agnostic_null_count(df_pd) foo bar ham @@ -2764,25 +3047,32 @@ def item(self: Self, row: int | None = None, column: int | str | None = None) -> With row/col, this is equivalent to df[row,col]. Examples: - >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoDataFrame >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function that returns item at given row/column - >>> @nw.narwhalify - ... def func(df, row, column): + >>> def agnostic_item( + ... df_native: IntoDataFrame, row: int | None, column: int | str | None + ... ): + ... df = nw.from_native(df_native, eager_only=True) ... return df.item(row, column) - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_item`: - >>> func(df_pd, 1, 1), func(df_pd, 2, "b") + >>> agnostic_item(df_pd, 1, 1), agnostic_item(df_pd, 2, "b") (np.int64(5), np.int64(6)) - - >>> func(df_pl, 1, 1), func(df_pl, 2, "b") + >>> agnostic_item(df_pl, 1, 1), agnostic_item(df_pl, 2, "b") + (5, 6) + >>> agnostic_item(df_pa, 1, 1), agnostic_item(df_pa, 2, "b") (5, 6) """ return self._compliant_frame.item(row=row, column=column) @@ -2794,25 +3084,30 @@ def clone(self) -> Self: An identical copy of the original dataframe. Examples: - >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT >>> data = {"a": [1, 2], "b": [3, 4]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) Let's define a dataframe-agnostic function in which we clone the DataFrame: - >>> @nw.narwhalify - ... def func(df): - ... return df.clone() + >>> def agnostic_clone(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.clone().to_native() + + We can then pass any supported library such as Pandas or Polars + to `agnostic_clone`: - >>> func(df_pd) + >>> agnostic_clone(df_pd) a b 0 1 3 1 2 4 - >>> func(df_pl) + >>> agnostic_clone(df_pl) shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -2836,26 +3131,32 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: The dataframe containing only the selected rows. Examples: - >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT >>> data = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function in which gather every 2 rows, starting from a offset of 1: - >>> @nw.narwhalify - ... def func(df): - ... return df.gather_every(n=2, offset=1) + >>> def agnostic_gather_every(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.gather_every(n=2, offset=1).to_native() - >>> func(df_pd) + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_gather_every`: + + >>> agnostic_gather_every(df_pd) a b 1 2 6 3 4 8 - >>> func(df_pl) + >>> agnostic_gather_every(df_pl) shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -2865,6 +3166,13 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: │ 2 ┆ 6 │ │ 4 ┆ 8 │ └─────┴─────┘ + >>> agnostic_gather_every(df_pa) + pyarrow.Table + a: int64 + b: int64 + ---- + a: [[2,4]] + b: [[6,8]] """ return super().gather_every(n=n, offset=offset) @@ -2909,9 +3217,11 @@ def pivot( A new dataframe. Examples: - >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoDataFrameT >>> data = { ... "ix": [1, 1, 2, 2, 1, 2], ... "col": ["a", "a", "a", "a", "b", "b"], @@ -2920,20 +3230,22 @@ def pivot( ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> @nw.narwhalify - ... def func(df): - ... return df.pivot("col", index="ix", aggregate_function="sum") + >>> def agnostic_pivot(df_native: IntoDataFrameT) -> IntoDataFrameT: + ... df = nw.from_native(df_native, eager_only=True) + ... return df.pivot("col", index="ix", aggregate_function="sum").to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas or Polars + to `agnostic_pivot`: - >>> func(df_pd) + >>> agnostic_pivot(df_pd) ix foo_a foo_b bar_a bar_b 0 1 1 7 2 9 1 2 4 1 0 4 - >>> func(df_pl) + >>> agnostic_pivot(df_pl) shape: (2, 5) ┌─────┬───────┬───────┬───────┬───────┐ │ ix ┆ foo_a ┆ foo_b ┆ bar_a ┆ bar_b │ @@ -2972,20 +3284,26 @@ def to_arrow(self: Self) -> pa.Table: A new PyArrow table. Examples: - >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoDataFrame >>> data = {"foo": [1, 2, 3], "bar": ["a", "b", "c"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function that converts to arrow table: - >>> @nw.narwhalify - ... def func(df): + >>> def agnostic_to_arrow(df_native: IntoDataFrame) -> pa.Table: + ... df = nw.from_native(df_native, eager_only=True) ... return df.to_arrow() - >>> func(df_pd) + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_to_arrow`: + + >>> agnostic_to_arrow(df_pd) pyarrow.Table foo: int64 bar: string @@ -2993,13 +3311,21 @@ def to_arrow(self: Self) -> pa.Table: foo: [[1,2,3]] bar: [["a","b","c"]] - >>> func(df_pl) # doctest:+NORMALIZE_WHITESPACE + >>> agnostic_to_arrow(df_pl) pyarrow.Table foo: int64 bar: large_string ---- foo: [[1,2,3]] bar: [["a","b","c"]] + + >>> agnostic_to_arrow(df_pa) + pyarrow.Table + foo: int64 + bar: string + ---- + foo: [[1,2,3]] + bar: [["a","b","c"]] """ return self._compliant_frame.to_arrow() @@ -3027,25 +3353,30 @@ def sample( The results may not be consistent across libraries. Examples: - >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoDataFrameT >>> data = {"a": [1, 2, 3, 4], "b": ["x", "y", "x", "y"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> @nw.narwhalify - ... def func(df): - ... return df.sample(n=2, seed=123) + >>> def agnostic_sample(df_native: IntoDataFrameT) -> IntoDataFrameT: + ... df = nw.from_native(df_native, eager_only=True) + ... return df.sample(n=2, seed=123).to_native() - We can then pass either pandas or Polars to `func`: - >>> func(df_pd) + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_sample`: + + >>> agnostic_sample(df_pd) a b 3 4 y 0 1 x - >>> func(df_pl) + >>> agnostic_sample(df_pl) shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -3055,6 +3386,13 @@ def sample( │ 2 ┆ y │ │ 3 ┆ x │ └─────┴─────┘ + >>> agnostic_sample(df_pa) + pyarrow.Table + a: int64 + b: string + ---- + a: [[1,3]] + b: [["x","x"]] As you can see, by using the same seed, the result will be consistent within the same backend, but not necessarely across different backends. @@ -3098,10 +3436,11 @@ def unpivot( In other frameworks, you might know this operation as `pivot_longer`. Examples: - >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT >>> data = { ... "a": ["x", "y", "z"], ... "b": [1, 3, 5], @@ -3110,13 +3449,14 @@ def unpivot( We define a library agnostic function: - >>> @nw.narwhalify - ... def func(df): - ... return df.unpivot(on=["b", "c"], index="a") + >>> def agnostic_unpivot(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.unpivot(on=["b", "c"], index="a").to_native() - We can pass any supported library such as pandas, Polars or PyArrow to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_unpivot`: - >>> func(pl.DataFrame(data)) + >>> agnostic_unpivot(pl.DataFrame(data)) shape: (6, 3) ┌─────┬──────────┬───────┐ │ a ┆ variable ┆ value │ @@ -3131,7 +3471,7 @@ def unpivot( │ z ┆ c ┆ 6 │ └─────┴──────────┴───────┘ - >>> func(pd.DataFrame(data)) + >>> agnostic_unpivot(pd.DataFrame(data)) a variable value 0 x b 1 1 y b 3 @@ -3140,7 +3480,7 @@ def unpivot( 4 y c 4 5 z c 6 - >>> func(pa.table(data)) + >>> agnostic_unpivot(pa.table(data)) pyarrow.Table a: string variable: string @@ -3169,11 +3509,11 @@ def explode(self: Self, columns: str | Sequence[str], *more_columns: str) -> Sel New DataFrame Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoDataFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT >>> data = { ... "a": ["x", "y", "z", "w"], ... "lst1": [[1, 2], None, [None], []], @@ -3182,7 +3522,7 @@ def explode(self: Self, columns: str | Sequence[str], *more_columns: str) -> Sel We define a library agnostic function: - >>> def agnostic_explode(df_native: IntoDataFrameT) -> IntoDataFrameT: + >>> def agnostic_explode(df_native: IntoFrameT) -> IntoFrameT: ... return ( ... nw.from_native(df_native) ... .with_columns(nw.col("lst1", "lst2").cast(nw.List(nw.Int32()))) diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 2b5be2eee..f7705713f 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -180,11 +180,10 @@ def lazy(self) -> LazyFrame[Any]: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrame - >>> - >>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) - >>> df_pa = pa.table(df) + >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: @@ -192,7 +191,8 @@ def lazy(self) -> LazyFrame[Any]: ... df = nw.from_native(df_native) ... return df.lazy().to_native() - Note that then, pandas and pyarrow dataframe stay eager, but Polars DataFrame becomes a Polars LazyFrame: + Note that then, pandas and pyarrow dataframe stay eager, but Polars DataFrame + becomes a Polars LazyFrame: >>> agnostic_lazy(df_pd) foo bar ham @@ -228,7 +228,7 @@ def to_dict( Arguments: as_series: If set to true ``True``, then the values are Narwhals Series, - otherwise the values are Any. + otherwise the values are Any. Returns: A mapping from column name to values / Series. @@ -239,17 +239,16 @@ def to_dict( >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoDataFrame - >>> - >>> df = { + >>> data = { ... "A": [1, 2, 3, 4, 5], ... "fruits": ["banana", "banana", "apple", "apple", "banana"], ... "B": [5, 4, 3, 2, 1], ... "animals": ["beetle", "fly", "beetle", "beetle", "beetle"], ... "optional": [28, 300, None, 2, -30], ... } - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) - >>> df_pa = pa.table(df) + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: @@ -277,38 +276,37 @@ def is_duplicated(self: Self) -> Series: A new Series. Examples: - >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl - >>> df_pd = pd.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df_pl = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoDataFrame + >>> from narwhals.typing import IntoSeries + >>> data = { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> @nw.narwhalify - ... def func(df): - ... return df.is_duplicated() + >>> def agnostic_is_duplicated(df_native: IntoDataFrame) -> IntoSeries: + ... df = nw.from_native(df_native, eager_only=True) + ... return df.is_duplicated().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_is_duplicated`: - >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_duplicated(df_pd) 0 True 1 False 2 False 3 True dtype: bool - >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_duplicated(df_pl) # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [bool] [ @@ -317,6 +315,16 @@ def is_duplicated(self: Self) -> Series: false true ] + >>> agnostic_is_duplicated(df_pa) # doctest: +ELLIPSIS + + [ + [ + true, + false, + false, + true + ] + ] """ return super().is_duplicated() # type: ignore[return-value] @@ -327,38 +335,37 @@ def is_unique(self: Self) -> Series: A new Series. Examples: - >>> import narwhals as nw >>> import pandas as pd >>> import polars as pl - >>> df_pd = pd.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) - >>> df_pl = pl.DataFrame( - ... { - ... "a": [1, 2, 3, 1], - ... "b": ["x", "y", "z", "x"], - ... } - ... ) + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoDataFrame + >>> from narwhals.typing import IntoSeries + >>> data = { + ... "a": [1, 2, 3, 1], + ... "b": ["x", "y", "z", "x"], + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> @nw.narwhalify - ... def func(df): - ... return df.is_unique() + >>> def agnostic_is_unique(df_native: IntoDataFrame) -> IntoSeries: + ... df = nw.from_native(df_native, eager_only=True) + ... return df.is_unique().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_is_unique`: - >>> func(df_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_unique(df_pd) 0 False 1 True 2 True 3 False dtype: bool - >>> func(df_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_unique(df_pl) # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [bool] [ @@ -367,6 +374,16 @@ def is_unique(self: Self) -> Series: true false ] + >>> agnostic_is_unique(df_pa) # doctest: +ELLIPSIS + + [ + [ + false, + true, + true, + false + ] + ] """ return super().is_unique() # type: ignore[return-value] From 3124bf33f2c33c295e31f129aa286c02100d210d Mon Sep 17 00:00:00 2001 From: Cameron <96146940+camriddell@users.noreply.github.com> Date: Wed, 1 Jan 2025 00:00:02 -0800 Subject: [PATCH 05/13] enh modin dtype interoperability (#1692) --- narwhals/_pandas_like/group_by.py | 17 ++--------------- narwhals/_pandas_like/utils.py | 2 +- tests/conftest.py | 7 ++++++- tests/expr_and_series/arithmetic_test.py | 12 ++++++------ tests/expr_and_series/cast_test.py | 4 ++-- tests/expr_and_series/convert_time_zone_test.py | 12 +++++++++--- .../dt/datetime_attributes_test.py | 9 ++++++++- tests/expr_and_series/dt/timestamp_test.py | 7 ++++++- tests/expr_and_series/is_finite_test.py | 16 +++++++++++----- tests/expr_and_series/replace_time_zone_test.py | 12 ++++++++---- .../str/to_uppercase_to_lowercase_test.py | 3 ++- tests/frame/to_numpy_test.py | 5 +---- tests/frame/to_pandas_test.py | 5 ++--- tests/group_by_test.py | 7 ++++--- tests/series_only/to_numpy_test.py | 6 ++---- tests/series_only/to_pandas_test.py | 2 +- 16 files changed, 71 insertions(+), 55 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index ae897ee98..e8cf77a87 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -18,7 +18,6 @@ from narwhals.utils import Implementation from narwhals.utils import find_stacklevel from narwhals.utils import remove_prefix -from narwhals.utils import tupleify if TYPE_CHECKING: from narwhals._pandas_like.dataframe import PandasLikeDataFrame @@ -124,20 +123,8 @@ def _from_native_frame(self, df: PandasLikeDataFrame) -> PandasLikeDataFrame: ) def __iter__(self) -> Iterator[tuple[Any, PandasLikeDataFrame]]: - indices = self._grouped.indices - if ( - self._df._implementation is Implementation.PANDAS - and self._df._backend_version < (2, 2) - ) or ( - self._df._implementation is Implementation.CUDF - and self._df._backend_version < (2024, 12) - ): # pragma: no cover - for key in indices: - yield (key, self._from_native_frame(self._grouped.get_group(key))) - else: - for key in indices: - key = tupleify(key) # noqa: PLW2901 - yield (key, self._from_native_frame(self._grouped.get_group(key))) + for key, group in self._grouped: + yield (key, self._from_native_frame(group)) def agg_pandas( # noqa: PLR0915 diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 0bb7edf8f..8d67aa65e 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -481,7 +481,7 @@ def native_to_narwhals_dtype( def get_dtype_backend(dtype: Any, implementation: Implementation) -> str: - if implementation is Implementation.PANDAS: + if implementation in [Implementation.PANDAS, Implementation.MODIN]: import pandas as pd if hasattr(pd, "ArrowDtype") and isinstance(dtype, pd.ArrowDtype): diff --git a/tests/conftest.py b/tests/conftest.py index cb8a982a3..1e87aa7e8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -71,6 +71,11 @@ def pandas_pyarrow_constructor(obj: Any) -> IntoDataFrame: def modin_constructor(obj: Any) -> IntoDataFrame: # pragma: no cover + mpd = get_modin() + return mpd.DataFrame(pd.DataFrame(obj)) # type: ignore[no-any-return] + + +def modin_pyarrow_constructor(obj: Any) -> IntoDataFrame: # pragma: no cover mpd = get_modin() return mpd.DataFrame(pd.DataFrame(obj)).convert_dtypes(dtype_backend="pyarrow") # type: ignore[no-any-return] @@ -146,7 +151,7 @@ def spark_session() -> Generator[SparkSession, None, None]: # pragma: no cover lazy_constructors = [polars_lazy_constructor] if get_modin() is not None: # pragma: no cover - eager_constructors.append(modin_constructor) + eager_constructors.extend([modin_constructor, modin_pyarrow_constructor]) if get_cudf() is not None: eager_constructors.append(cudf_constructor) # pragma: no cover if get_dask_dataframe() is not None: # pragma: no cover diff --git a/tests/expr_and_series/arithmetic_test.py b/tests/expr_and_series/arithmetic_test.py index 12f931baa..eb38c6a14 100644 --- a/tests/expr_and_series/arithmetic_test.py +++ b/tests/expr_and_series/arithmetic_test.py @@ -38,7 +38,7 @@ def test_arithmetic_expr( request: pytest.FixtureRequest, ) -> None: if attr == "__mod__" and any( - x in str(constructor) for x in ["pandas_pyarrow", "modin"] + x in str(constructor) for x in ["pandas_pyarrow", "modin_pyarrow"] ): request.applymarker(pytest.mark.xfail) @@ -68,7 +68,7 @@ def test_right_arithmetic_expr( request: pytest.FixtureRequest, ) -> None: if attr == "__rmod__" and any( - x in str(constructor) for x in ["pandas_pyarrow", "modin"] + x in str(constructor) for x in ["pandas_pyarrow", "modin_pyarrow"] ): request.applymarker(pytest.mark.xfail) @@ -99,7 +99,7 @@ def test_arithmetic_series( request: pytest.FixtureRequest, ) -> None: if attr == "__mod__" and any( - x in str(constructor_eager) for x in ["pandas_pyarrow", "modin"] + x in str(constructor_eager) for x in ["pandas_pyarrow", "modin_pyarrow"] ): request.applymarker(pytest.mark.xfail) @@ -129,7 +129,7 @@ def test_right_arithmetic_series( request: pytest.FixtureRequest, ) -> None: if attr == "__rmod__" and any( - x in str(constructor_eager) for x in ["pandas_pyarrow", "modin"] + x in str(constructor_eager) for x in ["pandas_pyarrow", "modin_pyarrow"] ): request.applymarker(pytest.mark.xfail) @@ -242,7 +242,7 @@ def test_arithmetic_expr_left_literal( request: pytest.FixtureRequest, ) -> None: if attr == "__mod__" and any( - x in str(constructor) for x in ["pandas_pyarrow", "modin"] + x in str(constructor) for x in ["pandas_pyarrow", "modin_pyarrow"] ): request.applymarker(pytest.mark.xfail) @@ -273,7 +273,7 @@ def test_arithmetic_series_left_literal( request: pytest.FixtureRequest, ) -> None: if attr == "__mod__" and any( - x in str(constructor_eager) for x in ["pandas_pyarrow", "modin"] + x in str(constructor_eager) for x in ["pandas_pyarrow", "modin_pyarrow"] ): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/cast_test.py b/tests/expr_and_series/cast_test.py index 992ea5f54..8cdceb9cb 100644 --- a/tests/expr_and_series/cast_test.py +++ b/tests/expr_and_series/cast_test.py @@ -63,7 +63,7 @@ def test_cast( 15, ): # pragma: no cover request.applymarker(pytest.mark.xfail) - if "modin" in str(constructor): + if "modin_constructor" in str(constructor): # TODO(unassigned): in modin, we end up with `' None: if ( - (any(x in str(constructor) for x in ("pyarrow", "modin")) and is_windows()) + ("pyarrow" in str(constructor) and is_windows()) + or ("pyarrow_table" in str(constructor) and is_windows()) or ("pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1)) + or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1)) or ("cudf" in str(constructor)) ): request.applymarker(pytest.mark.xfail) @@ -49,8 +51,10 @@ def test_convert_time_zone_series( request: pytest.FixtureRequest, ) -> None: if ( - (any(x in str(constructor_eager) for x in ("pyarrow", "modin")) and is_windows()) + ("pyarrow" in str(constructor_eager) and is_windows()) + or ("pyarrow_table" in str(constructor_eager) and is_windows()) or ("pandas_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2, 1)) + or ("modin_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2, 1)) or ("cudf" in str(constructor_eager)) ): request.applymarker(pytest.mark.xfail) @@ -74,8 +78,10 @@ def test_convert_time_zone_from_none( constructor: Constructor, request: pytest.FixtureRequest ) -> None: if ( - (any(x in str(constructor) for x in ("pyarrow", "modin")) and is_windows()) + ("pyarrow" in str(constructor) and is_windows()) + or ("pyarrow_table" in str(constructor) and is_windows()) or ("pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1)) + or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1)) or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) or ("cudf" in str(constructor)) ): diff --git a/tests/expr_and_series/dt/datetime_attributes_test.py b/tests/expr_and_series/dt/datetime_attributes_test.py index 0e4c7c992..0235920e7 100644 --- a/tests/expr_and_series/dt/datetime_attributes_test.py +++ b/tests/expr_and_series/dt/datetime_attributes_test.py @@ -95,6 +95,8 @@ def test_datetime_chained_attributes( ) -> None: if "pandas" in str(constructor_eager) and "pyarrow" not in str(constructor_eager): request.applymarker(pytest.mark.xfail) + if "modin" in str(constructor_eager) and "pyarrow" not in str(constructor_eager): + request.applymarker(pytest.mark.xfail) if "cudf" in str(constructor_eager): request.applymarker(pytest.mark.xfail) @@ -109,7 +111,12 @@ def test_datetime_chained_attributes( def test_to_date(request: pytest.FixtureRequest, constructor: Constructor) -> None: if any( x in str(constructor) - for x in ("pandas_constructor", "pandas_nullable_constructor", "cudf") + for x in ( + "pandas_constructor", + "pandas_nullable_constructor", + "cudf", + "modin_constructor", + ) ): request.applymarker(pytest.mark.xfail) dates = {"a": [datetime(2001, 1, 1), None, datetime(2001, 1, 3)]} diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py index 517827941..e205d8179 100644 --- a/tests/expr_and_series/dt/timestamp_test.py +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -138,7 +138,12 @@ def test_timestamp_dates( ) -> None: if any( x in str(constructor) - for x in ("pandas_constructor", "pandas_nullable_constructor", "cudf") + for x in ( + "pandas_constructor", + "pandas_nullable_constructor", + "cudf", + "modin_constructor", + ) ): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/is_finite_test.py b/tests/expr_and_series/is_finite_test.py index 90f4c3b6b..270ba7d52 100644 --- a/tests/expr_and_series/is_finite_test.py +++ b/tests/expr_and_series/is_finite_test.py @@ -14,9 +14,13 @@ def test_is_finite_expr(constructor: Constructor) -> None: if "polars" in str(constructor) or "pyarrow_table" in str(constructor): expected = {"a": [False, False, True, None]} - elif "pandas_constructor" in str(constructor) or "dask" in str(constructor): + elif ( + "pandas_constructor" in str(constructor) + or "dask" in str(constructor) + or "modin_constructor" in str(constructor) + ): expected = {"a": [False, False, True, False]} - else: # pandas_nullable_constructor, pandas_pyarrow_constructor, modin + else: # pandas_nullable_constructor, pandas_pyarrow_constructor, modin_pyarrrow_constructor expected = {"a": [None, False, True, None]} df = nw.from_native(constructor(data)) @@ -28,11 +32,13 @@ def test_is_finite_expr(constructor: Constructor) -> None: def test_is_finite_series(constructor_eager: ConstructorEager) -> None: if "polars" in str(constructor_eager) or "pyarrow_table" in str(constructor_eager): expected = {"a": [False, False, True, None]} - elif "pandas_constructor" in str(constructor_eager) or "dask" in str( - constructor_eager + elif ( + "pandas_constructor" in str(constructor_eager) + or "dask" in str(constructor_eager) + or "modin_constructor" in str(constructor_eager) ): expected = {"a": [False, False, True, False]} - else: # pandas_nullable_constructor, pandas_pyarrow_constructor, modin + else: # pandas_nullable_constructor, pandas_pyarrow_constructor, modin_pyarrrow_constructor expected = {"a": [None, False, True, None]} df = nw.from_native(constructor_eager(data), eager_only=True) diff --git a/tests/expr_and_series/replace_time_zone_test.py b/tests/expr_and_series/replace_time_zone_test.py index 76e786042..94367d1e1 100644 --- a/tests/expr_and_series/replace_time_zone_test.py +++ b/tests/expr_and_series/replace_time_zone_test.py @@ -21,8 +21,9 @@ def test_replace_time_zone( constructor: Constructor, request: pytest.FixtureRequest ) -> None: if ( - (any(x in str(constructor) for x in ("pyarrow", "modin")) and is_windows()) + ("pyarrow" in str(constructor) and is_windows()) or ("pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) + or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) or ("cudf" in str(constructor)) ): @@ -47,8 +48,9 @@ def test_replace_time_zone_none( constructor: Constructor, request: pytest.FixtureRequest ) -> None: if ( - (any(x in str(constructor) for x in ("pyarrow", "modin")) and is_windows()) + ("pyarrow" in str(constructor) and is_windows()) or ("pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) + or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) ): request.applymarker(pytest.mark.xfail) @@ -72,8 +74,9 @@ def test_replace_time_zone_series( constructor_eager: ConstructorEager, request: pytest.FixtureRequest ) -> None: if ( - (any(x in str(constructor_eager) for x in ("pyarrow", "modin")) and is_windows()) + ("pyarrow" in str(constructor_eager) and is_windows()) or ("pandas_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2,)) + or ("modin_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2,)) or ("pyarrow_table" in str(constructor_eager) and PYARROW_VERSION < (12,)) or ("cudf" in str(constructor_eager)) ): @@ -98,8 +101,9 @@ def test_replace_time_zone_none_series( constructor_eager: ConstructorEager, request: pytest.FixtureRequest ) -> None: if ( - (any(x in str(constructor_eager) for x in ("pyarrow", "modin")) and is_windows()) + ("pyarrow" in str(constructor_eager) and is_windows()) or ("pandas_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2,)) + or ("modin_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2,)) or ("pyarrow_table" in str(constructor_eager) and PYARROW_VERSION < (12,)) ): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py b/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py index 6ab26ac41..1d0eb8834 100644 --- a/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py +++ b/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py @@ -38,7 +38,7 @@ def test_str_to_uppercase( in ( "pandas_pyarrow_constructor", "pyarrow_table_constructor", - "modin_constructor", + "modin_pyarrow_constructor", ) or ("dask" in str(constructor) and PYARROW_VERSION >= (12,)) ): @@ -80,6 +80,7 @@ def test_str_to_uppercase_series( "pandas_nullable_constructor", "polars_eager_constructor", "cudf_constructor", + "modin_constructor", ) ): # We are marking it xfail for these conditions above diff --git a/tests/frame/to_numpy_test.py b/tests/frame/to_numpy_test.py index 0b631a3db..e1179f3a5 100644 --- a/tests/frame/to_numpy_test.py +++ b/tests/frame/to_numpy_test.py @@ -31,10 +31,7 @@ def test_to_numpy_tz_aware( if ( ("pyarrow_table" in str(constructor_eager) and PYARROW_VERSION < (12,)) or ("pandas_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2, 2)) - or ( - any(x in str(constructor_eager) for x in ("pyarrow", "modin")) - and is_windows() - ) + or ("pyarrow" in str(constructor_eager) and is_windows()) ): request.applymarker(pytest.mark.xfail) df = nw.from_native( diff --git a/tests/frame/to_pandas_test.py b/tests/frame/to_pandas_test.py index d9bce7a69..1bc588f35 100644 --- a/tests/frame/to_pandas_test.py +++ b/tests/frame/to_pandas_test.py @@ -19,16 +19,15 @@ ) def test_convert_pandas( constructor_eager: ConstructorEager, - request: pytest.FixtureRequest, ) -> None: - if "modin" in str(constructor_eager): - request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_raw = constructor_eager(data) result = nw.from_native(df_raw).to_pandas() # type: ignore[union-attr] if constructor_eager.__name__.startswith("pandas"): expected = constructor_eager(data) + elif "modin_pyarrow" in str(constructor_eager): + expected = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow") else: expected = pd.DataFrame(data) diff --git a/tests/group_by_test.py b/tests/group_by_test.py index f98508ef3..f60df3690 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -321,8 +321,8 @@ def test_key_with_nulls_iter( constructor_eager: ConstructorEager, request: pytest.FixtureRequest, ) -> None: - if PANDAS_VERSION < (1, 3) and "pandas_constructor" in str(constructor_eager): - # bug in old pandas + if PANDAS_VERSION < (1, 0) and "pandas_constructor" in str(constructor_eager): + # Grouping by null values is not supported in pandas < 1.0.0 request.applymarker(pytest.mark.xfail) data = {"b": ["4", "5", None, "7"], "a": [1, 2, 3, 4], "c": ["4", "3", None, None]} result = dict( @@ -330,6 +330,7 @@ def test_key_with_nulls_iter( .group_by("b", "c", drop_null_keys=True) .__iter__() ) + assert len(result) == 2 assert_equal_data(result[("4", "4")], {"b": ["4"], "a": [1], "c": ["4"]}) assert_equal_data(result[("5", "3")], {"b": ["5"], "a": [2], "c": ["3"]}) @@ -415,7 +416,7 @@ def test_double_same_aggregation( def test_all_kind_of_aggs( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if any(x in str(constructor) for x in ("dask", "cudf", "modin_constructor")): + if any(x in str(constructor) for x in ("dask", "cudf", "modin")): # bugged in dask https://github.com/dask/dask/issues/11612 # and modin lol https://github.com/modin-project/modin/issues/7414 # and cudf https://github.com/rapidsai/cudf/issues/17649 diff --git a/tests/series_only/to_numpy_test.py b/tests/series_only/to_numpy_test.py index 8e36ac128..166d18777 100644 --- a/tests/series_only/to_numpy_test.py +++ b/tests/series_only/to_numpy_test.py @@ -42,10 +42,8 @@ def test_to_numpy_tz_aware( if ( ("pyarrow_table" in str(constructor_eager) and PYARROW_VERSION < (12,)) or ("pandas_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2, 2)) - or ( - any(x in str(constructor_eager) for x in ("pyarrow", "modin")) - and is_windows() - ) + or ("modin_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2, 2)) + or ("pyarrow" in str(constructor_eager) and is_windows()) ): request.applymarker(pytest.mark.xfail) request.applymarker(pytest.mark.xfail) diff --git a/tests/series_only/to_pandas_test.py b/tests/series_only/to_pandas_test.py index 387af2709..a1c3985cd 100644 --- a/tests/series_only/to_pandas_test.py +++ b/tests/series_only/to_pandas_test.py @@ -22,7 +22,7 @@ def test_convert( ) -> None: if any( cname in str(constructor_eager) - for cname in ("pandas_nullable", "pandas_pyarrow", "modin") + for cname in ("pandas_nullable", "pandas_pyarrow", "modin_pyarrow") ): request.applymarker(pytest.mark.xfail) From f5a33dde80135dc1a27d56732504fdc2ff2486b5 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 1 Jan 2025 13:42:43 +0000 Subject: [PATCH 06/13] test: Make constructors configurable when running tests (#1694) --- .github/workflows/check_tpch_queries.yml | 2 +- .github/workflows/extremes.yml | 12 +- .github/workflows/pytest.yml | 19 ++- .github/workflows/random_ci_pytest.yml | 12 +- CONTRIBUTING.md | 9 +- narwhals/_pandas_like/group_by.py | 10 +- noxfile.py | 16 +- pyproject.toml | 23 ++- tests/conftest.py | 154 +++++++++++------- tests/dtypes_test.py | 3 +- tests/expr_and_series/over_test.py | 2 +- tests/frame/drop_test.py | 2 +- .../interchange_native_namespace_test.py | 2 +- tests/frame/interchange_schema_test.py | 2 +- tests/frame/interchange_select_test.py | 2 +- tests/frame/interchange_to_arrow_test.py | 2 +- tests/frame/interchange_to_pandas_test.py | 2 +- tests/frame/schema_test.py | 2 +- tests/group_by_test.py | 7 +- tests/hypothesis/getitem_test.py | 20 ++- tpch/generate_data.py | 3 +- 21 files changed, 177 insertions(+), 129 deletions(-) diff --git a/.github/workflows/check_tpch_queries.yml b/.github/workflows/check_tpch_queries.yml index 888f40048..8924421f6 100644 --- a/.github/workflows/check_tpch_queries.yml +++ b/.github/workflows/check_tpch_queries.yml @@ -25,7 +25,7 @@ jobs: cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "pyproject.toml" - name: local-install - run: uv pip install -e ".[dev, dask]" --system + run: uv pip install -e ".[dev, core, dask]" --system - name: generate-data run: cd tpch && python generate_data.py - name: tpch-tests diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index d167d1429..71b07d5f3 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -41,7 +41,7 @@ jobs: echo "$DEPS" | grep 'scipy==1.5.0' echo "$DEPS" | grep 'scikit-learn==1.1.0' - name: Run pytest - run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow + run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow --constructors=pandas,pyarrow,polars[eager],polars[lazy] pretty_old_versions: strategy: @@ -60,7 +60,7 @@ jobs: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "pyproject.toml" - - name: install-minimum-versions + - name: install-pretty-old-versions run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.3.0 scipy==1.5.0 scikit-learn==1.1.0 tzdata --system - name: install-reqs run: uv pip install -e ".[dev]" --system @@ -79,7 +79,7 @@ jobs: echo "$DEPS" | grep 'scipy==1.5.0' echo "$DEPS" | grep 'scikit-learn==1.1.0' - name: Run pytest - run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow + run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow --constructors=pandas,pyarrow,polars[eager],polars[lazy] not_so_old_versions: strategy: @@ -116,7 +116,7 @@ jobs: echo "$DEPS" | grep 'scikit-learn==1.3.0' echo "$DEPS" | grep 'dask==2024.7' - name: Run pytest - run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow + run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow --constructors=pandas,pyarrow,polars[eager],polars[lazy],dask nightlies: strategy: @@ -179,4 +179,6 @@ jobs: echo "$DEPS" | grep 'numpy' echo "$DEPS" | grep 'dask' - name: Run pytest - run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow + run: | + pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow \ + --constructors=pandas,pandas[nullable],pandas[pyarrow],pyarrow,polars[eager],polars[lazy],dask diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 5c099bfad..b02b74c07 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -25,11 +25,11 @@ jobs: cache-dependency-glob: "pyproject.toml" - name: install-reqs # Python3.8 is technically at end-of-life, so we don't test everything - run: uv pip install -e ".[dev]" --system + run: uv pip install -e ".[dev, core]" --system - name: show-deps run: uv pip freeze - name: Run pytest - run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=85 + run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=85 --constructors=pandas,pyarrow,polars[eager],polars[lazy] pytest-windows: strategy: @@ -49,11 +49,12 @@ jobs: cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "pyproject.toml" - name: install-reqs - run: uv pip install -e ".[dev, extra]" --system + run: uv pip install -e ".[dev, core, extra, dask, modin]" --system - name: show-deps run: uv pip freeze - name: Run pytest - run: pytest tests --cov=narwhals --cov=tests --runslow --cov-fail-under=95 + run: | + pytest tests --cov=narwhals --cov=tests --runslow --cov-fail-under=95 --all-cpu-constructors pytest-full-coverage: strategy: @@ -73,16 +74,20 @@ jobs: cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "pyproject.toml" - name: install-reqs - run: uv pip install -e ".[dev, extra]" --system + run: uv pip install -e ".[dev, core, extra, modin, dask]" --system + - name: install pyspark + run: uv pip install -e ".[pyspark]" --system + # PySpark is not yet available on Python3.12+ + if: matrix.python-version == '3.9' || matrix.python-version == '3.11' - name: install ibis - run: uv pip install ibis-framework>=6.0.0 rich packaging pyarrow_hotfix --system + run: uv pip install -e ".[ibis]" --system # Ibis puts upper bounds on dependencies, and requires Python3.10+, # which messes with other dependencies on lower Python versions if: matrix.python-version == '3.11' - name: show-deps run: uv pip freeze - name: Run pytest - run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=100 --runslow + run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=100 --runslow --all-cpu-constructors - name: Run doctests # reprs differ between versions, so we only run doctests on the latest Python if: matrix.python-version == '3.13' diff --git a/.github/workflows/random_ci_pytest.yml b/.github/workflows/random_ci_pytest.yml index 599dab3c0..67f416c21 100644 --- a/.github/workflows/random_ci_pytest.yml +++ b/.github/workflows/random_ci_pytest.yml @@ -22,17 +22,15 @@ jobs: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "pyproject.toml" - - name: install package - run: uv pip install -e . --system - name: generate-random-versions run: python utils/generate_random_versions.py - - name: install-reqs - run: uv pip install -e ".[dev]" --system - - name: uninstall scipy/sklearn - run: uv pip uninstall scipy scikit-learn --system - name: install-random-verions run: uv pip install -r random-requirements.txt --system + - name: install-narwhals + run: uv pip install -e ".[dev]" --system - name: show versions run: uv pip freeze - name: Run pytest - run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=80 + run: | + pytest tests --cov=narwhals --cov=tests --cov-fail-under=80 \ + --constructors=pandas,pyarrow,polars[eager],polars[lazy] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7083129a7..b61ed663b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -96,8 +96,9 @@ Here's how you can set up your local development environment to contribute. uv venv -p 3.12 --seed ``` 4. Activate it. On Linux, this is `. .venv/bin/activate`, on Windows `.\.venv\Scripts\activate`. -2. Install Narwhals: `uv pip install -e ".[dev, docs]"`. If you also want to test Dask , PySpark, and - Modin, you can install them too with `uv pip install -e ".[dev, docs, extra]"`. +2. Install Narwhals: `uv pip install -e ".[dev, core, docs]"`. This will include fast-ish core libraries. + If you also want to test other libraries like Dask , PySpark, and Modin, you can install them too with + `uv pip install -e ".[dev, core, docs, dask, pyspark, modin]"`. 3. Install a fork of griffe: ``` uv pip install git+https://github.com/MarcoGorelli/griffe.git@no-overloads @@ -131,6 +132,10 @@ If you add code that should be tested, please add tests. - To run tests on the doctests, use `pytest narwhals --doctest-modules` - To run unit tests and doctests at the same time, run `pytest tests narwhals --cov=narwhals --doctest-modules` - To run tests multiprocessed, you may also want to use [pytest-xdist](https://github.com/pytest-dev/pytest-xdist) (optional) +- To choose which backends to run tests with you, you can use the `--constructors` flag: + - to only run tests for pandas, Polars, and PyArrow, use `pytest --constructors=pandas,pyarrow,polars` + - to run tests for all CPU constructors, use `pytest --all-cpu-constructors` + - by default, tests run for pandas, pandas (PyArrow dtypes), PyArrow, and Polars. If you want to have less surprises when opening a PR, you can take advantage of [nox](https://nox.thea.codes/en/stable/index.html) to run the entire CI/CD test suite locally in your operating system. diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index e8cf77a87..3741c7130 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -123,8 +123,14 @@ def _from_native_frame(self, df: PandasLikeDataFrame) -> PandasLikeDataFrame: ) def __iter__(self) -> Iterator[tuple[Any, PandasLikeDataFrame]]: - for key, group in self._grouped: - yield (key, self._from_native_frame(group)) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=".*a length 1 tuple will be returned", + category=FutureWarning, + ) + for key, group in self._grouped: + yield (key, self._from_native_frame(group)) def agg_pandas( # noqa: PLR0915 diff --git a/noxfile.py b/noxfile.py index c17e41ae4..49bb60535 100644 --- a/noxfile.py +++ b/noxfile.py @@ -14,10 +14,12 @@ def run_common(session: Session, coverage_threshold: float) -> None: - session.install("-e .[dev]") - - if session.python != "3.8": - session.install("ibis-framework>=6.0.0", "rich", "packaging", "pyarrow_hotfix") + if session.python == "3.12": + session.install("-e .[dev,extra,dask,modin]") + elif session.python != "3.8": + session.install("-e .[dev,extra,dask,modin,pyspark,ibis]") + else: + session.install("-e .[dev]") session.run( "pytest", @@ -34,11 +36,7 @@ def run_common(session: Session, coverage_threshold: float) -> None: @nox.session(python=PYTHON_VERSIONS) # type: ignore[misc] def pytest_coverage(session: Session) -> None: - if session.python == "3.8": - coverage_threshold = 85 - else: - coverage_threshold = 100 - session.install("modin[dask]") + coverage_threshold = 85 if session.python == "3.8" else 100 run_common(session, coverage_threshold) diff --git a/pyproject.toml b/pyproject.toml index 8193e0284..45aa54ad8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,30 +25,30 @@ cudf = ["cudf>=24.10.0"] modin = ["modin"] pandas = ["pandas>=0.25.3"] polars = ["polars>=0.20.3"] +ibis = ["ibis-framework>=6.0.0", "rich", "packaging", "pyarrow_hotfix"] pyarrow = ["pyarrow>=11.0.0"] dask = ["dask[dataframe]>=2024.7"] +duckdb = ["duckdb>=1.0"] pyspark = ["pyspark>=3.3.0"] dev = [ - "tqdm", "covdefaults", - "duckdb", - "pandas", - "polars", "pre-commit", - "pyarrow", - "pyarrow-stubs", "pytest", "pytest-cov", "pytest-randomly", "pytest-env", - "hypothesis[numpy]", + "hypothesis", "typing_extensions", ] +core = [ + "duckdb", + "pandas", + "polars", + "pyarrow", + "pyarrow-stubs", +] extra = [ # heavier dependencies we don't necessarily need in every testing job "scikit-learn", - "pyspark; python_version >= '3.9' and python_version < '3.12'", - "dask[dataframe]; python_version >= '3.9'", - "modin", ] docs = [ "black", # required by mkdocstrings_handlers @@ -110,9 +110,8 @@ lint.ignore = [ "E501", "FIX", "ISC001", - "NPY002", - "PD901", # This is a auxiliary library so dataframe variables have no concrete business meaning "PD010", + "PD901", # This is a auxiliary library so dataframe variables have no concrete business meaning "PLR0911", "PLR0912", "PLR0913", diff --git a/tests/conftest.py b/tests/conftest.py index 1e87aa7e8..28fbc7610 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,54 +1,67 @@ from __future__ import annotations -import contextlib +import os from typing import TYPE_CHECKING from typing import Any +from typing import Callable from typing import Generator +from typing import Sequence import pandas as pd import polars as pl import pyarrow as pa import pytest -from narwhals.dependencies import get_dask_dataframe -from narwhals.stable.v1.dependencies import get_cudf -from narwhals.stable.v1.dependencies import get_modin -from tests.utils import PANDAS_VERSION - if TYPE_CHECKING: from narwhals.typing import IntoDataFrame from narwhals.typing import IntoFrame - from tests.utils import Constructor - from tests.utils import ConstructorEager - -with contextlib.suppress(ImportError): - import modin.pandas # noqa: F401 -with contextlib.suppress(ImportError): - import dask.dataframe # noqa: F401 -with contextlib.suppress(ImportError): - import cudf # noqa: F401 -with contextlib.suppress(ImportError): - from pyspark.sql import SparkSession if TYPE_CHECKING: from pyspark.sql import SparkSession from narwhals.typing import IntoDataFrame from narwhals.typing import IntoFrame - from tests.utils import Constructor + +# When testing cudf.pandas in Kaggle, we get an error if we try to run +# python -m cudf.pandas -m pytest --constructors=pandas. This gives us +# a way to run `python -m cudf.pandas -m pytest` and control which constructors +# get tested. +if default_constructors := os.environ.get( + "NARWHALS_DEFAULT_CONSTRUCTORS", None +): # pragma: no cover + DEFAULT_CONSTRUCTORS = default_constructors +else: + DEFAULT_CONSTRUCTORS = ( + "pandas,pandas[nullable],pandas[pyarrow],polars[eager],polars[lazy],pyarrow" + ) def pytest_addoption(parser: Any) -> None: parser.addoption( "--runslow", action="store_true", default=False, help="run slow tests" ) + parser.addoption( + "--all-cpu-constructors", + action="store_true", + default=False, + help="run tests with all cpu constructors", + ) + parser.addoption( + "--constructors", + action="store", + default=DEFAULT_CONSTRUCTORS, + type=str, + help="libraries to test", + ) -def pytest_configure(config: Any) -> None: +def pytest_configure(config: pytest.Config) -> None: config.addinivalue_line("markers", "slow: mark test as slow to run") -def pytest_collection_modifyitems(config: Any, items: Any) -> Any: # pragma: no cover +def pytest_collection_modifyitems( + config: pytest.Config, items: Sequence[pytest.Function] +) -> None: # pragma: no cover if config.getoption("--runslow"): # --runslow given in cli: do not skip slow tests return @@ -71,17 +84,20 @@ def pandas_pyarrow_constructor(obj: Any) -> IntoDataFrame: def modin_constructor(obj: Any) -> IntoDataFrame: # pragma: no cover - mpd = get_modin() + import modin.pandas as mpd + return mpd.DataFrame(pd.DataFrame(obj)) # type: ignore[no-any-return] def modin_pyarrow_constructor(obj: Any) -> IntoDataFrame: # pragma: no cover - mpd = get_modin() + import modin.pandas as mpd + return mpd.DataFrame(pd.DataFrame(obj)).convert_dtypes(dtype_backend="pyarrow") # type: ignore[no-any-return] def cudf_constructor(obj: Any) -> IntoDataFrame: # pragma: no cover - cudf = get_cudf() + import cudf + return cudf.DataFrame(obj) # type: ignore[no-any-return] @@ -94,12 +110,14 @@ def polars_lazy_constructor(obj: Any) -> pl.LazyFrame: def dask_lazy_p1_constructor(obj: Any) -> IntoFrame: # pragma: no cover - dd = get_dask_dataframe() + import dask.dataframe as dd + return dd.from_dict(obj, npartitions=1) # type: ignore[no-any-return] def dask_lazy_p2_constructor(obj: Any) -> IntoFrame: # pragma: no cover - dd = get_dask_dataframe() + import dask.dataframe as dd + return dd.from_dict(obj, npartitions=2) # type: ignore[no-any-return] @@ -115,7 +133,6 @@ def spark_session() -> Generator[SparkSession, None, None]: # pragma: no cover pytest.skip("pyspark is not installed") return - import os import warnings os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" @@ -138,35 +155,56 @@ def spark_session() -> Generator[SparkSession, None, None]: # pragma: no cover session.stop() -if PANDAS_VERSION >= (2, 0, 0): - eager_constructors = [ - pandas_constructor, - pandas_nullable_constructor, - pandas_pyarrow_constructor, - ] -else: # pragma: no cover - eager_constructors = [pandas_constructor] - -eager_constructors.extend([polars_eager_constructor, pyarrow_table_constructor]) -lazy_constructors = [polars_lazy_constructor] - -if get_modin() is not None: # pragma: no cover - eager_constructors.extend([modin_constructor, modin_pyarrow_constructor]) -if get_cudf() is not None: - eager_constructors.append(cudf_constructor) # pragma: no cover -if get_dask_dataframe() is not None: # pragma: no cover - # TODO(unassigned): reinstate both dask constructors once if/when we have a dask use-case - # lazy_constructors.extend([dask_lazy_p1_constructor, dask_lazy_p2_constructor]) # noqa: ERA001 - lazy_constructors.append(dask_lazy_p2_constructor) # type: ignore # noqa: PGH003 - - -@pytest.fixture(params=eager_constructors) -def constructor_eager( - request: pytest.FixtureRequest, -) -> ConstructorEager: - return request.param # type: ignore[no-any-return] - - -@pytest.fixture(params=[*eager_constructors, *lazy_constructors]) -def constructor(request: pytest.FixtureRequest) -> Constructor: - return request.param # type: ignore[no-any-return] +EAGER_CONSTRUCTORS: dict[str, Callable[[Any], IntoDataFrame]] = { + "pandas": pandas_constructor, + "pandas[nullable]": pandas_nullable_constructor, + "pandas[pyarrow]": pandas_pyarrow_constructor, + "pyarrow": pyarrow_table_constructor, + "modin": modin_constructor, + "modin[pyarrow]": modin_pyarrow_constructor, + "cudf": cudf_constructor, + "polars[eager]": polars_eager_constructor, +} +LAZY_CONSTRUCTORS: dict[str, Callable[[Any], IntoFrame]] = { + "dask": dask_lazy_p2_constructor, + "polars[lazy]": polars_lazy_constructor, +} +GPU_CONSTRUCTORS: dict[str, Callable[[Any], IntoFrame]] = {"cudf": cudf_constructor} + + +def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: + if metafunc.config.getoption("all_cpu_constructors"): + selected_constructors: list[str] = [ + *iter(EAGER_CONSTRUCTORS.keys()), + *iter(LAZY_CONSTRUCTORS.keys()), + ] + selected_constructors = [ + x for x in selected_constructors if x not in GPU_CONSTRUCTORS + ] + else: # pragma: no cover + selected_constructors = metafunc.config.getoption("constructors").split(",") + + eager_constructors: list[Callable[[Any], IntoDataFrame]] = [] + eager_constructors_ids: list[str] = [] + constructors: list[Callable[[Any], IntoFrame]] = [] + constructors_ids: list[str] = [] + + for constructor in selected_constructors: + if constructor in EAGER_CONSTRUCTORS: + eager_constructors.append(EAGER_CONSTRUCTORS[constructor]) + eager_constructors_ids.append(constructor) + constructors.append(EAGER_CONSTRUCTORS[constructor]) + constructors_ids.append(constructor) + elif constructor in LAZY_CONSTRUCTORS: + constructors.append(LAZY_CONSTRUCTORS[constructor]) + constructors_ids.append(constructor) + else: # pragma: no cover + msg = f"Expected one of {EAGER_CONSTRUCTORS.keys()} or {LAZY_CONSTRUCTORS.keys()}, got {constructor}" + raise ValueError(msg) + + if "constructor_eager" in metafunc.fixturenames: + metafunc.parametrize( + "constructor_eager", eager_constructors, ids=eager_constructors_ids + ) + elif "constructor" in metafunc.fixturenames: + metafunc.parametrize("constructor", constructors, ids=constructors_ids) diff --git a/tests/dtypes_test.py b/tests/dtypes_test.py index aa497785a..97ca384c8 100644 --- a/tests/dtypes_test.py +++ b/tests/dtypes_test.py @@ -5,7 +5,6 @@ from datetime import timezone from typing import Literal -import duckdb import numpy as np import pandas as pd import polars as pl @@ -201,6 +200,7 @@ def test_pandas_fixed_offset_1302() -> None: def test_huge_int() -> None: + duckdb = pytest.importorskip("duckdb") df = pl.DataFrame({"a": [1, 2, 3]}) if POLARS_VERSION >= (1, 18): # pragma: no cover result = nw.from_native(df.select(pl.col("a").cast(pl.Int128))).schema @@ -227,6 +227,7 @@ def test_huge_int() -> None: @pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="too old for pyarrow") def test_decimal() -> None: + duckdb = pytest.importorskip("duckdb") df = pl.DataFrame({"a": [1]}, schema={"a": pl.Decimal}) result = nw.from_native(df).schema assert result["a"] == nw.Decimal diff --git a/tests/expr_and_series/over_test.py b/tests/expr_and_series/over_test.py index e824c6b7f..a67c7973b 100644 --- a/tests/expr_and_series/over_test.py +++ b/tests/expr_and_series/over_test.py @@ -138,7 +138,7 @@ def test_over_cummin(request: pytest.FixtureRequest, constructor: Constructor) - def test_over_cumprod(request: pytest.FixtureRequest, constructor: Constructor) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "dask_lazy_p2", "cudf")): + if any(x in str(constructor) for x in ("pyarrow_table", "dask_lazy_p2")): request.applymarker(pytest.mark.xfail) if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): diff --git a/tests/frame/drop_test.py b/tests/frame/drop_test.py index 2d6115c15..eb9bb2660 100644 --- a/tests/frame/drop_test.py +++ b/tests/frame/drop_test.py @@ -48,7 +48,7 @@ def test_drop_strict( *, strict: bool, ) -> None: - if "polars_lazy" in str(request) and POLARS_VERSION < (1, 0, 0) and strict: + if "polars_lazy" in str(constructor) and POLARS_VERSION < (1, 0, 0) and strict: request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6]} diff --git a/tests/frame/interchange_native_namespace_test.py b/tests/frame/interchange_native_namespace_test.py index 22d036460..865a675ad 100644 --- a/tests/frame/interchange_native_namespace_test.py +++ b/tests/frame/interchange_native_namespace_test.py @@ -1,6 +1,5 @@ from __future__ import annotations -import duckdb import polars as pl import pytest @@ -45,6 +44,7 @@ def test_ibis( def test_duckdb() -> None: + duckdb = pytest.importorskip("duckdb") df_pl = pl.DataFrame(data) # noqa: F841 rel = duckdb.sql("select * from df_pl") diff --git a/tests/frame/interchange_schema_test.py b/tests/frame/interchange_schema_test.py index e06a482db..a9482811f 100644 --- a/tests/frame/interchange_schema_test.py +++ b/tests/frame/interchange_schema_test.py @@ -4,7 +4,6 @@ from datetime import datetime from datetime import timedelta -import duckdb import polars as pl import pytest @@ -160,6 +159,7 @@ def test_interchange_schema_ibis( def test_interchange_schema_duckdb() -> None: + duckdb = pytest.importorskip("duckdb") df_pl = pl.DataFrame( # noqa: F841 { "a": [1, 1, 2], diff --git a/tests/frame/interchange_select_test.py b/tests/frame/interchange_select_test.py index b553af751..30d878808 100644 --- a/tests/frame/interchange_select_test.py +++ b/tests/frame/interchange_select_test.py @@ -2,7 +2,6 @@ from typing import Any -import duckdb import polars as pl import pytest @@ -61,6 +60,7 @@ def test_interchange_ibis( def test_interchange_duckdb() -> None: + duckdb = pytest.importorskip("duckdb") df_pl = pl.DataFrame(data) # noqa: F841 rel = duckdb.sql("select * from df_pl") df = nw.from_native(rel, eager_or_interchange_only=True) diff --git a/tests/frame/interchange_to_arrow_test.py b/tests/frame/interchange_to_arrow_test.py index d1ddd2a53..1f01b67c6 100644 --- a/tests/frame/interchange_to_arrow_test.py +++ b/tests/frame/interchange_to_arrow_test.py @@ -1,6 +1,5 @@ from __future__ import annotations -import duckdb import polars as pl import pyarrow as pa import pytest @@ -35,6 +34,7 @@ def test_interchange_ibis_to_arrow( def test_interchange_duckdb_to_arrow() -> None: + duckdb = pytest.importorskip("duckdb") df_pl = pl.DataFrame(data) # noqa: F841 rel = duckdb.sql("select * from df_pl") df = nw.from_native(rel, eager_or_interchange_only=True) diff --git a/tests/frame/interchange_to_pandas_test.py b/tests/frame/interchange_to_pandas_test.py index 938c23eaf..7761a6499 100644 --- a/tests/frame/interchange_to_pandas_test.py +++ b/tests/frame/interchange_to_pandas_test.py @@ -1,6 +1,5 @@ from __future__ import annotations -import duckdb import pandas as pd import pytest @@ -39,6 +38,7 @@ def test_interchange_ibis_to_pandas( def test_interchange_duckdb_to_pandas(request: pytest.FixtureRequest) -> None: + duckdb = pytest.importorskip("duckdb") if PANDAS_VERSION < (1, 0, 0): request.applymarker(pytest.mark.xfail) df_raw = pd.DataFrame(data) diff --git a/tests/frame/schema_test.py b/tests/frame/schema_test.py index c2fdec31a..9de397748 100644 --- a/tests/frame/schema_test.py +++ b/tests/frame/schema_test.py @@ -7,7 +7,6 @@ from typing import TYPE_CHECKING from typing import Any -import duckdb import pandas as pd import polars as pl import pytest @@ -215,6 +214,7 @@ def test_from_non_hashable_column_name() -> None: reason="too old for pyarrow types", ) def test_nested_dtypes() -> None: + duckdb = pytest.importorskip("duckdb") df = pl.DataFrame( {"a": [[1, 2]], "b": [[1, 2]], "c": [{"a": 1}]}, schema_overrides={"b": pl.Array(pl.Int64, 2)}, diff --git a/tests/group_by_test.py b/tests/group_by_test.py index f60df3690..188c17c76 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -75,12 +75,7 @@ def test_invalid_group_by() -> None: ) -def test_group_by_iter( - constructor_eager: ConstructorEager, request: pytest.FixtureRequest -) -> None: - if "cudf" in str(constructor_eager): - # https://github.com/rapidsai/cudf/issues/17650 - request.applymarker(pytest.mark.xfail) +def test_group_by_iter(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) expected_keys = [(1,), (3,)] keys = [] diff --git a/tests/hypothesis/getitem_test.py b/tests/hypothesis/getitem_test.py index f227bcd64..33c31f761 100644 --- a/tests/hypothesis/getitem_test.py +++ b/tests/hypothesis/getitem_test.py @@ -30,7 +30,9 @@ ], scope="module", ) -def constructor(request: pytest.FixtureRequest) -> Callable[[Any], IntoDataFrame]: +def pandas_or_pyarrow_constructor( + request: pytest.FixtureRequest, +) -> Callable[[Any], IntoDataFrame]: return request.param # type: ignore[no-any-return] @@ -140,7 +142,7 @@ def tuple_selector(draw: st.DrawFn) -> tuple[Any, Any]: ) # type: ignore[misc] @pytest.mark.slow def test_getitem( - constructor: Any, + pandas_or_pyarrow_constructor: Any, selector: Any, ) -> None: """Compare __getitem__ against polars.""" @@ -150,7 +152,7 @@ def test_getitem( # NotImplementedError: Slicing with step is not supported on PyArrow tables assume( not ( - constructor is pyarrow_table_constructor + pandas_or_pyarrow_constructor is pyarrow_table_constructor and isinstance(selector, slice) and selector.step is not None ) @@ -159,7 +161,7 @@ def test_getitem( # IndexError: Offset must be non-negative (pyarrow does not support negative indexing) assume( not ( - constructor is pyarrow_table_constructor + pandas_or_pyarrow_constructor is pyarrow_table_constructor and isinstance(selector, slice) and isinstance(selector.start, int) and selector.start < 0 @@ -167,7 +169,7 @@ def test_getitem( ) assume( not ( - constructor is pyarrow_table_constructor + pandas_or_pyarrow_constructor is pyarrow_table_constructor and isinstance(selector, slice) and isinstance(selector.stop, int) and selector.stop < 0 @@ -179,7 +181,7 @@ def test_getitem( # TypeError: Got unexpected argument type for compute function assume( not ( - constructor is pyarrow_table_constructor + pandas_or_pyarrow_constructor is pyarrow_table_constructor and isinstance(selector, tuple) and isinstance(selector[0], slice) and isinstance(selector[1], slice) @@ -194,7 +196,7 @@ def test_getitem( # ArrowNotImplementedError: Function 'array_take' has no kernel matching input types (int64, null) assume( not ( - constructor is pyarrow_table_constructor + pandas_or_pyarrow_constructor is pyarrow_table_constructor and isinstance(selector, tuple) and isinstance(selector[0], list) and len(selector[0]) == 0 @@ -205,7 +207,7 @@ def test_getitem( # df[[], "a":], df[[], :] etc return different results between pandas/polars: assume( not ( - constructor is pandas_constructor + pandas_or_pyarrow_constructor is pandas_constructor and isinstance(selector, tuple) and isinstance(selector[0], list) and len(selector[0]) == 0 @@ -236,7 +238,7 @@ def test_getitem( # rows/columns sides. return - df_other = nw.from_native(constructor(TEST_DATA)) + df_other = nw.from_native(pandas_or_pyarrow_constructor(TEST_DATA)) result_other = df_other[selector] if isinstance(result_polars, nw.Series): diff --git a/tpch/generate_data.py b/tpch/generate_data.py index d0a370a2a..31872a624 100644 --- a/tpch/generate_data.py +++ b/tpch/generate_data.py @@ -5,7 +5,6 @@ import duckdb import pyarrow as pa import pyarrow.parquet as pq -import tqdm if not Path("data").exists(): Path("data").mkdir() @@ -23,7 +22,7 @@ "region", "supplier", ] -for t in tqdm.tqdm(tables): +for t in tables: res = con.query("SELECT * FROM " + t) # noqa: S608 res_arrow = res.to_arrow_table() new_schema = [] From 1f9b59e272cb9b21b0dd6ede3de7793e665ea90f Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 1 Jan 2025 13:46:24 +0000 Subject: [PATCH 07/13] fix: remove `maintain_order` from LazyFrame.unique (#1687) --- narwhals/_arrow/dataframe.py | 2 +- narwhals/_dask/dataframe.py | 5 +-- narwhals/_spark_like/dataframe.py | 5 +-- narwhals/dataframe.py | 62 +++++++++++++++++-------------- tests/frame/unique_test.py | 25 +++++++++++-- tests/spark_like_test.py | 17 ++++----- 6 files changed, 68 insertions(+), 48 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index b51cba7c6..c2898f20f 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -655,7 +655,7 @@ def unique( subset: list[str] | None, *, keep: Literal["any", "first", "last", "none"], - maintain_order: bool, + maintain_order: bool = False, ) -> Self: # The param `maintain_order` is only here for compatibility with the Polars API # and has no effect on the output. diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 0edf41216..36d85bdba 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -200,11 +200,8 @@ def unique( self: Self, subset: list[str] | None, *, - keep: Literal["any", "first", "last", "none"] = "any", - maintain_order: bool = False, + keep: Literal["any", "none"] = "any", ) -> Self: - # The param `maintain_order` is only here for compatibility with the Polars API - # and has no effect on the output. native_frame = self._native_frame if keep == "none": subset = subset or self.columns diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index ba57da5bd..e57f9e1b3 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -203,11 +203,8 @@ def unique( self: Self, subset: str | list[str] | None = None, *, - keep: Literal["any", "first", "last", "none"], - maintain_order: bool, + keep: Literal["any", "none"], ) -> Self: - # The param `maintain_order` is only here for compatibility with the Polars API - # and has no effect on the output. if keep != "any": msg = "`LazyFrame.unique` with PySpark backend only supports `keep='any'`." raise ValueError(msg) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 088f6198d..b508ded18 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -146,21 +146,6 @@ def drop(self, *columns: Iterable[str], strict: bool) -> Self: self._compliant_frame.drop(columns, strict=strict) ) - def unique( - self, - subset: str | list[str] | None = None, - *, - keep: Literal["any", "first", "last", "none"] = "any", - maintain_order: bool = False, - ) -> Self: - if isinstance(subset, str): - subset = [subset] - return self._from_compliant_dataframe( - self._compliant_frame.unique( - subset=subset, keep=keep, maintain_order=maintain_order - ) - ) - def filter( self, *predicates: IntoExpr | Iterable[IntoExpr] | list[bool], **constraints: Any ) -> Self: @@ -2097,8 +2082,7 @@ def unique( * 'first': Keep first unique row. * 'last': Keep last unique row. maintain_order: Keep the same order as the original DataFrame. This may be more - expensive to compute. Settings this to `True` blocks the possibility - to run on the streaming engine for Polars. + expensive to compute. Returns: The dataframe with the duplicate rows removed. @@ -2148,7 +2132,16 @@ def unique( bar: [["a"]] ham: [["b"]] """ - return super().unique(subset, keep=keep, maintain_order=maintain_order) + if keep not in {"any", "none", "first", "last"}: + msg = f"Expected {'any', 'none', 'first', 'last'}, got: {keep}" + raise ValueError(msg) + if isinstance(subset, str): + subset = [subset] + return self._from_compliant_dataframe( + self._compliant_frame.unique( + subset=subset, keep=keep, maintain_order=maintain_order + ) + ) def filter( self, *predicates: IntoExpr | Iterable[IntoExpr] | list[bool], **constraints: Any @@ -4436,25 +4429,21 @@ def unique( self, subset: str | list[str] | None = None, *, - keep: Literal["any", "first", "last", "none"] = "any", - maintain_order: bool = False, + keep: Literal["any", "none"] = "any", + maintain_order: bool | None = None, ) -> Self: """Drop duplicate rows from this LazyFrame. Arguments: subset: Column name(s) to consider when identifying duplicate rows. If set to `None`, use all columns. - keep: {'first', 'last', 'any', 'none'} + keep: {'first', 'none'} Which of the duplicate rows to keep. * 'any': Does not give any guarantee of which row is kept. This allows more optimizations. * 'none': Don't keep duplicate rows. - * 'first': Keep first unique row. - * 'last': Keep last unique row. - maintain_order: Keep the same order as the original DataFrame. This may be more - expensive to compute. Settings this to `True` blocks the possibility - to run on the streaming engine for Polars. + maintain_order: Has no effect and is kept around only for backwards-compatibility. Returns: LazyFrame: LazyFrame with unique rows. @@ -4494,7 +4483,26 @@ def unique( │ 1 ┆ a ┆ b │ └─────┴─────┴─────┘ """ - return super().unique(subset, keep=keep, maintain_order=maintain_order) + if keep not in {"any", "none"}: + msg = ( + "narwhals.LazyFrame makes no assumptions about row order, so only " + f"'any' and 'none' are supported for `keep` in `unique`. Got: {keep}." + ) + raise ValueError(msg) + if maintain_order: + msg = "`maintain_order=True` is not supported for LazyFrame.unique." + raise ValueError(msg) + if maintain_order is not None: + msg = ( + "`maintain_order` has no effect and is only kept around for backwards-compatibility. " + "You can safely remove this argument." + ) + warn(message=msg, category=UserWarning, stacklevel=find_stacklevel()) + if isinstance(subset, str): + subset = [subset] + return self._from_compliant_dataframe( + self._compliant_frame.unique(subset=subset, keep=keep) + ) def filter( self, *predicates: IntoExpr | Iterable[IntoExpr] | list[bool], **constraints: Any diff --git a/tests/frame/unique_test.py b/tests/frame/unique_test.py index c8079f593..96d5a8c2d 100644 --- a/tests/frame/unique_test.py +++ b/tests/frame/unique_test.py @@ -1,5 +1,8 @@ from __future__ import annotations +from contextlib import nullcontext as does_not_raise +from typing import Any + import pytest import narwhals.stable.v1 as nw @@ -17,6 +20,7 @@ ("last", {"a": [3, 2], "b": [4, 6], "z": [8.0, 9.0]}), ("any", {"a": [1, 2], "b": [4, 6], "z": [7.0, 9.0]}), ("none", {"a": [2], "b": [6], "z": [9]}), + ("foo", {"a": [2], "b": [6], "z": [9]}), ], ) def test_unique( @@ -27,14 +31,29 @@ def test_unique( ) -> None: df_raw = constructor(data) df = nw.from_native(df_raw) + if isinstance(df, nw.LazyFrame) and keep in {"first", "last"}: + context: Any = pytest.raises(ValueError, match="row order") + elif keep == "foo": + context = pytest.raises(ValueError, match=": foo") + else: + context = does_not_raise() - result = df.unique(subset, keep=keep, maintain_order=True) # type: ignore[arg-type] - assert_equal_data(result, expected) + with context: + result = df.unique(subset, keep=keep).sort("z") # type: ignore[arg-type] + assert_equal_data(result, expected) +@pytest.mark.filterwarnings("ignore:.*backwards-compatibility:UserWarning") def test_unique_none(constructor: Constructor) -> None: df_raw = constructor(data) df = nw.from_native(df_raw) - result = df.unique(maintain_order=True) + result = df.unique(maintain_order=False).sort("z") assert_equal_data(result, data) + + if isinstance(df, nw.LazyFrame): + with pytest.raises(ValueError, match="not supported"): + result = df.unique(maintain_order=True).sort("z") + else: + result = df.unique(maintain_order=True) + assert_equal_data(result, data) diff --git a/tests/spark_like_test.py b/tests/spark_like_test.py index 3b9fa12f3..99682b8f7 100644 --- a/tests/spark_like_test.py +++ b/tests/spark_like_test.py @@ -548,35 +548,34 @@ def test_rename(pyspark_constructor: Constructor) -> None: ("none", {"a": [2], "b": [6], "z": [9]}), ], ) -@pytest.mark.filterwarnings("ignore:Argument `maintain_order=True` is unused") def test_unique( pyspark_constructor: Constructor, subset: str | list[str] | None, keep: str, expected: dict[str, list[float]], ) -> None: - context = ( - does_not_raise() - if keep == "any" - else pytest.raises( + if keep == "any": + context: Any = does_not_raise() + elif keep == "none": + context = pytest.raises( ValueError, match=r"`LazyFrame.unique` with PySpark backend only supports `keep='any'`.", ) - ) + else: + context = pytest.raises(ValueError, match=f": {keep}") with context: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(pyspark_constructor(data)) - result = df.unique(subset, keep=keep, maintain_order=True) # type: ignore[arg-type] + result = df.unique(subset, keep=keep).sort("z") # type: ignore[arg-type] assert_equal_data(result, expected) -@pytest.mark.filterwarnings("ignore:Argument `maintain_order=True` is unused") def test_unique_none(pyspark_constructor: Constructor) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(pyspark_constructor(data)) - result = df.unique(maintain_order=True) + result = df.unique().sort("z") assert_equal_data(result, data) From b628780c7905552bea80dea4510d53f55ea29493 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 1 Jan 2025 21:42:36 +0000 Subject: [PATCH 08/13] skip changelog(deps): bump astral-sh/setup-uv from 4 to 5 (#1701) Bumps [astral-sh/setup-uv](https://github.com/astral-sh/setup-uv) from 4 to 5. - [Release notes](https://github.com/astral-sh/setup-uv/releases) - [Commits](https://github.com/astral-sh/setup-uv/compare/v4...v5) --- updated-dependencies: - dependency-name: astral-sh/setup-uv dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/check_docs_build.yml | 2 +- .github/workflows/check_tpch_queries.yml | 2 +- .github/workflows/downstream_tests.yml | 14 +++++++------- .github/workflows/downstream_tests_slow.yml | 2 +- .github/workflows/extremes.yml | 8 ++++---- .github/workflows/pytest.yml | 6 +++--- .github/workflows/random_ci_pytest.yml | 2 +- 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/check_docs_build.yml b/.github/workflows/check_docs_build.yml index c85fa35d9..42d243163 100644 --- a/.github/workflows/check_docs_build.yml +++ b/.github/workflows/check_docs_build.yml @@ -19,7 +19,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} diff --git a/.github/workflows/check_tpch_queries.yml b/.github/workflows/check_tpch_queries.yml index 8924421f6..723fa6e80 100644 --- a/.github/workflows/check_tpch_queries.yml +++ b/.github/workflows/check_tpch_queries.yml @@ -19,7 +19,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} diff --git a/.github/workflows/downstream_tests.yml b/.github/workflows/downstream_tests.yml index 2c5602258..548251ddc 100644 --- a/.github/workflows/downstream_tests.yml +++ b/.github/workflows/downstream_tests.yml @@ -19,7 +19,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -66,7 +66,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -127,7 +127,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -164,7 +164,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -243,7 +243,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -283,7 +283,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -323,7 +323,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} diff --git a/.github/workflows/downstream_tests_slow.yml b/.github/workflows/downstream_tests_slow.yml index e26598d82..45a93990a 100644 --- a/.github/workflows/downstream_tests_slow.yml +++ b/.github/workflows/downstream_tests_slow.yml @@ -21,7 +21,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index 71b07d5f3..91563d840 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -19,7 +19,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -55,7 +55,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -93,7 +93,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -131,7 +131,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index b02b74c07..3f015d405 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -18,7 +18,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -43,7 +43,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} @@ -68,7 +68,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} diff --git a/.github/workflows/random_ci_pytest.yml b/.github/workflows/random_ci_pytest.yml index 67f416c21..4ec50da06 100644 --- a/.github/workflows/random_ci_pytest.yml +++ b/.github/workflows/random_ci_pytest.yml @@ -17,7 +17,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v4 + uses: astral-sh/setup-uv@v5 with: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} From 554517cd8b2b4c092ed025e8b194075f5348bab6 Mon Sep 17 00:00:00 2001 From: Lucas Nelson Date: Thu, 2 Jan 2025 04:07:48 -0600 Subject: [PATCH 09/13] feat: add `sum` and `sum_horizontal` for `SparkLike` (#1693) * feat: include sum method * feat: include sum_horizontal method * docs: update examples with PySpark * docs: remove pyspark references * feat: coalesce nulls to zero for addition * test: copy tests from expr_and_series suite * feat: handle NaN cases * chore: remove notebook * coalesce, sort in output * sumh sort, sum_expr constructor --------- Co-authored-by: FBruzzesi --- docs/installation.md | 4 +-- narwhals/_spark_like/expr.py | 8 +++++ narwhals/_spark_like/namespace.py | 27 +++++++++++++++++ tests/spark_like_test.py | 49 +++++++++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 2 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 1acc11774..a406b6295 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -91,5 +91,5 @@ Let's learn about what you just did, and what Narwhals can do for you! !!! info - These examples are using pandas, Polars and PyArrow, however Narwhals supports - other dataframe libraries (See [supported libraries](extending.md)). + These examples are using pandas, Polars, and PyArrow, however Narwhals + supports other dataframe libraries (See [supported libraries](extending.md)). diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index 3d09a2427..4887e8001 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -233,6 +233,14 @@ def _min(_input: Column) -> Column: return self._from_call(_min, "min", returns_scalar=True) + def sum(self) -> Self: + def _sum(_input: Column) -> Column: + from pyspark.sql import functions as F # noqa: N812 + + return F.sum(_input) + + return self._from_call(_sum, "sum", returns_scalar=True) + def std(self: Self, ddof: int) -> Self: from functools import partial diff --git a/narwhals/_spark_like/namespace.py b/narwhals/_spark_like/namespace.py index d150e7541..d34867b00 100644 --- a/narwhals/_spark_like/namespace.py +++ b/narwhals/_spark_like/namespace.py @@ -66,3 +66,30 @@ def col(self, *column_names: str) -> SparkLikeExpr: return SparkLikeExpr.from_column_names( *column_names, backend_version=self._backend_version, version=self._version ) + + def sum_horizontal(self, *exprs: IntoSparkLikeExpr) -> SparkLikeExpr: + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: SparkLikeLazyFrame) -> list[Column]: + import pyspark.sql.functions as F # noqa: N812 + + cols = [c for _expr in parsed_exprs for c in _expr(df)] + col_name = get_column_name(df, cols[0]) + return [ + reduce( + operator.add, + (F.coalesce(col, F.lit(0)) for col in cols), + ).alias(col_name) + ] + + return SparkLikeExpr( # type: ignore[abstract] + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="sum_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + returns_scalar=False, + backend_version=self._backend_version, + version=self._version, + kwargs={"exprs": exprs}, + ) diff --git a/tests/spark_like_test.py b/tests/spark_like_test.py index 99682b8f7..44335c6d4 100644 --- a/tests/spark_like_test.py +++ b/tests/spark_like_test.py @@ -297,6 +297,45 @@ def test_allh_all(pyspark_constructor: Constructor) -> None: assert_equal_data(result, expected) +# copied from tests/expr_and_series/sum_horizontal_test.py +@pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) +def test_sumh(pyspark_constructor: Constructor, col_expr: Any) -> None: + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + df = nw.from_native(pyspark_constructor(data)) + result = df.with_columns(horizontal_sum=nw.sum_horizontal(col_expr, nw.col("b"))) + expected = { + "a": [1, 3, 2], + "b": [4, 4, 6], + "z": [7.0, 8.0, 9.0], + "horizontal_sum": [5, 7, 8], + } + assert_equal_data(result, expected) + + +def test_sumh_nullable(pyspark_constructor: Constructor) -> None: + data = {"a": [1, 8, 3], "b": [4, 5, None], "idx": [0, 1, 2]} + expected = {"hsum": [5, 13, 3]} + + df = nw.from_native(pyspark_constructor(data)) + result = df.select("idx", hsum=nw.sum_horizontal("a", "b")).sort("idx").drop("idx") + assert_equal_data(result, expected) + + +def test_sumh_all(pyspark_constructor: Constructor) -> None: + data = {"a": [1, 2, 3], "b": [10, 20, 30]} + df = nw.from_native(pyspark_constructor(data)) + result = df.select(nw.sum_horizontal(nw.all())) + expected = { + "a": [11, 22, 33], + } + assert_equal_data(result, expected) + result = df.select(c=nw.sum_horizontal(nw.all())) + expected = { + "c": [11, 22, 33], + } + assert_equal_data(result, expected) + + # copied from tests/expr_and_series/count_test.py def test_count(pyspark_constructor: Constructor) -> None: data = {"a": [1, 3, 2], "b": [4, None, 6], "z": [7.0, None, None]} @@ -347,6 +386,16 @@ def test_expr_min_expr(pyspark_constructor: Constructor) -> None: assert_equal_data(result, expected) +# copied from tests/expr_and_series/min_test.py +@pytest.mark.parametrize("expr", [nw.col("a", "b", "z").sum(), nw.sum("a", "b", "z")]) +def test_expr_sum_expr(pyspark_constructor: Constructor, expr: nw.Expr) -> None: + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + df = nw.from_native(pyspark_constructor(data)) + result = df.select(expr) + expected = {"a": [6], "b": [14], "z": [24.0]} + assert_equal_data(result, expected) + + # copied from tests/expr_and_series/std_test.py def test_std(pyspark_constructor: Constructor) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} From f58fc4377f1ba91987d1847d9d302c0581fd980f Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 2 Jan 2025 10:25:04 +0000 Subject: [PATCH 10/13] feat: show native object (if possible) in repr (#1702) --- narwhals/dataframe.py | 38 +++++------------- narwhals/series.py | 14 +------ narwhals/stable/v1/__init__.py | 11 ++--- narwhals/utils.py | 33 +++++++++++++++ tests/repr_test.py | 73 ++++++++++++++++++++++++++++++++++ 5 files changed, 123 insertions(+), 46 deletions(-) create mode 100644 tests/repr_test.py diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index b508ded18..33aa35a22 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -19,6 +19,7 @@ from narwhals.translate import to_native from narwhals.utils import find_stacklevel from narwhals.utils import flatten +from narwhals.utils import generate_repr from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_version @@ -414,18 +415,7 @@ def __array__(self, dtype: Any = None, copy: bool | None = None) -> np.ndarray: return self._compliant_frame.__array__(dtype, copy=copy) def __repr__(self) -> str: # pragma: no cover - header = " Narwhals DataFrame " - length = len(header) - return ( - "┌" - + "─" * length - + "┐\n" - + f"|{header}|\n" - + "| Use `.to_native` to see native output |\n" - + "└" - + "─" * length - + "┘" - ) + return generate_repr("Narwhals DataFrame", self.to_native().__repr__()) def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: """Export a DataFrame via the Arrow PyCapsule Interface. @@ -3581,18 +3571,7 @@ def __init__( raise AssertionError(msg) def __repr__(self) -> str: # pragma: no cover - header = " Narwhals LazyFrame " - length = len(header) - return ( - "┌" - + "─" * length - + "┐\n" - + f"|{header}|\n" - + "| Use `.to_native` to see native output |\n" - + "└" - + "─" * length - + "┘" - ) + return generate_repr("Narwhals LazyFrame", self.to_native().__repr__()) @property def implementation(self) -> Implementation: @@ -3640,11 +3619,12 @@ def collect(self) -> DataFrame[Any]: ... } ... ) >>> lf = nw.from_native(lf_pl) - >>> lf - ┌───────────────────────────────────────┐ - | Narwhals LazyFrame | - | Use `.to_native` to see native output | - └───────────────────────────────────────┘ + >>> lf # doctest:+ELLIPSIS + ┌─────────────────────────────┐ + | Narwhals LazyFrame | + |-----------------------------| + |>> df = lf.group_by("a").agg(nw.all().sum()).collect() >>> df.to_native().sort("a") shape: (3, 3) diff --git a/narwhals/series.py b/narwhals/series.py index 8f15ff0ce..de0e64396 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -15,6 +15,7 @@ from narwhals.dtypes import _validate_dtype from narwhals.typing import IntoSeriesT from narwhals.utils import _validate_rolling_arguments +from narwhals.utils import generate_repr from narwhals.utils import parse_version if TYPE_CHECKING: @@ -404,18 +405,7 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se return function(self, *args, **kwargs) def __repr__(self) -> str: # pragma: no cover - header = " Narwhals Series " - length = len(header) - return ( - "┌" - + "─" * length - + "┐\n" - + f"|{header}|\n" - + "| Use `.to_native()` to see native output |\n" - + "└" - + "─" * length - + "┘" - ) + return generate_repr("Narwhals Series", self.to_native().__repr__()) def __len__(self) -> int: return len(self._compliant_series) diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index f7705713f..22afc687d 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -430,11 +430,12 @@ def collect(self) -> DataFrame[Any]: ... } ... ) >>> lf = nw.from_native(lf_pl) - >>> lf - ┌───────────────────────────────────────┐ - | Narwhals LazyFrame | - | Use `.to_native` to see native output | - └───────────────────────────────────────┘ + >>> lf # doctest:+ELLIPSIS + ┌─────────────────────────────┐ + | Narwhals LazyFrame | + |-----------------------------| + |>> df = lf.group_by("a").agg(nw.all().sum()).collect() >>> df.to_native().sort("a") shape: (3, 3) diff --git a/narwhals/utils.py b/narwhals/utils.py index b6337cb8e..2125d46c4 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os import re from enum import Enum from enum import auto @@ -960,3 +961,35 @@ def _validate_rolling_arguments( min_periods = window_size return window_size, min_periods + + +def generate_repr(header: str, native_repr: str) -> str: + try: + terminal_width = os.get_terminal_size().columns + except OSError: + terminal_width = 80 + native_lines = native_repr.splitlines() + max_native_width = max(len(line) for line in native_lines) + + if max_native_width + 2 < terminal_width: + length = max(max_native_width, len(header)) + output = f"┌{'─'*length}┐\n" + header_extra = length - len(header) + output += ( + f"|{' '*(header_extra//2)}{header}{' '*(header_extra//2 + header_extra%2)}|\n" + ) + output += f"|{'-'*(length)}|\n" + start_extra = (length - max_native_width) // 2 + end_extra = (length - max_native_width) // 2 + (length - max_native_width) % 2 + for line in native_lines: + output += f"|{' '*(start_extra)}{line}{' '*(end_extra + max_native_width - len(line))}|\n" + output += f"└{'─' * length}┘" + return output + + diff = 39 - len(header) + return ( + f"┌{'─' * (39)}┐\n" + f"|{' '*(diff//2)}{header}{' '*(diff//2+diff%2)}|\n" + "| Use `.to_native` to see native output |\n└" + f"{'─' * 39}┘" + ) diff --git a/tests/repr_test.py b/tests/repr_test.py new file mode 100644 index 000000000..40cd51dca --- /dev/null +++ b/tests/repr_test.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +import pandas as pd +import pytest + +import narwhals.stable.v1 as nw + + +def test_repr() -> None: + duckdb = pytest.importorskip("duckdb") + df = pd.DataFrame({"a": [1, 2, 3], "b": ["fdaf", "fda", "cf"]}) + result = nw.from_native(df).__repr__() + expected = ( + "┌──────────────────┐\n" + "|Narwhals DataFrame|\n" + "|------------------|\n" + "| a b |\n" + "| 0 1 fdaf |\n" + "| 1 2 fda |\n" + "| 2 3 cf |\n" + "└──────────────────┘" + ) + assert result == expected + result = nw.from_native(df).lazy().__repr__() + expected = ( + "┌──────────────────┐\n" + "|Narwhals LazyFrame|\n" + "|------------------|\n" + "| a b |\n" + "| 0 1 fdaf |\n" + "| 1 2 fda |\n" + "| 2 3 cf |\n" + "└──────────────────┘" + ) + assert result == expected + result = nw.from_native(df)["a"].__repr__() + expected = ( + "┌─────────────────────┐\n" + "| Narwhals Series |\n" + "|---------------------|\n" + "|0 1 |\n" + "|1 2 |\n" + "|2 3 |\n" + "|Name: a, dtype: int64|\n" + "└─────────────────────┘" + ) + assert result == expected + result = nw.from_native(duckdb.table("df")).__repr__() + expected = ( + "┌───────────────────┐\n" + "|Narwhals DataFrame |\n" + "|-------------------|\n" + "|┌───────┬─────────┐|\n" + "|│ a │ b │|\n" + "|│ int64 │ varchar │|\n" + "|├───────┼─────────┤|\n" + "|│ 1 │ fdaf │|\n" + "|│ 2 │ fda │|\n" + "|│ 3 │ cf │|\n" + "|└───────┴─────────┘|\n" + "└───────────────────┘" + ) + assert result == expected + # Make something wider than the terminal size + df = pd.DataFrame({"a": [1, 2, 3], "b": ["fdaf" * 100, "fda", "cf"]}) + result = nw.from_native(duckdb.table("df")).__repr__() + expected = ( + "┌───────────────────────────────────────┐\n" + "| Narwhals DataFrame |\n" + "| Use `.to_native` to see native output |\n" + "└───────────────────────────────────────┘" + ) + assert result == expected From a03b3ec1ff985f49981a250151133a955b43b5b7 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 2 Jan 2025 10:26:05 +0000 Subject: [PATCH 11/13] test: allow to run tests for Polars[gpu] (#1698) --- CONTRIBUTING.md | 8 +++++--- tests/expr_and_series/sample_test.py | 18 +++++++----------- tests/expr_and_series/str/to_datetime_test.py | 5 +---- tests/group_by_test.py | 7 +------ tests/read_scan_test.py | 8 ++++---- tests/tpch_q1_test.py | 6 ++---- tests/utils.py | 12 ++++++++++-- 7 files changed, 30 insertions(+), 34 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b61ed663b..0f8a6eb0b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -133,9 +133,11 @@ If you add code that should be tested, please add tests. - To run unit tests and doctests at the same time, run `pytest tests narwhals --cov=narwhals --doctest-modules` - To run tests multiprocessed, you may also want to use [pytest-xdist](https://github.com/pytest-dev/pytest-xdist) (optional) - To choose which backends to run tests with you, you can use the `--constructors` flag: - - to only run tests for pandas, Polars, and PyArrow, use `pytest --constructors=pandas,pyarrow,polars` - - to run tests for all CPU constructors, use `pytest --all-cpu-constructors` - - by default, tests run for pandas, pandas (PyArrow dtypes), PyArrow, and Polars. + - To only run tests for pandas, Polars, and PyArrow, use `pytest --constructors=pandas,pyarrow,polars` + - To run tests for all CPU constructors, use `pytest --all-cpu-constructors` + - By default, tests run for pandas, pandas (PyArrow dtypes), PyArrow, and Polars. + - To run tests using `cudf.pandas`, run `NARWHALS_DEFAULT_CONSTRUCTORS=pandas python -m cudf.pandas -m pytest` + - To run tests using `polars[gpu]`, run `NARWHALS_POLARS_GPU=1 pytest --constructors=polars[lazy]` If you want to have less surprises when opening a PR, you can take advantage of [nox](https://nox.thea.codes/en/stable/index.html) to run the entire CI/CD test suite locally in your operating system. diff --git a/tests/expr_and_series/sample_test.py b/tests/expr_and_series/sample_test.py index e8985e561..009acc3c9 100644 --- a/tests/expr_and_series/sample_test.py +++ b/tests/expr_and_series/sample_test.py @@ -46,17 +46,13 @@ def test_sample_with_seed( size, n = 100, 10 df = nw.from_native(constructor({"a": list(range(size))})).lazy() expected = {"res1": [True], "res2": [False]} - result = ( - df.select( - seed1=nw.col("a").sample(n=n, seed=123), - seed2=nw.col("a").sample(n=n, seed=123), - seed3=nw.col("a").sample(n=n, seed=42), - ) - .select( - res1=(nw.col("seed1") == nw.col("seed2")).all(), - res2=(nw.col("seed1") == nw.col("seed3")).all(), - ) - .collect() + result = df.select( + seed1=nw.col("a").sample(n=n, seed=123), + seed2=nw.col("a").sample(n=n, seed=123), + seed3=nw.col("a").sample(n=n, seed=42), + ).select( + res1=(nw.col("seed1") == nw.col("seed2")).all(), + res2=(nw.col("seed1") == nw.col("seed3")).all(), ) assert_equal_data(result, expected) diff --git a/tests/expr_and_series/str/to_datetime_test.py b/tests/expr_and_series/str/to_datetime_test.py index 8bab09559..388ef23db 100644 --- a/tests/expr_and_series/str/to_datetime_test.py +++ b/tests/expr_and_series/str/to_datetime_test.py @@ -130,10 +130,7 @@ def test_to_datetime_infer_fmt_from_date(constructor: Constructor) -> None: data = {"z": ["2020-01-01", "2020-01-02", None]} expected = [datetime(2020, 1, 1), datetime(2020, 1, 2), None] result = ( - nw.from_native(constructor(data)) - .lazy() - .select(nw.col("z").str.to_datetime()) - .collect() + nw.from_native(constructor(data)).lazy().select(nw.col("z").str.to_datetime()) ) assert_equal_data(result, {"z": expected}) diff --git a/tests/group_by_test.py b/tests/group_by_test.py index 188c17c76..a0a7bee41 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -31,9 +31,7 @@ def test_group_by_complex() -> None: assert_equal_data(result, expected) lf = nw.from_native(df_lazy).lazy() - result = nw.to_native( - lf.group_by("a").agg((nw.col("b") - nw.col("c").mean()).mean()).sort("a") - ) + result = lf.group_by("a").agg((nw.col("b") - nw.col("c").mean()).mean()).sort("a") assert_equal_data(result, expected) @@ -220,7 +218,6 @@ def test_group_by_simple_named(constructor: Constructor) -> None: b_min=nw.col("b").min(), b_max=nw.col("b").max(), ) - .collect() .sort("a") ) expected = { @@ -240,7 +237,6 @@ def test_group_by_simple_unnamed(constructor: Constructor) -> None: nw.col("b").min(), nw.col("c").max(), ) - .collect() .sort("a") ) expected = { @@ -260,7 +256,6 @@ def test_group_by_multiple_keys(constructor: Constructor) -> None: c_min=nw.col("c").min(), c_max=nw.col("c").max(), ) - .collect() .sort("a") ) expected = { diff --git a/tests/read_scan_test.py b/tests/read_scan_test.py index 89bbcdce8..dbb2cf624 100644 --- a/tests/read_scan_test.py +++ b/tests/read_scan_test.py @@ -60,7 +60,7 @@ def test_scan_csv( df = nw.from_native(constructor(data)) native_namespace = nw.get_native_namespace(df) result = nw.scan_csv(filepath, native_namespace=native_namespace) - assert_equal_data(result.collect(), data) + assert_equal_data(result, data) assert isinstance(result, nw.LazyFrame) @@ -74,7 +74,7 @@ def test_scan_csv_v1( df = nw_v1.from_native(constructor(data)) native_namespace = nw_v1.get_native_namespace(df) result = nw_v1.scan_csv(filepath, native_namespace=native_namespace) - assert_equal_data(result.collect(), data) + assert_equal_data(result, data) assert isinstance(result, nw_v1.LazyFrame) @@ -136,7 +136,7 @@ def test_scan_parquet( df = nw.from_native(constructor(data)) native_namespace = nw.get_native_namespace(df) result = nw.scan_parquet(filepath, native_namespace=native_namespace) - assert_equal_data(result.collect(), data) + assert_equal_data(result, data) assert isinstance(result, nw.LazyFrame) @@ -151,7 +151,7 @@ def test_scan_parquet_v1( df = nw_v1.from_native(constructor(data)) native_namespace = nw_v1.get_native_namespace(df) result = nw_v1.scan_parquet(filepath, native_namespace=native_namespace) - assert_equal_data(result.collect(), data) + assert_equal_data(result, data) assert isinstance(result, nw_v1.LazyFrame) diff --git a/tests/tpch_q1_test.py b/tests/tpch_q1_test.py index 3d762cbb9..fd2a7d24c 100644 --- a/tests/tpch_q1_test.py +++ b/tests/tpch_q1_test.py @@ -66,7 +66,6 @@ def test_q1(library: str, request: pytest.FixtureRequest) -> None: ) .sort(["l_returnflag", "l_linestatus"]) ) - result = query_result.collect().to_dict(as_series=False) expected = { "l_returnflag": ["A", "N", "N", "R"], "l_linestatus": ["F", "F", "O", "F"], @@ -89,7 +88,7 @@ def test_q1(library: str, request: pytest.FixtureRequest) -> None: "avg_disc": [0.05039473684210526, 0.02, 0.05537414965986395, 0.04507042253521127], "count_order": [76, 1, 147, 71], } - assert_equal_data(result, expected) + assert_equal_data(query_result, expected) @pytest.mark.parametrize( @@ -193,7 +192,6 @@ def test_q1_w_pandas_agg_generic_path() -> None: ) .sort(["l_returnflag", "l_linestatus"]) ) - result = query_result.collect().to_dict(as_series=False) expected = { "l_returnflag": ["A", "N", "N", "R"], "l_linestatus": ["F", "F", "O", "F"], @@ -216,4 +214,4 @@ def test_q1_w_pandas_agg_generic_path() -> None: "avg_disc": [0.05039473684210526, 0.02, 0.05537414965986395, 0.04507042253521127], "count_order": [76, 1, 147, 71], } - assert_equal_data(result, expected) + assert_equal_data(query_result, expected) diff --git a/tests/utils.py b/tests/utils.py index 60933046b..8ad8ee03e 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import math +import os import sys import warnings from typing import Any @@ -69,10 +70,17 @@ def _sort_dict_by_key( def assert_equal_data(result: Any, expected: dict[str, Any]) -> None: is_pyspark = ( hasattr(result, "_compliant_frame") - and result._compliant_frame._implementation is Implementation.PYSPARK + and result.implementation is Implementation.PYSPARK ) + if hasattr(result, "collect"): - result = result.collect() + if result.implementation is Implementation.POLARS and os.environ.get( + "NARWHALS_POLARS_GPU", False + ): # pragma: no cover + result = result.to_native().collect(engine="gpu") + else: + result = result.collect() + if hasattr(result, "columns"): for key in result.columns: assert key in expected, (key, expected) From 9c3aa53242a32160021f8b6f4d58ca6413f7ff12 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 2 Jan 2025 10:26:30 +0000 Subject: [PATCH 12/13] chore: use `None` instead of `float('nan')` to check for null values in tests (#1697) --- tests/expr_and_series/fill_null_test.py | 8 ++-- tests/expr_and_series/max_horizontal_test.py | 2 +- tests/expr_and_series/mean_horizontal_test.py | 2 +- tests/expr_and_series/min_horizontal_test.py | 2 +- tests/expr_and_series/rolling_mean_test.py | 13 ++++--- tests/expr_and_series/rolling_std_test.py | 31 +++++++++++---- tests/expr_and_series/rolling_sum_test.py | 12 +++--- tests/expr_and_series/rolling_var_test.py | 8 ++-- tests/expr_and_series/skew_test.py | 4 +- tests/expr_and_series/unary_test.py | 38 ++++++++++--------- tests/expr_and_series/when_test.py | 11 +++--- tests/frame/drop_nulls_test.py | 4 +- tests/frame/join_test.py | 18 ++++----- tests/frame/pivot_test.py | 4 +- tests/frame/sort_test.py | 4 +- tests/group_by_test.py | 2 +- tests/hypothesis/join_test.py | 4 +- tests/spark_like_test.py | 16 ++++---- tests/utils.py | 8 +++- 19 files changed, 107 insertions(+), 84 deletions(-) diff --git a/tests/expr_and_series/fill_null_test.py b/tests/expr_and_series/fill_null_test.py index 32e4b9cdd..57f767d4d 100644 --- a/tests/expr_and_series/fill_null_test.py +++ b/tests/expr_and_series/fill_null_test.py @@ -136,7 +136,7 @@ def test_fill_null_limits(constructor: Constructor) -> None: nw.col("a", "b").fill_null(strategy="forward", limit=2) ) expected_forward = { - "a": [1, 1, 1, float("nan"), 5, 6, 6, 6, float("nan"), 10], + "a": [1, 1, 1, None, 5, 6, 6, 6, None, 10], "b": ["a", "a", "a", None, "b", "c", "c", "c", None, "d"], } assert_equal_data(result_forward, expected_forward) @@ -146,7 +146,7 @@ def test_fill_null_limits(constructor: Constructor) -> None: ) expected_backward = { - "a": [1, float("nan"), 5, 5, 5, 6, float("nan"), 10, 10, 10], + "a": [1, None, 5, 5, 5, 6, None, 10, 10, 10], "b": ["a", None, "b", "b", "b", "c", None, "d", "d", "d"], } assert_equal_data(result_backward, expected_backward) @@ -203,7 +203,7 @@ def test_fill_null_series_limits(constructor_eager: ConstructorEager) -> None: "ignore", message="The 'downcast' keyword in fillna is deprecated" ) expected_forward = { - "a_forward": [0.0, 1, 1, float("nan"), 2, 2, float("nan"), 3], + "a_forward": [0.0, 1, 1, None, 2, 2, None, 3], "b_forward": ["", "a", "a", None, "c", "c", None, "e"], } result_forward = df.select( @@ -214,7 +214,7 @@ def test_fill_null_series_limits(constructor_eager: ConstructorEager) -> None: assert_equal_data(result_forward, expected_forward) expected_backward = { - "a_backward": [0.0, 1, float("nan"), 2, 2, float("nan"), 3, 3], + "a_backward": [0.0, 1, None, 2, 2, None, 3, 3], "b_backward": ["", "a", None, "c", "c", None, "e", "e"], } diff --git a/tests/expr_and_series/max_horizontal_test.py b/tests/expr_and_series/max_horizontal_test.py index 3becb36be..c86e11318 100644 --- a/tests/expr_and_series/max_horizontal_test.py +++ b/tests/expr_and_series/max_horizontal_test.py @@ -9,7 +9,7 @@ from tests.utils import assert_equal_data data = {"a": [1, 3, None, None], "b": [4, None, 6, None], "z": [3, 1, None, None]} -expected_values = [4, 3, 6, float("nan")] +expected_values = [4, 3, 6, None] @pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) diff --git a/tests/expr_and_series/mean_horizontal_test.py b/tests/expr_and_series/mean_horizontal_test.py index 31b4b2109..485bf1750 100644 --- a/tests/expr_and_series/mean_horizontal_test.py +++ b/tests/expr_and_series/mean_horizontal_test.py @@ -14,7 +14,7 @@ def test_meanh(constructor: Constructor, col_expr: Any) -> None: data = {"a": [1, 3, None, None], "b": [4, None, 6, None]} df = nw.from_native(constructor(data)) result = df.select(horizontal_mean=nw.mean_horizontal(col_expr, nw.col("b"))) - expected = {"horizontal_mean": [2.5, 3.0, 6.0, float("nan")]} + expected = {"horizontal_mean": [2.5, 3.0, 6.0, None]} assert_equal_data(result, expected) diff --git a/tests/expr_and_series/min_horizontal_test.py b/tests/expr_and_series/min_horizontal_test.py index 5fb7fce97..787e3e2a4 100644 --- a/tests/expr_and_series/min_horizontal_test.py +++ b/tests/expr_and_series/min_horizontal_test.py @@ -9,7 +9,7 @@ from tests.utils import assert_equal_data data = {"a": [1, 3, None, None], "b": [4, None, 6, None], "z": [3, 1, None, None]} -expected_values = [1, 1, 6, float("nan")] +expected_values = [1, 1, 6, None] @pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) diff --git a/tests/expr_and_series/rolling_mean_test.py b/tests/expr_and_series/rolling_mean_test.py index 33c817bf3..a6dd41935 100644 --- a/tests/expr_and_series/rolling_mean_test.py +++ b/tests/expr_and_series/rolling_mean_test.py @@ -1,6 +1,7 @@ from __future__ import annotations import random +from typing import Any import hypothesis.strategies as st import pandas as pd @@ -16,15 +17,15 @@ data = {"a": [None, 1, 2, None, 4, 6, 11]} -kwargs_and_expected = { - "x1": {"kwargs": {"window_size": 3}, "expected": [float("nan")] * 6 + [7.0]}, +kwargs_and_expected: dict[str, dict[str, Any]] = { + "x1": {"kwargs": {"window_size": 3}, "expected": [None] * 6 + [7.0]}, "x2": { "kwargs": {"window_size": 3, "min_periods": 1}, - "expected": [float("nan"), 1.0, 1.5, 1.5, 3.0, 5.0, 7.0], + "expected": [None, 1.0, 1.5, 1.5, 3.0, 5.0, 7.0], }, "x3": { "kwargs": {"window_size": 2, "min_periods": 1}, - "expected": [float("nan"), 1.0, 1.5, 2.0, 4.0, 5.0, 8.5], + "expected": [None, 1.0, 1.5, 2.0, 4.0, 5.0, 8.5], }, "x4": { "kwargs": {"window_size": 5, "min_periods": 1, "center": True}, @@ -52,7 +53,7 @@ def test_rolling_mean_expr( df = nw.from_native(constructor(data)) result = df.select( **{ - name: nw.col("a").rolling_mean(**values["kwargs"]) # type: ignore[arg-type] + name: nw.col("a").rolling_mean(**values["kwargs"]) for name, values in kwargs_and_expected.items() } ) @@ -69,7 +70,7 @@ def test_rolling_mean_series(constructor_eager: ConstructorEager) -> None: result = df.select( **{ - name: df["a"].rolling_mean(**values["kwargs"]) # type: ignore[arg-type] + name: df["a"].rolling_mean(**values["kwargs"]) for name, values in kwargs_and_expected.items() } ) diff --git a/tests/expr_and_series/rolling_std_test.py b/tests/expr_and_series/rolling_std_test.py index 3fdba9493..b937f8430 100644 --- a/tests/expr_and_series/rolling_std_test.py +++ b/tests/expr_and_series/rolling_std_test.py @@ -1,8 +1,8 @@ from __future__ import annotations +from math import sqrt from typing import Any -import numpy as np import pytest import narwhals.stable.v1 as nw @@ -17,32 +17,49 @@ { "name": "x1", "kwargs": {"window_size": 3}, - "expected": np.sqrt([float("nan"), float("nan"), 1 / 3, 1, 4 / 3, 7 / 3, 3]), + "expected": [ + sqrt(x) if x is not None else x + for x in [None, None, 1 / 3, 1, 4 / 3, 7 / 3, 3] + ], }, { "name": "x2", "kwargs": {"window_size": 3, "min_periods": 1}, - "expected": np.sqrt([float("nan"), 0.5, 1 / 3, 1.0, 4 / 3, 7 / 3, 3]), + "expected": [ + sqrt(x) if x is not None else x + for x in [None, 0.5, 1 / 3, 1.0, 4 / 3, 7 / 3, 3] + ], }, { "name": "x3", "kwargs": {"window_size": 2, "min_periods": 1}, - "expected": np.sqrt([float("nan"), 0.5, 0.5, 2.0, 2.0, 4.5, 4.5]), + "expected": [ + sqrt(x) if x is not None else x for x in [None, 0.5, 0.5, 2.0, 2.0, 4.5, 4.5] + ], }, { "name": "x4", "kwargs": {"window_size": 5, "min_periods": 1, "center": True}, - "expected": np.sqrt([1 / 3, 11 / 12, 4 / 5, 17 / 10, 2.0, 2.25, 3]), + "expected": [ + sqrt(x) if x is not None else x + for x in [1 / 3, 11 / 12, 4 / 5, 17 / 10, 2.0, 2.25, 3] + ], }, { "name": "x5", "kwargs": {"window_size": 4, "min_periods": 1, "center": True}, - "expected": np.sqrt([0.5, 1 / 3, 11 / 12, 11 / 12, 2.25, 2.25, 3]), + "expected": [ + sqrt(x) if x is not None else x + for x in [0.5, 1 / 3, 11 / 12, 11 / 12, 2.25, 2.25, 3] + ], }, { "name": "x6", "kwargs": {"window_size": 3, "ddof": 2}, - "expected": np.sqrt([float("nan"), float("nan"), 2 / 3, 2.0, 8 / 3, 14 / 3, 6.0]), + "expected": [ + sqrt(x) if x is not None else x + for x in [None, None, 2 / 3, 2.0, 8 / 3, 14 / 3, 6.0] + ], }, ) diff --git a/tests/expr_and_series/rolling_sum_test.py b/tests/expr_and_series/rolling_sum_test.py index fae22552b..8c4537e49 100644 --- a/tests/expr_and_series/rolling_sum_test.py +++ b/tests/expr_and_series/rolling_sum_test.py @@ -18,15 +18,15 @@ data = {"a": [None, 1, 2, None, 4, 6, 11]} -kwargs_and_expected = { - "x1": {"kwargs": {"window_size": 3}, "expected": [float("nan")] * 6 + [21]}, +kwargs_and_expected: dict[str, dict[str, Any]] = { + "x1": {"kwargs": {"window_size": 3}, "expected": [None] * 6 + [21]}, "x2": { "kwargs": {"window_size": 3, "min_periods": 1}, - "expected": [float("nan"), 1.0, 3.0, 3.0, 6.0, 10.0, 21.0], + "expected": [None, 1.0, 3.0, 3.0, 6.0, 10.0, 21.0], }, "x3": { "kwargs": {"window_size": 2, "min_periods": 1}, - "expected": [float("nan"), 1.0, 3.0, 2.0, 4.0, 10.0, 17.0], + "expected": [None, 1.0, 3.0, 2.0, 4.0, 10.0, 17.0], }, "x4": { "kwargs": {"window_size": 5, "min_periods": 1, "center": True}, @@ -54,7 +54,7 @@ def test_rolling_sum_expr( df = nw.from_native(constructor(data)) result = df.select( **{ - name: nw.col("a").rolling_sum(**values["kwargs"]) # type: ignore[arg-type] + name: nw.col("a").rolling_sum(**values["kwargs"]) for name, values in kwargs_and_expected.items() } ) @@ -71,7 +71,7 @@ def test_rolling_sum_series(constructor_eager: ConstructorEager) -> None: result = df.select( **{ - name: df["a"].rolling_sum(**values["kwargs"]) # type: ignore[arg-type] + name: df["a"].rolling_sum(**values["kwargs"]) for name, values in kwargs_and_expected.items() } ) diff --git a/tests/expr_and_series/rolling_var_test.py b/tests/expr_and_series/rolling_var_test.py index 32767c990..37475e76a 100644 --- a/tests/expr_and_series/rolling_var_test.py +++ b/tests/expr_and_series/rolling_var_test.py @@ -23,17 +23,17 @@ { "name": "x1", "kwargs": {"window_size": 3}, - "expected": [float("nan"), float("nan"), 1 / 3, 1, 4 / 3, 7 / 3, 3], + "expected": [None, None, 1 / 3, 1, 4 / 3, 7 / 3, 3], }, { "name": "x2", "kwargs": {"window_size": 3, "min_periods": 1}, - "expected": [float("nan"), 0.5, 1 / 3, 1.0, 4 / 3, 7 / 3, 3], + "expected": [None, 0.5, 1 / 3, 1.0, 4 / 3, 7 / 3, 3], }, { "name": "x3", "kwargs": {"window_size": 2, "min_periods": 1}, - "expected": [float("nan"), 0.5, 0.5, 2.0, 2.0, 4.5, 4.5], + "expected": [None, 0.5, 0.5, 2.0, 2.0, 4.5, 4.5], }, { "name": "x4", @@ -48,7 +48,7 @@ { "name": "x6", "kwargs": {"window_size": 3, "ddof": 2}, - "expected": [float("nan"), float("nan"), 2 / 3, 2.0, 8 / 3, 14 / 3, 6.0], + "expected": [None, None, 2 / 3, 2.0, 8 / 3, 14 / 3, 6.0], }, ) diff --git a/tests/expr_and_series/skew_test.py b/tests/expr_and_series/skew_test.py index b2029d08e..849496807 100644 --- a/tests/expr_and_series/skew_test.py +++ b/tests/expr_and_series/skew_test.py @@ -13,9 +13,9 @@ ("data", "expected"), [ ([], None), - ([1], float("nan")), + ([1], None), ([1, 2], 0.0), - ([0.0, 0.0, 0.0], float("nan")), + ([0.0, 0.0, 0.0], None), ([1, 2, 3, 2, 1], 0.343622), ], ) diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index 3a580b726..f2f9c33ff 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -93,7 +93,7 @@ def test_unary_two_elements(constructor: Constructor) -> None: "b_nunique": [2], "b_skew": [0.0], "c_nunique": [2], - "c_skew": [float("nan")], + "c_skew": [None], } assert_equal_data(result, expected) @@ -115,21 +115,23 @@ def test_unary_two_elements_series(constructor_eager: ConstructorEager) -> None: "b_nunique": [2], "b_skew": [0.0], "c_nunique": [2], - "c_skew": [float("nan")], + "c_skew": [None], } assert_equal_data(result, expected) def test_unary_one_element(constructor: Constructor) -> None: - data = {"a": [1], "b": [2], "c": [float("nan")]} + data = {"a": [1], "b": [2], "c": [None]} # Dask runs into a divide by zero RuntimeWarning for 1 element skew. context = ( pytest.warns(RuntimeWarning, match="invalid value encountered in scalar divide") if "dask" in str(constructor) else does_not_raise() ) - with context: - result = nw.from_native(constructor(data)).select( + result = ( + nw.from_native(constructor(data)) + .with_columns(nw.col("c").cast(nw.Float64)) + .select( a_nunique=nw.col("a").n_unique(), a_skew=nw.col("a").skew(), b_nunique=nw.col("b").n_unique(), @@ -137,19 +139,21 @@ def test_unary_one_element(constructor: Constructor) -> None: c_nunique=nw.col("c").n_unique(), c_skew=nw.col("c").skew(), ) - expected = { - "a_nunique": [1], - "a_skew": [float("nan")], - "b_nunique": [1], - "b_skew": [float("nan")], - "c_nunique": [1], - "c_skew": [float("nan")], - } + ) + expected = { + "a_nunique": [1], + "a_skew": [None], + "b_nunique": [1], + "b_skew": [None], + "c_nunique": [1], + "c_skew": [None], + } + with context: assert_equal_data(result, expected) def test_unary_one_element_series(constructor_eager: ConstructorEager) -> None: - data = {"a": [1], "b": [2], "c": [float("nan")]} + data = {"a": [1], "b": [2], "c": [None]} df = nw.from_native(constructor_eager(data)) result = { "a_nunique": [df["a"].n_unique()], @@ -161,10 +165,10 @@ def test_unary_one_element_series(constructor_eager: ConstructorEager) -> None: } expected = { "a_nunique": [1], - "a_skew": [float("nan")], + "a_skew": [None], "b_nunique": [1], - "b_skew": [float("nan")], + "b_skew": [None], "c_nunique": [1], - "c_skew": [float("nan")], + "c_skew": [None], } assert_equal_data(result, expected) diff --git a/tests/expr_and_series/when_test.py b/tests/expr_and_series/when_test.py index 3cef177fa..8648ae4fb 100644 --- a/tests/expr_and_series/when_test.py +++ b/tests/expr_and_series/when_test.py @@ -21,7 +21,7 @@ def test_when(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") == 1).then(value=3).alias("a_when")) expected = { - "a_when": [3, np.nan, np.nan], + "a_when": [3, None, None], } assert_equal_data(result, expected) @@ -41,7 +41,7 @@ def test_multiple_conditions(constructor: Constructor) -> None: nw.when(nw.col("a") < 3, nw.col("c") < 5.0).then(3).alias("a_when") ) expected = { - "a_when": [3, np.nan, np.nan], + "a_when": [3, None, None], } assert_equal_data(result, expected) @@ -65,7 +65,7 @@ def test_value_numpy_array( nw.when(nw.col("a") == 1).then(np.asanyarray([3, 4, 5])).alias("a_when") ) expected = { - "a_when": [3, np.nan, np.nan], + "a_when": [3, None, None], } assert_equal_data(result, expected) @@ -77,7 +77,7 @@ def test_value_series(constructor_eager: ConstructorEager) -> None: assert isinstance(s, nw.Series) result = df.select(nw.when(nw.col("a") == 1).then(s).alias("a_when")) expected = { - "a_when": [3, np.nan, np.nan], + "a_when": [3, None, None], } assert_equal_data(result, expected) @@ -86,7 +86,7 @@ def test_value_expression(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") == 1).then(nw.col("a") + 9).alias("a_when")) expected = { - "a_when": [10, np.nan, np.nan], + "a_when": [10, None, None], } assert_equal_data(result, expected) @@ -98,7 +98,6 @@ def test_otherwise_numpy_array( request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - import numpy as np result = df.select( nw.when(nw.col("a") == 1).then(-1).otherwise(np.array([0, 9, 10])).alias("a_when") diff --git a/tests/frame/drop_nulls_test.py b/tests/frame/drop_nulls_test.py index 680cbd4c4..bb55439eb 100644 --- a/tests/frame/drop_nulls_test.py +++ b/tests/frame/drop_nulls_test.py @@ -24,8 +24,8 @@ def test_drop_nulls(constructor: Constructor) -> None: @pytest.mark.parametrize( ("subset", "expected"), [ - ("a", {"a": [1, 2.0, 4.0], "b": [float("nan"), 3.0, 5.0]}), - (["a"], {"a": [1, 2.0, 4.0], "b": [float("nan"), 3.0, 5.0]}), + ("a", {"a": [1, 2.0, 4.0], "b": [None, 3.0, 5.0]}), + (["a"], {"a": [1, 2.0, 4.0], "b": [None, 3.0, 5.0]}), (["a", "b"], {"a": [2.0, 4.0], "b": [3.0, 5.0]}), ], ) diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index c743893d0..1abe2b90f 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -235,22 +235,20 @@ def test_left_join(constructor: Constructor) -> None: } df_left = nw.from_native(constructor(data_left)) df_right = nw.from_native(constructor(data_right)) - result = df_left.join(df_right, left_on="bob", right_on="co", how="left").select( # type: ignore[arg-type] - nw.all().fill_null(float("nan")) - ) + result = df_left.join(df_right, left_on="bob", right_on="co", how="left") # type: ignore[arg-type] result = result.sort("index") result = result.drop("index_right") expected = { "antananarivo": [1, 2, 3], "bob": [4, 5, 6], - "antananarivo_right": [1, 2, float("nan")], + "antananarivo_right": [1, 2, None], "index": [0, 1, 2], } result_on_list = df_left.join( df_right, # type: ignore[arg-type] on=["antananarivo", "index"], how="left", - ).select(nw.all().fill_null(float("nan"))) + ) result_on_list = result_on_list.sort("index") expected_on_list = { "antananarivo": [1, 2, 3], @@ -312,15 +310,15 @@ def test_left_join_overlapping_column(constructor: Constructor) -> None: left_on="antananarivo", right_on="d", how="left", - ).select(nw.all().fill_null(float("nan"))) + ) result = result.sort("index") result = result.drop("index_right") expected = { "antananarivo": [1, 2, 3], "bob": [4, 5, 6], "d": [1, 4, 2], - "antananarivo_right": [1.0, 3.0, float("nan")], - "c": [4.0, 6.0, float("nan")], + "antananarivo_right": [1.0, 3.0, None], + "c": [4.0, 6.0, None], "index": [0, 1, 2], } assert_equal_data(result, expected) @@ -397,7 +395,7 @@ def test_joinasof_numeric( expected_forward = { "antananarivo": [1, 5, 10], "val": ["a", "b", "c"], - "val_right": [1, 6, float("nan")], + "val_right": [1, 6, None], } expected_nearest = { "antananarivo": [1, 5, 10], @@ -523,7 +521,7 @@ def test_joinasof_by( "antananarivo": [1, 5, 7, 10], "bob": ["D", "D", "C", "A"], "c": [9, 2, 1, 1], - "d": [1, 3, float("nan"), 4], + "d": [1, 3, None, 4], } assert_equal_data(result, expected) assert_equal_data(result_by, expected) diff --git a/tests/frame/pivot_test.py b/tests/frame/pivot_test.py index 98ef7466f..0e3860292 100644 --- a/tests/frame/pivot_test.py +++ b/tests/frame/pivot_test.py @@ -271,7 +271,7 @@ def test_pivot_no_index( expected = { "ix": [1, 1, 2, 2], "bar": ["x", "y", "w", "z"], - "a": [1.0, float("nan"), float("nan"), 3.0], - "b": [float("nan"), 2.0, 4.0, float("nan")], + "a": [1.0, None, None, 3.0], + "b": [None, 2.0, 4.0, None], } assert_equal_data(result, expected) diff --git a/tests/frame/sort_test.py b/tests/frame/sort_test.py index 4e12cc95a..5147c6f56 100644 --- a/tests/frame/sort_test.py +++ b/tests/frame/sort_test.py @@ -29,8 +29,8 @@ def test_sort(constructor: Constructor) -> None: @pytest.mark.parametrize( ("nulls_last", "expected"), [ - (True, {"a": [0, 2, 0, -1], "b": [3, 2, 1, float("nan")]}), - (False, {"a": [-1, 0, 2, 0], "b": [float("nan"), 3, 2, 1]}), + (True, {"a": [0, 2, 0, -1], "b": [3, 2, 1, None]}), + (False, {"a": [-1, 0, 2, 0], "b": [None, 3, 2, 1]}), ], ) def test_sort_nulls( diff --git a/tests/group_by_test.py b/tests/group_by_test.py index a0a7bee41..3c57ce027 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -288,7 +288,7 @@ def test_key_with_nulls( .sort("a") .with_columns(nw.col("b").cast(nw.Float64)) ) - expected = {"b": [4.0, 5, float("nan")], "len": [1, 1, 1], "a": [1, 2, 3]} + expected = {"b": [4.0, 5, None], "len": [1, 1, 1], "a": [1, 2, 3]} assert_equal_data(result, expected) diff --git a/tests/hypothesis/join_test.py b/tests/hypothesis/join_test.py index 5b498db65..7f1cd8103 100644 --- a/tests/hypothesis/join_test.py +++ b/tests/hypothesis/join_test.py @@ -161,7 +161,7 @@ def test_left_join( # pragma: no cover left_on=left_key, right_on=right_key, ) - ).select(pl.all().fill_null(float("nan"))) + ) assert_equal_data( result_pd.to_dict(as_series=False), result_pl.to_dict(as_series=False) ) @@ -174,7 +174,7 @@ def test_left_join( # pragma: no cover left_on=left_key, right_on=right_key, ) - .select(nw.all().cast(nw.Float64).fill_null(float("nan"))) + .select(nw.all().cast(nw.Float64)) .pipe(lambda df: df.sort(df.columns)) ) assert_equal_data( diff --git a/tests/spark_like_test.py b/tests/spark_like_test.py index 44335c6d4..27777139c 100644 --- a/tests/spark_like_test.py +++ b/tests/spark_like_test.py @@ -235,8 +235,8 @@ def test_sort(pyspark_constructor: Constructor) -> None: @pytest.mark.parametrize( ("nulls_last", "expected"), [ - (True, {"a": [0, 2, 0, -1], "b": [3, 2, 1, float("nan")]}), - (False, {"a": [-1, 0, 2, 0], "b": [float("nan"), 3, 2, 1]}), + (True, {"a": [0, 2, 0, -1], "b": [3, 2, 1, None]}), + (False, {"a": [-1, 0, 2, 0], "b": [None, 3, 2, 1]}), ], ) def test_sort_nulls( @@ -338,7 +338,7 @@ def test_sumh_all(pyspark_constructor: Constructor) -> None: # copied from tests/expr_and_series/count_test.py def test_count(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, None, 6], "z": [7.0, None, None]} + data = {"a": [1, 2, 3], "b": [4, None, 6], "z": [7.0, None, None]} df = nw.from_native(pyspark_constructor(data)) result = df.select(nw.col("a", "b", "z").count()) expected = {"a": [3], "b": [2], "z": [1]} @@ -560,8 +560,8 @@ def test_drop_nulls(pyspark_constructor: Constructor) -> None: @pytest.mark.parametrize( ("subset", "expected"), [ - ("a", {"a": [1, 2.0, 4.0], "b": [float("nan"), 3.0, 5.0]}), - (["a"], {"a": [1, 2.0, 4.0], "b": [float("nan"), 3.0, 5.0]}), + ("a", {"a": [1, 2.0, 4.0], "b": [None, 3.0, 5.0]}), + (["a"], {"a": [1, 2.0, 4.0], "b": [None, 3.0, 5.0]}), (["a", "b"], {"a": [2.0, 4.0], "b": [3.0, 5.0]}), ], ) @@ -831,7 +831,7 @@ def test_left_join(pyspark_constructor: Constructor) -> None: expected = { "antananarivo": [1, 2, 3], "bob": [4, 5, 6], - "antananarivo_right": [1, 2, float("nan")], + "antananarivo_right": [1, 2, None], "idx": [0, 1, 2], } result_on_list = df_left.join( @@ -912,8 +912,8 @@ def test_left_join_overlapping_column(pyspark_constructor: Constructor) -> None: "antananarivo": [1, 2, 3], "bob": [4, 5, 6], "d": [1, 4, 2], - "antananarivo_right": [1.0, 3.0, float("nan")], - "c": [4.0, 6.0, float("nan")], + "antananarivo_right": [1.0, 3.0, None], + "c": [4.0, 6.0, None], "idx": [0, 1, 2], } assert_equal_data(result, expected) diff --git a/tests/utils.py b/tests/utils.py index 8ad8ee03e..e7c9c7d89 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -98,8 +98,12 @@ def assert_equal_data(result: Any, expected: dict[str, Any]) -> None: for i, (lhs, rhs) in enumerate(zip_strict(result_value, expected_value)): if isinstance(lhs, float) and not math.isnan(lhs): are_equivalent_values = math.isclose(lhs, rhs, rel_tol=0, abs_tol=1e-6) - elif isinstance(lhs, float) and math.isnan(lhs) and rhs is not None: - are_equivalent_values = math.isnan(rhs) # pragma: no cover + elif isinstance(lhs, float) and math.isnan(lhs): + are_equivalent_values = rhs is None or math.isnan(rhs) + elif isinstance(rhs, float) and math.isnan(rhs): + are_equivalent_values = lhs is None or math.isnan(lhs) + elif lhs is None: + are_equivalent_values = rhs is None elif pd.isna(lhs): are_equivalent_values = pd.isna(rhs) else: From 44d449df787dd90b75a65b959d4de9497996500b Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Thu, 2 Jan 2025 11:30:53 +0100 Subject: [PATCH 13/13] docs: `Series` method' docstrings (#1699) --- narwhals/dataframe.py | 5 +- narwhals/series.py | 2682 +++++++++++++++++++++++--------- narwhals/stable/v1/__init__.py | 104 +- 3 files changed, 1995 insertions(+), 796 deletions(-) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 33aa35a22..0e401d464 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -3577,9 +3577,8 @@ def __repr__(self) -> str: # pragma: no cover def implementation(self) -> Implementation: """Return implementation of native frame. - This can be useful when you need to some special-casing for - some libraries for features outside of Narwhals' scope - for - example, when dealing with pandas' Period Dtype. + This can be useful when you need to use special-casing for features outside of + Narwhals' scope - for example, when dealing with pandas' Period Dtype. Returns: Implementation. diff --git a/narwhals/series.py b/narwhals/series.py index de0e64396..4203cdb74 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -79,9 +79,8 @@ def __init__( def implementation(self) -> Implementation: """Return implementation of native Series. - This can be useful when you need to some special-casing for - some libraries for features outside of Narwhals' scope - for - example, when dealing with pandas' Period Dtype. + This can be useful when you need to use special-casing for features outside of + Narwhals' scope - for example, when dealing with pandas' Period Dtype. Returns: Implementation. @@ -89,14 +88,19 @@ def implementation(self) -> Implementation: Examples: >>> import narwhals as nw >>> import pandas as pd + >>> s_native = pd.Series([1, 2, 3]) >>> s = nw.from_native(s_native, series_only=True) + >>> s.implementation + >>> s.implementation.is_pandas() True + >>> s.implementation.is_pandas_like() True + >>> s.implementation.is_polars() False """ @@ -125,16 +129,17 @@ def __getitem__(self: Self, idx: int | slice | Sequence[int]) -> Any | Self: A single element if `idx` is an integer, else a subset of the Series. Examples: + >>> from typing import Any >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> from typing import Any - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) - >>> s_pa = pa.chunked_array([s]) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: @@ -142,12 +147,15 @@ def __getitem__(self: Self, idx: int | slice | Sequence[int]) -> Any | Self: ... s = nw.from_native(s_native, series_only=True) ... return s[0] - We can then pass either pandas, Polars, or any supported library: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_get_first_item`: >>> agnostic_get_first_item(s_pd) np.int64(1) + >>> agnostic_get_first_item(s_pl) 1 + >>> agnostic_get_first_item(s_pa) 1 @@ -161,6 +169,7 @@ def __getitem__(self: Self, idx: int | slice | Sequence[int]) -> Any | Self: 0 1 1 2 dtype: int64 + >>> agnostic_slice(s_pl) # doctest:+NORMALIZE_WHITESPACE shape: (2,) Series: '' [i64] @@ -168,7 +177,8 @@ def __getitem__(self: Self, idx: int | slice | Sequence[int]) -> Any | Self: 1 2 ] - >>> agnostic_slice(s_pa) + + >>> agnostic_slice(s_pa) # doctest:+ELLIPSIS [ [ @@ -220,26 +230,31 @@ def to_native(self) -> IntoSeriesT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_to_native(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_native`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_to_native(s_pd) 0 1 1 2 2 3 dtype: int64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_to_native(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ @@ -247,6 +262,16 @@ def to_native(self) -> IntoSeriesT: 2 3 ] + + >>> agnostic_to_native(s_pa) # doctest:+ELLIPSIS + + [ + [ + 1, + 2, + 3 + ] + ] """ return self._compliant_series._native_series # type: ignore[no-any-return] @@ -284,26 +309,31 @@ def scatter(self, indices: int | Sequence[int], values: Any) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_scatter(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns(df["a"].scatter([0, 1], [999, 888])).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_scatter`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_scatter(df_pd) a b 0 999 4 1 888 5 2 3 6 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_scatter(df_pl) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -314,6 +344,14 @@ def scatter(self, indices: int | Sequence[int], values: Any) -> Self: │ 888 ┆ 5 │ │ 3 ┆ 6 │ └─────┴─────┘ + + >>> agnostic_scatter(df_pa) + pyarrow.Table + a: int64 + b: int64 + ---- + a: [[999,888,3]] + b: [[4,5,6]] """ return self._from_compliant_series( self._compliant_series.scatter(indices, self._extract_native(values)) @@ -329,23 +367,31 @@ def shape(self) -> tuple[int]: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries) -> tuple[int]: + >>> def agnostic_shape(s_native: IntoSeries) -> tuple[int]: ... s = nw.from_native(s_native, series_only=True) ... return s.shape - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_shape`: + + >>> agnostic_shape(s_pd) + (3,) - >>> my_library_agnostic_function(s_pd) + >>> agnostic_shape(s_pl) (3,) - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_shape(s_pa) (3,) """ return self._compliant_series.shape # type: ignore[no-any-return] @@ -372,35 +418,48 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se Examples: >>> import polars as pl >>> import pandas as pd + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s_pd = pd.Series([1, 2, 3, 4]) - >>> s_pl = pl.Series([1, 2, 3, 4]) - Lets define a function to pipe into - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + Let's define a function to pipe into: + + >>> def agnostic_pipe(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.pipe(lambda x: x + 2).to_native() - Now apply it to the series + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_pipe`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_pipe(s_pd) 0 3 1 4 2 5 - 3 6 dtype: int64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (4,) + + >>> agnostic_pipe(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) Series: '' [i64] [ 3 4 5 - 6 ] - + >>> agnostic_pipe(s_pa) # doctest: +ELLIPSIS + + [ + [ + 3, + 4, + 5 + ] + ] """ return function(self, *args, **kwargs) @@ -419,25 +478,33 @@ def len(self) -> int: The number of elements in the Series. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeries >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeries + >>> data = [1, 2, None] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function that computes the len of the series: - >>> def my_library_agnostic_function(s_native: IntoSeries) -> int: + >>> def agnostic_len(s_native: IntoSeries) -> int: ... s = nw.from_native(s_native, series_only=True) ... return s.len() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_len`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_len(s_pd) 3 - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_len(s_pl) + 3 + + >>> agnostic_len(s_pa) 3 """ return len(self._compliant_series) @@ -452,23 +519,31 @@ def dtype(self: Self) -> DType: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> nw.dtypes.DType: + >>> def agnostic_dtype(s_native: IntoSeriesT) -> nw.dtypes.DType: ... s = nw.from_native(s_native, series_only=True) ... return s.dtype - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dtype`: + + >>> agnostic_dtype(s_pd) + Int64 - >>> my_library_agnostic_function(s_pd) + >>> agnostic_dtype(s_pl) Int64 - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_dtype(s_pa) Int64 """ return self._compliant_series.dtype # type: ignore[no-any-return] @@ -483,23 +558,27 @@ def name(self) -> str: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s, name="foo") - >>> s_pl = pl.Series("foo", s) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data, name="foo") + >>> s_pl = pl.Series("foo", data) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries) -> str: + >>> def agnostic_name(s_native: IntoSeries) -> str: ... s = nw.from_native(s_native, series_only=True) ... return s.name - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas or Polars + to `agnostic_name`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_name(s_pd) 'foo' - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_name(s_pl) 'foo' """ return self._compliant_series.name # type: ignore[no-any-return] @@ -561,25 +640,27 @@ def ewm_mean( >>> import polars as pl >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = [1, 2, 3] >>> s_pd = pd.Series(name="a", data=data) >>> s_pl = pl.Series(name="a", values=data) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_ewm_mean(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.ewm_mean(com=1, ignore_nulls=False).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas or Polars + to `agnostic_ewm_mean`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_ewm_mean(s_pd) 0 1.000000 1 1.666667 2 2.428571 Name: a, dtype: float64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_ewm_mean(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: 'a' [f64] [ @@ -612,26 +693,31 @@ def cast(self: Self, dtype: DType | type[DType]) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s = [True, False, True] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [True, False, True] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_cast(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.cast(nw.Int64).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cast`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_cast(s_pd) 0 1 1 0 2 1 dtype: int64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_cast(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ @@ -639,6 +725,16 @@ def cast(self: Self, dtype: DType | type[DType]) -> Self: 0 1 ] + + >>> agnostic_cast(s_pa) # doctest: +ELLIPSIS + + [ + [ + 1, + 0, + 1 + ] + ] """ _validate_dtype(dtype) return self._from_compliant_series(self._compliant_series.cast(dtype)) @@ -652,27 +748,32 @@ def to_frame(self) -> DataFrame[Any]: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> from narwhals.typing import IntoSeries, IntoDataFrame - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s, name="a") - >>> s_pl = pl.Series("a", s) + >>> from narwhals.typing import IntoDataFrame + >>> from narwhals.typing import IntoSeries + + >>> data = [1, 2] + >>> s_pd = pd.Series(data, name="a") + >>> s_pl = pl.Series("a", data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries) -> IntoDataFrame: + >>> def agnostic_to_frame(s_native: IntoSeries) -> IntoDataFrame: ... s = nw.from_native(s_native, series_only=True) ... return s.to_frame().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_frame`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_to_frame(s_pd) a 0 1 1 2 - 2 3 - >>> my_library_agnostic_function(s_pl) - shape: (3, 1) + + >>> agnostic_to_frame(s_pl) + shape: (2, 1) ┌─────┐ │ a │ │ --- │ @@ -680,8 +781,13 @@ def to_frame(self) -> DataFrame[Any]: ╞═════╡ │ 1 │ │ 2 │ - │ 3 │ └─────┘ + + >>> agnostic_to_frame(s_pa) + pyarrow.Table + : int64 + ---- + : [[1,2]] """ return self._dataframe( self._compliant_series.to_frame(), @@ -703,23 +809,31 @@ def to_list(self) -> list[Any]: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s, name="a") - >>> s_pl = pl.Series("a", s) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries): + >>> def agnostic_to_list(s_native: IntoSeries): ... s = nw.from_native(s_native, series_only=True) ... return s.to_list() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_list`: + + >>> agnostic_to_list(s_pd) + [1, 2, 3] - >>> my_library_agnostic_function(s_pd) + >>> agnostic_to_list(s_pl) [1, 2, 3] - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_to_list(s_pa) [1, 2, 3] """ return self._compliant_series.to_list() # type: ignore[no-any-return] @@ -733,23 +847,31 @@ def mean(self) -> Any: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries): + >>> def agnostic_mean(s_native: IntoSeries) -> float: ... s = nw.from_native(s_native, series_only=True) ... return s.mean() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_mean`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_mean(s_pd) np.float64(2.0) - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_mean(s_pl) + 2.0 + + >>> agnostic_mean(s_pa) 2.0 """ return self._compliant_series.mean() @@ -769,24 +891,28 @@ def median(self) -> Any: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [5, 3, 8] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) - >>> s_pa = pa.chunked_array([s]) + + >>> data = [5, 3, 8] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries): + >>> def agnostic_median(s_native: IntoSeries) -> float: ... s = nw.from_native(s_native, series_only=True) ... return s.median() - We can then pass any supported library such as pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_median`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_median(s_pd) np.float64(5.0) - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_median(s_pl) 5.0 - >>> my_library_agnostic_function(s_pa) + + >>> agnostic_median(s_pa) 5.0 """ return self._compliant_series.median() @@ -802,22 +928,29 @@ def skew(self: Self) -> Any: >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw - >>> s = [1, 1, 2, 10, 100] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) - >>> s_pa = pa.array(s) + >>> from narwhals.typing import IntoSeries + + >>> data = [1, 1, 2, 10, 100] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> @nw.narwhalify - ... def func(s): + >>> def agnostic_skew(s_native: IntoSeries) -> float: + ... s = nw.from_native(s_native, series_only=True) ... return s.skew() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_skew`: - >>> func(s_pd) + >>> agnostic_skew(s_pd) np.float64(1.4724267269058975) - >>> func(s_pl) + + >>> agnostic_skew(s_pl) + 1.4724267269058975 + + >>> agnostic_skew(s_pa) 1.4724267269058975 Notes: @@ -835,25 +968,32 @@ def count(self) -> Any: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries): + >>> def agnostic_count(s_native: IntoSeries) -> int: ... s = nw.from_native(s_native, series_only=True) ... return s.count() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_count`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_count(s_pd) np.int64(3) - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_count(s_pl) 3 + >>> agnostic_count(s_pa) + 3 """ return self._compliant_series.count() @@ -869,23 +1009,31 @@ def any(self) -> Any: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [False, True, False] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [False, True, False] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries): + >>> def agnostic_any(s_native: IntoSeries) -> bool: ... s = nw.from_native(s_native, series_only=True) ... return s.any() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_any`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_any(s_pd) np.True_ - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_any(s_pl) + True + + >>> agnostic_any(s_pa) True """ return self._compliant_series.any() @@ -899,25 +1047,32 @@ def all(self) -> Any: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [True, False, True] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [False, True, False] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries): + >>> def agnostic_all(s_native: IntoSeries) -> bool: ... s = nw.from_native(s_native, series_only=True) ... return s.all() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_all`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_all(s_pd) np.False_ - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_all(s_pl) False + >>> agnostic_all(s_pa) + False """ return self._compliant_series.all() @@ -930,23 +1085,31 @@ def min(self) -> Any: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries): + >>> def agnostic_min(s_native: IntoSeries): ... s = nw.from_native(s_native, series_only=True) ... return s.min() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_min`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_min(s_pd) np.int64(1) - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_min(s_pl) + 1 + + >>> agnostic_min(s_pa) 1 """ return self._compliant_series.min() @@ -960,23 +1123,31 @@ def max(self) -> Any: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries): + >>> def agnostic_max(s_native: IntoSeries): ... s = nw.from_native(s_native, series_only=True) ... return s.max() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_max`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_max(s_pd) np.int64(3) - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_max(s_pl) + 3 + + >>> agnostic_max(s_pa) 3 """ return self._compliant_series.max() @@ -990,10 +1161,11 @@ def arg_min(self) -> int: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) - >>> s_pa = pa.chunked_array([s]) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: @@ -1001,13 +1173,15 @@ def arg_min(self) -> int: ... s = nw.from_native(s_native, series_only=True) ... return s.arg_min() - We can then pass either any supported library such as pandas, Polars, - or PyArrow: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_arg_min`: >>> agnostic_arg_min(s_pd) np.int64(0) + >>> agnostic_arg_min(s_pl) 0 + >>> agnostic_arg_min(s_pa) 0 """ @@ -1022,10 +1196,11 @@ def arg_max(self) -> int: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) - >>> s_pa = pa.chunked_array([s]) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: @@ -1033,13 +1208,15 @@ def arg_max(self) -> int: ... s = nw.from_native(s_native, series_only=True) ... return s.arg_max() - We can then pass either any supported library such as pandas, Polars, - or PyArrow: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_arg_max`: >>> agnostic_arg_max(s_pd) np.int64(2) + >>> agnostic_arg_max(s_pl) 2 + >>> agnostic_arg_max(s_pa) 2 """ @@ -1054,23 +1231,31 @@ def sum(self) -> Any: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries): + >>> def agnostic_sum(s_native: IntoSeries): ... s = nw.from_native(s_native, series_only=True) ... return s.sum() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_sum`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_sum(s_pd) np.int64(6) - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_sum(s_pl) + 6 + + >>> agnostic_sum(s_pa) 6 """ return self._compliant_series.sum() @@ -1088,23 +1273,31 @@ def std(self, *, ddof: int = 1) -> Any: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries): + >>> def agnostic_std(s_native: IntoSeries) -> float: ... s = nw.from_native(s_native, series_only=True) ... return s.std() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_std`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_std(s_pd) np.float64(1.0) - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_std(s_pl) + 1.0 + + >>> agnostic_std(s_pa) 1.0 """ return self._compliant_series.std(ddof=ddof) @@ -1119,24 +1312,32 @@ def var(self, *, ddof: int = 1) -> Any: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def agnostic_var(s_native: IntoSeries): + >>> def agnostic_var(s_native: IntoSeries) -> float: ... s = nw.from_native(s_native, series_only=True) ... return s.var() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_var`: >>> agnostic_var(s_pd) np.float64(1.0) + >>> agnostic_var(s_pl) 1.0 + + >>> agnostic_var(s_pa) + 1.0 """ return self._compliant_series.var(ddof=ddof) @@ -1155,27 +1356,31 @@ def clip( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def clip_lower(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_clip_lower(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.clip(2).to_native() - We can then pass either pandas or Polars to `clip_lower`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_clip_lower`: - >>> clip_lower(s_pd) + >>> agnostic_clip_lower(s_pd) 0 2 1 2 2 3 dtype: int64 - >>> clip_lower(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_clip_lower(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ @@ -1184,20 +1389,32 @@ def clip( 3 ] + >>> agnostic_clip_lower(s_pa) # doctest: +ELLIPSIS + + [ + [ + 2, + 2, + 3 + ] + ] + We define another library agnostic function: - >>> def clip_upper(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_clip_upper(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.clip(upper_bound=2).to_native() - We can then pass either pandas or Polars to `clip_upper`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_clip_upper`: - >>> clip_upper(s_pd) + >>> agnostic_clip_upper(s_pd) 0 1 1 2 2 2 dtype: int64 - >>> clip_upper(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_clip_upper(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ @@ -1206,21 +1423,33 @@ def clip( 2 ] + >>> agnostic_clip_upper(s_pa) # doctest: +ELLIPSIS + + [ + [ + 1, + 2, + 2 + ] + ] + We can have both at the same time - >>> s = [-1, 1, -3, 3, -5, 5] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + >>> data = [-1, 1, -3, 3, -5, 5] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_clip(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.clip(-1, 3).to_native() - We can pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_clip`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_clip(s_pd) 0 -1 1 1 2 -1 @@ -1228,7 +1457,8 @@ def clip( 4 -1 5 3 dtype: int64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_clip(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (6,) Series: '' [i64] [ @@ -1239,6 +1469,19 @@ def clip( -1 3 ] + + >>> agnostic_clip_upper(s_pa) # doctest: +ELLIPSIS + + [ + [ + -1, + 1, + -3, + 2, + -5, + 2 + ] + ] """ return self._from_compliant_series( self._compliant_series.clip(lower_bound=lower_bound, upper_bound=upper_bound) @@ -1256,25 +1499,31 @@ def is_in(self, other: Any) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s_pd = pd.Series([1, 2, 3]) - >>> s_pl = pl.Series([1, 2, 3]) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_is_in(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.is_in([3, 2, 8]).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_in`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_is_in(s_pd) 0 False 1 True 2 True dtype: bool - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_is_in(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [bool] [ @@ -1282,6 +1531,16 @@ def is_in(self, other: Any) -> Self: true true ] + + >>> agnostic_is_in(s_pa) # doctest: +ELLIPSIS + + [ + [ + false, + true, + true + ] + ] """ return self._from_compliant_series( self._compliant_series.is_in(self._extract_native(other)) @@ -1296,31 +1555,45 @@ def arg_true(self) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = [1, None, None, 2] - >>> s_pd = pd.Series(data, name="a") - >>> s_pl = pl.Series("a", data) + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_arg_true(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.is_null().arg_true().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_arg_true`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_arg_true(s_pd) 1 1 2 2 - Name: a, dtype: int64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + dtype: int64 + + >>> agnostic_arg_true(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) - Series: 'a' [u32] + Series: '' [u32] [ 1 2 ] + + >>> agnostic_arg_true(s_pa) # doctest: +ELLIPSIS + + [ + [ + 1, + 2 + ] + ] """ return self._from_compliant_series(self._compliant_series.arg_true()) @@ -1341,9 +1614,11 @@ def drop_nulls(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s_pd = pd.Series([2, 4, None, 3, 5]) - >>> s_pl = pl.Series([2, 4, None, 3, 5]) - >>> s_pa = pa.chunked_array([[2, 4, None, 3, 5]]) + + >>> data = [2, 4, None, 3, 5] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function: @@ -1351,7 +1626,8 @@ def drop_nulls(self) -> Self: ... s = nw.from_native(s_native, series_only=True) ... return s.drop_nulls().to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_drop_nulls`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_drop_nulls`: >>> agnostic_drop_nulls(s_pd) 0 2.0 @@ -1359,6 +1635,7 @@ def drop_nulls(self) -> Self: 3 3.0 4 5.0 dtype: float64 + >>> agnostic_drop_nulls(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [i64] @@ -1368,6 +1645,7 @@ def drop_nulls(self) -> Self: 3 5 ] + >>> agnostic_drop_nulls(s_pa) # doctest: +ELLIPSIS [ @@ -1390,26 +1668,31 @@ def abs(self) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s = [2, -4, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [2, -4, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_abs(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.abs().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_abs`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_abs(s_pd) 0 2 1 4 2 3 dtype: int64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_abs(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ @@ -1417,6 +1700,16 @@ def abs(self) -> Self: 4 3 ] + + >>> agnostic_abs(s_pa) # doctest: +ELLIPSIS + + [ + [ + 2, + 4, + 3 + ] + ] """ return self._from_compliant_series(self._compliant_series.abs()) @@ -1432,26 +1725,31 @@ def cum_sum(self: Self, *, reverse: bool = False) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s = [2, 4, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [2, 4, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_cum_sum(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.cum_sum().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cum_sum`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_cum_sum(s_pd) 0 2 1 6 2 9 dtype: int64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_cum_sum(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ @@ -1459,6 +1757,16 @@ def cum_sum(self: Self, *, reverse: bool = False) -> Self: 6 9 ] + + >>> agnostic_cum_sum(s_pa) # doctest: +ELLIPSIS + + [ + [ + 2, + 6, + 9 + ] + ] """ return self._from_compliant_series( self._compliant_series.cum_sum(reverse=reverse) @@ -1478,26 +1786,31 @@ def unique(self, *, maintain_order: bool = False) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s = [2, 4, 4, 6] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [2, 4, 4, 6] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_unique(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.unique(maintain_order=True).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_unique`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_unique(s_pd) 0 2 1 4 2 6 dtype: int64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_unique(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ @@ -1505,6 +1818,16 @@ def unique(self, *, maintain_order: bool = False) -> Self: 4 6 ] + + >>> agnostic_unique(s_pa) # doctest: +ELLIPSIS + + [ + [ + 2, + 4, + 6 + ] + ] """ return self._from_compliant_series( self._compliant_series.unique(maintain_order=maintain_order) @@ -1528,26 +1851,31 @@ def diff(self) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s = [2, 4, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [2, 4, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_diff(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.diff().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_diff`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_diff(s_pd) 0 NaN 1 2.0 2 -1.0 dtype: float64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_diff(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ @@ -1555,6 +1883,16 @@ def diff(self) -> Self: 2 -1 ] + + >>> agnostic_diff(s_pa) # doctest: +ELLIPSIS + + [ + [ + null, + 2, + -1 + ] + ] """ return self._from_compliant_series(self._compliant_series.diff()) @@ -1580,26 +1918,31 @@ def shift(self, n: int) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s = [2, 4, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [2, 4, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_shift(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.shift(1).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_shift`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_shift(s_pd) 0 NaN 1 2.0 2 4.0 dtype: float64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_shift(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ @@ -1607,6 +1950,16 @@ def shift(self, n: int) -> Self: 2 4 ] + + >>> agnostic_shift(s_pa) # doctest: +ELLIPSIS + + [ + [ + null, + 2, + 4 + ] + ] """ return self._from_compliant_series(self._compliant_series.shift(n)) @@ -1636,29 +1989,34 @@ def sample( The results are not consistent across libraries. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT - >>> s_pd = pd.Series([1, 2, 3, 4]) - >>> s_pl = pl.Series([1, 2, 3, 4]) + >>> data = [1, 2, 3, 4] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_sample(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.sample(fraction=1.0, with_replacement=True).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_sample`: - >>> my_library_agnostic_function(s_pd) # doctest: +SKIP + >>> agnostic_sample(s_pd) # doctest: +SKIP a 2 3 1 2 3 4 3 4 - >>> my_library_agnostic_function(s_pl) # doctest: +SKIP + + >>> agnostic_sample(s_pl) # doctest: +SKIP shape: (4,) Series: '' [i64] [ @@ -1667,6 +2025,17 @@ def sample( 3 4 ] + + >>> agnostic_sample(s_pa) # doctest: +SKIP + + [ + [ + 1, + 4, + 3, + 4 + ] + ] """ return self._from_compliant_series( self._compliant_series.sample( @@ -1708,25 +2077,28 @@ def alias(self, name: str) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s, name="foo") - >>> s_pl = pl.Series("foo", s) - >>> s_pa = pa.chunked_array([s]) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data, name="foo") + >>> s_pl = pl.Series("foo", data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_alias(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.alias("bar").to_native() - We can then pass any supported library such as pandas, Polars, or PyArrow: + We can then pass any supported library such as pandas or Polars, or + PyArrow to `agnostic_alias`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_alias(s_pd) 0 1 1 2 2 3 Name: bar, dtype: int64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_alias(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: 'bar' [i64] [ @@ -1734,7 +2106,8 @@ def alias(self, name: str) -> Self: 2 3 ] - >>> my_library_agnostic_function(s_pa) # doctest: +ELLIPSIS + + >>> agnostic_alias(s_pa) # doctest: +ELLIPSIS [ [ @@ -1782,25 +2155,28 @@ def rename(self, name: str) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s, name="foo") - >>> s_pl = pl.Series("foo", s) - >>> s_pa = pa.chunked_array([s]) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data, name="foo") + >>> s_pl = pl.Series("foo", data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_rename(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.rename("bar").to_native() - We can then pass any supported library such as pandas, Polars, or PyArrow: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rename`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_rename(s_pd) 0 1 1 2 2 3 Name: bar, dtype: int64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_rename(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: 'bar' [i64] [ @@ -1808,7 +2184,8 @@ def rename(self, name: str) -> Self: 2 3 ] - >>> my_library_agnostic_function(s_pa) # doctest: +ELLIPSIS + + >>> agnostic_rename(s_pa) # doctest: +ELLIPSIS [ [ @@ -1844,32 +2221,36 @@ def replace_strict( A new Series with values replaced according to the mapping. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> df_pd = pd.DataFrame({"a": [3, 0, 1, 2]}) - >>> df_pl = pl.DataFrame({"a": [3, 0, 1, 2]}) - >>> df_pa = pa.table({"a": [3, 0, 1, 2]}) + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = {"a": [3, 0, 1, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define dataframe-agnostic functions: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_replace_strict(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.replace_strict( ... [0, 1, 2, 3], ["zero", "one", "two", "three"], return_dtype=nw.String ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_replace_strict`: - >>> my_library_agnostic_function(df_pd["a"]) + >>> agnostic_replace_strict(df_pd["a"]) 0 three 1 zero 2 one 3 two Name: a, dtype: object - >>> my_library_agnostic_function(df_pl["a"]) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_replace_strict(df_pl["a"]) # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: 'a' [str] [ @@ -1878,7 +2259,8 @@ def replace_strict( "one" "two" ] - >>> my_library_agnostic_function(df_pa["a"]) + + >>> agnostic_replace_strict(df_pa["a"]) [ [ @@ -1914,11 +2296,14 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s = [5, None, 1, 2] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [5, None, 1, 2] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define library agnostic functions: @@ -1930,7 +2315,8 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: ... s = nw.from_native(s_native, series_only=True) ... return s.sort(descending=True).to_native() - We can then pass either pandas or Polars to `agnostic_sort`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_sort` and `agnostic_sort_descending`: >>> agnostic_sort(s_pd) 1 NaN @@ -1938,6 +2324,7 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: 3 2.0 0 5.0 dtype: float64 + >>> agnostic_sort(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [i64] @@ -1947,12 +2334,25 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: 2 5 ] + + >>> agnostic_sort(s_pa) # doctest: +ELLIPSIS + + [ + [ + null, + 1, + 2, + 5 + ] + ] + >>> agnostic_sort_descending(s_pd) 1 NaN 0 5.0 3 2.0 2 1.0 dtype: float64 + >>> agnostic_sort_descending(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [i64] @@ -1962,6 +2362,17 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: 2 1 ] + + >>> agnostic_sort_descending(s_pa) # doctest: +ELLIPSIS + + [ + [ + null, + 5, + 2, + 1 + ] + ] """ return self._from_compliant_series( self._compliant_series.sort(descending=descending, nulls_last=nulls_last) @@ -1981,13 +2392,14 @@ def is_null(self) -> Self: Examples: >>> import pandas as pd >>> import polars as pl - >>> import narwhals as nw >>> import pyarrow as pa + >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s = [1, 2, None] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) - >>> s_pa = pa.chunked_array([s]) + + >>> data = [1, 2, None] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function: @@ -1995,13 +2407,15 @@ def is_null(self) -> Self: ... s = nw.from_native(s_native, series_only=True) ... return s.is_null().to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_is_null`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_null`: >>> agnostic_is_null(s_pd) 0 False 1 False 2 True dtype: bool + >>> agnostic_is_null(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [bool] @@ -2010,6 +2424,7 @@ def is_null(self) -> Self: false true ] + >>> agnostic_is_null(s_pa) # doctest:+ELLIPSIS [ @@ -2049,10 +2464,11 @@ def fill_null( >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s = [1, 2, None] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) - >>> s_pa = pa.chunked_array([s]) + + >>> data = [1, 2, None] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function: @@ -2060,13 +2476,15 @@ def fill_null( ... s = nw.from_native(s_native, series_only=True) ... return s.fill_null(5).to_native() - We can then pass any supported library such as pandas, Polars, or PyArrow to `agnostic_fill_null`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_fill_null`: >>> agnostic_fill_null(s_pd) 0 1.0 1 2.0 2 5.0 dtype: float64 + >>> agnostic_fill_null(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] @@ -2075,6 +2493,7 @@ def fill_null( 2 5 ] + >>> agnostic_fill_null(s_pa) # doctest:+ELLIPSIS [ @@ -2096,6 +2515,7 @@ def fill_null( 1 2.0 2 2.0 dtype: float64 + >>> agnostic_fill_null_with_strategy(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] @@ -2104,6 +2524,7 @@ def fill_null( 2 2 ] + >>> agnostic_fill_null_with_strategy(s_pa) # doctest:+ELLIPSIS [ @@ -2147,27 +2568,33 @@ def is_between( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s_pd = pd.Series([1, 2, 3, 4, 5]) - >>> s_pl = pl.Series([1, 2, 3, 4, 5]) + + >>> data = [1, 2, 3, 4, 5] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_is_between(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.is_between(2, 4, "right").to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_between`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_is_between(s_pd) 0 False 1 False 2 True 3 True 4 False dtype: bool - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_is_between(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (5,) Series: '' [bool] [ @@ -2177,6 +2604,18 @@ def is_between( true false ] + + >>> agnostic_is_between(s_pa) # doctest: +ELLIPSIS + + [ + [ + false, + false, + true, + true, + false + ] + ] """ return self._from_compliant_series( self._compliant_series.is_between( @@ -2195,23 +2634,31 @@ def n_unique(self) -> int: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [1, 2, 2, 3] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [1, 2, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries): + >>> def agnostic_n_unique(s_native: IntoSeries) -> int: ... s = nw.from_native(s_native, series_only=True) ... return s.n_unique() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_n_unique`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_n_unique(s_pd) 3 - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_n_unique(s_pl) + 3 + + >>> agnostic_n_unique(s_pa) 3 """ return self._compliant_series.n_unique() # type: ignore[no-any-return] @@ -2223,26 +2670,34 @@ def to_numpy(self) -> np.ndarray: NumPy ndarray representation of the Series. Examples: + >>> import numpy as np >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> import numpy as np >>> from narwhals.typing import IntoSeries - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s, name="a") - >>> s_pl = pl.Series("a", s) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data, name="a") + >>> s_pl = pl.Series("a", data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries) -> np.ndarray: + >>> def agnostic_to_numpy(s_native: IntoSeries) -> np.ndarray: ... s = nw.from_native(s_native, series_only=True) ... return s.to_numpy() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_numpy`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_to_numpy(s_pd) array([1, 2, 3]...) - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_to_numpy(s_pl) + array([1, 2, 3]...) + + >>> agnostic_to_numpy(s_pa) array([1, 2, 3]...) """ return self._compliant_series.to_numpy() @@ -2256,30 +2711,41 @@ def to_pandas(self) -> pd.Series: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeries - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s, name="a") - >>> s_pl = pl.Series("a", s) + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data, name="a") + >>> s_pl = pl.Series("a", data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries) -> pd.Series: + >>> def agnostic_to_pandas(s_native: IntoSeries) -> pd.Series: ... s = nw.from_native(s_native, series_only=True) ... return s.to_pandas() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_pandas`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_to_pandas(s_pd) 0 1 1 2 2 3 Name: a, dtype: int64 - >>> my_library_agnostic_function(s_pl) + + >>> agnostic_to_pandas(s_pl) 0 1 1 2 2 3 Name: a, dtype: int64 + + >>> agnostic_to_pandas(s_pa) + 0 1 + 1 2 + 2 3 + Name: , dtype: int64 """ return self._compliant_series.to_pandas() @@ -2416,26 +2882,31 @@ def filter(self, other: Any) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> s = [4, 10, 15, 34, 50] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) + + >>> data = [4, 10, 15, 34, 50] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_filter(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.filter(s > 10).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_filter`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_filter(s_pd) 2 15 3 34 4 50 dtype: int64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_filter(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ @@ -2443,6 +2914,16 @@ def filter(self, other: Any) -> Self: 34 50 ] + + >>> agnostic_filter(s_pa) # doctest: +ELLIPSIS + + [ + [ + 15, + 34, + 50 + ] + ] """ return self._from_compliant_series( self._compliant_series.filter(self._extract_native(other)) @@ -2456,28 +2937,34 @@ def is_duplicated(self: Self) -> Self: A new Series with boolean values indicating duplicated rows. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl - >>> s_pd = pd.Series([1, 2, 3, 1]) - >>> s_pl = pl.Series([1, 2, 3, 1]) + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = [1, 2, 3, 1] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_is_duplicated(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.is_duplicated().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_duplicated`: - >>> my_library_agnostic_function(s_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_duplicated(s_pd) 0 True 1 False 2 False 3 True dtype: bool - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_is_duplicated(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [bool] [ @@ -2486,6 +2973,17 @@ def is_duplicated(self: Self) -> Self: false true ] + + >>> agnostic_is_duplicated(s_pa) # doctest: +ELLIPSIS + + [ + [ + true, + false, + false, + true + ] + ] """ return self._from_compliant_series(self._compliant_series.is_duplicated()) @@ -2496,29 +2994,35 @@ def is_empty(self: Self) -> bool: A boolean indicating if the series is empty. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeries >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeries Let's define a dataframe-agnostic function that filters rows in which "foo" values are greater than 10, and then checks if the result is empty or not: - >>> def my_library_agnostic_function(s_native: IntoSeries): + >>> def agnostic_is_empty(s_native: IntoSeries) -> bool: ... s = nw.from_native(s_native, series_only=True) ... return s.filter(s > 10).is_empty() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_empty`: - >>> s_pd = pd.Series([1, 2, 3]) - >>> s_pl = pl.Series([1, 2, 3]) - >>> my_library_agnostic_function(s_pd), my_library_agnostic_function(s_pl) - (True, True) + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + >>> agnostic_is_empty(s_pd), agnostic_is_empty(s_pl), agnostic_is_empty(s_pa) + (True, True, True) - >>> s_pd = pd.Series([100, 2, 3]) - >>> s_pl = pl.Series([100, 2, 3]) - >>> my_library_agnostic_function(s_pd), my_library_agnostic_function(s_pl) - (False, False) + >>> data = [100, 2, 3] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + >>> agnostic_is_empty(s_pd), agnostic_is_empty(s_pl), agnostic_is_empty(s_pa) + (False, False, False) """ return self._compliant_series.is_empty() # type: ignore[no-any-return] @@ -2529,29 +3033,34 @@ def is_unique(self: Self) -> Self: A new Series with boolean values indicating unique rows. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl - >>> s_pd = pd.Series([1, 2, 3, 1]) - >>> s_pl = pl.Series([1, 2, 3, 1]) + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = [1, 2, 3, 1] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_is_unique(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.is_unique().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_unique`: - >>> my_library_agnostic_function(s_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_unique(s_pd) 0 False 1 True 2 True 3 False dtype: bool - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_unique(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [bool] [ @@ -2560,6 +3069,16 @@ def is_unique(self: Self) -> Self: true false ] + >>> agnostic_is_unique(s_pa) # doctest: +ELLIPSIS + + [ + [ + false, + true, + true, + false + ] + ] """ return self._from_compliant_series(self._compliant_series.is_unique()) @@ -2575,29 +3094,33 @@ def null_count(self: Self) -> int: The number of null values in the Series. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeries >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> s = [1, None, None] - >>> s_pd = pd.Series(s) - >>> s_pl = pl.Series(s) - >>> s_pa = pa.chunked_array([s]) + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeries + + >>> data = [1, None, None] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function that returns the null count of the series: - >>> def agnostic_null_count(s_native: IntoSeries): + >>> def agnostic_null_count(s_native: IntoSeries) -> int: ... s = nw.from_native(s_native, series_only=True) ... return s.null_count() - We can then pass any supported library such as pandas, Polars, or PyArrow to `agnostic_null_count`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_null_count`: >>> agnostic_null_count(s_pd) np.int64(2) + >>> agnostic_null_count(s_pl) 2 + >>> agnostic_null_count(s_pa) 2 """ @@ -2610,22 +3133,27 @@ def is_first_distinct(self: Self) -> Self: A new Series with boolean values indicating the first occurrence of each distinct value. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl - >>> s_pd = pd.Series([1, 1, 2, 3, 2]) - >>> s_pl = pl.Series([1, 1, 2, 3, 2]) + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = [1, 1, 2, 3, 2] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_is_first_distinct(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.is_first_distinct().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_first_distinct`: - >>> my_library_agnostic_function(s_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_first_distinct(s_pd) 0 True 1 False 2 True @@ -2633,7 +3161,7 @@ def is_first_distinct(self: Self) -> Self: 4 False dtype: bool - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_first_distinct(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (5,) Series: '' [bool] [ @@ -2643,6 +3171,18 @@ def is_first_distinct(self: Self) -> Self: true false ] + + >>> agnostic_is_first_distinct(s_pa) # doctest: +ELLIPSIS + + [ + [ + true, + false, + true, + true, + false + ] + ] """ return self._from_compliant_series(self._compliant_series.is_first_distinct()) @@ -2653,22 +3193,27 @@ def is_last_distinct(self: Self) -> Self: A new Series with boolean values indicating the last occurrence of each distinct value. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl - >>> s_pd = pd.Series([1, 1, 2, 3, 2]) - >>> s_pl = pl.Series([1, 1, 2, 3, 2]) + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = [1, 1, 2, 3, 2] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_is_last_distinct(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.is_last_distinct().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_last_distinct`: - >>> my_library_agnostic_function(s_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_last_distinct(s_pd) 0 False 1 True 2 False @@ -2676,7 +3221,7 @@ def is_last_distinct(self: Self) -> Self: 4 True dtype: bool - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_last_distinct(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (5,) Series: '' [bool] [ @@ -2686,6 +3231,18 @@ def is_last_distinct(self: Self) -> Self: true true ] + + >>> agnostic_is_last_distinct(s_pa) # doctest: +ELLIPSIS + + [ + [ + false, + true, + false, + true, + true + ] + ] """ return self._from_compliant_series(self._compliant_series.is_last_distinct()) @@ -2699,30 +3256,40 @@ def is_sorted(self: Self, *, descending: bool = False) -> bool: A boolean indicating if the Series is sorted. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeries >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeries + >>> unsorted_data = [1, 3, 2] >>> sorted_data = [3, 2, 1] Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function( - ... s_native: IntoSeries, descending: bool = False - ... ): + >>> def agnostic_is_sorted(s_native: IntoSeries, descending: bool = False): ... s = nw.from_native(s_native, series_only=True) ... return s.is_sorted(descending=descending) - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_sorted`: + + >>> agnostic_is_sorted(pd.Series(unsorted_data)) + False + + >>> agnostic_is_sorted(pd.Series(sorted_data), descending=True) + True - >>> my_library_agnostic_function(pl.Series(unsorted_data)) + >>> agnostic_is_sorted(pl.Series(unsorted_data)) False - >>> my_library_agnostic_function(pl.Series(sorted_data), descending=True) + + >>> agnostic_is_sorted(pl.Series(sorted_data), descending=True) True - >>> my_library_agnostic_function(pd.Series(unsorted_data)) + + >>> agnostic_is_sorted(pa.chunked_array([unsorted_data])) False - >>> my_library_agnostic_function(pd.Series(sorted_data), descending=True) + + >>> agnostic_is_sorted(pa.chunked_array([sorted_data]), descending=True) True """ return self._compliant_series.is_sorted(descending=descending) # type: ignore[no-any-return] @@ -2751,28 +3318,34 @@ def value_counts( - Either count or proportion as second column, depending on normalize parameter. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeries, IntoDataFrame >>> import pandas as pd >>> import polars as pl - >>> s_pd = pd.Series([1, 1, 2, 3, 2], name="s") - >>> s_pl = pl.Series(values=[1, 1, 2, 3, 2], name="s") + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoDataFrame + >>> from narwhals.typing import IntoSeries + + >>> data = [1, 1, 2, 3, 2] + >>> s_pd = pd.Series(data, name="s") + >>> s_pl = pl.Series(values=data, name="s") + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries) -> IntoDataFrame: + >>> def agnostic_value_counts(s_native: IntoSeries) -> IntoDataFrame: ... s = nw.from_native(s_native, series_only=True) ... return s.value_counts(sort=True).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_value_counts`: - >>> my_library_agnostic_function(s_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_value_counts(s_pd) s count 0 1 2 1 2 2 2 3 1 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_value_counts(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3, 2) ┌─────┬───────┐ │ s ┆ count │ @@ -2783,6 +3356,14 @@ def value_counts( │ 2 ┆ 2 │ │ 3 ┆ 1 │ └─────┴───────┘ + + >>> agnostic_value_counts(s_pa) + pyarrow.Table + : int64 + count: int64 + ---- + : [[1,2,3]] + count: [[2,2,1]] """ return self._dataframe( self._compliant_series.value_counts( @@ -2809,30 +3390,37 @@ def quantile( The quantile value. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeries >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeries + >>> data = list(range(50)) >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries): + >>> def agnostic_quantile(s_native: IntoSeries) -> list[float]: ... s = nw.from_native(s_native, series_only=True) ... return [ ... s.quantile(quantile=q, interpolation="nearest") ... for q in (0.1, 0.25, 0.5, 0.75, 0.9) ... ] - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_quantile`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_quantile(s_pd) [np.int64(5), np.int64(12), np.int64(24), np.int64(37), np.int64(44)] - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_quantile(s_pl) [5.0, 12.0, 25.0, 37.0, 44.0] + + >>> agnostic_quantile(s_pa) + [5, 12, 24, 37, 44] """ return self._compliant_series.quantile( quantile=quantile, interpolation=interpolation @@ -2852,20 +3440,19 @@ def zip_with(self: Self, mask: Self, other: Self) -> Self: A new Series with values selected from self or other based on the mask. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl - >>> s1_pl = pl.Series([1, 2, 3, 4, 5]) - >>> s2_pl = pl.Series([5, 4, 3, 2, 1]) - >>> mask_pl = pl.Series([True, False, True, False, True]) - >>> s1_pd = pd.Series([1, 2, 3, 4, 5]) - >>> s2_pd = pd.Series([5, 4, 3, 2, 1]) - >>> mask_pd = pd.Series([True, False, True, False, True]) + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + + >>> data = [1, 2, 3, 4, 5] + >>> other = [5, 4, 3, 2, 1] + >>> mask = [True, False, True, False, True] Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function( + >>> def agnostic_zip_with( ... s1_native: IntoSeriesT, mask_native: IntoSeriesT, s2_native: IntoSeriesT ... ) -> IntoSeriesT: ... s1 = nw.from_native(s1_native, series_only=True) @@ -2873,10 +3460,13 @@ def zip_with(self: Self, mask: Self, other: Self) -> Self: ... s2 = nw.from_native(s2_native, series_only=True) ... return s1.zip_with(mask, s2).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_zip_with`: - >>> my_library_agnostic_function( - ... s1_pl, mask_pl, s2_pl + >>> agnostic_zip_with( + ... s1_native=pl.Series(data), + ... mask_native=pl.Series(mask), + ... s2_native=pl.Series(other), ... ) # doctest: +NORMALIZE_WHITESPACE shape: (5,) Series: '' [i64] @@ -2887,13 +3477,34 @@ def zip_with(self: Self, mask: Self, other: Self) -> Self: 2 5 ] - >>> my_library_agnostic_function(s1_pd, mask_pd, s2_pd) + + >>> agnostic_zip_with( + ... s1_native=pd.Series(data), + ... mask_native=pd.Series(mask), + ... s2_native=pd.Series(other), + ... ) 0 1 1 4 2 3 3 2 4 5 dtype: int64 + + >>> agnostic_zip_with( + ... s1_native=pa.chunked_array([data]), + ... mask_native=pa.chunked_array([mask]), + ... s2_native=pa.chunked_array([other]), + ... ) # doctest: +ELLIPSIS + + [ + [ + 1, + 4, + 3, + 2, + 5 + ] + ] """ return self._from_compliant_series( self._compliant_series.zip_with( @@ -2911,30 +3522,34 @@ def item(self: Self, index: int | None = None) -> Any: The scalar value of the Series or the element at the given index. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeries >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeries Let's define a dataframe-agnostic function that returns item at given index - >>> def my_library_agnostic_function(s_native: IntoSeries, index=None): + >>> def agnostic_item(s_native: IntoSeries, index=None): ... s = nw.from_native(s_native, series_only=True) ... return s.item(index) - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_item`: >>> ( - ... my_library_agnostic_function(pl.Series("a", [1]), None), - ... my_library_agnostic_function(pd.Series([1]), None), + ... agnostic_item(pl.Series("a", [1]), None), + ... agnostic_item(pd.Series([1]), None), + ... agnostic_item(pa.chunked_array([[1]]), None), ... ) - (1, np.int64(1)) + (1, np.int64(1), 1) >>> ( - ... my_library_agnostic_function(pl.Series("a", [9, 8, 7]), -1), - ... my_library_agnostic_function(pl.Series([9, 8, 7]), -2), + ... agnostic_item(pl.Series("a", [9, 8, 7]), -1), + ... agnostic_item(pl.Series([9, 8, 7]), -2), + ... agnostic_item(pa.chunked_array([[9, 8, 7]]), -3), ... ) - (7, 8) + (7, 8, 9) """ return self._compliant_series.item(index=index) @@ -2948,29 +3563,33 @@ def head(self: Self, n: int = 10) -> Self: A new Series containing the first n characters of each string. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = list(range(10)) >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function that returns the first 3 rows: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_head(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.head(3).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_head`: - >>> my_library_agnostic_function(s_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_head(s_pd) 0 0 1 1 2 2 dtype: int64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_head(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ @@ -2978,6 +3597,16 @@ def head(self: Self, n: int = 10) -> Self: 1 2 ] + + >>> agnostic_head(s_pa) # doctest: +ELLIPSIS + + [ + [ + 0, + 1, + 2 + ] + ] """ return self._from_compliant_series(self._compliant_series.head(n)) @@ -2991,28 +3620,33 @@ def tail(self: Self, n: int = 10) -> Self: A new Series with the last n rows. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = list(range(10)) >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function that returns the last 3 rows: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_tail(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.tail(3).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_tail`: - >>> my_library_agnostic_function(s_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_tail(s_pd) 7 7 8 8 9 9 dtype: int64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_tail(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ @@ -3020,6 +3654,16 @@ def tail(self: Self, n: int = 10) -> Self: 8 9 ] + + >>> agnostic_tail(s_pa) # doctest: +ELLIPSIS + + [ + [ + 7, + 8, + 9 + ] + ] """ return self._from_compliant_series(self._compliant_series.tail(n)) @@ -3041,29 +3685,33 @@ def round(self: Self, decimals: int = 0) -> Self: Polars and Arrow round away from 0 (e.g. -0.5 to -1.0, 0.5 to 1.0, 1.5 to 2.0, 2.5 to 3.0, etc..). Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [1.12345, 2.56789, 3.901234] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function that rounds to the first decimal: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_round(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.round(1).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_round`: - >>> my_library_agnostic_function(s_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_round(s_pd) 0 1.1 1 2.6 2 3.9 dtype: float64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_round(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [f64] [ @@ -3071,6 +3719,16 @@ def round(self: Self, decimals: int = 0) -> Self: 2.6 3.9 ] + + >>> agnostic_round(s_pa) # doctest: +ELLIPSIS + + [ + [ + 1.1, + 2.6, + 3.9 + ] + ] """ return self._from_compliant_series(self._compliant_series.round(decimals)) @@ -3091,37 +3749,42 @@ def to_dummies( between NaN and Null, whereas pandas doesn't. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeries, IntoDataFrame >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoDataFrame + >>> from narwhals.typing import IntoSeries + >>> data = [1, 2, 3] >>> s_pd = pd.Series(data, name="a") >>> s_pl = pl.Series("a", data) + >>> s_pa = pa.chunked_array([data]) - Let's define a dataframe-agnostic function that rounds to the first decimal: + Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function( + >>> def agnostic_to_dummies( ... s_native: IntoSeries, drop_first: bool = False ... ) -> IntoDataFrame: ... s = nw.from_native(s_native, series_only=True) ... return s.to_dummies(drop_first=drop_first).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_dummies`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_to_dummies(s_pd) a_1 a_2 a_3 0 1 0 0 1 0 1 0 2 0 0 1 - >>> my_library_agnostic_function(s_pd, drop_first=True) + >>> agnostic_to_dummies(s_pd, drop_first=True) a_2 a_3 0 0 0 1 1 0 2 0 1 - >>> my_library_agnostic_function(s_pl) + >>> agnostic_to_dummies(s_pl) shape: (3, 3) ┌─────┬─────┬─────┐ │ a_1 ┆ a_2 ┆ a_3 │ @@ -3132,7 +3795,8 @@ def to_dummies( │ 0 ┆ 1 ┆ 0 │ │ 0 ┆ 0 ┆ 1 │ └─────┴─────┴─────┘ - >>> my_library_agnostic_function(s_pl, drop_first=True) + + >>> agnostic_to_dummies(s_pl, drop_first=True) shape: (3, 2) ┌─────┬─────┐ │ a_2 ┆ a_3 │ @@ -3143,6 +3807,23 @@ def to_dummies( │ 1 ┆ 0 │ │ 0 ┆ 1 │ └─────┴─────┘ + + >>> agnostic_to_dummies(s_pa) + pyarrow.Table + _1: int8 + _2: int8 + _3: int8 + ---- + _1: [[1,0,0]] + _2: [[0,1,0]] + _3: [[0,0,1]] + >>> agnostic_to_dummies(s_pa, drop_first=True) + pyarrow.Table + _2: int8 + _3: int8 + ---- + _2: [[0,1,0]] + _3: [[0,0,1]] """ return self._dataframe( self._compliant_series.to_dummies(separator=separator, drop_first=drop_first), @@ -3160,33 +3841,48 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: A new Series with every nth value starting from the offset. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [1, 2, 3, 4] - >>> s_pd = pd.Series(name="a", data=data) - >>> s_pl = pl.Series(name="a", values=data) + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function in which gather every 2 rows, starting from a offset of 1: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_gather_every(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.gather_every(n=2, offset=1).to_native() - >>> my_library_agnostic_function(s_pd) + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_gather_every`: + + >>> agnostic_gather_every(s_pd) 1 2 3 4 - Name: a, dtype: int64 + dtype: int64 - >>> my_library_agnostic_function(s_pl) # doctest:+NORMALIZE_WHITESPACE + >>> agnostic_gather_every(s_pl) # doctest:+NORMALIZE_WHITESPACE shape: (2,) - Series: 'a' [i64] + Series: '' [i64] [ 2 4 ] + + >>> agnostic_gather_every(s_pa) # doctest:+ELLIPSIS + + [ + [ + 2, + 4 + ] + ] """ return self._from_compliant_series( self._compliant_series.gather_every(n=n, offset=offset) @@ -3199,22 +3895,36 @@ def to_arrow(self: Self) -> pa.Array: A PyArrow Array containing the data from the Series. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeries - >>> import pyarrow as pa >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeries + >>> data = [1, 2, 3, 4] - >>> s_pd = pd.Series(name="a", data=data) - >>> s_pl = pl.Series(name="a", values=data) + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function that converts to arrow: - >>> def my_library_agnostic_function(s_native: IntoSeries) -> pa.Array: + >>> def agnostic_to_arrow(s_native: IntoSeries) -> pa.Array: ... s = nw.from_native(s_native, series_only=True) ... return s.to_arrow() - >>> my_library_agnostic_function(s_pd) # doctest:+NORMALIZE_WHITESPACE + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_arrow`: + + >>> agnostic_to_arrow(s_pd) # doctest:+NORMALIZE_WHITESPACE + + [ + 1, + 2, + 3, + 4 + ] + + >>> agnostic_to_arrow(s_pl) # doctest:+NORMALIZE_WHITESPACE [ 1, @@ -3223,7 +3933,7 @@ def to_arrow(self: Self) -> pa.Array: 4 ] - >>> my_library_agnostic_function(s_pl) # doctest:+NORMALIZE_WHITESPACE + >>> agnostic_to_arrow(s_pa) # doctest:+NORMALIZE_WHITESPACE [ 1, @@ -3245,33 +3955,45 @@ def mode(self: Self) -> Self: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT >>> data = [1, 1, 2, 2, 3] - >>> s_pd = pd.Series(name="a", data=data) - >>> s_pl = pl.Series(name="a", values=data) + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_mode(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.mode().sort().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_mode`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_mode(s_pd) 0 1 1 2 - Name: a, dtype: int64 + dtype: int64 - >>> my_library_agnostic_function(s_pl) # doctest:+NORMALIZE_WHITESPACE + >>> agnostic_mode(s_pl) # doctest:+NORMALIZE_WHITESPACE shape: (2,) - Series: 'a' [i64] + Series: '' [i64] [ 1 2 ] + + >>> agnostic_mode(s_pa) # doctest:+ELLIPSIS + + [ + [ + 1, + 2 + ] + ] """ return self._from_compliant_series(self._compliant_series.mode()) @@ -3287,31 +4009,31 @@ def is_finite(self: Self) -> Self: Expression of `Boolean` data type. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [float("nan"), float("inf"), 2.0, None] We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_is_finite(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.is_finite().to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_finite`: - >>> my_library_agnostic_function(pd.Series(data)) + >>> agnostic_is_finite(pd.Series(data)) 0 False 1 False 2 True 3 False dtype: bool - >>> my_library_agnostic_function( - ... pl.Series(data) - ... ) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_is_finite(pl.Series(data)) # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [bool] [ @@ -3321,7 +4043,7 @@ def is_finite(self: Self) -> Self: null ] - >>> my_library_agnostic_function(pa.chunked_array([data])) # doctest: +ELLIPSIS + >>> agnostic_is_finite(pa.chunked_array([data])) # doctest: +ELLIPSIS [ [ @@ -3344,28 +4066,31 @@ def cum_count(self: Self, *, reverse: bool = False) -> Self: A new Series with the cumulative count of non-null values. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = ["x", "k", None, "d"] We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_cum_count(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.cum_count(reverse=True).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cum_count`: - >>> my_library_agnostic_function(pd.Series(data)) + >>> agnostic_cum_count(pd.Series(data)) 0 3 1 2 2 1 3 1 dtype: int64 - >>> my_library_agnostic_function(pl.Series(data)) # doctest:+NORMALIZE_WHITESPACE + + >>> agnostic_cum_count(pl.Series(data)) # doctest:+NORMALIZE_WHITESPACE shape: (4,) Series: '' [u32] [ @@ -3374,7 +4099,8 @@ def cum_count(self: Self, *, reverse: bool = False) -> Self: 1 1 ] - >>> my_library_agnostic_function(pa.chunked_array([data])) # doctest:+ELLIPSIS + + >>> agnostic_cum_count(pa.chunked_array([data])) # doctest:+ELLIPSIS [ [ @@ -3400,28 +4126,31 @@ def cum_min(self: Self, *, reverse: bool = False) -> Self: A new Series with the cumulative min of non-null values. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [3, 1, None, 2] We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_cum_min(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.cum_min().to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cum_min`: - >>> my_library_agnostic_function(pd.Series(data)) + >>> agnostic_cum_min(pd.Series(data)) 0 3.0 1 1.0 2 NaN 3 1.0 dtype: float64 - >>> my_library_agnostic_function(pl.Series(data)) # doctest:+NORMALIZE_WHITESPACE + + >>> agnostic_cum_min(pl.Series(data)) # doctest:+NORMALIZE_WHITESPACE shape: (4,) Series: '' [i64] [ @@ -3430,7 +4159,8 @@ def cum_min(self: Self, *, reverse: bool = False) -> Self: null 1 ] - >>> my_library_agnostic_function(pa.chunked_array([data])) # doctest:+ELLIPSIS + + >>> agnostic_cum_min(pa.chunked_array([data])) # doctest:+ELLIPSIS [ [ @@ -3456,28 +4186,31 @@ def cum_max(self: Self, *, reverse: bool = False) -> Self: A new Series with the cumulative max of non-null values. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [1, 3, None, 2] We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_cum_max(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.cum_max().to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cum_max`: - >>> my_library_agnostic_function(pd.Series(data)) + >>> agnostic_cum_max(pd.Series(data)) 0 1.0 1 3.0 2 NaN 3 3.0 dtype: float64 - >>> my_library_agnostic_function(pl.Series(data)) # doctest:+NORMALIZE_WHITESPACE + + >>> agnostic_cum_max(pl.Series(data)) # doctest:+NORMALIZE_WHITESPACE shape: (4,) Series: '' [i64] [ @@ -3486,7 +4219,8 @@ def cum_max(self: Self, *, reverse: bool = False) -> Self: null 3 ] - >>> my_library_agnostic_function(pa.chunked_array([data])) # doctest:+ELLIPSIS + + >>> agnostic_cum_max(pa.chunked_array([data])) # doctest:+ELLIPSIS [ [ @@ -3512,28 +4246,31 @@ def cum_prod(self: Self, *, reverse: bool = False) -> Self: A new Series with the cumulative product of non-null values. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [1, 3, None, 2] We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_cum_prod(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.cum_prod().to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cum_prod`: - >>> my_library_agnostic_function(pd.Series(data)) + >>> agnostic_cum_prod(pd.Series(data)) 0 1.0 1 3.0 2 NaN 3 6.0 dtype: float64 - >>> my_library_agnostic_function(pl.Series(data)) # doctest:+NORMALIZE_WHITESPACE + + >>> agnostic_cum_prod(pl.Series(data)) # doctest:+NORMALIZE_WHITESPACE shape: (4,) Series: '' [i64] [ @@ -3542,7 +4279,8 @@ def cum_prod(self: Self, *, reverse: bool = False) -> Self: null 6 ] - >>> my_library_agnostic_function(pa.chunked_array([data])) # doctest:+ELLIPSIS + + >>> agnostic_cum_prod(pa.chunked_array([data])) # doctest:+ELLIPSIS [ [ @@ -3590,11 +4328,12 @@ def rolling_sum( A new series. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [1.0, 2.0, 3.0, 4.0] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) @@ -3606,7 +4345,8 @@ def rolling_sum( ... s = nw.from_native(s_native, series_only=True) ... return s.rolling_sum(window_size=2).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_sum`: >>> agnostic_rolling_sum(s_pd) 0 NaN @@ -3683,11 +4423,12 @@ def rolling_mean( A new series. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [1.0, 2.0, 3.0, 4.0] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) @@ -3699,7 +4440,8 @@ def rolling_mean( ... s = nw.from_native(s_native, series_only=True) ... return s.rolling_mean(window_size=2).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_mean`: >>> agnostic_rolling_mean(s_pd) 0 NaN @@ -3778,11 +4520,12 @@ def rolling_var( A new series. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [1.0, 3.0, 1.0, 4.0] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) @@ -3794,7 +4537,8 @@ def rolling_var( ... s = nw.from_native(s_native, series_only=True) ... return s.rolling_var(window_size=2, min_periods=1).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_var`: >>> agnostic_rolling_var(s_pd) 0 NaN @@ -3871,11 +4615,12 @@ def rolling_std( A new series. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [1.0, 3.0, 1.0, 4.0] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) @@ -3887,7 +4632,8 @@ def rolling_std( ... s = nw.from_native(s_native, series_only=True) ... return s.rolling_std(window_size=2, min_periods=1).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_std`: >>> agnostic_rolling_std(s_pd) 0 NaN @@ -3971,32 +4717,46 @@ def get_categories(self: Self) -> SeriesT: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = ["apple", "mango", "mango"] >>> s_pd = pd.Series(data, dtype="category") >>> s_pl = pl.Series(data, dtype=pl.Categorical) + >>> s_pa = pa.chunked_array([data]).dictionary_encode() We define a dataframe-agnostic function to get unique categories from column 'fruits': - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_get_categories(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.cat.get_categories().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_get_categories`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_get_categories(s_pd) 0 apple 1 mango dtype: object - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_get_categories(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [str] [ "apple" "mango" ] + + >>> agnostic_get_categories(s_pa) # doctest: +ELLIPSIS + + [ + [ + "apple", + "mango" + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.cat.get_categories() @@ -4016,21 +4776,25 @@ def len_chars(self: Self) -> SeriesT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = ["foo", "Café", "345", "東京", None] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_len_chars(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.str.len_chars().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_len_chars`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_len_chars(s_pd) 0 3.0 1 4.0 2 3.0 @@ -4038,7 +4802,7 @@ def len_chars(self: Self) -> SeriesT: 4 NaN dtype: float64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_len_chars(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (5,) Series: '' [u32] [ @@ -4048,6 +4812,18 @@ def len_chars(self: Self) -> SeriesT: 2 null ] + + >>> agnostic_len_chars(s_pa) # doctest: +ELLIPSIS + + [ + [ + 3, + 4, + 3, + 2, + null + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.str.len_chars() @@ -4070,33 +4846,46 @@ def replace( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = ["123abc", "abc abc123"] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_replace(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... s = s.str.replace("abc", "") ... return s.to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_replace`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_replace(s_pd) 0 123 1 abc123 dtype: object - >>> my_library_agnostic_function(s_pl) # doctest:+NORMALIZE_WHITESPACE + >>> agnostic_replace(s_pl) # doctest:+NORMALIZE_WHITESPACE shape: (2,) Series: '' [str] [ "123" " abc123" ] + + >>> agnostic_replace(s_pa) # doctest: +ELLIPSIS + + [ + [ + "123", + " abc123" + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.str.replace( @@ -4120,33 +4909,46 @@ def replace_all( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = ["123abc", "abc abc123"] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_replace_all(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... s = s.str.replace_all("abc", "") ... return s.to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_replace_all`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_replace_all(s_pd) 0 123 1 123 dtype: object - >>> my_library_agnostic_function(s_pl) # doctest:+NORMALIZE_WHITESPACE + >>> agnostic_replace_all(s_pl) # doctest:+NORMALIZE_WHITESPACE shape: (2,) Series: '' [str] [ "123" " 123" ] + + >>> agnostic_replace_all(s_pa) # doctest: +ELLIPSIS + + [ + [ + "123", + " 123" + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.str.replace_all( @@ -4166,33 +4968,46 @@ def strip_chars(self: Self, characters: str | None = None) -> SeriesT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = ["apple", "\nmango"] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_strip_chars(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... s = s.str.strip_chars() ... return s.to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_strip_chars`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_strip_chars(s_pd) 0 apple 1 mango dtype: object - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_strip_chars(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [str] [ "apple" "mango" ] + + >>> agnostic_strip_chars(s_pa) # doctest: +ELLIPSIS + + [ + [ + "apple", + "mango" + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.str.strip_chars(characters) @@ -4210,27 +5025,31 @@ def starts_with(self: Self, prefix: str) -> SeriesT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = ["apple", "mango", None] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_starts_with(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.str.starts_with("app").to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_starts_with`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_starts_with(s_pd) 0 True 1 False 2 None dtype: object - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_starts_with(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [bool] [ @@ -4238,6 +5057,16 @@ def starts_with(self: Self, prefix: str) -> SeriesT: false null ] + + >>> agnostic_starts_with(s_pa) # doctest: +ELLIPSIS + + [ + [ + true, + false, + null + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.str.starts_with(prefix) @@ -4255,27 +5084,31 @@ def ends_with(self: Self, suffix: str) -> SeriesT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = ["apple", "mango", None] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_ends_with(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.str.ends_with("ngo").to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_ends_with`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_ends_with(s_pd) 0 False 1 True 2 None dtype: object - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_ends_with(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [bool] [ @@ -4283,6 +5116,16 @@ def ends_with(self: Self, suffix: str) -> SeriesT: true null ] + + >>> agnostic_ends_with(s_pa) # doctest: +ELLIPSIS + + [ + [ + false, + true, + null + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.str.ends_with(suffix) @@ -4305,10 +5148,11 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> SeriesT: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> pets = ["cat", "dog", "rabbit and parrot", "dove", None] - >>> s_pd = pd.Series(pets) - >>> s_pl = pl.Series(pets) - >>> s_pa = pa.chunked_array([pets]) + + >>> data = ["cat", "dog", "rabbit and parrot", "dove", None] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: @@ -4367,28 +5211,32 @@ def slice(self: Self, offset: int, length: int | None = None) -> SeriesT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = ["pear", None, "papaya", "dragonfruit"] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_slice(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.str.slice(4, length=3).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_slice`: - >>> my_library_agnostic_function(s_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_slice(s_pd) # doctest: +NORMALIZE_WHITESPACE 0 1 None 2 ya 3 onf dtype: object - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_slice(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [str] [ @@ -4398,20 +5246,31 @@ def slice(self: Self, offset: int, length: int | None = None) -> SeriesT: "onf" ] + >>> agnostic_slice(s_pa) # doctest: +ELLIPSIS + + [ + [ + "", + null, + "ya", + "onf" + ] + ] + Using negative indexes: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_slice(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.str.slice(-3).to_native() - >>> my_library_agnostic_function(s_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_slice(s_pd) # doctest: +NORMALIZE_WHITESPACE 0 ear 1 None 2 aya 3 uit dtype: object - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_slice(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [str] [ @@ -4420,6 +5279,17 @@ def slice(self: Self, offset: int, length: int | None = None) -> SeriesT: "aya" "uit" ] + + >>> agnostic_slice(s_pa) # doctest: +ELLIPSIS + + [ + [ + "ear", + null, + "aya", + "uit" + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.str.slice( @@ -4444,27 +5314,32 @@ def head(self: Self, n: int = 5) -> SeriesT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> lyrics = ["Atatata", "taata", "taatatata", "zukkyun"] - >>> s_pd = pd.Series(lyrics) - >>> s_pl = pl.Series(lyrics) + + >>> data = ["Atatata", "taata", "taatatata", "zukkyun"] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_head(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.str.head().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_head`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_head(s_pd) 0 Atata 1 taata 2 taata 3 zukky dtype: object - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_head(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [str] [ @@ -4473,6 +5348,17 @@ def head(self: Self, n: int = 5) -> SeriesT: "taata" "zukky" ] + + >>> agnostic_head(s_pa) # doctest: +ELLIPSIS + + [ + [ + "Atata", + "taata", + "taata", + "zukky" + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.str.slice(offset=0, length=n) @@ -4495,27 +5381,32 @@ def tail(self: Self, n: int = 5) -> SeriesT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT - >>> lyrics = ["Atatata", "taata", "taatatata", "zukkyun"] - >>> s_pd = pd.Series(lyrics) - >>> s_pl = pl.Series(lyrics) + + >>> data = ["Atatata", "taata", "taatatata", "zukkyun"] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_tail(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.str.tail().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_tail`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_tail(s_pd) 0 atata 1 taata 2 atata 3 kkyun dtype: object - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_tail(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [str] [ @@ -4524,6 +5415,17 @@ def tail(self: Self, n: int = 5) -> SeriesT: "atata" "kkyun" ] + + >>> agnostic_tail(s_pa) # doctest: +ELLIPSIS + + [ + [ + "atata", + "taata", + "atata", + "kkyun" + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.str.slice(offset=-n, length=None) @@ -4543,40 +5445,48 @@ def to_uppercase(self) -> SeriesT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT - >>> data = {"fruits": ["apple", "mango", None]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["apple", "mango", None] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... upper_col=nw.col("fruits").str.to_uppercase() - ... ).to_native() + >>> def agnostic_to_uppercase(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.str.to_uppercase().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_uppercase`: - >>> my_library_agnostic_function(df_pd) # doctest: +NORMALIZE_WHITESPACE - fruits upper_col - 0 apple APPLE - 1 mango MANGO - 2 None None + >>> agnostic_to_uppercase(s_pd) + 0 APPLE + 1 MANGO + 2 None + dtype: object - >>> my_library_agnostic_function(df_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (3, 2) - ┌────────┬───────────┐ - │ fruits ┆ upper_col │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞════════╪═══════════╡ - │ apple ┆ APPLE │ - │ mango ┆ MANGO │ - │ null ┆ null │ - └────────┴───────────┘ + >>> agnostic_to_uppercase(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: '' [str] + [ + "APPLE" + "MANGO" + null + ] + >>> agnostic_to_uppercase(s_pa) # doctest: +ELLIPSIS + + [ + [ + "APPLE", + "MANGO", + null + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.str.to_uppercase() @@ -4591,40 +5501,48 @@ def to_lowercase(self) -> SeriesT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT, IntoFrameT - >>> data = {"fruits": ["APPLE", "MANGO", None]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) + >>> from narwhals.typing import IntoSeriesT + + >>> data = ["APPLE", "MANGO", None] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: - ... df = nw.from_native(df_native) - ... return df.with_columns( - ... lower_col=nw.col("fruits").str.to_lowercase() - ... ).to_native() + >>> def agnostic_to_lowercase(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.str.to_lowercase().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_lowercase`: - >>> my_library_agnostic_function(df_pd) # doctest: +NORMALIZE_WHITESPACE - fruits lower_col - 0 APPLE apple - 1 MANGO mango - 2 None None + >>> agnostic_to_lowercase(s_pd) + 0 apple + 1 mango + 2 None + dtype: object + >>> agnostic_to_lowercase(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: '' [str] + [ + "apple" + "mango" + null + ] - >>> my_library_agnostic_function(df_pl) # doctest: +NORMALIZE_WHITESPACE - shape: (3, 2) - ┌────────┬───────────┐ - │ fruits ┆ lower_col │ - │ --- ┆ --- │ - │ str ┆ str │ - ╞════════╪═══════════╡ - │ APPLE ┆ apple │ - │ MANGO ┆ mango │ - │ null ┆ null │ - └────────┴───────────┘ + >>> agnostic_to_lowercase(s_pa) # doctest: +ELLIPSIS + + [ + [ + "apple", + "mango", + null + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.str.to_lowercase() @@ -4656,6 +5574,7 @@ def to_datetime(self: Self, format: str | None = None) -> SeriesT: # noqa: A002 >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = ["2020-01-01", "2020-01-02"] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) @@ -4663,24 +5582,27 @@ def to_datetime(self: Self, format: str | None = None) -> SeriesT: # noqa: A002 We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_to_datetime(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.str.to_datetime(format="%Y-%m-%d").to_native() - We can then pass any supported library such as pandas, Polars, or PyArrow:: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_datetime`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_to_datetime(s_pd) 0 2020-01-01 1 2020-01-02 dtype: datetime64[ns] - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_to_datetime(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [datetime[μs]] [ 2020-01-01 00:00:00 2020-01-02 00:00:00 ] - >>> my_library_agnostic_function(s_pa) # doctest: +ELLIPSIS + + >>> agnostic_to_datetime(s_pa) # doctest: +ELLIPSIS [ [ @@ -4708,35 +5630,48 @@ def date(self: Self) -> SeriesT: NotImplementedError: If pandas default backend is being used. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> dates = [datetime(2012, 1, 7, 10, 20), datetime(2023, 3, 10, 11, 32)] >>> s_pd = pd.Series(dates).convert_dtypes(dtype_backend="pyarrow") >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_date(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.date().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_date`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_date(s_pd) 0 2012-01-07 1 2023-03-10 dtype: date32[day][pyarrow] - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_date(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [date] [ 2012-01-07 2023-03-10 ] + + >>> agnostic_date(s_pa) # doctest: +ELLIPSIS + + [ + [ + 2012-01-07, + 2023-03-10 + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.date() @@ -4749,34 +5684,48 @@ def year(self: Self) -> SeriesT: A new Series containing the year component of each datetime value. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> dates = [datetime(2012, 1, 7), datetime(2023, 3, 10)] >>> s_pd = pd.Series(dates) >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_year(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.year().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_year`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_year(s_pd) 0 2012 1 2023 dtype: int... - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_year(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [i32] [ 2012 2023 ] + + >>> agnostic_year(s_pa) # doctest: +ELLIPSIS + + [ + [ + 2012, + 2023 + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.year() @@ -4789,34 +5738,47 @@ def month(self: Self) -> SeriesT: A new Series containing the month component of each datetime value. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> dates = [datetime(2023, 2, 1), datetime(2023, 8, 3)] >>> s_pd = pd.Series(dates) >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_month(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.month().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_month`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_month(s_pd) 0 2 1 8 dtype: int... - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_month(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [i8] [ 2 8 ] + + >>> agnostic_month(s_pa) # doctest: +ELLIPSIS + + [ + [ + 2, + 8 + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.month() @@ -4829,34 +5791,48 @@ def day(self: Self) -> SeriesT: A new Series containing the day component of each datetime value. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> dates = [datetime(2022, 1, 1), datetime(2022, 1, 5)] >>> s_pd = pd.Series(dates) >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_day(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.day().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_day`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_day(s_pd) 0 1 1 5 dtype: int... - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_day(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [i8] [ 1 5 ] + + >>> agnostic_day(s_pa) # doctest: +ELLIPSIS + + [ + [ + 1, + 5 + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.day() @@ -4869,34 +5845,48 @@ def hour(self: Self) -> SeriesT: A new Series containing the hour component of each datetime value. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> dates = [datetime(2022, 1, 1, 5, 3), datetime(2022, 1, 5, 9, 12)] >>> s_pd = pd.Series(dates) >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_hour(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.hour().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_hour`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_hour(s_pd) 0 5 1 9 dtype: int... - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_hour(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [i8] [ 5 9 ] + + >>> agnostic_hour(s_pa) # doctest: +ELLIPSIS + + [ + [ + 5, + 9 + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.hour() @@ -4909,34 +5899,48 @@ def minute(self: Self) -> SeriesT: A new Series containing the minute component of each datetime value. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> dates = [datetime(2022, 1, 1, 5, 3), datetime(2022, 1, 5, 9, 12)] >>> s_pd = pd.Series(dates) >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_minute(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.minute().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_minute`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_minute(s_pd) 0 3 1 12 dtype: int... - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_minute(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [i8] [ 3 12 ] + + >>> agnostic_minute(s_pa) # doctest: +ELLIPSIS + + [ + [ + 3, + 12 + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.minute() @@ -4949,34 +5953,48 @@ def second(self: Self) -> SeriesT: A new Series containing the second component of each datetime value. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> dates = [datetime(2022, 1, 1, 5, 3, 10), datetime(2022, 1, 5, 9, 12, 4)] >>> s_pd = pd.Series(dates) >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_second(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.second().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_second`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_second(s_pd) 0 10 1 4 dtype: int... - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_second(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [i8] [ 10 4 ] + + >>> agnostic_second(s_pa) # doctest: +ELLIPSIS + + [ + [ + 10, + 4 + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.second() @@ -4989,11 +6007,13 @@ def millisecond(self: Self) -> SeriesT: A new Series containing the millisecond component of each datetime value. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> dates = [ ... datetime(2023, 5, 21, 12, 55, 10, 400000), ... datetime(2023, 5, 21, 12, 55, 10, 600000), @@ -5001,26 +6021,28 @@ def millisecond(self: Self) -> SeriesT: ... datetime(2023, 5, 21, 12, 55, 11, 0), ... datetime(2023, 5, 21, 12, 55, 11, 200000), ... ] - >>> s_pd = pd.Series(dates) >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_millisecond(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.millisecond().alias("datetime").to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_millisecond`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_millisecond(s_pd) 0 400 1 600 2 800 3 0 4 200 Name: datetime, dtype: int... - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_millisecond(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (5,) Series: 'datetime' [i32] [ @@ -5030,6 +6052,18 @@ def millisecond(self: Self) -> SeriesT: 0 200 ] + + >>> agnostic_millisecond(s_pa) # doctest: +ELLIPSIS + + [ + [ + 400, + 600, + 800, + 0, + 200 + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.millisecond() @@ -5042,11 +6076,13 @@ def microsecond(self: Self) -> SeriesT: A new Series containing the microsecond component of each datetime value. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> dates = [ ... datetime(2023, 5, 21, 12, 55, 10, 400000), ... datetime(2023, 5, 21, 12, 55, 10, 600000), @@ -5054,26 +6090,28 @@ def microsecond(self: Self) -> SeriesT: ... datetime(2023, 5, 21, 12, 55, 11, 0), ... datetime(2023, 5, 21, 12, 55, 11, 200000), ... ] - >>> s_pd = pd.Series(dates) >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_microsecond(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.microsecond().alias("datetime").to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_microsecond`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_microsecond(s_pd) 0 400000 1 600000 2 800000 3 0 4 200000 Name: datetime, dtype: int... - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_microsecond(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (5,) Series: 'datetime' [i32] [ @@ -5083,6 +6121,18 @@ def microsecond(self: Self) -> SeriesT: 0 200000 ] + + >>> agnostic_microsecond(s_pa) # doctest: +ELLIPSIS + + [ + [ + 400000, + 600000, + 800000, + 0, + 200000 + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.microsecond() @@ -5095,37 +6145,51 @@ def nanosecond(self: Self) -> SeriesT: A new Series containing the nanosecond component of each datetime value. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> dates = [ ... datetime(2022, 1, 1, 5, 3, 10, 500000), ... datetime(2022, 1, 5, 9, 12, 4, 60000), ... ] >>> s_pd = pd.Series(dates) >>> s_pl = pl.Series(dates) + >>> s_pa = pa.chunked_array([dates]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_nanosecond(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.nanosecond().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_nanosecond`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_nanosecond(s_pd) 0 500000000 1 60000000 dtype: int... - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_nanosecond(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [i32] [ 500000000 60000000 ] + + >>> agnostic_nanosecond(s_pa) # doctest: +ELLIPSIS + + [ + [ + 500000000, + 60000000 + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.nanosecond() @@ -5138,34 +6202,49 @@ def ordinal_day(self: Self) -> SeriesT: A new Series containing the ordinal day (day of year) for each datetime value. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = [datetime(2020, 1, 1), datetime(2020, 8, 3)] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_ordinal_day(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.ordinal_day().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_ordinal_day`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_ordinal_day(s_pd) 0 1 1 216 dtype: int32 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_ordinal_day(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [i16] [ 1 216 ] + + + >>> agnostic_ordinal_day(s_pa) # doctest: +ELLIPSIS + + [ + [ + 1, + 216 + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.ordinal_day() @@ -5183,34 +6262,48 @@ def total_minutes(self: Self) -> SeriesT: A new Series containing the total number of minutes for each timedelta value. Examples: + >>> from datetime import timedelta >>> import pandas as pd >>> import polars as pl - >>> from datetime import timedelta + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = [timedelta(minutes=10), timedelta(minutes=20, seconds=40)] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_total_minutes(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.total_minutes().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_total_minutes`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_total_minutes(s_pd) 0 10 1 20 dtype: int... - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_total_minutes(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [i64] [ 10 20 ] + + >>> agnostic_total_minutes(s_pa) # doctest: +ELLIPSIS + + [ + [ + 10, + 20 + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.total_minutes() @@ -5228,34 +6321,48 @@ def total_seconds(self: Self) -> SeriesT: A new Series containing the total number of seconds for each timedelta value. Examples: + >>> from datetime import timedelta >>> import pandas as pd >>> import polars as pl - >>> from datetime import timedelta + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = [timedelta(seconds=10), timedelta(seconds=20, milliseconds=40)] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_total_seconds(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.total_seconds().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_total_seconds`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_total_seconds(s_pd) 0 10 1 20 dtype: int... - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_total_seconds(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [i64] [ 10 20 ] + + >>> agnostic_total_seconds(s_pa) # doctest: +ELLIPSIS + + [ + [ + 10, + 20 + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.total_seconds() @@ -5273,37 +6380,51 @@ def total_milliseconds(self: Self) -> SeriesT: A new Series containing the total number of milliseconds for each timedelta value. Examples: + >>> from datetime import timedelta >>> import pandas as pd >>> import polars as pl - >>> from datetime import timedelta + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = [ ... timedelta(milliseconds=10), ... timedelta(milliseconds=20, microseconds=40), ... ] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_total_milliseconds(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.total_milliseconds().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_total_milliseconds`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_total_milliseconds(s_pd) 0 10 1 20 dtype: int... - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_total_milliseconds(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [i64] [ 10 20 ] + + >>> agnostic_total_milliseconds(s_pa) # doctest: +ELLIPSIS + + [ + [ + 10, + 20 + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.total_milliseconds() @@ -5321,36 +6442,50 @@ def total_microseconds(self: Self) -> SeriesT: consider using `fill_null()` in this case. Examples: + >>> from datetime import timedelta >>> import pandas as pd >>> import polars as pl - >>> from datetime import timedelta + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = [ ... timedelta(microseconds=10), ... timedelta(milliseconds=1, microseconds=200), ... ] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_total_microseconds(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.total_microseconds().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_total_microseconds`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_total_microseconds(s_pd) 0 10 1 1200 dtype: int... - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_total_microseconds(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [i64] [ - 10 - 1200 + 10 + 1200 + ] + + >>> agnostic_total_microseconds(s_pa) # doctest: +ELLIPSIS + + [ + [ + 10, + 1200 + ] ] """ return self._narwhals_series._from_compliant_series( @@ -5369,28 +6504,32 @@ def total_nanoseconds(self: Self) -> SeriesT: A new Series containing the total number of nanoseconds for each timedelta value. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import timedelta + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = ["2024-01-01 00:00:00.000000001", "2024-01-01 00:00:00.000000002"] >>> s_pd = pd.to_datetime(pd.Series(data)) >>> s_pl = pl.Series(data).str.to_datetime(time_unit="ns") We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_total_nanoseconds(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.diff().dt.total_nanoseconds().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_total_nanoseconds`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_total_nanoseconds(s_pd) 0 NaN 1 1.0 dtype: float64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_total_nanoseconds(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [i64] [ @@ -5446,8 +6585,10 @@ def to_string(self: Self, format: str) -> SeriesT: # noqa: A002 >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = [ ... datetime(2020, 3, 1), ... datetime(2020, 4, 1), @@ -5455,22 +6596,24 @@ def to_string(self: Self, format: str) -> SeriesT: # noqa: A002 ... ] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_to_string(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.to_string("%Y/%m/%d").to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_string`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_to_string(s_pd) 0 2020/03/01 1 2020/04/01 2 2020/05/01 dtype: object - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_to_string(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [str] [ @@ -5478,6 +6621,16 @@ def to_string(self: Self, format: str) -> SeriesT: # noqa: A002 "2020/04/01" "2020/05/01" ] + + >>> agnostic_to_string(s_pa) # doctest: +ELLIPSIS + + [ + [ + "2020/03/01", + "2020/04/01", + "2020/05/01" + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.dt.to_string(format) @@ -5494,11 +6647,12 @@ def replace_time_zone(self: Self, time_zone: str | None) -> SeriesT: Examples: >>> from datetime import datetime, timezone - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [ ... datetime(2024, 1, 1, tzinfo=timezone.utc), ... datetime(2024, 1, 2, tzinfo=timezone.utc), @@ -5509,24 +6663,27 @@ def replace_time_zone(self: Self, time_zone: str | None) -> SeriesT: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_replace_time_zone(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.replace_time_zone("Asia/Kathmandu").to_native() - We can then pass pandas / PyArrow / Polars / any other supported library: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_replace_time_zone`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_replace_time_zone(s_pd) 0 2024-01-01 00:00:00+05:45 1 2024-01-02 00:00:00+05:45 dtype: datetime64[ns, Asia/Kathmandu] - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_replace_time_zone(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [datetime[μs, Asia/Kathmandu]] [ 2024-01-01 00:00:00 +0545 2024-01-02 00:00:00 +0545 ] - >>> my_library_agnostic_function(s_pa) + + >>> agnostic_replace_time_zone(s_pa) [ [ @@ -5553,11 +6710,12 @@ def convert_time_zone(self: Self, time_zone: str) -> SeriesT: Examples: >>> from datetime import datetime, timezone - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [ ... datetime(2024, 1, 1, tzinfo=timezone.utc), ... datetime(2024, 1, 2, tzinfo=timezone.utc), @@ -5568,24 +6726,27 @@ def convert_time_zone(self: Self, time_zone: str) -> SeriesT: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_convert_time_zone(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.convert_time_zone("Asia/Kathmandu").to_native() - We can then pass pandas / PyArrow / Polars / any other supported library: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_convert_time_zone`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_convert_time_zone(s_pd) 0 2024-01-01 05:45:00+05:45 1 2024-01-02 05:45:00+05:45 dtype: datetime64[ns, Asia/Kathmandu] - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_convert_time_zone(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (2,) Series: '' [datetime[μs, Asia/Kathmandu]] [ 2024-01-01 05:45:00 +0545 2024-01-02 05:45:00 +0545 ] - >>> my_library_agnostic_function(s_pa) + + >>> agnostic_convert_time_zone(s_pa) [ [ @@ -5613,11 +6774,12 @@ def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> Series Examples: >>> from datetime import date - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [date(2001, 1, 1), None, date(2001, 1, 3)] >>> s_pd = pd.Series(data, dtype="datetime64[ns]") >>> s_pl = pl.Series(data) @@ -5625,18 +6787,20 @@ def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> Series Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_timestamp(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.dt.timestamp("ms").to_native() - We can then pass pandas / PyArrow / Polars / any other supported library: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_timestamp`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_timestamp(s_pd) 0 9.783072e+11 1 NaN 2 9.784800e+11 dtype: float64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + + >>> agnostic_timestamp(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: '' [i64] [ @@ -5644,7 +6808,8 @@ def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> Series null 978480000000 ] - >>> my_library_agnostic_function(s_pa) + + >>> agnostic_timestamp(s_pa) [ [ @@ -5678,11 +6843,12 @@ def len(self: Self) -> SeriesT: A new series. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [[1, 2], [3, 4, None], None, []] Let's define a dataframe-agnostic function: diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 22afc687d..8bf4a4b1e 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -501,27 +501,32 @@ def to_frame(self) -> DataFrame[Any]: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw - >>> from narwhals.typing import IntoSeries, IntoDataFrame - >>> s = [1, 2, 3] - >>> s_pd = pd.Series(s, name="a") - >>> s_pl = pl.Series("a", s) + >>> from narwhals.typing import IntoDataFrame + >>> from narwhals.typing import IntoSeries + + >>> data = [1, 2] + >>> s_pd = pd.Series(data, name="a") + >>> s_pl = pl.Series("a", data) + >>> s_pa = pa.chunked_array([data]) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries) -> IntoDataFrame: + >>> def agnostic_to_frame(s_native: IntoSeries) -> IntoDataFrame: ... s = nw.from_native(s_native, series_only=True) ... return s.to_frame().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_frame`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_to_frame(s_pd) a 0 1 1 2 - 2 3 - >>> my_library_agnostic_function(s_pl) - shape: (3, 1) + + >>> agnostic_to_frame(s_pl) + shape: (2, 1) ┌─────┐ │ a │ │ --- │ @@ -529,8 +534,13 @@ def to_frame(self) -> DataFrame[Any]: ╞═════╡ │ 1 │ │ 2 │ - │ 3 │ └─────┘ + + >>> agnostic_to_frame(s_pa) + pyarrow.Table + : int64 + ---- + : [[1,2]] """ return super().to_frame() # type: ignore[return-value] @@ -558,28 +568,34 @@ def value_counts( - Either count or proportion as second column, depending on normalize parameter. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeries, IntoDataFrame >>> import pandas as pd >>> import polars as pl - >>> s_pd = pd.Series([1, 1, 2, 3, 2], name="s") - >>> s_pl = pl.Series(values=[1, 1, 2, 3, 2], name="s") + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoDataFrame + >>> from narwhals.typing import IntoSeries + + >>> data = [1, 1, 2, 3, 2] + >>> s_pd = pd.Series(data, name="s") + >>> s_pl = pl.Series(values=data, name="s") + >>> s_pa = pa.chunked_array([data]) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeries) -> IntoDataFrame: + >>> def agnostic_value_counts(s_native: IntoSeries) -> IntoDataFrame: ... s = nw.from_native(s_native, series_only=True) ... return s.value_counts(sort=True).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_value_counts`: - >>> my_library_agnostic_function(s_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_value_counts(s_pd) s count 0 1 2 1 2 2 2 3 1 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_value_counts(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3, 2) ┌─────┬───────┐ │ s ┆ count │ @@ -590,6 +606,14 @@ def value_counts( │ 2 ┆ 2 │ │ 3 ┆ 1 │ └─────┴───────┘ + + >>> agnostic_value_counts(s_pa) + pyarrow.Table + : int64 + count: int64 + ---- + : [[1,2,3]] + count: [[2,2,1]] """ return super().value_counts( # type: ignore[return-value] sort=sort, parallel=parallel, name=name, normalize=normalize @@ -652,25 +676,27 @@ def ewm_mean( >>> import polars as pl >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT + >>> data = [1, 2, 3] >>> s_pd = pd.Series(name="a", data=data) >>> s_pl = pl.Series(name="a", values=data) We define a library agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_ewm_mean(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.ewm_mean(com=1, ignore_nulls=False).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas or Polars + to `agnostic_ewm_mean`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_ewm_mean(s_pd) 0 1.000000 1 1.666667 2 2.428571 Name: a, dtype: float64 - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_ewm_mean(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3,) Series: 'a' [f64] [ @@ -729,11 +755,12 @@ def rolling_sum( A new series. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [1.0, 2.0, 3.0, 4.0] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) @@ -745,7 +772,8 @@ def rolling_sum( ... s = nw.from_native(s_native, series_only=True) ... return s.rolling_sum(window_size=2).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_sum`: >>> agnostic_rolling_sum(s_pd) 0 NaN @@ -821,11 +849,12 @@ def rolling_mean( A new series. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [1.0, 2.0, 3.0, 4.0] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) @@ -837,7 +866,8 @@ def rolling_mean( ... s = nw.from_native(s_native, series_only=True) ... return s.rolling_mean(window_size=2).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_mean`: >>> agnostic_rolling_mean(s_pd) 0 NaN @@ -915,11 +945,12 @@ def rolling_var( A new series. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [1.0, 3.0, 1.0, 4.0] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) @@ -931,7 +962,8 @@ def rolling_var( ... s = nw.from_native(s_native, series_only=True) ... return s.rolling_var(window_size=2, min_periods=1).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_var`: >>> agnostic_rolling_var(s_pd) 0 NaN @@ -1010,11 +1042,12 @@ def rolling_std( A new series. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoSeriesT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> data = [1.0, 3.0, 1.0, 4.0] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) @@ -1026,7 +1059,8 @@ def rolling_std( ... s = nw.from_native(s_native, series_only=True) ... return s.rolling_std(window_size=2, min_periods=1).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_std`: >>> agnostic_rolling_std(s_pd) 0 NaN