From 1f3cc418ce848807f3882cb7e206fd47ecd4d917 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 5 Jan 2025 08:58:39 +0000 Subject: [PATCH 01/35] chore: simplify definitions of rhs expressions, bump dask minimum to 2024.10 (#1720) --- .github/workflows/extremes.yml | 8 +-- .github/workflows/pytest.yml | 4 +- narwhals/_arrow/expr.py | 36 ------------ narwhals/_dask/expr.py | 72 ----------------------- narwhals/_dask/namespace.py | 35 ++++++----- narwhals/_dask/utils.py | 3 +- narwhals/_pandas_like/expr.py | 37 ------------ narwhals/_polars/expr.py | 12 ---- narwhals/_spark_like/dataframe.py | 2 +- narwhals/_spark_like/expr.py | 2 +- narwhals/dataframe.py | 12 ++-- narwhals/expr.py | 97 ++++++++++++++++++------------- narwhals/typing.py | 12 ++++ pyproject.toml | 2 +- tests/frame/lit_test.py | 3 - tests/selectors_test.py | 6 +- 16 files changed, 110 insertions(+), 233 deletions(-) diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index 91563d840..9e7e997b2 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -84,7 +84,7 @@ jobs: not_so_old_versions: strategy: matrix: - python-version: ["3.9"] + python-version: ["3.10"] os: [ubuntu-latest] runs-on: ${{ matrix.os }} steps: @@ -99,7 +99,7 @@ jobs: cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "pyproject.toml" - name: install-not-so-old-versions - run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==14.0.0 "pyarrow-stubs<17" pyspark==3.4.0 scipy==1.8.0 scikit-learn==1.3.0 dask[dataframe]==2024.7 tzdata --system + run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==15.0.0 "pyarrow-stubs<17" pyspark==3.4.0 scipy==1.8.0 scikit-learn==1.3.0 dask[dataframe]==2024.10 tzdata --system - name: install-reqs run: uv pip install -e ".[dev]" --system - name: show-deps @@ -110,11 +110,11 @@ jobs: echo "$DEPS" | grep 'pandas==2.0.3' echo "$DEPS" | grep 'polars==0.20.8' echo "$DEPS" | grep 'numpy==1.24.4' - echo "$DEPS" | grep 'pyarrow==14.0.0' + echo "$DEPS" | grep 'pyarrow==15.0.0' echo "$DEPS" | grep 'pyspark==3.4.0' echo "$DEPS" | grep 'scipy==1.8.0' echo "$DEPS" | grep 'scikit-learn==1.3.0' - echo "$DEPS" | grep 'dask==2024.7' + echo "$DEPS" | grep 'dask==2024.10' - name: Run pytest run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow --constructors=pandas,pyarrow,polars[eager],polars[lazy],dask diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 3f015d405..c89ab2cd7 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -59,7 +59,7 @@ jobs: pytest-full-coverage: strategy: matrix: - python-version: ["3.9", "3.11", "3.13"] + python-version: ["3.11", "3.13"] os: [ubuntu-latest] runs-on: ${{ matrix.os }} steps: @@ -78,7 +78,7 @@ jobs: - name: install pyspark run: uv pip install -e ".[pyspark]" --system # PySpark is not yet available on Python3.12+ - if: matrix.python-version == '3.9' || matrix.python-version == '3.11' + if: matrix.python-version != '3.13' - name: install ibis run: uv pip install -e ".[ibis]" --system # Ibis puts upper bounds on dependencies, and requires Python3.10+, diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 04c5c930e..e511f405d 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -163,66 +163,30 @@ def __lt__(self: Self, other: ArrowExpr | Any) -> Self: def __and__(self: Self, other: ArrowExpr | bool | Any) -> Self: return reuse_series_implementation(self, "__and__", other=other) - def __rand__(self: Self, other: ArrowExpr | bool | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__and__(self) # type: ignore[return-value] - def __or__(self: Self, other: ArrowExpr | bool | Any) -> Self: return reuse_series_implementation(self, "__or__", other=other) - def __ror__(self: Self, other: ArrowExpr | bool | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__or__(self) # type: ignore[return-value] - def __add__(self: Self, other: ArrowExpr | Any) -> Self: return reuse_series_implementation(self, "__add__", other=other) - def __radd__(self: Self, other: ArrowExpr | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__add__(self) # type: ignore[return-value] - def __sub__(self: Self, other: ArrowExpr | Any) -> Self: return reuse_series_implementation(self, "__sub__", other=other) - def __rsub__(self: Self, other: ArrowExpr | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__sub__(self) # type: ignore[return-value] - def __mul__(self: Self, other: ArrowExpr | Any) -> Self: return reuse_series_implementation(self, "__mul__", other=other) - def __rmul__(self: Self, other: ArrowExpr | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__mul__(self) # type: ignore[return-value] - def __pow__(self: Self, other: ArrowExpr | Any) -> Self: return reuse_series_implementation(self, "__pow__", other=other) - def __rpow__(self: Self, other: ArrowExpr | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__pow__(self) # type: ignore[return-value] - def __floordiv__(self: Self, other: ArrowExpr | Any) -> Self: return reuse_series_implementation(self, "__floordiv__", other=other) - def __rfloordiv__(self: Self, other: ArrowExpr | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__floordiv__(self) # type: ignore[return-value] - def __truediv__(self: Self, other: ArrowExpr | Any) -> Self: return reuse_series_implementation(self, "__truediv__", other=other) - def __rtruediv__(self: Self, other: ArrowExpr | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__truediv__(self) # type: ignore[return-value] - def __mod__(self: Self, other: ArrowExpr | Any) -> Self: return reuse_series_implementation(self, "__mod__", other=other) - def __rmod__(self: Self, other: ArrowExpr | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__mod__(self) # type: ignore[return-value] - def __invert__(self: Self) -> Self: return reuse_series_implementation(self, "__invert__") diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 7f50dbf74..b3163fef6 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -210,14 +210,6 @@ def __add__(self, other: Any) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __radd__(self, other: Any) -> Self: - return self._from_call( - lambda _input, other: _input.__radd__(other), - "__radd__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __sub__(self, other: Any) -> Self: return self._from_call( lambda _input, other: _input.__sub__(other), @@ -226,14 +218,6 @@ def __sub__(self, other: Any) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __rsub__(self, other: Any) -> Self: - return self._from_call( - lambda _input, other: _input.__rsub__(other), - "__rsub__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __mul__(self, other: Any) -> Self: return self._from_call( lambda _input, other: _input.__mul__(other), @@ -242,14 +226,6 @@ def __mul__(self, other: Any) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __rmul__(self, other: Any) -> Self: - return self._from_call( - lambda _input, other: _input.__rmul__(other), - "__rmul__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __truediv__(self, other: Any) -> Self: return self._from_call( lambda _input, other: _input.__truediv__(other), @@ -258,14 +234,6 @@ def __truediv__(self, other: Any) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __rtruediv__(self, other: Any) -> Self: - return self._from_call( - lambda _input, other: _input.__rtruediv__(other), - "__rtruediv__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __floordiv__(self, other: Any) -> Self: return self._from_call( lambda _input, other: _input.__floordiv__(other), @@ -274,14 +242,6 @@ def __floordiv__(self, other: Any) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __rfloordiv__(self, other: Any) -> Self: - return self._from_call( - lambda _input, other: _input.__rfloordiv__(other), - "__rfloordiv__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __pow__(self, other: Any) -> Self: return self._from_call( lambda _input, other: _input.__pow__(other), @@ -290,14 +250,6 @@ def __pow__(self, other: Any) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __rpow__(self, other: Any) -> Self: - return self._from_call( - lambda _input, other: _input.__rpow__(other), - "__rpow__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __mod__(self, other: Any) -> Self: return self._from_call( lambda _input, other: _input.__mod__(other), @@ -306,14 +258,6 @@ def __mod__(self, other: Any) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __rmod__(self, other: Any) -> Self: - return self._from_call( - lambda _input, other: _input.__rmod__(other), - "__rmod__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __eq__(self, other: DaskExpr) -> Self: # type: ignore[override] return self._from_call( lambda _input, other: _input.__eq__(other), @@ -370,14 +314,6 @@ def __and__(self, other: DaskExpr) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __rand__(self, other: DaskExpr) -> Self: - return self._from_call( - lambda _input, other: _input.__rand__(other), - "__rand__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __or__(self, other: DaskExpr) -> Self: return self._from_call( lambda _input, other: _input.__or__(other), @@ -386,14 +322,6 @@ def __or__(self, other: DaskExpr) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __ror__(self, other: DaskExpr) -> Self: - return self._from_call( - lambda _input, other: _input.__ror__(other), - "__ror__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __invert__(self: Self) -> Self: return self._from_call( lambda _input: _input.__invert__(), diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 126afaae6..d9a1a8ac6 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -66,26 +66,30 @@ def nth(self, *column_indices: int) -> DaskExpr: ) def lit(self, value: Any, dtype: DType | None) -> DaskExpr: - def convert_if_dtype( - series: dask_expr.Series, dtype: DType | type[DType] - ) -> dask_expr.Series: - return ( - series.astype(narwhals_to_native_dtype(dtype, self._version)) - if dtype - else series - ) + import dask.dataframe as dd + import pandas as pd - return DaskExpr( - lambda df: [ - df._native_frame.assign(literal=value)["literal"].pipe( - convert_if_dtype, dtype + def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + return [ + dd.from_pandas( + pd.Series( + [value], + dtype=narwhals_to_native_dtype(dtype, self._version) + if dtype is not None + else None, + name="literal", + ), + npartitions=df._native_frame.npartitions, ) - ], + ] + + return DaskExpr( + func, depth=0, function_name="lit", root_names=None, output_names=["literal"], - returns_scalar=False, + returns_scalar=True, backend_version=self._backend_version, version=self._version, kwargs={}, @@ -414,6 +418,9 @@ def __call__(self, df: DaskLazyFrame) -> Sequence[dask_expr.Series]: # `self._otherwise_value` is a scalar and can't be converted to an expression return [value_series.where(condition, self._otherwise_value)] otherwise_series = otherwise_expr(df)[0] + + if otherwise_expr._returns_scalar: # type: ignore[attr-defined] + return [value_series.where(condition, otherwise_series[0])] validate_comparand(condition, otherwise_series) return [value_series.where(condition, otherwise_series)] diff --git a/narwhals/_dask/utils.py b/narwhals/_dask/utils.py index cb1232496..88d59b532 100644 --- a/narwhals/_dask/utils.py +++ b/narwhals/_dask/utils.py @@ -31,7 +31,8 @@ def maybe_evaluate(df: DaskLazyFrame, obj: Any) -> Any: msg = "Multi-output expressions (e.g. `nw.all()` or `nw.col('a', 'b')`) not supported in this context" raise NotImplementedError(msg) result = results[0] - validate_comparand(df._native_frame, result) + if not obj._returns_scalar: + validate_comparand(df._native_frame, result) if obj._returns_scalar: # Return scalar, let Dask do its broadcasting return result[0] diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index df451b492..c681fc487 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -179,68 +179,31 @@ def __lt__(self, other: PandasLikeExpr | Any) -> Self: def __and__(self, other: PandasLikeExpr | bool | Any) -> Self: return reuse_series_implementation(self, "__and__", other=other) - def __rand__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__and__(self) # type: ignore[no-any-return] - def __or__(self, other: PandasLikeExpr | bool | Any) -> Self: return reuse_series_implementation(self, "__or__", other=other) - def __ror__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__or__(self) # type: ignore[no-any-return] - def __add__(self, other: PandasLikeExpr | Any) -> Self: return reuse_series_implementation(self, "__add__", other=other) - def __radd__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__add__(self) # type: ignore[no-any-return] - def __sub__(self, other: PandasLikeExpr | Any) -> Self: return reuse_series_implementation(self, "__sub__", other=other) - def __rsub__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__sub__(self) # type: ignore[no-any-return] - def __mul__(self, other: PandasLikeExpr | Any) -> Self: return reuse_series_implementation(self, "__mul__", other=other) - def __rmul__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__mul__(self) # type: ignore[no-any-return] - def __truediv__(self, other: PandasLikeExpr | Any) -> Self: return reuse_series_implementation(self, "__truediv__", other=other) - def __rtruediv__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__truediv__(self) # type: ignore[no-any-return] - def __floordiv__(self, other: PandasLikeExpr | Any) -> Self: return reuse_series_implementation(self, "__floordiv__", other=other) - def __rfloordiv__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__floordiv__(self) # type: ignore[no-any-return] - def __pow__(self, other: PandasLikeExpr | Any) -> Self: return reuse_series_implementation(self, "__pow__", other=other) - def __rpow__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__pow__(self) # type: ignore[no-any-return] - def __mod__(self, other: PandasLikeExpr | Any) -> Self: return reuse_series_implementation(self, "__mod__", other=other) - def __rmod__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__mod__(self) # type: ignore[no-any-return] - # Unary - def __invert__(self) -> Self: return reuse_series_implementation(self, "__invert__") diff --git a/narwhals/_polars/expr.py b/narwhals/_polars/expr.py index 230ce37d5..0e4240010 100644 --- a/narwhals/_polars/expr.py +++ b/narwhals/_polars/expr.py @@ -186,27 +186,15 @@ def __or__(self: Self, other: PolarsExpr | bool | Any) -> Self: def __add__(self: Self, other: Any) -> Self: return self._from_native_expr(self._native_expr.__add__(extract_native(other))) - def __radd__(self: Self, other: Any) -> Self: - return self._from_native_expr(self._native_expr.__radd__(extract_native(other))) - def __sub__(self: Self, other: Any) -> Self: return self._from_native_expr(self._native_expr.__sub__(extract_native(other))) - def __rsub__(self: Self, other: Any) -> Self: - return self._from_native_expr(self._native_expr.__rsub__(extract_native(other))) - def __mul__(self: Self, other: Any) -> Self: return self._from_native_expr(self._native_expr.__mul__(extract_native(other))) - def __rmul__(self: Self, other: Any) -> Self: - return self._from_native_expr(self._native_expr.__rmul__(extract_native(other))) - def __pow__(self: Self, other: Any) -> Self: return self._from_native_expr(self._native_expr.__pow__(extract_native(other))) - def __rpow__(self: Self, other: Any) -> Self: - return self._from_native_expr(self._native_expr.__rpow__(extract_native(other))) - def __invert__(self: Self) -> Self: return self._from_native_expr(self._native_expr.__invert__()) diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index f4a779c23..eb7118b23 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -48,7 +48,7 @@ def __native_namespace__(self) -> Any: # pragma: no cover def __narwhals_namespace__(self) -> SparkLikeNamespace: from narwhals._spark_like.namespace import SparkLikeNamespace - return SparkLikeNamespace( + return SparkLikeNamespace( # type: ignore[abstract] backend_version=self._backend_version, version=self._version ) diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index 4887e8001..1b98fcc46 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -58,7 +58,7 @@ def __narwhals_namespace__(self) -> SparkLikeNamespace: # pragma: no cover # Unused, just for compatibility with PandasLikeExpr from narwhals._spark_like.namespace import SparkLikeNamespace - return SparkLikeNamespace( + return SparkLikeNamespace( # type: ignore[abstract] backend_version=self._backend_version, version=self._version ) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index fcf8648dc..dd786ef3d 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -5015,7 +5015,7 @@ def join_asof( ... datetime(2016, 5, 25, 13, 30, 0, 23), ... datetime(2016, 5, 25, 13, 30, 0, 38), ... datetime(2016, 5, 25, 13, 30, 0, 48), - ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... datetime(2016, 5, 25, 13, 30, 0, 49), ... datetime(2016, 5, 25, 13, 30, 0, 48), ... ], ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], @@ -5036,7 +5036,7 @@ def join_asof( ... df = nw.from_native(df_native) ... other = nw.from_native(other_native) ... return ( - ... df.sort("datetime") + ... df.sort("datetime", "ticker") ... .join_asof(other, on="datetime", by="ticker") ... .sort("datetime", "ticker") ... .collect() @@ -5056,15 +5056,15 @@ def join_asof( │ 2016-05-25 13:30:00.000038 ┆ MSFT ┆ 51.95 ┆ 155 ┆ 51.97 ┆ 51.98 │ │ 2016-05-25 13:30:00.000048 ┆ AAPL ┆ 98.0 ┆ 100 ┆ null ┆ null │ │ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.77 ┆ 100 ┆ 720.5 ┆ 720.93 │ - │ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.92 ┆ 100 ┆ 720.5 ┆ 720.93 │ + │ 2016-05-25 13:30:00.000049 ┆ GOOG ┆ 720.92 ┆ 100 ┆ 720.5 ┆ 720.93 │ └────────────────────────────┴────────┴────────┴──────────┴───────┴────────┘ >>> agnostic_join_asof_datetime_by_ticker(trades_dask, quotes_dask) datetime ticker price quantity bid ask 0 2016-05-25 13:30:00.000023 MSFT 51.95 75 51.95 51.96 0 2016-05-25 13:30:00.000038 MSFT 51.95 155 51.97 51.98 - 2 2016-05-25 13:30:00.000048 AAPL 98.00 100 NaN NaN - 1 2016-05-25 13:30:00.000048 GOOG 720.92 100 720.50 720.93 - 3 2016-05-25 13:30:00.000048 GOOG 720.77 100 720.50 720.93 + 1 2016-05-25 13:30:00.000048 AAPL 98.00 100 NaN NaN + 2 2016-05-25 13:30:00.000048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.000049 GOOG 720.92 100 720.50 720.93 """ return super().join_asof( other, diff --git a/narwhals/expr.py b/narwhals/expr.py index 6e3cacb02..0ab7ba20e 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -19,14 +19,19 @@ from typing_extensions import Self from narwhals.dtypes import DType + from narwhals.typing import CompliantExpr + from narwhals.typing import CompliantNamespace + from narwhals.typing import CompliantSeriesT_co from narwhals.typing import IntoExpr -def extract_compliant(expr: Expr, other: Any) -> Any: +def extract_compliant( + plx: CompliantNamespace[CompliantSeriesT_co], other: Any +) -> CompliantExpr[CompliantSeriesT_co] | CompliantSeriesT_co | Any: from narwhals.series import Series if isinstance(other, Expr): - return other._to_compliant_expr(expr) + return other._to_compliant_expr(plx) if isinstance(other, Series): return other._compliant_series return other @@ -227,11 +232,12 @@ def __and__(self, other: Any) -> Self: ) def __rand__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__rand__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__and__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __or__(self, other: Any) -> Self: return self.__class__( @@ -239,11 +245,12 @@ def __or__(self, other: Any) -> Self: ) def __ror__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__ror__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__or__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __add__(self, other: Any) -> Self: return self.__class__( @@ -253,11 +260,12 @@ def __add__(self, other: Any) -> Self: ) def __radd__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__radd__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__add__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __sub__(self, other: Any) -> Self: return self.__class__( @@ -267,11 +275,12 @@ def __sub__(self, other: Any) -> Self: ) def __rsub__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__rsub__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__sub__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __truediv__(self, other: Any) -> Self: return self.__class__( @@ -281,11 +290,12 @@ def __truediv__(self, other: Any) -> Self: ) def __rtruediv__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__rtruediv__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__truediv__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __mul__(self, other: Any) -> Self: return self.__class__( @@ -295,11 +305,12 @@ def __mul__(self, other: Any) -> Self: ) def __rmul__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__rmul__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__mul__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __le__(self, other: Any) -> Self: return self.__class__( @@ -329,11 +340,12 @@ def __pow__(self, other: Any) -> Self: ) def __rpow__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__rpow__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__pow__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __floordiv__(self, other: Any) -> Self: return self.__class__( @@ -343,11 +355,12 @@ def __floordiv__(self, other: Any) -> Self: ) def __rfloordiv__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__rfloordiv__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__floordiv__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __mod__(self, other: Any) -> Self: return self.__class__( @@ -357,11 +370,12 @@ def __mod__(self, other: Any) -> Self: ) def __rmod__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__rmod__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__mod__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) # --- unary --- def __invert__(self) -> Self: @@ -1788,8 +1802,11 @@ def is_in(self, other: Any) -> Self: b: [[true,true,false,false]] """ if isinstance(other, Iterable) and not isinstance(other, (str, bytes)): - other = extract_compliant(self, other) - return self.__class__(lambda plx: self._to_compliant_expr(plx).is_in(other)) + return self.__class__( + lambda plx: self._to_compliant_expr(plx).is_in( + extract_compliant(plx, other) + ) + ) else: msg = "Narwhals `is_in` doesn't accept expressions as an argument, as opposed to Polars. You should provide an iterable instead." raise NotImplementedError(msg) diff --git a/narwhals/typing.py b/narwhals/typing.py index ff29cb57e..859e98dff 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -81,10 +81,22 @@ def __narwhals_namespace__(self) -> CompliantNamespace[CompliantSeriesT_co]: ... def is_null(self) -> Self: ... def alias(self, name: str) -> Self: ... def cast(self, dtype: DType) -> Self: ... + def __and__(self, other: Any) -> Self: ... + def __or__(self, other: Any) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __sub__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __pow__(self, other: Any) -> Self: ... class CompliantNamespace(Protocol, Generic[CompliantSeriesT_co]): def col(self, *column_names: str) -> CompliantExpr[CompliantSeriesT_co]: ... + def lit( + self, value: Any, dtype: DType | None + ) -> CompliantExpr[CompliantSeriesT_co]: ... IntoExpr: TypeAlias = Union["Expr", str, "Series[Any]"] diff --git a/pyproject.toml b/pyproject.toml index 45aa54ad8..667a4980c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ pandas = ["pandas>=0.25.3"] polars = ["polars>=0.20.3"] ibis = ["ibis-framework>=6.0.0", "rich", "packaging", "pyarrow_hotfix"] pyarrow = ["pyarrow>=11.0.0"] -dask = ["dask[dataframe]>=2024.7"] +dask = ["dask[dataframe]>=2024.10"] duckdb = ["duckdb>=1.0"] pyspark = ["pyspark>=3.3.0"] dev = [ diff --git a/tests/frame/lit_test.py b/tests/frame/lit_test.py index f51bd5c76..8b3bcd8e2 100644 --- a/tests/frame/lit_test.py +++ b/tests/frame/lit_test.py @@ -82,10 +82,7 @@ def test_lit_operation( col_name: str, expr: nw.Expr, expected_result: list[int], - request: pytest.FixtureRequest, ) -> None: - if "dask_lazy_p2" in str(constructor) and "lit_with_agg" in col_name: - request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2]} df_raw = constructor(data) df = nw.from_native(df_raw).lazy() diff --git a/tests/selectors_test.py b/tests/selectors_test.py index 8a2194caf..86bdbac53 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -91,9 +91,9 @@ def test_set_ops( @pytest.mark.parametrize("invalid_constructor", [pd.DataFrame, pa.table]) def test_set_ops_invalid(invalid_constructor: Constructor) -> None: df = nw.from_native(invalid_constructor(data)) - with pytest.raises(NotImplementedError): + with pytest.raises((NotImplementedError, ValueError)): df.select(1 - numeric()) - with pytest.raises(NotImplementedError): + with pytest.raises((NotImplementedError, ValueError)): df.select(1 | numeric()) - with pytest.raises(NotImplementedError): + with pytest.raises((NotImplementedError, ValueError)): df.select(1 & numeric()) From 9f4b41902bc5f42ba6ff4f00c89d1685c258c267 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dea=20Mar=C3=ADa=20L=C3=A9on?= Date: Sun, 5 Jan 2025 10:27:07 +0100 Subject: [PATCH 02/35] tests: remove unnecessary pytest filterwarnings (#1691) --- pyproject.toml | 13 ------------- tests/spark_like_test.py | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 667a4980c..a0a68cf3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -150,22 +150,9 @@ docstring-code-format = true testpaths = ["tests"] filterwarnings = [ "error", - 'ignore:distutils Version classes are deprecated:DeprecationWarning', - 'ignore:In the future `np.bool`', - 'ignore:make_block is deprecated and will be removed', - 'ignore:np.find_common_type is deprecated', - 'ignore:is_sparse is deprecated and will be removed', - 'ignore:Passing a BlockManager to DataFrame is deprecated', 'ignore:.*defaulting to pandas implementation', 'ignore:.*implementation has mismatches with pandas', - 'ignore:.*Do not use the `random` module inside strategies', 'ignore:.*You are using pyarrow version', - 'ignore:.*but when imported by', - 'ignore:Distributing .*This may take some time', - 'ignore:.*The default coalesce behavior', - 'ignore:is_datetime64tz_dtype is deprecated', - 'ignore: unclosed IntoFrame: # NaN and NULL are not the same in PySpark pd_df = pd.DataFrame(obj).replace({float("nan"): None}).reset_index() From 31158b2b32d17e3b75d9553cc90d620eaea627c5 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 5 Jan 2025 15:53:53 +0000 Subject: [PATCH 03/35] feat: validate library minimum version in compliant objects (#1727) --- narwhals/_arrow/dataframe.py | 2 + narwhals/_arrow/series.py | 2 + narwhals/_dask/dataframe.py | 4 +- narwhals/_duckdb/dataframe.py | 18 +++++++-- narwhals/_ibis/dataframe.py | 18 +++++++-- narwhals/_pandas_like/dataframe.py | 2 + narwhals/_pandas_like/series.py | 2 + narwhals/_polars/dataframe.py | 3 ++ narwhals/_polars/series.py | 2 + narwhals/_spark_like/dataframe.py | 2 + narwhals/dependencies.py | 10 ++--- narwhals/translate.py | 14 ++++++- narwhals/utils.py | 61 ++++++++++++++++++++++++++++++ pyproject.toml | 16 +++++--- tests/expr_and_series/clip_test.py | 3 ++ 15 files changed, 140 insertions(+), 19 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index c0efa50fe..9e5ce0621 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -22,6 +22,7 @@ from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop from narwhals.utils import scale_bytes +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -56,6 +57,7 @@ def __init__( self._implementation = Implementation.PYARROW self._backend_version = backend_version self._version = version + validate_backend_version(self._implementation, self._backend_version) def __narwhals_namespace__(self: Self) -> ArrowNamespace: from narwhals._arrow.namespace import ArrowNamespace diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index cf7760d49..046e26e05 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -18,6 +18,7 @@ from narwhals.utils import Implementation from narwhals.utils import generate_temporary_column_name from narwhals.utils import import_dtypes_module +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -54,6 +55,7 @@ def __init__( self._implementation = Implementation.PYARROW self._backend_version = backend_version self._version = version + validate_backend_version(self._implementation, self._backend_version) def _change_version(self: Self, version: Version) -> Self: return self.__class__( diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 6542253a0..5e652a937 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -11,11 +11,13 @@ from narwhals._dask.utils import parse_exprs_and_named_exprs from narwhals._pandas_like.utils import native_to_narwhals_dtype from narwhals._pandas_like.utils import select_columns_by_name +from narwhals.typing import CompliantLazyFrame from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_temporary_column_name from narwhals.utils import parse_columns_to_drop from narwhals.utils import parse_version +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -29,7 +31,6 @@ from narwhals._dask.typing import IntoDaskExpr from narwhals.dtypes import DType from narwhals.utils import Version -from narwhals.typing import CompliantLazyFrame class DaskLazyFrame(CompliantLazyFrame): @@ -44,6 +45,7 @@ def __init__( self._backend_version = backend_version self._implementation = Implementation.DASK self._version = version + validate_backend_version(self._implementation, self._backend_version) def __native_namespace__(self: Self) -> ModuleType: if self._implementation is Implementation.DASK: diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 339fca137..73dd055ca 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -6,8 +6,10 @@ from typing import Any from narwhals.dependencies import get_duckdb +from narwhals.utils import Implementation from narwhals.utils import import_dtypes_module from narwhals.utils import parse_version +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -82,9 +84,15 @@ def native_to_narwhals_dtype(duckdb_dtype: str, version: Version) -> DType: class DuckDBInterchangeFrame: - def __init__(self, df: Any, version: Version) -> None: + _implementation = Implementation.DUCKDB + + def __init__( + self, df: Any, *, backend_version: tuple[int, ...], version: Version + ) -> None: self._native_frame = df self._version = version + self._backend_version = backend_version + validate_backend_version(self._implementation, self._backend_version) def __narwhals_dataframe__(self) -> Any: return self @@ -147,10 +155,14 @@ def to_arrow(self: Self) -> pa.Table: return self._native_frame.arrow() def _change_version(self: Self, version: Version) -> Self: - return self.__class__(self._native_frame, version=version) + return self.__class__( + self._native_frame, version=version, backend_version=self._backend_version + ) def _from_native_frame(self: Self, df: Any) -> Self: - return self.__class__(df, version=self._version) + return self.__class__( + df, version=self._version, backend_version=self._backend_version + ) def collect_schema(self) -> dict[str, DType]: return { diff --git a/narwhals/_ibis/dataframe.py b/narwhals/_ibis/dataframe.py index f62a31e8b..6fe8997a9 100644 --- a/narwhals/_ibis/dataframe.py +++ b/narwhals/_ibis/dataframe.py @@ -5,7 +5,9 @@ from typing import Any from narwhals.dependencies import get_ibis +from narwhals.utils import Implementation from narwhals.utils import import_dtypes_module +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -69,9 +71,15 @@ def native_to_narwhals_dtype(ibis_dtype: Any, version: Version) -> DType: class IbisInterchangeFrame: - def __init__(self, df: Any, version: Version) -> None: + _implementation = Implementation.IBIS + + def __init__( + self, df: Any, *, backend_version: tuple[int, ...], version: Version + ) -> None: self._native_frame = df self._version = version + self._backend_version = backend_version + validate_backend_version(self._implementation, self._backend_version) def __narwhals_dataframe__(self) -> Any: return self @@ -125,10 +133,14 @@ def __getattr__(self, attr: str) -> Any: raise NotImplementedError(msg) def _change_version(self: Self, version: Version) -> Self: - return self.__class__(self._native_frame, version=version) + return self.__class__( + self._native_frame, version=version, backend_version=self._backend_version + ) def _from_native_frame(self: Self, df: Any) -> Self: - return self.__class__(df, version=self._version) + return self.__class__( + df, version=self._version, backend_version=self._backend_version + ) def collect_schema(self) -> dict[str, DType]: return { diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index c10aacec5..293f5cefe 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -27,6 +27,7 @@ from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop from narwhals.utils import scale_bytes +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -59,6 +60,7 @@ def __init__( self._implementation = implementation self._backend_version = backend_version self._version = version + validate_backend_version(self._implementation, self._backend_version) def __narwhals_dataframe__(self) -> Self: return self diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index cf8972deb..8a6779828 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -24,6 +24,7 @@ from narwhals.typing import CompliantSeries from narwhals.utils import Implementation from narwhals.utils import import_dtypes_module +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -94,6 +95,7 @@ def __init__( self._implementation = implementation self._backend_version = backend_version self._version = version + validate_backend_version(self._implementation, self._backend_version) def __native_namespace__(self: Self) -> ModuleType: if self._implementation in { diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index 760b5f4b6..d5e115284 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -15,6 +15,7 @@ from narwhals.utils import Implementation from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -45,6 +46,7 @@ def __init__( self._backend_version = backend_version self._implementation = Implementation.POLARS self._version = version + validate_backend_version(self._implementation, self._backend_version) def __repr__(self: Self) -> str: # pragma: no cover return "PolarsDataFrame" @@ -343,6 +345,7 @@ def __init__( self._backend_version = backend_version self._implementation = Implementation.POLARS self._version = version + validate_backend_version(self._implementation, self._backend_version) def __repr__(self: Self) -> str: # pragma: no cover return "PolarsLazyFrame" diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index 30cd90fd5..33572db7c 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -10,6 +10,7 @@ from narwhals._polars.utils import narwhals_to_native_dtype from narwhals._polars.utils import native_to_narwhals_dtype from narwhals.utils import Implementation +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -38,6 +39,7 @@ def __init__( self._backend_version = backend_version self._implementation = Implementation.POLARS self._version = version + validate_backend_version(self._implementation, self._backend_version) def __repr__(self: Self) -> str: # pragma: no cover return "PolarsSeries" diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index eb7118b23..e04da7f57 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -12,6 +12,7 @@ from narwhals.utils import flatten from narwhals.utils import parse_columns_to_drop from narwhals.utils import parse_version +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from pyspark.sql import DataFrame @@ -37,6 +38,7 @@ def __init__( self._backend_version = backend_version self._implementation = Implementation.PYSPARK self._version = version + validate_backend_version(self._implementation, self._backend_version) def __native_namespace__(self) -> Any: # pragma: no cover if self._implementation is Implementation.PYSPARK: diff --git a/narwhals/dependencies.py b/narwhals/dependencies.py index 0c5d11720..43904a0ba 100644 --- a/narwhals/dependencies.py +++ b/narwhals/dependencies.py @@ -87,16 +87,16 @@ def get_duckdb() -> Any: return sys.modules.get("duckdb", None) -def get_dask_expr() -> Any: - """Get dask_expr module (if already imported - else return None).""" - return sys.modules.get("dask_expr", None) - - def get_ibis() -> Any: """Get ibis module (if already imported - else return None).""" return sys.modules.get("ibis", None) +def get_dask_expr() -> Any: + """Get dask_expr module (if already imported - else return None).""" + return sys.modules.get("dask_expr", None) + + def get_pyspark() -> Any: # pragma: no cover """Get pyspark module (if already imported - else return None).""" return sys.modules.get("pyspark", None) diff --git a/narwhals/translate.py b/narwhals/translate.py index 8542a62f0..77c83b548 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -709,8 +709,13 @@ def _from_native_impl( # noqa: PLR0915 else: return native_object raise TypeError(msg) + import duckdb # ignore-banned-import + + backend_version = parse_version(duckdb.__version__) return DataFrame( - DuckDBInterchangeFrame(native_object, version=version), + DuckDBInterchangeFrame( + native_object, version=version, backend_version=backend_version + ), level="interchange", ) @@ -726,8 +731,13 @@ def _from_native_impl( # noqa: PLR0915 ) raise TypeError(msg) return native_object + import ibis # ignore-banned-import + + backend_version = parse_version(ibis.__version__) return DataFrame( - IbisInterchangeFrame(native_object, version=version), + IbisInterchangeFrame( + native_object, version=version, backend_version=backend_version + ), level="interchange", ) diff --git a/narwhals/utils.py b/narwhals/utils.py index 2125d46c4..658c0e7bf 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -16,6 +16,8 @@ from narwhals.dependencies import get_cudf from narwhals.dependencies import get_dask_dataframe +from narwhals.dependencies import get_duckdb +from narwhals.dependencies import get_ibis from narwhals.dependencies import get_modin from narwhals.dependencies import get_pandas from narwhals.dependencies import get_polars @@ -73,6 +75,10 @@ class Implementation(Enum): """Polars implementation.""" DASK = auto() """Dask implementation.""" + DUCKDB = auto() + """DuckDB implementation.""" + IBIS = auto() + """Ibis implementation.""" UNKNOWN = auto() """Unknown implementation.""" @@ -97,6 +103,8 @@ def from_native_namespace( get_pyspark_sql(): Implementation.PYSPARK, get_polars(): Implementation.POLARS, get_dask_dataframe(): Implementation.DASK, + get_duckdb(): Implementation.DUCKDB, + get_ibis(): Implementation.IBIS, } return mapping.get(native_namespace, Implementation.UNKNOWN) @@ -245,6 +253,59 @@ def is_dask(self) -> bool: """ return self is Implementation.DASK # pragma: no cover + def is_duckdb(self) -> bool: + """Return whether implementation is DuckDB. + + Returns: + Boolean. + + Examples: + >>> import polars as pl + >>> import narwhals as nw + >>> df_native = pl.DataFrame({"a": [1, 2, 3]}) + >>> df = nw.from_native(df_native) + >>> df.implementation.is_duckdb() + False + """ + return self is Implementation.DUCKDB # pragma: no cover + + def is_ibis(self) -> bool: + """Return whether implementation is Ibis. + + Returns: + Boolean. + + Examples: + >>> import polars as pl + >>> import narwhals as nw + >>> df_native = pl.DataFrame({"a": [1, 2, 3]}) + >>> df = nw.from_native(df_native) + >>> df.implementation.is_ibis() + False + """ + return self is Implementation.IBIS # pragma: no cover + + +MIN_VERSIONS: dict[Implementation, tuple[int, ...]] = { + Implementation.PANDAS: (0, 25, 3), + Implementation.MODIN: (0, 25, 3), + Implementation.CUDF: (24, 10), + Implementation.PYARROW: (11,), + Implementation.PYSPARK: (3, 3), + Implementation.POLARS: (0, 20, 3), + Implementation.DASK: (2024, 10), + Implementation.DUCKDB: (1,), + Implementation.IBIS: (6,), +} + + +def validate_backend_version( + implementation: Implementation, backend_version: tuple[int, ...] +) -> None: + if backend_version < (min_version := MIN_VERSIONS[implementation]): + msg = f"Minimum version of {implementation} supported by Narwhals is {min_version}, found: {backend_version}" + raise ValueError(msg) + def import_dtypes_module(version: Version) -> DTypes: if version is Version.V1: diff --git a/pyproject.toml b/pyproject.toml index a0a68cf3a..c01ebbafa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,15 +21,21 @@ classifiers = [ ] [project.optional-dependencies] -cudf = ["cudf>=24.10.0"] -modin = ["modin"] +# These should be aligned with MIN_VERSIONS in narwhals/utils.py +# Exception: modin, because `modin.__version__` isn't aligned with +# `modin.pandas.__version__`. The latter is the one that we make +# API decisions based on, so that's the one we track internally. +# We have yet to determine the minimum Modin version we support +# https://github.com/narwhals-dev/narwhals/issues/817 pandas = ["pandas>=0.25.3"] -polars = ["polars>=0.20.3"] -ibis = ["ibis-framework>=6.0.0", "rich", "packaging", "pyarrow_hotfix"] +modin = ["modin"] +cudf = ["cudf>=24.10.0"] pyarrow = ["pyarrow>=11.0.0"] +pyspark = ["pyspark>=3.3.0"] +polars = ["polars>=0.20.3"] dask = ["dask[dataframe]>=2024.10"] duckdb = ["duckdb>=1.0"] -pyspark = ["pyspark>=3.3.0"] +ibis = ["ibis-framework>=6.0.0", "rich", "packaging", "pyarrow_hotfix"] dev = [ "covdefaults", "pre-commit", diff --git a/tests/expr_and_series/clip_test.py b/tests/expr_and_series/clip_test.py index 838ca6b08..29ed6379b 100644 --- a/tests/expr_and_series/clip_test.py +++ b/tests/expr_and_series/clip_test.py @@ -57,6 +57,9 @@ def test_clip_series_expressified( ) -> None: if "modin_pyarrow" in str(constructor_eager): request.applymarker(pytest.mark.xfail) + if "cudf" in str(constructor_eager): + # https://github.com/rapidsai/cudf/issues/17682 + request.applymarker(pytest.mark.xfail) data = {"a": [1, 2, 3, -4, 5], "lb": [3, 2, 1, 1, 1], "ub": [4, 4, 2, 2, 2]} df = nw.from_native(constructor_eager(data), eager_only=True) From 19418cff123d3137afafa43a6c8d7c933a35dc98 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 5 Jan 2025 19:17:26 +0000 Subject: [PATCH 04/35] fix: nw.lit(date, dtype=nw.Date), loosen Dask minimum back to 2024.8 (#1730) --- narwhals/_dask/utils.py | 2 ++ narwhals/_pandas_like/utils.py | 9 +++++---- narwhals/expr.py | 2 +- narwhals/stable/v1/__init__.py | 2 +- narwhals/utils.py | 2 +- pyproject.toml | 2 +- tests/expr_and_series/arithmetic_test.py | 5 +++++ tests/expr_and_series/binary_test.py | 7 ++++++- tests/{frame => expr_and_series}/lit_test.py | 20 ++++++++++++++++++++ tests/expr_and_series/operators_test.py | 12 +++++++++++- tests/frame/select_test.py | 2 +- tests/series_only/cast_test.py | 12 ------------ tests/tpch_q1_test.py | 3 +++ 13 files changed, 57 insertions(+), 23 deletions(-) rename tests/{frame => expr_and_series}/lit_test.py (77%) diff --git a/narwhals/_dask/utils.py b/narwhals/_dask/utils.py index 88d59b532..4f2952d0b 100644 --- a/narwhals/_dask/utils.py +++ b/narwhals/_dask/utils.py @@ -136,6 +136,8 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], version: Version) -> An return "category" if isinstance_or_issubclass(dtype, dtypes.Datetime): return "datetime64[us]" + if isinstance_or_issubclass(dtype, dtypes.Date): + return "date32[day][pyarrow]" if isinstance_or_issubclass(dtype, dtypes.Duration): return "timedelta64[ns]" if isinstance_or_issubclass(dtype, dtypes.List): # pragma: no cover diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 655e60773..03f025f78 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -637,10 +637,11 @@ def narwhals_to_native_dtype( # noqa: PLR0915 else f"timedelta64[{du_time_unit}]" ) if isinstance_or_issubclass(dtype, dtypes.Date): - if dtype_backend == "pyarrow-nullable": - return "date32[pyarrow]" - msg = "Date dtype only supported for pyarrow-backed data types in pandas" - raise NotImplementedError(msg) + try: + import pyarrow as pa # ignore-banned-import + except ModuleNotFoundError: # pragma: no cover + msg = "PyArrow>=11.0.0 is required for `Date` dtype." + return "date32[pyarrow]" if isinstance_or_issubclass(dtype, dtypes.Enum): msg = "Converting to Enum is not (yet) supported" raise NotImplementedError(msg) diff --git a/narwhals/expr.py b/narwhals/expr.py index 0ab7ba20e..aa934a01f 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -7023,7 +7023,7 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: ) -def lit(value: Any, dtype: DType | None = None) -> Expr: +def lit(value: Any, dtype: DType | type[DType] | None = None) -> Expr: """Return an expression representing a literal value. Arguments: diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index ba5117425..5ffc475e5 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -2542,7 +2542,7 @@ def len() -> Expr: return _stableify(nw.len()) -def lit(value: Any, dtype: DType | None = None) -> Expr: +def lit(value: Any, dtype: DType | type[DType] | None = None) -> Expr: """Return an expression representing a literal value. Arguments: diff --git a/narwhals/utils.py b/narwhals/utils.py index 658c0e7bf..b8e9830e1 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -293,7 +293,7 @@ def is_ibis(self) -> bool: Implementation.PYARROW: (11,), Implementation.PYSPARK: (3, 3), Implementation.POLARS: (0, 20, 3), - Implementation.DASK: (2024, 10), + Implementation.DASK: (2024, 8), Implementation.DUCKDB: (1,), Implementation.IBIS: (6,), } diff --git a/pyproject.toml b/pyproject.toml index c01ebbafa..0c2b4a9be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ cudf = ["cudf>=24.10.0"] pyarrow = ["pyarrow>=11.0.0"] pyspark = ["pyspark>=3.3.0"] polars = ["polars>=0.20.3"] -dask = ["dask[dataframe]>=2024.10"] +dask = ["dask[dataframe]>=2024.8"] duckdb = ["duckdb>=1.0"] ibis = ["ibis-framework>=6.0.0", "rich", "packaging", "pyarrow_hotfix"] dev = [ diff --git a/tests/expr_and_series/arithmetic_test.py b/tests/expr_and_series/arithmetic_test.py index eb38c6a14..cd82a945e 100644 --- a/tests/expr_and_series/arithmetic_test.py +++ b/tests/expr_and_series/arithmetic_test.py @@ -11,6 +11,7 @@ from hypothesis import given import narwhals.stable.v1 as nw +from tests.utils import DASK_VERSION from tests.utils import PANDAS_VERSION from tests.utils import Constructor from tests.utils import ConstructorEager @@ -67,6 +68,8 @@ def test_right_arithmetic_expr( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: + if "dask" in str(constructor) and DASK_VERSION < (2024, 10): + request.applymarker(pytest.mark.xfail) if attr == "__rmod__" and any( x in str(constructor) for x in ["pandas_pyarrow", "modin_pyarrow"] ): @@ -241,6 +244,8 @@ def test_arithmetic_expr_left_literal( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: + if "dask" in str(constructor) and DASK_VERSION < (2024, 10): + request.applymarker(pytest.mark.xfail) if attr == "__mod__" and any( x in str(constructor) for x in ["pandas_pyarrow", "modin_pyarrow"] ): diff --git a/tests/expr_and_series/binary_test.py b/tests/expr_and_series/binary_test.py index 3693ccebd..0808810bc 100644 --- a/tests/expr_and_series/binary_test.py +++ b/tests/expr_and_series/binary_test.py @@ -1,11 +1,16 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw +from tests.utils import DASK_VERSION from tests.utils import Constructor from tests.utils import assert_equal_data -def test_expr_binary(constructor: Constructor) -> None: +def test_expr_binary(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "dask" in str(constructor) and DASK_VERSION < (2024, 10): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_raw = constructor(data) result = nw.from_native(df_raw).with_columns( diff --git a/tests/frame/lit_test.py b/tests/expr_and_series/lit_test.py similarity index 77% rename from tests/frame/lit_test.py rename to tests/expr_and_series/lit_test.py index 8b3bcd8e2..f5be7dfbe 100644 --- a/tests/frame/lit_test.py +++ b/tests/expr_and_series/lit_test.py @@ -1,5 +1,6 @@ from __future__ import annotations +from datetime import date from typing import TYPE_CHECKING from typing import Any @@ -7,6 +8,8 @@ import pytest import narwhals.stable.v1 as nw +from tests.utils import DASK_VERSION +from tests.utils import PANDAS_VERSION from tests.utils import Constructor from tests.utils import assert_equal_data @@ -82,10 +85,27 @@ def test_lit_operation( col_name: str, expr: nw.Expr, expected_result: list[int], + request: pytest.FixtureRequest, ) -> None: + if ( + "dask" in str(constructor) + and col_name in ("left_lit", "left_scalar") + and DASK_VERSION < (2024, 10) + ): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2]} df_raw = constructor(data) df = nw.from_native(df_raw).lazy() result = df.select(expr.alias(col_name)) expected = {col_name: expected_result} assert_equal_data(result, expected) + + +@pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="too old for pyarrow") +def test_date_lit(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "dask" in str(constructor): + # https://github.com/dask/dask/issues/11637 + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor({"a": [1]})) + result = df.with_columns(nw.lit(date(2020, 1, 1), dtype=nw.Date)).collect_schema() + assert result == {"a": nw.Int64, "literal": nw.Date} diff --git a/tests/expr_and_series/operators_test.py b/tests/expr_and_series/operators_test.py index ff01747a6..356d81d5b 100644 --- a/tests/expr_and_series/operators_test.py +++ b/tests/expr_and_series/operators_test.py @@ -3,6 +3,7 @@ import pytest import narwhals.stable.v1 as nw +from tests.utils import DASK_VERSION from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data @@ -75,8 +76,17 @@ def test_logic_operators_expr( ], ) def test_logic_operators_expr_scalar( - constructor: Constructor, operator: str, expected: list[bool] + constructor: Constructor, + operator: str, + expected: list[bool], + request: pytest.FixtureRequest, ) -> None: + if ( + "dask" in str(constructor) + and DASK_VERSION < (2024, 10) + and operator in ("__rand__", "__ror__") + ): + request.applymarker(pytest.mark.xfail) data = {"a": [True, True, False, False]} df = nw.from_native(constructor(data)) diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index 2cb3df91d..d85697249 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -118,7 +118,7 @@ def test_missing_columns(constructor: Constructor) -> None: def test_left_to_right_broadcasting( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "dask" in str(constructor) and DASK_VERSION < (2024, 9): + if "dask" in str(constructor) and DASK_VERSION < (2024, 10): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 1, 2], "b": [4, 5, 6]})) result = df.select(nw.col("a") + nw.col("b").sum()) diff --git a/tests/series_only/cast_test.py b/tests/series_only/cast_test.py index 10587a084..b4051e503 100644 --- a/tests/series_only/cast_test.py +++ b/tests/series_only/cast_test.py @@ -98,18 +98,6 @@ def test_cast_date_datetime_pandas() -> None: assert df.schema == {"a": nw.Date} -@pytest.mark.skipif( - PANDAS_VERSION < (2, 0, 0), - reason="pyarrow dtype not available", -) -def test_cast_date_datetime_invalid() -> None: - # pandas: pyarrow datetime to date - dfpd = pd.DataFrame({"a": [datetime(2020, 1, 1), datetime(2020, 1, 2)]}) - df = nw.from_native(dfpd) - with pytest.raises(NotImplementedError, match="pyarrow"): - df.select(nw.col("a").cast(nw.Date)) - - @pytest.mark.filterwarnings("ignore: casting period") def test_unknown_to_int() -> None: df = pd.DataFrame({"a": pd.period_range("2000", periods=3, freq="min")}) diff --git a/tests/tpch_q1_test.py b/tests/tpch_q1_test.py index fd2a7d24c..cb6d48548 100644 --- a/tests/tpch_q1_test.py +++ b/tests/tpch_q1_test.py @@ -10,6 +10,7 @@ import pytest import narwhals.stable.v1 as nw +from tests.utils import DASK_VERSION from tests.utils import PANDAS_VERSION from tests.utils import assert_equal_data @@ -20,6 +21,8 @@ ) @pytest.mark.filterwarnings("ignore:.*Passing a BlockManager.*:DeprecationWarning") def test_q1(library: str, request: pytest.FixtureRequest) -> None: + if library == "dask" and DASK_VERSION < (2024, 10): + request.applymarker(pytest.mark.xfail) if library == "pandas" and PANDAS_VERSION < (1, 5): request.applymarker(pytest.mark.xfail) elif library == "pandas": From 03f67546064d46d8329ddda32407bdd05922eac1 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Sun, 5 Jan 2025 22:07:59 +0100 Subject: [PATCH 05/35] chore: refactor root_names and output_names tracking (#1731) --------- Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- narwhals/_dask/expr.py | 27 ++-------------- narwhals/_expression_parsing.py | 56 +++++++++++++++++++-------------- narwhals/_spark_like/expr.py | 27 ++-------------- pyproject.toml | 2 ++ 4 files changed, 39 insertions(+), 73 deletions(-) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index b3163fef6..c76593404 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -1,6 +1,5 @@ from __future__ import annotations -from copy import copy from typing import TYPE_CHECKING from typing import Any from typing import Callable @@ -12,6 +11,7 @@ from narwhals._dask.utils import binary_operation_returns_scalar from narwhals._dask.utils import maybe_evaluate from narwhals._dask.utils import narwhals_to_native_dtype +from narwhals._expression_parsing import infer_new_root_output_names from narwhals._pandas_like.utils import calculate_timestamp_date from narwhals._pandas_like.utils import calculate_timestamp_datetime from narwhals._pandas_like.utils import native_to_narwhals_dtype @@ -148,30 +148,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: results.append(result) return results - # Try tracking root and output names by combining them from all - # expressions appearing in args and kwargs. If any anonymous - # expression appears (e.g. nw.all()), then give up on tracking root names - # and just set it to None. - root_names = copy(self._root_names) - output_names = self._output_names - for arg in list(kwargs.values()): - if root_names is not None and isinstance(arg, self.__class__): - if arg._root_names is not None: - root_names.extend(arg._root_names) - else: - root_names = None - output_names = None - break - elif root_names is None: - output_names = None - break - - if not ( - (output_names is None and root_names is None) - or (output_names is not None and root_names is not None) - ): # pragma: no cover - msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues" - raise AssertionError(msg) + root_names, output_names = infer_new_root_output_names(self, **kwargs) return self.__class__( func, diff --git a/narwhals/_expression_parsing.py b/narwhals/_expression_parsing.py index 4d51eb719..99bb3bb24 100644 --- a/narwhals/_expression_parsing.py +++ b/narwhals/_expression_parsing.py @@ -125,6 +125,38 @@ def parse_into_expr( raise InvalidIntoExprError.from_invalid_type(type(into_expr)) +def infer_new_root_output_names( + expr: CompliantExpr[Any], **kwargs: Any +) -> tuple[list[str] | None, list[str] | None]: + """Return new root and output names after chaining expressions. + + Try tracking root and output names by combining them from all expressions appearing in kwargs. + If any anonymous expression appears (e.g. nw.all()), then give up on tracking root names + and just set it to None. + """ + root_names = copy(expr._root_names) + output_names = expr._output_names + for arg in list(kwargs.values()): + if root_names is not None and isinstance(arg, expr.__class__): + if arg._root_names is not None: + root_names.extend(arg._root_names) + else: + root_names = None + output_names = None + break + elif root_names is None: + output_names = None + break + + if not ( + (output_names is None and root_names is None) + or (output_names is not None and root_names is not None) + ): # pragma: no cover + msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues" + raise AssertionError(msg) + return root_names, output_names + + @overload def reuse_series_implementation( expr: PandasLikeExprT, @@ -201,30 +233,8 @@ def func(df: CompliantDataFrame) -> Sequence[CompliantSeries]: raise AssertionError(msg) return out - # Try tracking root and output names by combining them from all - # expressions appearing in args and kwargs. If any anonymous - # expression appears (e.g. nw.all()), then give up on tracking root names - # and just set it to None. - root_names = copy(expr._root_names) - output_names = expr._output_names - for arg in list(kwargs.values()): - if root_names is not None and isinstance(arg, expr.__class__): - if arg._root_names is not None: - root_names.extend(arg._root_names) - else: - root_names = None - output_names = None - break - elif root_names is None: - output_names = None - break + root_names, output_names = infer_new_root_output_names(expr, **kwargs) - if not ( - (output_names is None and root_names is None) - or (output_names is not None and root_names is not None) - ): # pragma: no cover - msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues" - raise AssertionError(msg) return plx._create_expr_from_callable( # type: ignore[return-value] func, # type: ignore[arg-type] depth=expr._depth + 1, diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index 1b98fcc46..b74aea678 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -1,11 +1,11 @@ from __future__ import annotations -from copy import copy from typing import TYPE_CHECKING from typing import Any from typing import Callable from typing import Sequence +from narwhals._expression_parsing import infer_new_root_output_names from narwhals._spark_like.utils import get_column_name from narwhals._spark_like.utils import maybe_evaluate from narwhals.typing import CompliantExpr @@ -106,30 +106,7 @@ def func(df: SparkLikeLazyFrame) -> list[Column]: results.append(column_result) return results - # Try tracking root and output names by combining them from all - # expressions appearing in args and kwargs. If any anonymous - # expression appears (e.g. nw.all()), then give up on tracking root names - # and just set it to None. - root_names = copy(self._root_names) - output_names = self._output_names - for arg in list(kwargs.values()): - if root_names is not None and isinstance(arg, self.__class__): - if arg._root_names is not None: - root_names.extend(arg._root_names) - else: # pragma: no cover - root_names = None - output_names = None - break - elif root_names is None: - output_names = None - break - - if not ( - (output_names is None and root_names is None) - or (output_names is not None and root_names is not None) - ): # pragma: no cover - msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues" - raise AssertionError(msg) + root_names, output_names = infer_new_root_output_names(self, **kwargs) return self.__class__( func, diff --git a/pyproject.toml b/pyproject.toml index 0c2b4a9be..43a1dbc12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -159,6 +159,8 @@ filterwarnings = [ 'ignore:.*defaulting to pandas implementation', 'ignore:.*implementation has mismatches with pandas', 'ignore:.*You are using pyarrow version', + # This warning was temporarily raised by pandas but then reverted. + 'ignore:.*Passing a BlockManager to DataFrame:DeprecationWarning', ] xfail_strict = true markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"] From 0800bfa7104c7f7447188e5d95bee75086e93d04 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 5 Jan 2025 22:25:52 +0000 Subject: [PATCH 06/35] fix: fix broken link from warning (#1732) --- narwhals/_pandas_like/group_by.py | 4 ++-- narwhals/_spark_like/group_by.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 3741c7130..0f1000606 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -337,7 +337,7 @@ def agg_pandas( # noqa: PLR0915 "pandas API. If you can, please rewrite your query such that group-by aggregations " "are simple (e.g. mean, std, min, max, ...). \n\n" "Please see: " - "https://narwhals-dev.github.io/narwhals/pandas_like_concepts/improve_group_by_operation.md/", + "https://narwhals-dev.github.io/narwhals/pandas_like_concepts/improve_group_by_operation/", UserWarning, stacklevel=find_stacklevel(), ) @@ -349,7 +349,7 @@ def func(df: Any) -> Any: results_keys = expr(from_dataframe(df)) if not all(len(x) == 1 for x in results_keys): msg = f"Aggregation '{expr._function_name}' failed to aggregate - does your aggregation function return a scalar? \ - \n\n Please see: https://narwhals-dev.github.io/narwhals/pandas_like_concepts/improve_group_by_operation.md/" + \n\n Please see: https://narwhals-dev.github.io/narwhals/pandas_like_concepts/improve_group_by_operation/" raise ValueError(msg) for result_keys in results_keys: diff --git a/narwhals/_spark_like/group_by.py b/narwhals/_spark_like/group_by.py index d53237b59..c7cc52bf1 100644 --- a/narwhals/_spark_like/group_by.py +++ b/narwhals/_spark_like/group_by.py @@ -162,6 +162,6 @@ def agg_pyspark( result_simple = grouped.agg(*agg_columns) except ValueError as exc: # pragma: no cover msg = "Failed to aggregated - does your aggregation function return a scalar? \ - \n\n Please see: https://narwhals-dev.github.io/narwhals/pandas_like_concepts/improve_group_by_operation.md/" + \n\n Please see: https://narwhals-dev.github.io/narwhals/pandas_like_concepts/improve_group_by_operation/" raise RuntimeError(msg) from exc return from_dataframe(result_simple) From 8c9525a3c62de8c83f6a8382c94592891e5d2f5c Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 6 Jan 2025 09:17:06 +0100 Subject: [PATCH 07/35] docs: `Expr` method' docstrings (#1733) --- narwhals/expr.py | 2556 ++++++++++++++++++++------------ narwhals/stable/v1/__init__.py | 340 +++-- 2 files changed, 1840 insertions(+), 1056 deletions(-) diff --git a/narwhals/expr.py b/narwhals/expr.py index aa934a01f..3e457989a 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -63,23 +63,27 @@ def alias(self, name: str) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [4, 5]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [4, 5]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [4, 5]}) + >>> + >>> data = {"a": [1, 2], "b": [4, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_alias(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select((nw.col("b") + 10).alias("c")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_alias`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_alias(df_pd) c 0 14 1 15 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_alias(df_pl) shape: (2, 1) ┌─────┐ │ c │ @@ -89,7 +93,8 @@ def alias(self, name: str) -> Self: │ 14 │ │ 15 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_alias(df_pa) pyarrow.Table c: int64 ---- @@ -110,11 +115,12 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se A new expression. Examples: - >>> import polars as pl >>> import pandas as pd + >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3, 4]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -122,19 +128,21 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se Lets define a library-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_pipe(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").pipe(lambda x: x + 1)).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_pipe`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_pipe(df_pd) a 0 2 1 3 2 4 3 5 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_pipe(df_pl) shape: (4, 1) ┌─────┐ │ a │ @@ -146,7 +154,8 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se │ 4 │ │ 5 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_pipe(df_pa) pyarrow.Table a: int64 ---- @@ -169,27 +178,29 @@ def cast(self: Self, dtype: DType | type[DType]) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> from datetime import date - >>> df_pd = pd.DataFrame({"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0]}) - >>> df_pl = pl.DataFrame({"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0]}) - >>> df_pa = pa.table({"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0]}) + >>> + >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_cast(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("foo").cast(nw.Float32), nw.col("bar").cast(nw.UInt8) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cast`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_cast(df_pd) foo bar 0 1.0 6 1 2.0 7 2 3.0 8 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_cast(df_pl) shape: (3, 2) ┌─────┬─────┐ │ foo ┆ bar │ @@ -200,7 +211,7 @@ def cast(self: Self, dtype: DType | type[DType]) -> Self: │ 2.0 ┆ 7 │ │ 3.0 ┆ 8 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_cast(df_pa) pyarrow.Table foo: float bar: uint8 @@ -393,22 +404,26 @@ def any(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [True, False], "b": [True, True]}) - >>> df_pl = pl.DataFrame({"a": [True, False], "b": [True, True]}) - >>> df_pa = pa.table({"a": [True, False], "b": [True, True]}) + >>> + >>> data = {"a": [True, False], "b": [True, True]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_any(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").any()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_any`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_any(df_pd) a b 0 True True - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_any(df_pl) shape: (1, 2) ┌──────┬──────┐ │ a ┆ b │ @@ -417,7 +432,8 @@ def any(self) -> Self: ╞══════╪══════╡ │ true ┆ true │ └──────┴──────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_any(df_pa) pyarrow.Table a: bool b: bool @@ -439,22 +455,26 @@ def all(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [True, False], "b": [True, True]}) - >>> df_pl = pl.DataFrame({"a": [True, False], "b": [True, True]}) - >>> df_pa = pa.table({"a": [True, False], "b": [True, True]}) + >>> + >>> data = {"a": [True, False], "b": [True, True]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_all(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").all()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_all`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_all(df_pd) a b 0 False True - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_all(df_pl) shape: (1, 2) ┌───────┬──────┐ │ a ┆ b │ @@ -463,7 +483,8 @@ def all(self) -> Self: ╞═══════╪══════╡ │ false ┆ true │ └───────┴──────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_all(df_pa) pyarrow.Table a: bool b: bool @@ -530,27 +551,28 @@ def ewm_mean( >>> import polars as pl >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_ewm_mean(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").ewm_mean(com=1, ignore_nulls=False) ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass either pandas or Polars to `agnostic_ewm_mean`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_ewm_mean(df_pd) a 0 1.000000 1 1.666667 2 2.428571 - >>> my_library_agnostic_function(df_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_ewm_mean(df_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3, 1) ┌──────────┐ │ a │ @@ -586,22 +608,26 @@ def mean(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [-1, 0, 1], "b": [2, 4, 6]}) - >>> df_pl = pl.DataFrame({"a": [-1, 0, 1], "b": [2, 4, 6]}) - >>> df_pa = pa.table({"a": [-1, 0, 1], "b": [2, 4, 6]}) + >>> + >>> data = {"a": [-1, 0, 1], "b": [2, 4, 6]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_mean(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").mean()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_mean`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_mean(df_pd) a b 0 0.0 4.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_mean(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -610,7 +636,8 @@ def mean(self) -> Self: ╞═════╪═════╡ │ 0.0 ┆ 4.0 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_mean(df_pa) pyarrow.Table a: double b: double @@ -635,22 +662,26 @@ def median(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]}) - >>> df_pl = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]}) - >>> df_pa = pa.table({"a": [1, 8, 3], "b": [4, 5, 2]}) + >>> + >>> data = {"a": [1, 8, 3], "b": [4, 5, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_median(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").median()).to_native() - We can then pass any supported library such as pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_median`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_median(df_pd) a b 0 3.0 4.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_median(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -659,7 +690,8 @@ def median(self) -> Self: ╞═════╪═════╡ │ 3.0 ┆ 4.0 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_median(df_pa) pyarrow.Table a: double b: double @@ -674,7 +706,7 @@ def std(self, *, ddof: int = 1) -> Self: Arguments: ddof: "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof, - where N represents the number of elements. By default ddof is 1. + where N represents the number of elements. By default ddof is 1. Returns: A new expression. @@ -685,22 +717,25 @@ def std(self, *, ddof: int = 1) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [20, 25, 60], "b": [1.5, 1, -1.4]}) - >>> df_pl = pl.DataFrame({"a": [20, 25, 60], "b": [1.5, 1, -1.4]}) - >>> df_pa = pa.table({"a": [20, 25, 60], "b": [1.5, 1, -1.4]}) + >>> + >>> data = {"a": [20, 25, 60], "b": [1.5, 1, -1.4]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_std(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").std(ddof=0)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_std`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_std(df_pd) a b 0 17.79513 1.265789 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_std(df_pl) shape: (1, 2) ┌──────────┬──────────┐ │ a ┆ b │ @@ -709,7 +744,7 @@ def std(self, *, ddof: int = 1) -> Self: ╞══════════╪══════════╡ │ 17.79513 ┆ 1.265789 │ └──────────┴──────────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_std(df_pa) pyarrow.Table a: double b: double @@ -736,9 +771,11 @@ def var(self, *, ddof: int = 1) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [20, 25, 60], "b": [1.5, 1, -1.4]}) - >>> df_pl = pl.DataFrame({"a": [20, 25, 60], "b": [1.5, 1, -1.4]}) - >>> df_pa = pa.table({"a": [20, 25, 60], "b": [1.5, 1, -1.4]}) + >>> + >>> data = {"a": [20, 25, 60], "b": [1.5, 1, -1.4]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: @@ -746,11 +783,13 @@ def var(self, *, ddof: int = 1) -> Self: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").var(ddof=0)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_var`: >>> agnostic_var(df_pd) a b 0 316.666667 1.602222 + >>> agnostic_var(df_pl) shape: (1, 2) ┌────────────┬──────────┐ @@ -760,6 +799,7 @@ def var(self, *, ddof: int = 1) -> Self: ╞════════════╪══════════╡ │ 316.666667 ┆ 1.602222 │ └────────────┴──────────┘ + >>> agnostic_var(df_pa) pyarrow.Table a: double @@ -796,6 +836,7 @@ def map_batches( >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -803,7 +844,7 @@ def map_batches( Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_map_batches(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a", "b").map_batches( @@ -811,14 +852,15 @@ def map_batches( ... ) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_map_batches`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_map_batches(df_pd) a b 0 2.0 5.0 1 3.0 6.0 2 4.0 7.0 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_map_batches(df_pl) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -829,7 +871,7 @@ def map_batches( │ 3.0 ┆ 6.0 │ │ 4.0 ┆ 7.0 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_map_batches(df_pa) pyarrow.Table a: double b: double @@ -854,22 +896,27 @@ def skew(self: Self) -> Self: >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw - >>> df_pd = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]}) - >>> df_pa = pa.Table.from_pandas(df_pd) + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> @nw.narwhalify - ... def func(df): - ... return df.select(nw.col("a", "b").skew()) + >>> def agnostic_skew(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.col("a", "b").skew()).to_native() - We can then pass pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_skew`: - >>> func(df_pd) + >>> agnostic_skew(df_pd) a b 0 0.0 1.472427 - >>> func(df_pl) + + >>> agnostic_skew(df_pl) shape: (1, 2) ┌─────┬──────────┐ │ a ┆ b │ @@ -878,7 +925,8 @@ def skew(self: Self) -> Self: ╞═════╪══════════╡ │ 0.0 ┆ 1.472427 │ └─────┴──────────┘ - >>> func(df_pa) + + >>> agnostic_skew(df_pa) pyarrow.Table a: double b: double @@ -900,22 +948,25 @@ def sum(self) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [5, 10], "b": [50, 100]}) - >>> df_pl = pl.DataFrame({"a": [5, 10], "b": [50, 100]}) - >>> df_pa = pa.table({"a": [5, 10], "b": [50, 100]}) + >>> + >>> data = {"a": [5, 10], "b": [50, 100]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_sum(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").sum()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_sum`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_sum(df_pd) a b 0 15 150 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_sum(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -924,7 +975,7 @@ def sum(self) -> Expr: ╞═════╪═════╡ │ 15 ┆ 150 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_sum(df_pa) pyarrow.Table a: int64 b: int64 @@ -946,22 +997,26 @@ def min(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [4, 3]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [4, 3]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [4, 3]}) + >>> + >>> data = {"a": [1, 2], "b": [4, 3]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_min(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.min("a", "b")).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_min`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_min(df_pd) a b 0 1 3 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_min(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -970,7 +1025,8 @@ def min(self) -> Self: ╞═════╪═════╡ │ 1 ┆ 3 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_min(df_pa) pyarrow.Table a: int64 b: int64 @@ -992,22 +1048,26 @@ def max(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [10, 20], "b": [50, 100]}) - >>> df_pl = pl.DataFrame({"a": [10, 20], "b": [50, 100]}) - >>> df_pa = pa.table({"a": [10, 20], "b": [50, 100]}) + >>> + >>> data = {"a": [10, 20], "b": [50, 100]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_max(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.max("a", "b")).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_max`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_max(df_pd) a b 0 20 100 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_max(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1016,7 +1076,8 @@ def max(self) -> Self: ╞═════╪═════╡ │ 20 ┆ 100 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_max(df_pa) pyarrow.Table a: int64 b: int64 @@ -1038,9 +1099,11 @@ def arg_min(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [10, 20], "b": [150, 100]}) - >>> df_pl = pl.DataFrame({"a": [10, 20], "b": [150, 100]}) - >>> df_pa = pa.table({"a": [10, 20], "b": [150, 100]}) + >>> + >>> data = {"a": [10, 20], "b": [150, 100]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: @@ -1050,11 +1113,13 @@ def arg_min(self) -> Self: ... nw.col("a", "b").arg_min().name.suffix("_arg_min") ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_arg_min`: >>> agnostic_arg_min(df_pd) a_arg_min b_arg_min 0 0 1 + >>> agnostic_arg_min(df_pl) shape: (1, 2) ┌───────────┬───────────┐ @@ -1064,6 +1129,7 @@ def arg_min(self) -> Self: ╞═══════════╪═══════════╡ │ 0 ┆ 1 │ └───────────┴───────────┘ + >>> agnostic_arg_min(df_pa) pyarrow.Table a_arg_min: int64 @@ -1086,9 +1152,11 @@ def arg_max(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [10, 20], "b": [150, 100]}) - >>> df_pl = pl.DataFrame({"a": [10, 20], "b": [150, 100]}) - >>> df_pa = pa.table({"a": [10, 20], "b": [150, 100]}) + >>> + >>> data = {"a": [10, 20], "b": [150, 100]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: @@ -1098,11 +1166,13 @@ def arg_max(self) -> Self: ... nw.col("a", "b").arg_max().name.suffix("_arg_max") ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_arg_max`: >>> agnostic_arg_max(df_pd) a_arg_max b_arg_max 0 1 0 + >>> agnostic_arg_max(df_pl) shape: (1, 2) ┌───────────┬───────────┐ @@ -1112,6 +1182,7 @@ def arg_max(self) -> Self: ╞═══════════╪═══════════╡ │ 1 ┆ 0 │ └───────────┴───────────┘ + >>> agnostic_arg_max(df_pa) pyarrow.Table a_arg_max: int64 @@ -1134,22 +1205,26 @@ def count(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) - >>> df_pa = pa.table({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> + >>> data = {"a": [1, 2, 3], "b": [None, 4, 4]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_count(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.all().count()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_count`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_count(df_pd) a b 0 3 2 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_count(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1158,7 +1233,8 @@ def count(self) -> Self: ╞═════╪═════╡ │ 3 ┆ 2 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_count(df_pa) pyarrow.Table a: int64 b: int64 @@ -1180,22 +1256,25 @@ def n_unique(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 3, 3, 5]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 3, 3, 5]}) - >>> df_pa = pa.table({"a": [1, 2, 3, 4, 5], "b": [1, 1, 3, 3, 5]}) + >>> + >>> data = {"a": [1, 2, 3, 4, 5], "b": [1, 1, 3, 3, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_n_unique(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").n_unique()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_n_unique`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_n_unique(df_pd) a b 0 5 3 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_n_unique(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1204,7 +1283,7 @@ def n_unique(self) -> Self: ╞═════╪═════╡ │ 5 ┆ 3 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_n_unique(df_pa) pyarrow.Table a: int64 b: int64 @@ -1231,24 +1310,28 @@ def unique(self, *, maintain_order: bool = False) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]}) - >>> df_pl = pl.DataFrame({"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]}) - >>> df_pa = pa.table({"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]}) + >>> + >>> data = {"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_unique(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").unique(maintain_order=True)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_unique`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_unique(df_pd) a b 0 1 2 1 3 4 2 5 6 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_unique(df_pl) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1259,7 +1342,8 @@ def unique(self, *, maintain_order: bool = False) -> Self: │ 3 ┆ 4 │ │ 5 ┆ 6 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_unique(df_pa) pyarrow.Table a: int64 b: int64 @@ -1283,6 +1367,7 @@ def abs(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, -2], "b": [-3, 4]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -1290,17 +1375,19 @@ def abs(self) -> Self: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_abs(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").abs()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_abs`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_abs(df_pd) a b 0 1 3 1 2 4 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_abs(df_pl) shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1310,7 +1397,8 @@ def abs(self) -> Self: │ 1 ┆ 3 │ │ 2 ┆ 4 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_abs(df_pa) pyarrow.Table a: int64 b: int64 @@ -1335,26 +1423,29 @@ def cum_sum(self: Self, *, reverse: bool = False) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]}) - >>> df_pl = pl.DataFrame({"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]}) - >>> df_pa = pa.table({"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]}) + >>> + >>> data = {"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_cum_sum(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").cum_sum()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cum_sum`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_cum_sum(df_pd) a b 0 1 2 1 2 6 2 5 10 3 10 16 4 15 22 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_cum_sum(df_pl) shape: (5, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1367,7 +1458,7 @@ def cum_sum(self: Self, *, reverse: bool = False) -> Self: │ 10 ┆ 16 │ │ 15 ┆ 22 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_cum_sum(df_pa) pyarrow.Table a: int64 b: int64 @@ -1395,31 +1486,35 @@ def diff(self) -> Self: nw.col("a").diff().fill_null(0).cast(nw.Int64) Examples: - >>> import polars as pl >>> import pandas as pd + >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 1, 3, 5, 5]}) - >>> df_pl = pl.DataFrame({"a": [1, 1, 3, 5, 5]}) - >>> df_pa = pa.table({"a": [1, 1, 3, 5, 5]}) + >>> + >>> data = {"a": [1, 1, 3, 5, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_diff(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(a_diff=nw.col("a").diff()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_diff`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_diff(df_pd) a_diff 0 NaN 1 0.0 2 2.0 3 2.0 4 0.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_diff(df_pl) shape: (5, 1) ┌────────┐ │ a_diff │ @@ -1432,7 +1527,8 @@ def diff(self) -> Self: │ 2 │ │ 0 │ └────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_diff(df_pa) pyarrow.Table a_diff: int64 ---- @@ -1459,31 +1555,35 @@ def shift(self, n: int) -> Self: nw.col("a").shift(1).fill_null(0).cast(nw.Int64) Examples: - >>> import polars as pl >>> import pandas as pd + >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 1, 3, 5, 5]}) - >>> df_pl = pl.DataFrame({"a": [1, 1, 3, 5, 5]}) - >>> df_pa = pa.table({"a": [1, 1, 3, 5, 5]}) + >>> + >>> data = {"a": [1, 1, 3, 5, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_shift(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(a_shift=nw.col("a").shift(n=1)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_shift`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_shift(df_pd) a_shift 0 NaN 1 1.0 2 1.0 3 3.0 4 5.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_shift(df_pl) shape: (5, 1) ┌─────────┐ │ a_shift │ @@ -1496,7 +1596,8 @@ def shift(self, n: int) -> Self: │ 3 │ │ 5 │ └─────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_shift(df_pa) pyarrow.Table a_shift: int64 ---- @@ -1528,18 +1629,20 @@ def replace_strict( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> df_pd = pd.DataFrame({"a": [3, 0, 1, 2]}) - >>> df_pl = pl.DataFrame({"a": [3, 0, 1, 2]}) - >>> df_pa = pa.table({"a": [3, 0, 1, 2]}) + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [3, 0, 1, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define dataframe-agnostic functions: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_replace_strict(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... b=nw.col("a").replace_strict( @@ -1549,15 +1652,17 @@ def replace_strict( ... ) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_replace_strict`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_replace_strict(df_pd) a b 0 3 three 1 0 zero 2 1 one 3 2 two - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_replace_strict(df_pl) shape: (4, 2) ┌─────┬───────┐ │ a ┆ b │ @@ -1569,7 +1674,8 @@ def replace_strict( │ 1 ┆ one │ │ 2 ┆ two │ └─────┴───────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_replace_strict(df_pa) pyarrow.Table a: int64 b: string @@ -1602,35 +1708,38 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> df_pd = pd.DataFrame({"a": [5, None, 1, 2]}) - >>> df_pl = pl.DataFrame({"a": [5, None, 1, 2]}) - >>> df_pa = pa.table({"a": [5, None, 1, 2]}) + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [5, None, 1, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define dataframe-agnostic functions: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_sort(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").sort()).to_native() - >>> def func_descend(df): - ... df = nw.from_native(df) - ... df = df.select(nw.col("a").sort(descending=True)) - ... return nw.to_native(df) + >>> def agnostic_sort_descending(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.col("a").sort(descending=True)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_sort` and `agnostic_sort_descending`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_sort(df_pd) a 1 NaN 2 1.0 3 2.0 0 5.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_sort(df_pl) shape: (4, 1) ┌──────┐ │ a │ @@ -1642,19 +1751,21 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: │ 2 │ │ 5 │ └──────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_sort(df_pa) pyarrow.Table a: int64 ---- a: [[null,1,2,5]] - >>> func_descend(df_pd) + >>> agnostic_sort_descending(df_pd) a 1 NaN 0 5.0 3 2.0 2 1.0 - >>> func_descend(df_pl) + + >>> agnostic_sort_descending(df_pl) shape: (4, 1) ┌──────┐ │ a │ @@ -1666,7 +1777,8 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: │ 2 │ │ 1 │ └──────┘ - >>> func_descend(df_pa) + + >>> agnostic_sort_descending(df_pa) pyarrow.Table a: int64 ---- @@ -1701,26 +1813,30 @@ def is_between( >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2, 3, 4, 5]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) - >>> df_pa = pa.table({"a": [1, 2, 3, 4, 5]}) + >>> + >>> data = {"a": [1, 2, 3, 4, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_between(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").is_between(2, 4, "right")).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_between`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_is_between(df_pd) a 0 False 1 False 2 True 3 True 4 False - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_is_between(df_pl) shape: (5, 1) ┌───────┐ │ a │ @@ -1733,7 +1849,8 @@ def is_between( │ true │ │ false │ └───────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_is_between(df_pa) pyarrow.Table a: bool ---- @@ -1762,26 +1879,29 @@ def is_in(self, other: Any) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2, 9, 10]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 9, 10]}) - >>> df_pa = pa.table({"a": [1, 2, 9, 10]}) + >>> + >>> data = {"a": [1, 2, 9, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_in(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns(b=nw.col("a").is_in([1, 2])).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_in`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_is_in(df_pd) a b 0 1 True 1 2 True 2 9 False 3 10 False - >>> my_library_agnostic_function(df_pl) + >>> agnostic_is_in(df_pl) shape: (4, 2) ┌─────┬───────┐ │ a ┆ b │ @@ -1793,7 +1913,8 @@ def is_in(self, other: Any) -> Self: │ 9 ┆ false │ │ 10 ┆ false │ └─────┴───────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_is_in(df_pa) pyarrow.Table a: int64 b: bool @@ -1821,32 +1942,36 @@ def filter(self, *predicates: Any) -> Self: A new expression. Examples: - >>> import polars as pl >>> import pandas as pd + >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [2, 3, 4, 5, 6, 7], "b": [10, 11, 12, 13, 14, 15]}) - >>> df_pl = pl.DataFrame({"a": [2, 3, 4, 5, 6, 7], "b": [10, 11, 12, 13, 14, 15]}) - >>> df_pa = pa.table({"a": [2, 3, 4, 5, 6, 7], "b": [10, 11, 12, 13, 14, 15]}) + >>> + >>> data = {"a": [2, 3, 4, 5, 6, 7], "b": [10, 11, 12, 13, 14, 15]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").filter(nw.col("a") > 4), ... nw.col("b").filter(nw.col("b") < 13), ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_filter`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_filter(df_pd) a b 3 5 10 4 6 11 5 7 12 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_filter(df_pl) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1857,7 +1982,8 @@ def filter(self, *predicates: Any) -> Self: │ 6 ┆ 11 │ │ 7 ┆ 12 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_filter(df_pa) pyarrow.Table a: int64 b: int64 @@ -1888,13 +2014,19 @@ def is_null(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> df_pd = pd.DataFrame( - ... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]} - ... ) - >>> df_pl = pl.DataFrame( - ... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, None, 3.0, 5.0]} + ... { + ... "a": [2, 4, None, 3, 5], + ... "b": [2.0, 4.0, float("nan"), 3.0, 5.0], + ... } ... ) - >>> df_pa = pa.table({"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, None, 3.0, 5.0]}) + >>> data = { + ... "a": [2, 4, None, 3, 5], + ... "b": [2.0, 4.0, None, 3.0, 5.0], + ... } + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: @@ -1904,7 +2036,8 @@ def is_null(self) -> Self: ... a_is_null=nw.col("a").is_null(), b_is_null=nw.col("b").is_null() ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_is_null`: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_is_null`: >>> agnostic_is_null(df_pd) a b a_is_null b_is_null @@ -1959,6 +2092,7 @@ def is_nan(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"orig": [0.0, None, 2.0]} >>> df_pd = pd.DataFrame(data).astype({"orig": "Float64"}) >>> df_pl = pl.DataFrame(data) @@ -1973,7 +2107,8 @@ def is_nan(self) -> Self: ... divided_is_nan=(nw.col("orig") / nw.col("orig")).is_nan(), ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_self_div_is_nan`: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_self_div_is_nan`: >>> print(agnostic_self_div_is_nan(df_pd)) orig divided divided_is_nan @@ -2002,7 +2137,6 @@ def is_nan(self) -> Self: orig: [[0,null,2]] divided: [[nan,null,1]] divided_is_nan: [[true,null,false]] - """ return self.__class__(lambda plx: self._to_compliant_expr(plx).is_nan()) @@ -2018,6 +2152,7 @@ def arg_true(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, None, None, 2]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2025,17 +2160,19 @@ def arg_true(self) -> Self: We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_arg_true(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").is_null().arg_true()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_arg_true`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_arg_true(df_pd) a 1 1 2 2 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_arg_true(df_pl) shape: (2, 1) ┌─────┐ │ a │ @@ -2045,7 +2182,8 @@ def arg_true(self) -> Self: │ 1 │ │ 2 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_arg_true(df_pa) pyarrow.Table a: int64 ---- @@ -2080,24 +2218,19 @@ def fill_null( >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> df_pd = pd.DataFrame( ... { ... "a": [2, 4, None, None, 3, 5], ... "b": [2.0, 4.0, float("nan"), float("nan"), 3.0, 5.0], ... } ... ) - >>> df_pl = pl.DataFrame( - ... { - ... "a": [2, 4, None, None, 3, 5], - ... "b": [2.0, 4.0, None, None, 3.0, 5.0], - ... } - ... ) - >>> df_pa = pa.table( - ... { - ... "a": [2, 4, None, None, 3, 5], - ... "b": [2.0, 4.0, None, None, 3.0, 5.0], - ... } - ... ) + >>> data = { + ... "a": [2, 4, None, None, 3, 5], + ... "b": [2.0, 4.0, None, None, 3.0, 5.0], + ... } + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: @@ -2105,7 +2238,8 @@ def fill_null( ... df = nw.from_native(df_native) ... return df.with_columns(nw.col("a", "b").fill_null(0)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_fill_null`: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_fill_null`: >>> agnostic_fill_null(df_pd) a b @@ -2213,12 +2347,12 @@ def drop_nulls(self) -> Self: for reference. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> df_pd = pd.DataFrame({"a": [2.0, 4.0, float("nan"), 3.0, None, 5.0]}) >>> df_pl = pl.DataFrame({"a": [2.0, 4.0, None, 3.0, None, 5.0]}) >>> df_pa = pa.table({"a": [2.0, 4.0, None, 3.0, None, 5.0]}) @@ -2229,7 +2363,8 @@ def drop_nulls(self) -> Self: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").drop_nulls()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_drop_nulls`: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_drop_nulls`: >>> agnostic_drop_nulls(df_pd) a @@ -2237,6 +2372,7 @@ def drop_nulls(self) -> Self: 1 4.0 3 3.0 5 5.0 + >>> agnostic_drop_nulls(df_pl) shape: (4, 1) ┌─────┐ @@ -2249,6 +2385,7 @@ def drop_nulls(self) -> Self: │ 3.0 │ │ 5.0 │ └─────┘ + >>> agnostic_drop_nulls(df_pa) pyarrow.Table a: double @@ -2278,31 +2415,35 @@ def sample( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> df_pd = pd.DataFrame({"a": [1, 2, 3]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 3]}) - >>> df_pa = pa.table({"a": [1, 2, 3]}) + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 2, 3]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_sample(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").sample(fraction=1.0, with_replacement=True) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_sample`: - >>> my_library_agnostic_function(df_pd) # doctest: +SKIP + >>> agnostic_sample(df_pd) # doctest: +SKIP a 2 3 0 1 2 3 - >>> my_library_agnostic_function(df_pl) # doctest: +SKIP + + >>> agnostic_sample(df_pl) # doctest: +SKIP shape: (3, 1) ┌─────┐ │ a │ @@ -2313,7 +2454,8 @@ def sample( │ 3 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(df_pa) # doctest: +SKIP + + >>> agnostic_sample(df_pa) # doctest: +SKIP pyarrow.Table a: int64 ---- @@ -2337,11 +2479,12 @@ def over(self, *keys: str | Iterable[str]) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3], "b": [1, 1, 2]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2355,13 +2498,15 @@ def over(self, *keys: str | Iterable[str]) -> Self: ... a_min_per_group=nw.col("a").min().over("b") ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_min_over_b`: >>> agnostic_min_over_b(df_pd) a b a_min_per_group 0 1 1 1 1 2 1 1 2 3 2 3 + >>> agnostic_min_over_b(df_pl) shape: (3, 3) ┌─────┬─────┬─────────────────┐ @@ -2373,6 +2518,7 @@ def over(self, *keys: str | Iterable[str]) -> Self: │ 2 ┆ 1 ┆ 1 │ │ 3 ┆ 2 ┆ 3 │ └─────┴─────┴─────────────────┘ + >>> agnostic_min_over_b(df_pa) pyarrow.Table a: int64 @@ -2395,6 +2541,7 @@ def over(self, *keys: str | Iterable[str]) -> Self: 0 1 1 1 1 2 1 3 2 3 2 3 + >>> agnostic_cum_sum(df_pl) shape: (3, 3) ┌─────┬─────┬─────┐ @@ -2418,11 +2565,12 @@ def is_duplicated(self) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2430,19 +2578,21 @@ def is_duplicated(self) -> Self: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_duplicated(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.all().is_duplicated()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_duplicated`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_is_duplicated(df_pd) a b 0 True True 1 False True 2 False False 3 True False - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_is_duplicated(df_pl) shape: (4, 2) ┌───────┬───────┐ │ a ┆ b │ @@ -2454,7 +2604,8 @@ def is_duplicated(self) -> Self: │ false ┆ false │ │ true ┆ false │ └───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_is_duplicated(df_pa) pyarrow.Table a: bool b: bool @@ -2471,11 +2622,12 @@ def is_unique(self) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2483,19 +2635,21 @@ def is_unique(self) -> Self: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_unique(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.all().is_unique()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_unique`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_is_unique(df_pd) a b 0 False False 1 True False 2 True True 3 False True - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_is_unique(df_pl) shape: (4, 2) ┌───────┬───────┐ │ a ┆ b │ @@ -2507,7 +2661,8 @@ def is_unique(self) -> Self: │ true ┆ true │ │ false ┆ true │ └───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_is_unique(df_pa) pyarrow.Table a: bool b: bool @@ -2529,11 +2684,12 @@ def null_count(self) -> Self: for reference. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, None, 1], "b": ["a", None, "b", None]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2545,11 +2701,13 @@ def null_count(self) -> Self: ... df = nw.from_native(df_native) ... return df.select(nw.all().null_count()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_null_count`: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_null_count`: >>> agnostic_null_count(df_pd) a b 0 1 2 + >>> agnostic_null_count(df_pl) shape: (1, 2) ┌─────┬─────┐ @@ -2559,6 +2717,7 @@ def null_count(self) -> Self: ╞═════╪═════╡ │ 1 ┆ 2 │ └─────┴─────┘ + >>> agnostic_null_count(df_pa) pyarrow.Table a: int64 @@ -2576,11 +2735,12 @@ def is_first_distinct(self) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2588,19 +2748,21 @@ def is_first_distinct(self) -> Self: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_first_distinct(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.all().is_first_distinct()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_first_distinct`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_is_first_distinct(df_pd) a b 0 True True 1 True False 2 True True 3 False True - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_is_first_distinct(df_pl) shape: (4, 2) ┌───────┬───────┐ │ a ┆ b │ @@ -2612,7 +2774,8 @@ def is_first_distinct(self) -> Self: │ true ┆ true │ │ false ┆ true │ └───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_is_first_distinct(df_pa) pyarrow.Table a: bool b: bool @@ -2631,11 +2794,12 @@ def is_last_distinct(self) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2643,19 +2807,21 @@ def is_last_distinct(self) -> Self: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_last_distinct(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.all().is_last_distinct()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_last_distinct`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_is_last_distinct(df_pd) a b 0 False False 1 True True 2 True True 3 True True - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_is_last_distinct(df_pl) shape: (4, 2) ┌───────┬───────┐ │ a ┆ b │ @@ -2667,7 +2833,8 @@ def is_last_distinct(self) -> Self: │ true ┆ true │ │ true ┆ true │ └───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_is_last_distinct(df_pa) pyarrow.Table a: bool b: bool @@ -2699,11 +2866,12 @@ def quantile( native 'dask' - method. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": list(range(50)), "b": list(range(50, 100))} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2711,19 +2879,20 @@ def quantile( Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_quantile(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a", "b").quantile(0.5, interpolation="linear") ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_quantile`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_quantile(df_pd) a b 0 24.5 74.5 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_quantile(df_pl) shape: (1, 2) ┌──────┬──────┐ │ a ┆ b │ @@ -2732,7 +2901,8 @@ def quantile( ╞══════╪══════╡ │ 24.5 ┆ 74.5 │ └──────┴──────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_quantile(df_pa) pyarrow.Table a: double b: double @@ -2754,11 +2924,12 @@ def head(self, n: int = 10) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": list(range(10))} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2766,18 +2937,20 @@ def head(self, n: int = 10) -> Self: Let's define a dataframe-agnostic function that returns the first 3 rows: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_head(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").head(3)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_head`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_head(df_pd) a 0 0 1 1 2 2 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_head(df_pl) shape: (3, 1) ┌─────┐ │ a │ @@ -2788,7 +2961,8 @@ def head(self, n: int = 10) -> Self: │ 1 │ │ 2 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_head(df_pa) pyarrow.Table a: int64 ---- @@ -2806,11 +2980,12 @@ def tail(self, n: int = 10) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": list(range(10))} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2818,18 +2993,20 @@ def tail(self, n: int = 10) -> Self: Let's define a dataframe-agnostic function that returns the last 3 rows: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_tail(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").tail(3)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_tail`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_tail(df_pd) a 7 7 8 8 9 9 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_tail(df_pl) shape: (3, 1) ┌─────┐ │ a │ @@ -2840,7 +3017,8 @@ def tail(self, n: int = 10) -> Self: │ 8 │ │ 9 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_tail(df_pa) pyarrow.Table a: int64 ---- @@ -2867,11 +3045,12 @@ def round(self, decimals: int = 0) -> Self: Polars and Arrow round away from 0 (e.g. -0.5 to -1.0, 0.5 to 1.0, 1.5 to 2.0, 2.5 to 3.0, etc..). Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1.12345, 2.56789, 3.901234]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2879,18 +3058,20 @@ def round(self, decimals: int = 0) -> Self: Let's define a dataframe-agnostic function that rounds to the first decimal: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_round(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").round(1)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_round`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_round(df_pd) a 0 1.1 1 2.6 2 3.9 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_round(df_pl) shape: (3, 1) ┌─────┐ │ a │ @@ -2901,7 +3082,8 @@ def round(self, decimals: int = 0) -> Self: │ 2.6 │ │ 3.9 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_round(df_pa) pyarrow.Table a: double ---- @@ -2918,31 +3100,35 @@ def len(self) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": ["x", "y", "z"], "b": [1, 2, 1]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) >>> df_pa = pa.table(data) - Let's define a dataframe-agnostic function that computes the len over different values of "b" column: + Let's define a dataframe-agnostic function that computes the len over + different values of "b" column: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_len(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").filter(nw.col("b") == 1).len().alias("a1"), ... nw.col("a").filter(nw.col("b") == 2).len().alias("a2"), ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_len`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_len(df_pd) a1 a2 0 2 1 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_len(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a1 ┆ a2 │ @@ -2951,7 +3137,8 @@ def len(self) -> Self: ╞═════╪═════╡ │ 2 ┆ 1 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_len(df_pa) pyarrow.Table a1: int64 a2: int64 @@ -2972,11 +3159,12 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2985,17 +3173,19 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: Let's define a dataframe-agnostic function in which gather every 2 rows, starting from a offset of 1: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_gather_every(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").gather_every(n=2, offset=1)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_gather_every`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_gather_every(df_pd) a 1 2 3 4 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_gather_every(df_pl) shape: (2, 1) ┌─────┐ │ a │ @@ -3005,7 +3195,8 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: │ 2 │ │ 4 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_gather_every(df_pa) pyarrow.Table a: int64 ---- @@ -3037,29 +3228,31 @@ def clip( >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - - >>> s = [1, 2, 3] - >>> df_pd = pd.DataFrame({"s": s}) - >>> df_pl = pl.DataFrame({"s": s}) - >>> df_pa = pa.table({"s": s}) + >>> + >>> data = {"a": [1, 2, 3]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def func_lower(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_clip_lower(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.select(nw.col("s").clip(2)).to_native() + ... return df.select(nw.col("a").clip(2)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func_lower`: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_clip_lower`: - >>> func_lower(df_pd) - s + >>> agnostic_clip_lower(df_pd) + a 0 2 1 2 2 3 - >>> func_lower(df_pl) + + >>> agnostic_clip_lower(df_pl) shape: (3, 1) ┌─────┐ - │ s │ + │ a │ │ --- │ │ i64 │ ╞═════╡ @@ -3067,29 +3260,32 @@ def clip( │ 2 │ │ 3 │ └─────┘ - >>> func_lower(df_pa) + + >>> agnostic_clip_lower(df_pa) pyarrow.Table - s: int64 + a: int64 ---- - s: [[2,2,3]] + a: [[2,2,3]] We define another library agnostic function: - >>> def func_upper(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_clip_upper(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.select(nw.col("s").clip(upper_bound=2)).to_native() + ... return df.select(nw.col("a").clip(upper_bound=2)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func_upper`: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_clip_upper`: - >>> func_upper(df_pd) - s + >>> agnostic_clip_upper(df_pd) + a 0 1 1 2 2 2 - >>> func_upper(df_pl) + + >>> agnostic_clip_upper(df_pl) shape: (3, 1) ┌─────┐ - │ s │ + │ a │ │ --- │ │ i64 │ ╞═════╡ @@ -3097,39 +3293,42 @@ def clip( │ 2 │ │ 2 │ └─────┘ - >>> func_upper(df_pa) + + >>> agnostic_clip_upper(df_pa) pyarrow.Table - s: int64 + a: int64 ---- - s: [[1,2,2]] + a: [[1,2,2]] We can have both at the same time - >>> s = [-1, 1, -3, 3, -5, 5] - >>> df_pd = pd.DataFrame({"s": s}) - >>> df_pl = pl.DataFrame({"s": s}) - >>> df_pa = pa.table({"s": s}) + >>> data = {"a": [-1, 1, -3, 3, -5, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_clip(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.select(nw.col("s").clip(-1, 3)).to_native() + ... return df.select(nw.col("a").clip(-1, 3)).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_clip`: - >>> my_library_agnostic_function(df_pd) - s + >>> agnostic_clip(df_pd) + a 0 -1 1 1 2 -1 3 3 4 -1 5 3 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_clip(df_pl) shape: (6, 1) ┌─────┐ - │ s │ + │ a │ │ --- │ │ i64 │ ╞═════╡ @@ -3140,11 +3339,12 @@ def clip( │ -1 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_clip(df_pa) pyarrow.Table - s: int64 + a: int64 ---- - s: [[-1,1,-1,3,-1,3]] + a: [[-1,1,-1,3,-1,3]] """ return self.__class__( lambda plx: self._to_compliant_expr(plx).clip( @@ -3167,7 +3367,7 @@ def mode(self: Self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - + >>> >>> data = { ... "a": [1, 1, 2, 3], ... "b": [1, 1, 2, 2], @@ -3178,17 +3378,18 @@ def mode(self: Self) -> Self: We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_mode(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").mode()).sort("a").to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_mode`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_mode(df_pd) a 0 1 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_mode(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -3198,7 +3399,7 @@ def mode(self: Self) -> Self: │ 1 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_mode(df_pa) pyarrow.Table a: int64 ---- @@ -3218,28 +3419,34 @@ def is_finite(self: Self) -> Self: Expression of `Boolean` data type. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [float("nan"), float("inf"), 2.0, None]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_finite(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").is_finite()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_finite`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_is_finite(df_pd) a 0 False 1 False 2 True 3 False - >>> my_library_agnostic_function(pl.DataFrame(data)) + + >>> agnostic_is_finite(df_pl) shape: (4, 1) ┌───────┐ │ a │ @@ -3252,7 +3459,7 @@ def is_finite(self: Self) -> Self: │ null │ └───────┘ - >>> my_library_agnostic_function(pa.table(data)) + >>> agnostic_is_finite(df_pa) pyarrow.Table a: bool ---- @@ -3270,32 +3477,37 @@ def cum_count(self: Self, *, reverse: bool = False) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": ["x", "k", None, "d"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_cum_count(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.col("a").cum_count().alias("cum_count"), ... nw.col("a").cum_count(reverse=True).alias("cum_count_reverse"), ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cum_count`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_cum_count(df_pd) a cum_count cum_count_reverse 0 x 1 3 1 k 2 2 2 None 2 1 3 d 3 1 - >>> my_library_agnostic_function(pl.DataFrame(data)) + >>> agnostic_cum_count(df_pl) shape: (4, 3) ┌──────┬───────────┬───────────────────┐ │ a ┆ cum_count ┆ cum_count_reverse │ @@ -3308,7 +3520,7 @@ def cum_count(self: Self, *, reverse: bool = False) -> Self: │ d ┆ 3 ┆ 1 │ └──────┴───────────┴───────────────────┘ - >>> my_library_agnostic_function(pa.table(data)) + >>> agnostic_cum_count(df_pa) pyarrow.Table a: string cum_count: uint32 @@ -3332,32 +3544,37 @@ def cum_min(self: Self, *, reverse: bool = False) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [3, 1, None, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_cum_min(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.col("a").cum_min().alias("cum_min"), ... nw.col("a").cum_min(reverse=True).alias("cum_min_reverse"), ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cum_min`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_cum_min(df_pd) a cum_min cum_min_reverse 0 3.0 3.0 1.0 1 1.0 1.0 1.0 2 NaN NaN NaN 3 2.0 1.0 2.0 - >>> my_library_agnostic_function(pl.DataFrame(data)) + >>> agnostic_cum_min(df_pl) shape: (4, 3) ┌──────┬─────────┬─────────────────┐ │ a ┆ cum_min ┆ cum_min_reverse │ @@ -3370,7 +3587,7 @@ def cum_min(self: Self, *, reverse: bool = False) -> Self: │ 2 ┆ 1 ┆ 2 │ └──────┴─────────┴─────────────────┘ - >>> my_library_agnostic_function(pa.table(data)) + >>> agnostic_cum_min(df_pa) pyarrow.Table a: int64 cum_min: int64 @@ -3394,32 +3611,37 @@ def cum_max(self: Self, *, reverse: bool = False) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 3, None, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_cum_max(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.col("a").cum_max().alias("cum_max"), ... nw.col("a").cum_max(reverse=True).alias("cum_max_reverse"), ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_cum_max(df_pd) a cum_max cum_max_reverse 0 1.0 1.0 3.0 1 3.0 3.0 3.0 2 NaN NaN NaN 3 2.0 3.0 2.0 - >>> my_library_agnostic_function(pl.DataFrame(data)) + >>> agnostic_cum_max(df_pl) shape: (4, 3) ┌──────┬─────────┬─────────────────┐ │ a ┆ cum_max ┆ cum_max_reverse │ @@ -3432,7 +3654,7 @@ def cum_max(self: Self, *, reverse: bool = False) -> Self: │ 2 ┆ 3 ┆ 2 │ └──────┴─────────┴─────────────────┘ - >>> my_library_agnostic_function(pa.table(data)) + >>> agnostic_cum_max(df_pa) pyarrow.Table a: int64 cum_max: int64 @@ -3456,32 +3678,37 @@ def cum_prod(self: Self, *, reverse: bool = False) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 3, None, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_cum_prod(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.col("a").cum_prod().alias("cum_prod"), ... nw.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cum_prod`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_cum_prod(df_pd) a cum_prod cum_prod_reverse 0 1.0 1.0 6.0 1 3.0 3.0 6.0 2 NaN NaN NaN 3 2.0 6.0 2.0 - >>> my_library_agnostic_function(pl.DataFrame(data)) + >>> agnostic_cum_prod(df_pl) shape: (4, 3) ┌──────┬──────────┬──────────────────┐ │ a ┆ cum_prod ┆ cum_prod_reverse │ @@ -3494,7 +3721,7 @@ def cum_prod(self: Self, *, reverse: bool = False) -> Self: │ 2 ┆ 6 ┆ 2 │ └──────┴──────────┴──────────────────┘ - >>> my_library_agnostic_function(pa.table(data)) + >>> agnostic_cum_prod(df_pa) pyarrow.Table a: int64 cum_prod: int64 @@ -3540,11 +3767,12 @@ def rolling_sum( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1.0, 2.0, None, 4.0]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -3558,7 +3786,8 @@ def rolling_sum( ... b=nw.col("a").rolling_sum(window_size=3, min_periods=1) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_sum`: >>> agnostic_rolling_sum(df_pd) a b @@ -3580,7 +3809,7 @@ def rolling_sum( │ 4.0 ┆ 6.0 │ └──────┴─────┘ - >>> agnostic_rolling_sum(df_pa) # doctest:+ELLIPSIS + >>> agnostic_rolling_sum(df_pa) pyarrow.Table a: double b: double @@ -3632,11 +3861,12 @@ def rolling_mean( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1.0, 2.0, None, 4.0]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -3650,7 +3880,8 @@ def rolling_mean( ... b=nw.col("a").rolling_mean(window_size=3, min_periods=1) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_mean`: >>> agnostic_rolling_mean(df_pd) a b @@ -3672,7 +3903,7 @@ def rolling_mean( │ 4.0 ┆ 3.0 │ └──────┴─────┘ - >>> agnostic_rolling_mean(df_pa) # doctest:+ELLIPSIS + >>> agnostic_rolling_mean(df_pa) pyarrow.Table a: double b: double @@ -3726,11 +3957,12 @@ def rolling_var( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1.0, 2.0, None, 4.0]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -3744,7 +3976,8 @@ def rolling_var( ... b=nw.col("a").rolling_var(window_size=3, min_periods=1) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_var`: >>> agnostic_rolling_var(df_pd) a b @@ -3766,7 +3999,7 @@ def rolling_var( │ 4.0 ┆ 2.0 │ └──────┴──────┘ - >>> agnostic_rolling_var(df_pa) # doctest:+ELLIPSIS + >>> agnostic_rolling_var(df_pa) pyarrow.Table a: double b: double @@ -3818,11 +4051,12 @@ def rolling_std( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1.0, 2.0, None, 4.0]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -3836,7 +4070,8 @@ def rolling_std( ... b=nw.col("a").rolling_std(window_size=3, min_periods=1) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_std`: >>> agnostic_rolling_std(df_pd) a b @@ -3858,7 +4093,7 @@ def rolling_std( │ 4.0 ┆ 1.414214 │ └──────┴──────────┘ - >>> agnostic_rolling_std(df_pa) # doctest:+ELLIPSIS + >>> agnostic_rolling_std(df_pa) pyarrow.Table a: double b: double @@ -3918,8 +4153,10 @@ def get_categories(self: Self) -> ExprT: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"fruits": ["apple", "mango", "mango"]} >>> df_pd = pd.DataFrame(data, dtype="category") >>> df_pl = pl.DataFrame(data, schema={"fruits": pl.Categorical}) @@ -3927,17 +4164,19 @@ def get_categories(self: Self) -> ExprT: We define a dataframe-agnostic function to get unique categories from column 'fruits': - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_cat_get_categories(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("fruits").cat.get_categories()).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas or Polars to + `agnostic_cat_get_categories`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_cat_get_categories(df_pd) fruits 0 apple 1 mango - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_cat_get_categories(df_pl) shape: (2, 1) ┌────────┐ │ fruits │ @@ -3966,23 +4205,27 @@ def len_chars(self: Self) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"words": ["foo", "Café", "345", "東京", None]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_len_chars(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... words_len=nw.col("words").str.len_chars() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_len_chars`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_len_chars(df_pd) words words_len 0 foo 3.0 1 Café 4.0 @@ -3990,7 +4233,7 @@ def len_chars(self: Self) -> ExprT: 3 東京 2.0 4 None NaN - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_len_chars(df_pl) shape: (5, 2) ┌───────┬───────────┐ │ words ┆ words_len │ @@ -4003,6 +4246,14 @@ def len_chars(self: Self) -> ExprT: │ 東京 ┆ 2 │ │ null ┆ null │ └───────┴───────────┘ + + >>> agnostic_str_len_chars(df_pa) + pyarrow.Table + words: string + words_len: int32 + ---- + words: [["foo","Café","345","東京",null]] + words_len: [[3,4,3,2,null]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.len_chars() @@ -4025,27 +4276,31 @@ def replace( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"foo": ["123abc", "abc abc123"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_replace(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... df = df.with_columns(replaced=nw.col("foo").str.replace("abc", "")) ... return df.to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_replace`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_replace(df_pd) foo replaced 0 123abc 123 1 abc abc123 abc123 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_replace(df_pl) shape: (2, 2) ┌────────────┬──────────┐ │ foo ┆ replaced │ @@ -4055,6 +4310,14 @@ def replace( │ 123abc ┆ 123 │ │ abc abc123 ┆ abc123 │ └────────────┴──────────┘ + + >>> agnostic_str_replace(df_pa) + pyarrow.Table + foo: string + replaced: string + ---- + foo: [["123abc","abc abc123"]] + replaced: [["123"," abc123"]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.replace( @@ -4078,27 +4341,31 @@ def replace_all( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"foo": ["123abc", "abc abc123"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_replace_all(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... df = df.with_columns(replaced=nw.col("foo").str.replace_all("abc", "")) ... return df.to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_replace_all`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_replace_all(df_pd) foo replaced 0 123abc 123 1 abc abc123 123 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_replace_all(df_pl) shape: (2, 2) ┌────────────┬──────────┐ │ foo ┆ replaced │ @@ -4108,6 +4375,14 @@ def replace_all( │ 123abc ┆ 123 │ │ abc abc123 ┆ 123 │ └────────────┴──────────┘ + + >>> agnostic_str_replace_all(df_pa) + pyarrow.Table + foo: string + replaced: string + ---- + foo: [["123abc","abc abc123"]] + replaced: [["123"," 123"]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.replace_all( @@ -4119,34 +4394,44 @@ def strip_chars(self: Self, characters: str | None = None) -> ExprT: r"""Remove leading and trailing characters. Arguments: - characters: The set of characters to be removed. All combinations of this set of characters will be stripped from the start and end of the string. If set to None (default), all leading and trailing whitespace is removed instead. + characters: The set of characters to be removed. All combinations of this + set of characters will be stripped from the start and end of the string. + If set to None (default), all leading and trailing whitespace is removed + instead. Returns: A new expression. Examples: + >>> from typing import Any >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrame - >>> from typing import Any + >>> >>> data = {"fruits": ["apple", "\nmango"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrame) -> dict[str, Any]: + >>> def agnostic_str_strip_chars(df_native: IntoFrame) -> dict[str, Any]: ... df = nw.from_native(df_native) ... df = df.with_columns(stripped=nw.col("fruits").str.strip_chars()) ... return df.to_dict(as_series=False) - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_strip_chars`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_strip_chars(df_pd) {'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']} - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_strip_chars(df_pl) + {'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']} + + >>> agnostic_str_strip_chars(df_pa) {'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']} """ return self._expr.__class__( @@ -4165,29 +4450,33 @@ def starts_with(self: Self, prefix: str) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"fruits": ["apple", "mango", None]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_starts_with(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... has_prefix=nw.col("fruits").str.starts_with("app") ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_starts_with`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_starts_with(df_pd) fruits has_prefix 0 apple True 1 mango False 2 None None - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_starts_with(df_pl) shape: (3, 2) ┌────────┬────────────┐ │ fruits ┆ has_prefix │ @@ -4198,6 +4487,14 @@ def starts_with(self: Self, prefix: str) -> ExprT: │ mango ┆ false │ │ null ┆ null │ └────────┴────────────┘ + + >>> agnostic_str_starts_with(df_pa) + pyarrow.Table + fruits: string + has_prefix: bool + ---- + fruits: [["apple","mango",null]] + has_prefix: [[true,false,null]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.starts_with(prefix) @@ -4215,29 +4512,33 @@ def ends_with(self: Self, suffix: str) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"fruits": ["apple", "mango", None]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_ends_with(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... has_suffix=nw.col("fruits").str.ends_with("ngo") ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_ends_with`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_ends_with(df_pd) fruits has_suffix 0 apple False 1 mango True 2 None None - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_ends_with(df_pl) shape: (3, 2) ┌────────┬────────────┐ │ fruits ┆ has_suffix │ @@ -4248,6 +4549,14 @@ def ends_with(self: Self, suffix: str) -> ExprT: │ mango ┆ true │ │ null ┆ null │ └────────┴────────────┘ + + >>> agnostic_str_ends_with(df_pa) + pyarrow.Table + fruits: string + has_suffix: bool + ---- + fruits: [["apple","mango",null]] + has_suffix: [[false,true,null]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.ends_with(suffix) @@ -4270,6 +4579,7 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> ExprT: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"pets": ["cat", "dog", "rabbit and parrot", "dove", None]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -4277,7 +4587,7 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> ExprT: We define a dataframe-agnostic function: - >>> def agnostic_contains(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_contains(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... default_match=nw.col("pets").str.contains("parrot|Dove"), @@ -4287,9 +4597,10 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> ExprT: ... ), ... ).to_native() - We can then pass any supported library such as pandas, Polars, or PyArrow to `agnostic_contains`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_contains`: - >>> agnostic_contains(df_pd) + >>> agnostic_str_contains(df_pd) pets default_match case_insensitive_match literal_match 0 cat False False False 1 dog False False False @@ -4297,7 +4608,7 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> ExprT: 3 dove False True False 4 None None None None - >>> agnostic_contains(df_pl) + >>> agnostic_str_contains(df_pl) shape: (5, 4) ┌───────────────────┬───────────────┬────────────────────────┬───────────────┐ │ pets ┆ default_match ┆ case_insensitive_match ┆ literal_match │ @@ -4311,7 +4622,7 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> ExprT: │ null ┆ null ┆ null ┆ null │ └───────────────────┴───────────────┴────────────────────────┴───────────────┘ - >>> agnostic_contains(df_pa) + >>> agnostic_str_contains(df_pa) pyarrow.Table pets: string default_match: bool @@ -4343,30 +4654,34 @@ def slice(self: Self, offset: int, length: int | None = None) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"s": ["pear", None, "papaya", "dragonfruit"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_slice(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... s_sliced=nw.col("s").str.slice(4, length=3) ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_slice`: - >>> my_library_agnostic_function(df_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_str_slice(df_pd) # doctest: +NORMALIZE_WHITESPACE s s_sliced 0 pear 1 None None 2 papaya ya 3 dragonfruit onf - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_slice(df_pl) shape: (4, 2) ┌─────────────┬──────────┐ │ s ┆ s_sliced │ @@ -4379,20 +4694,28 @@ def slice(self: Self, offset: int, length: int | None = None) -> ExprT: │ dragonfruit ┆ onf │ └─────────────┴──────────┘ + >>> agnostic_str_slice(df_pa) + pyarrow.Table + s: string + s_sliced: string + ---- + s: [["pear",null,"papaya","dragonfruit"]] + s_sliced: [["",null,"ya","onf"]] + Using negative indexes: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_slice_negative(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns(s_sliced=nw.col("s").str.slice(-3)).to_native() - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_slice_negative(df_pd) s s_sliced 0 pear ear 1 None None 2 papaya aya 3 dragonfruit uit - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_slice_negative(df_pl) shape: (4, 2) ┌─────────────┬──────────┐ │ s ┆ s_sliced │ @@ -4404,6 +4727,14 @@ def slice(self: Self, offset: int, length: int | None = None) -> ExprT: │ papaya ┆ aya │ │ dragonfruit ┆ uit │ └─────────────┴──────────┘ + + >>> agnostic_str_slice_negative(df_pa) + pyarrow.Table + s: string + s_sliced: string + ---- + s: [["pear",null,"papaya","dragonfruit"]] + s_sliced: [["ear",null,"aya","uit"]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.slice( @@ -4426,30 +4757,34 @@ def head(self: Self, n: int = 5) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"lyrics": ["Atatata", "taata", "taatatata", "zukkyun"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_head(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... lyrics_head=nw.col("lyrics").str.head() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_head`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_head(df_pd) lyrics lyrics_head 0 Atatata Atata 1 taata taata 2 taatatata taata 3 zukkyun zukky - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_head(df_pl) shape: (4, 2) ┌───────────┬─────────────┐ │ lyrics ┆ lyrics_head │ @@ -4461,6 +4796,14 @@ def head(self: Self, n: int = 5) -> ExprT: │ taatatata ┆ taata │ │ zukkyun ┆ zukky │ └───────────┴─────────────┘ + + >>> agnostic_str_head(df_pa) + pyarrow.Table + lyrics: string + lyrics_head: string + ---- + lyrics: [["Atatata","taata","taatatata","zukkyun"]] + lyrics_head: [["Atata","taata","taata","zukky"]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.slice(0, n) @@ -4481,30 +4824,34 @@ def tail(self: Self, n: int = 5) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"lyrics": ["Atatata", "taata", "taatatata", "zukkyun"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_tail(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... lyrics_tail=nw.col("lyrics").str.tail() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_tail`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_tail(df_pd) lyrics lyrics_tail 0 Atatata atata 1 taata taata 2 taatatata atata 3 zukkyun kkyun - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_tail(df_pl) shape: (4, 2) ┌───────────┬─────────────┐ │ lyrics ┆ lyrics_tail │ @@ -4516,6 +4863,14 @@ def tail(self: Self, n: int = 5) -> ExprT: │ taatatata ┆ atata │ │ zukkyun ┆ kkyun │ └───────────┴─────────────┘ + + >>> agnostic_str_tail(df_pa) + pyarrow.Table + lyrics: string + lyrics_tail: string + ---- + lyrics: [["Atatata","taata","taatatata","zukkyun"]] + lyrics_tail: [["atata","taata","atata","kkyun"]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.slice( @@ -4549,6 +4904,7 @@ def to_datetime(self: Self, format: str | None = None) -> ExprT: # noqa: A002 >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = ["2020-01-01", "2020-01-02"] >>> df_pd = pd.DataFrame({"a": data}) >>> df_pl = pl.DataFrame({"a": data}) @@ -4556,19 +4912,21 @@ def to_datetime(self: Self, format: str | None = None) -> ExprT: # noqa: A002 We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_to_datetime(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").str.to_datetime(format="%Y-%m-%d") ... ).to_native() - We can then pass any supported library such as pandas, Polars, or PyArrow: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_to_datetime`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_to_datetime(df_pd) a 0 2020-01-01 1 2020-01-02 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_str_to_datetime(df_pl) shape: (2, 1) ┌─────────────────────┐ │ a │ @@ -4578,7 +4936,8 @@ def to_datetime(self: Self, format: str | None = None) -> ExprT: # noqa: A002 │ 2020-01-01 00:00:00 │ │ 2020-01-02 00:00:00 │ └─────────────────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_str_to_datetime(df_pa) pyarrow.Table a: timestamp[us] ---- @@ -4602,29 +4961,33 @@ def to_uppercase(self: Self) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"fruits": ["apple", "mango", None]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_to_uppercase(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... upper_col=nw.col("fruits").str.to_uppercase() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_to_uppercase`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_to_uppercase(df_pd) fruits upper_col 0 apple APPLE 1 mango MANGO 2 None None - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_to_uppercase(df_pl) shape: (3, 2) ┌────────┬───────────┐ │ fruits ┆ upper_col │ @@ -4636,6 +4999,13 @@ def to_uppercase(self: Self) -> ExprT: │ null ┆ null │ └────────┴───────────┘ + >>> agnostic_str_to_uppercase(df_pa) + pyarrow.Table + fruits: string + upper_col: string + ---- + fruits: [["apple","mango",null]] + upper_col: [["APPLE","MANGO",null]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.to_uppercase() @@ -4650,29 +5020,33 @@ def to_lowercase(self: Self) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"fruits": ["APPLE", "MANGO", None]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_to_lowercase(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... lower_col=nw.col("fruits").str.to_lowercase() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_to_lowercase`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_to_lowercase(df_pd) fruits lower_col 0 APPLE apple 1 MANGO mango 2 None None - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_to_lowercase(df_pl) shape: (3, 2) ┌────────┬───────────┐ │ fruits ┆ lower_col │ @@ -4683,6 +5057,14 @@ def to_lowercase(self: Self) -> ExprT: │ MANGO ┆ mango │ │ null ┆ null │ └────────┴───────────┘ + + >>> agnostic_str_to_lowercase(df_pa) + pyarrow.Table + fruits: string + lower_col: string + ---- + fruits: [["APPLE","MANGO",null]] + lower_col: [["apple","mango",null]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.to_lowercase() @@ -4703,29 +5085,33 @@ def date(self: Self) -> ExprT: NotImplementedError: If pandas default backend is being used. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [datetime(2012, 1, 7, 10, 20), datetime(2023, 3, 10, 11, 32)]} >>> df_pd = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow") >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_date(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").dt.date()).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_date`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_date(df_pd) a 0 2012-01-07 1 2023-03-10 - >>> my_library_agnostic_function(df_pl) # docetst + >>> agnostic_dt_date(df_pl) shape: (2, 1) ┌────────────┐ │ a │ @@ -4735,6 +5121,12 @@ def date(self: Self) -> ExprT: │ 2012-01-07 │ │ 2023-03-10 │ └────────────┘ + + >>> agnostic_dt_date(df_pa) + pyarrow.Table + a: date32[day] + ---- + a: [[2012-01-07,2023-03-10]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.date() @@ -4749,11 +5141,13 @@ def year(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 6, 1), @@ -4763,23 +5157,26 @@ def year(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_year(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.col("datetime").dt.year().alias("year") ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_year`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_year(df_pd) datetime year 0 1978-06-01 1978 1 2024-12-13 2024 2 2065-01-01 2065 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_year(df_pl) shape: (3, 2) ┌─────────────────────┬──────┐ │ datetime ┆ year │ @@ -4790,6 +5187,14 @@ def year(self: Self) -> ExprT: │ 2024-12-13 00:00:00 ┆ 2024 │ │ 2065-01-01 00:00:00 ┆ 2065 │ └─────────────────────┴──────┘ + + >>> agnostic_dt_year(df_pa) + pyarrow.Table + datetime: timestamp[us] + year: int64 + ---- + datetime: [[1978-06-01 00:00:00.000000,2024-12-13 00:00:00.000000,2065-01-01 00:00:00.000000]] + year: [[1978,2024,2065]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.year() @@ -4804,11 +5209,13 @@ def month(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 6, 1), @@ -4818,34 +5225,44 @@ def month(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_month(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( - ... nw.col("datetime").dt.year().alias("year"), ... nw.col("datetime").dt.month().alias("month"), ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_month`: - >>> my_library_agnostic_function(df_pd) - datetime year month - 0 1978-06-01 1978 6 - 1 2024-12-13 2024 12 - 2 2065-01-01 2065 1 - >>> my_library_agnostic_function(df_pl) - shape: (3, 3) - ┌─────────────────────┬──────┬───────┐ - │ datetime ┆ year ┆ month │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i32 ┆ i8 │ - ╞═════════════════════╪══════╪═══════╡ - │ 1978-06-01 00:00:00 ┆ 1978 ┆ 6 │ - │ 2024-12-13 00:00:00 ┆ 2024 ┆ 12 │ - │ 2065-01-01 00:00:00 ┆ 2065 ┆ 1 │ - └─────────────────────┴──────┴───────┘ + >>> agnostic_dt_month(df_pd) + datetime month + 0 1978-06-01 6 + 1 2024-12-13 12 + 2 2065-01-01 1 + + >>> agnostic_dt_month(df_pl) + shape: (3, 2) + ┌─────────────────────┬───────┐ + │ datetime ┆ month │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i8 │ + ╞═════════════════════╪═══════╡ + │ 1978-06-01 00:00:00 ┆ 6 │ + │ 2024-12-13 00:00:00 ┆ 12 │ + │ 2065-01-01 00:00:00 ┆ 1 │ + └─────────────────────┴───────┘ + + >>> agnostic_dt_month(df_pa) + pyarrow.Table + datetime: timestamp[us] + month: int64 + ---- + datetime: [[1978-06-01 00:00:00.000000,2024-12-13 00:00:00.000000,2065-01-01 00:00:00.000000]] + month: [[6,12,1]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.month() @@ -4860,11 +5277,13 @@ def day(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 6, 1), @@ -4874,35 +5293,44 @@ def day(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_day(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( - ... nw.col("datetime").dt.year().alias("year"), - ... nw.col("datetime").dt.month().alias("month"), ... nw.col("datetime").dt.day().alias("day"), ... ).to_native() - We can then pass either pandas or Polars to `func`: - - >>> my_library_agnostic_function(df_pd) - datetime year month day - 0 1978-06-01 1978 6 1 - 1 2024-12-13 2024 12 13 - 2 2065-01-01 2065 1 1 - >>> my_library_agnostic_function(df_pl) - shape: (3, 4) - ┌─────────────────────┬──────┬───────┬─────┐ - │ datetime ┆ year ┆ month ┆ day │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i32 ┆ i8 ┆ i8 │ - ╞═════════════════════╪══════╪═══════╪═════╡ - │ 1978-06-01 00:00:00 ┆ 1978 ┆ 6 ┆ 1 │ - │ 2024-12-13 00:00:00 ┆ 2024 ┆ 12 ┆ 13 │ - │ 2065-01-01 00:00:00 ┆ 2065 ┆ 1 ┆ 1 │ - └─────────────────────┴──────┴───────┴─────┘ + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_day`: + + >>> agnostic_dt_day(df_pd) + datetime day + 0 1978-06-01 1 + 1 2024-12-13 13 + 2 2065-01-01 1 + + >>> agnostic_dt_day(df_pl) + shape: (3, 2) + ┌─────────────────────┬─────┐ + │ datetime ┆ day │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i8 │ + ╞═════════════════════╪═════╡ + │ 1978-06-01 00:00:00 ┆ 1 │ + │ 2024-12-13 00:00:00 ┆ 13 │ + │ 2065-01-01 00:00:00 ┆ 1 │ + └─────────────────────┴─────┘ + + >>> agnostic_dt_day(df_pa) + pyarrow.Table + datetime: timestamp[us] + day: int64 + ---- + datetime: [[1978-06-01 00:00:00.000000,2024-12-13 00:00:00.000000,2065-01-01 00:00:00.000000]] + day: [[1,13,1]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.day() @@ -4917,11 +5345,13 @@ def hour(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 1, 1, 1), @@ -4931,23 +5361,26 @@ def hour(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_hour(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.col("datetime").dt.hour().alias("hour") ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_hour`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_hour(df_pd) datetime hour 0 1978-01-01 01:00:00 1 1 2024-10-13 05:00:00 5 2 2065-01-01 10:00:00 10 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_hour(df_pl) shape: (3, 2) ┌─────────────────────┬──────┐ │ datetime ┆ hour │ @@ -4958,6 +5391,14 @@ def hour(self: Self) -> ExprT: │ 2024-10-13 05:00:00 ┆ 5 │ │ 2065-01-01 10:00:00 ┆ 10 │ └─────────────────────┴──────┘ + + >>> agnostic_dt_hour(df_pa) + pyarrow.Table + datetime: timestamp[us] + hour: int64 + ---- + datetime: [[1978-01-01 01:00:00.000000,2024-10-13 05:00:00.000000,2065-01-01 10:00:00.000000]] + hour: [[1,5,10]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.hour() @@ -4972,11 +5413,13 @@ def minute(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 1, 1, 1, 1), @@ -4986,34 +5429,44 @@ def minute(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_minute(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( - ... nw.col("datetime").dt.hour().alias("hour"), ... nw.col("datetime").dt.minute().alias("minute"), ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_minute`: - >>> my_library_agnostic_function(df_pd) - datetime hour minute - 0 1978-01-01 01:01:00 1 1 - 1 2024-10-13 05:30:00 5 30 - 2 2065-01-01 10:20:00 10 20 - >>> my_library_agnostic_function(df_pl) - shape: (3, 3) - ┌─────────────────────┬──────┬────────┐ - │ datetime ┆ hour ┆ minute │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i8 ┆ i8 │ - ╞═════════════════════╪══════╪════════╡ - │ 1978-01-01 01:01:00 ┆ 1 ┆ 1 │ - │ 2024-10-13 05:30:00 ┆ 5 ┆ 30 │ - │ 2065-01-01 10:20:00 ┆ 10 ┆ 20 │ - └─────────────────────┴──────┴────────┘ + >>> agnostic_dt_minute(df_pd) + datetime minute + 0 1978-01-01 01:01:00 1 + 1 2024-10-13 05:30:00 30 + 2 2065-01-01 10:20:00 20 + + >>> agnostic_dt_minute(df_pl) + shape: (3, 2) + ┌─────────────────────┬────────┐ + │ datetime ┆ minute │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i8 │ + ╞═════════════════════╪════════╡ + │ 1978-01-01 01:01:00 ┆ 1 │ + │ 2024-10-13 05:30:00 ┆ 30 │ + │ 2065-01-01 10:20:00 ┆ 20 │ + └─────────────────────┴────────┘ + + >>> agnostic_dt_minute(df_pa) + pyarrow.Table + datetime: timestamp[us] + minute: int64 + ---- + datetime: [[1978-01-01 01:01:00.000000,2024-10-13 05:30:00.000000,2065-01-01 10:20:00.000000]] + minute: [[1,30,20]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.minute() @@ -5026,11 +5479,13 @@ def second(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 1, 1, 1, 1, 1), @@ -5040,35 +5495,44 @@ def second(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_second(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( - ... nw.col("datetime").dt.hour().alias("hour"), - ... nw.col("datetime").dt.minute().alias("minute"), ... nw.col("datetime").dt.second().alias("second"), ... ).to_native() - We can then pass either pandas or Polars to `func`: - - >>> my_library_agnostic_function(df_pd) - datetime hour minute second - 0 1978-01-01 01:01:01 1 1 1 - 1 2024-10-13 05:30:14 5 30 14 - 2 2065-01-01 10:20:30 10 20 30 - >>> my_library_agnostic_function(df_pl) - shape: (3, 4) - ┌─────────────────────┬──────┬────────┬────────┐ - │ datetime ┆ hour ┆ minute ┆ second │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i8 ┆ i8 ┆ i8 │ - ╞═════════════════════╪══════╪════════╪════════╡ - │ 1978-01-01 01:01:01 ┆ 1 ┆ 1 ┆ 1 │ - │ 2024-10-13 05:30:14 ┆ 5 ┆ 30 ┆ 14 │ - │ 2065-01-01 10:20:30 ┆ 10 ┆ 20 ┆ 30 │ - └─────────────────────┴──────┴────────┴────────┘ + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_second`: + + >>> agnostic_dt_second(df_pd) + datetime second + 0 1978-01-01 01:01:01 1 + 1 2024-10-13 05:30:14 14 + 2 2065-01-01 10:20:30 30 + + >>> agnostic_dt_second(df_pl) + shape: (3, 2) + ┌─────────────────────┬────────┐ + │ datetime ┆ second │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i8 │ + ╞═════════════════════╪════════╡ + │ 1978-01-01 01:01:01 ┆ 1 │ + │ 2024-10-13 05:30:14 ┆ 14 │ + │ 2065-01-01 10:20:30 ┆ 30 │ + └─────────────────────┴────────┘ + + >>> agnostic_dt_second(df_pa) + pyarrow.Table + datetime: timestamp[us] + second: int64 + ---- + datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.000000,2065-01-01 10:20:30.000000]] + second: [[1,14,30]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.second() @@ -5081,11 +5545,13 @@ def millisecond(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 1, 1, 1, 1, 1, 0), @@ -5095,36 +5561,44 @@ def millisecond(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_millisecond(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( - ... nw.col("datetime").dt.hour().alias("hour"), - ... nw.col("datetime").dt.minute().alias("minute"), - ... nw.col("datetime").dt.second().alias("second"), ... nw.col("datetime").dt.millisecond().alias("millisecond"), ... ).to_native() - We can then pass either pandas or Polars to `func`: - - >>> my_library_agnostic_function(df_pd) - datetime hour minute second millisecond - 0 1978-01-01 01:01:01.000 1 1 1 0 - 1 2024-10-13 05:30:14.505 5 30 14 505 - 2 2065-01-01 10:20:30.067 10 20 30 67 - >>> my_library_agnostic_function(df_pl) - shape: (3, 5) - ┌─────────────────────────┬──────┬────────┬────────┬─────────────┐ - │ datetime ┆ hour ┆ minute ┆ second ┆ millisecond │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i8 ┆ i8 ┆ i8 ┆ i32 │ - ╞═════════════════════════╪══════╪════════╪════════╪═════════════╡ - │ 1978-01-01 01:01:01 ┆ 1 ┆ 1 ┆ 1 ┆ 0 │ - │ 2024-10-13 05:30:14.505 ┆ 5 ┆ 30 ┆ 14 ┆ 505 │ - │ 2065-01-01 10:20:30.067 ┆ 10 ┆ 20 ┆ 30 ┆ 67 │ - └─────────────────────────┴──────┴────────┴────────┴─────────────┘ + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_millisecond`: + + >>> agnostic_dt_millisecond(df_pd) + datetime millisecond + 0 1978-01-01 01:01:01.000 0 + 1 2024-10-13 05:30:14.505 505 + 2 2065-01-01 10:20:30.067 67 + + >>> agnostic_dt_millisecond(df_pl) + shape: (3, 2) + ┌─────────────────────────┬─────────────┐ + │ datetime ┆ millisecond │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i32 │ + ╞═════════════════════════╪═════════════╡ + │ 1978-01-01 01:01:01 ┆ 0 │ + │ 2024-10-13 05:30:14.505 ┆ 505 │ + │ 2065-01-01 10:20:30.067 ┆ 67 │ + └─────────────────────────┴─────────────┘ + + >>> agnostic_dt_millisecond(df_pa) + pyarrow.Table + datetime: timestamp[us] + millisecond: int64 + ---- + datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.505000,2065-01-01 10:20:30.067000]] + millisecond: [[0,505,67]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.millisecond() @@ -5137,11 +5611,13 @@ def microsecond(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 1, 1, 1, 1, 1, 0), @@ -5151,36 +5627,44 @@ def microsecond(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_microsecond(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( - ... nw.col("datetime").dt.hour().alias("hour"), - ... nw.col("datetime").dt.minute().alias("minute"), - ... nw.col("datetime").dt.second().alias("second"), ... nw.col("datetime").dt.microsecond().alias("microsecond"), ... ).to_native() - We can then pass either pandas or Polars to `func`: - - >>> my_library_agnostic_function(df_pd) - datetime hour minute second microsecond - 0 1978-01-01 01:01:01.000 1 1 1 0 - 1 2024-10-13 05:30:14.505 5 30 14 505000 - 2 2065-01-01 10:20:30.067 10 20 30 67000 - >>> my_library_agnostic_function(df_pl) - shape: (3, 5) - ┌─────────────────────────┬──────┬────────┬────────┬─────────────┐ - │ datetime ┆ hour ┆ minute ┆ second ┆ microsecond │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i8 ┆ i8 ┆ i8 ┆ i32 │ - ╞═════════════════════════╪══════╪════════╪════════╪═════════════╡ - │ 1978-01-01 01:01:01 ┆ 1 ┆ 1 ┆ 1 ┆ 0 │ - │ 2024-10-13 05:30:14.505 ┆ 5 ┆ 30 ┆ 14 ┆ 505000 │ - │ 2065-01-01 10:20:30.067 ┆ 10 ┆ 20 ┆ 30 ┆ 67000 │ - └─────────────────────────┴──────┴────────┴────────┴─────────────┘ + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_microsecond`: + + >>> agnostic_dt_microsecond(df_pd) + datetime microsecond + 0 1978-01-01 01:01:01.000 0 + 1 2024-10-13 05:30:14.505 505000 + 2 2065-01-01 10:20:30.067 67000 + + >>> agnostic_dt_microsecond(df_pl) + shape: (3, 2) + ┌─────────────────────────┬─────────────┐ + │ datetime ┆ microsecond │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i32 │ + ╞═════════════════════════╪═════════════╡ + │ 1978-01-01 01:01:01 ┆ 0 │ + │ 2024-10-13 05:30:14.505 ┆ 505000 │ + │ 2065-01-01 10:20:30.067 ┆ 67000 │ + └─────────────────────────┴─────────────┘ + + >>> agnostic_dt_microsecond(df_pa) + pyarrow.Table + datetime: timestamp[us] + microsecond: int64 + ---- + datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.505000,2065-01-01 10:20:30.067000]] + microsecond: [[0,505000,67000]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.microsecond() @@ -5193,11 +5677,13 @@ def nanosecond(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 1, 1, 1, 1, 1, 0), @@ -5207,36 +5693,44 @@ def nanosecond(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_nanosecond(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( - ... nw.col("datetime").dt.hour().alias("hour"), - ... nw.col("datetime").dt.minute().alias("minute"), - ... nw.col("datetime").dt.second().alias("second"), ... nw.col("datetime").dt.nanosecond().alias("nanosecond"), ... ).to_native() - We can then pass either pandas or Polars to `func`: - - >>> my_library_agnostic_function(df_pd) - datetime hour minute second nanosecond - 0 1978-01-01 01:01:01.000 1 1 1 0 - 1 2024-10-13 05:30:14.500 5 30 14 500000000 - 2 2065-01-01 10:20:30.060 10 20 30 60000000 - >>> my_library_agnostic_function(df_pl) - shape: (3, 5) - ┌─────────────────────────┬──────┬────────┬────────┬────────────┐ - │ datetime ┆ hour ┆ minute ┆ second ┆ nanosecond │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i8 ┆ i8 ┆ i8 ┆ i32 │ - ╞═════════════════════════╪══════╪════════╪════════╪════════════╡ - │ 1978-01-01 01:01:01 ┆ 1 ┆ 1 ┆ 1 ┆ 0 │ - │ 2024-10-13 05:30:14.500 ┆ 5 ┆ 30 ┆ 14 ┆ 500000000 │ - │ 2065-01-01 10:20:30.060 ┆ 10 ┆ 20 ┆ 30 ┆ 60000000 │ - └─────────────────────────┴──────┴────────┴────────┴────────────┘ + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_nanosecond`: + + >>> agnostic_dt_nanosecond(df_pd) + datetime nanosecond + 0 1978-01-01 01:01:01.000 0 + 1 2024-10-13 05:30:14.500 500000000 + 2 2065-01-01 10:20:30.060 60000000 + + >>> agnostic_dt_nanosecond(df_pl) + shape: (3, 2) + ┌─────────────────────────┬────────────┐ + │ datetime ┆ nanosecond │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i32 │ + ╞═════════════════════════╪════════════╡ + │ 1978-01-01 01:01:01 ┆ 0 │ + │ 2024-10-13 05:30:14.500 ┆ 500000000 │ + │ 2065-01-01 10:20:30.060 ┆ 60000000 │ + └─────────────────────────┴────────────┘ + + >>> agnostic_dt_nanosecond(df_pa) + pyarrow.Table + datetime: timestamp[us] + nanosecond: int64 + ---- + datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.500000,2065-01-01 10:20:30.060000]] + nanosecond: [[0,500000000,60000000]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.nanosecond() @@ -5249,30 +5743,35 @@ def ordinal_day(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [datetime(2020, 1, 1), datetime(2020, 8, 3)]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_ordinal_day(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... a_ordinal_day=nw.col("a").dt.ordinal_day() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_ordinal_day`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_ordinal_day(df_pd) a a_ordinal_day 0 2020-01-01 1 1 2020-08-03 216 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_ordinal_day(df_pl) shape: (2, 2) ┌─────────────────────┬───────────────┐ │ a ┆ a_ordinal_day │ @@ -5282,6 +5781,14 @@ def ordinal_day(self: Self) -> ExprT: │ 2020-01-01 00:00:00 ┆ 1 │ │ 2020-08-03 00:00:00 ┆ 216 │ └─────────────────────┴───────────────┘ + + >>> agnostic_dt_ordinal_day(df_pa) + pyarrow.Table + a: timestamp[us] + a_ordinal_day: int64 + ---- + a: [[2020-01-01 00:00:00.000000,2020-08-03 00:00:00.000000]] + a_ordinal_day: [[1,216]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.ordinal_day() @@ -5293,7 +5800,6 @@ def weekday(self: Self) -> ExprT: Returns: Returns the ISO weekday number where monday = 1 and sunday = 7 - Examples: >>> from datetime import datetime >>> import pandas as pd @@ -5301,6 +5807,7 @@ def weekday(self: Self) -> ExprT: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [datetime(2020, 1, 1), datetime(2020, 8, 3)]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -5308,17 +5815,19 @@ def weekday(self: Self) -> ExprT: We define a dataframe-agnostic function: - >>> def agnostic_weekday(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_weekday(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns(a_weekday=nw.col("a").dt.weekday()).to_native() - We can then pass either pandas, Polars, PyArrow, and other supported libraries to `agnostic_weekday`: + We can then pass either pandas, Polars, PyArrow, and other supported libraries to + `agnostic_dt_weekday`: - >>> agnostic_weekday(df_pd) + >>> agnostic_dt_weekday(df_pd) a a_weekday 0 2020-01-01 3 1 2020-08-03 1 - >>> agnostic_weekday(df_pl) + + >>> agnostic_dt_weekday(df_pl) shape: (2, 2) ┌─────────────────────┬───────────┐ │ a ┆ a_weekday │ @@ -5328,7 +5837,8 @@ def weekday(self: Self) -> ExprT: │ 2020-01-01 00:00:00 ┆ 3 │ │ 2020-08-03 00:00:00 ┆ 1 │ └─────────────────────┴───────────┘ - >>> agnostic_weekday(df_pa) + + >>> agnostic_dt_weekday(df_pa) pyarrow.Table a: timestamp[us] a_weekday: int64 @@ -5352,30 +5862,35 @@ def total_minutes(self: Self) -> ExprT: consider using `fill_null()` and `cast` in this case. Examples: + >>> from datetime import timedelta >>> import pandas as pd >>> import polars as pl - >>> from datetime import timedelta + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [timedelta(minutes=10), timedelta(minutes=20, seconds=40)]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_total_minutes(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... a_total_minutes=nw.col("a").dt.total_minutes() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_total_minutes`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_total_minutes(df_pd) a a_total_minutes 0 0 days 00:10:00 10 1 0 days 00:20:40 20 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_total_minutes(df_pl) shape: (2, 2) ┌──────────────┬─────────────────┐ │ a ┆ a_total_minutes │ @@ -5385,6 +5900,14 @@ def total_minutes(self: Self) -> ExprT: │ 10m ┆ 10 │ │ 20m 40s ┆ 20 │ └──────────────┴─────────────────┘ + + >>> agnostic_dt_total_minutes(df_pa) + pyarrow.Table + a: duration[us] + a_total_minutes: int64 + ---- + a: [[600000000,1240000000]] + a_total_minutes: [[10,20]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.total_minutes() @@ -5402,30 +5925,35 @@ def total_seconds(self: Self) -> ExprT: consider using `fill_null()` and `cast` in this case. Examples: + >>> from datetime import timedelta >>> import pandas as pd >>> import polars as pl - >>> from datetime import timedelta + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [timedelta(seconds=10), timedelta(seconds=20, milliseconds=40)]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_total_seconds(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... a_total_seconds=nw.col("a").dt.total_seconds() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_total_seconds`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_total_seconds(df_pd) a a_total_seconds 0 0 days 00:00:10 10 1 0 days 00:00:20.040000 20 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_total_seconds(df_pl) shape: (2, 2) ┌──────────────┬─────────────────┐ │ a ┆ a_total_seconds │ @@ -5435,6 +5963,14 @@ def total_seconds(self: Self) -> ExprT: │ 10s ┆ 10 │ │ 20s 40ms ┆ 20 │ └──────────────┴─────────────────┘ + + >>> agnostic_dt_total_seconds(df_pa) + pyarrow.Table + a: duration[us] + a_total_seconds: int64 + ---- + a: [[10000000,20040000]] + a_total_seconds: [[10,20]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.total_seconds() @@ -5452,11 +5988,13 @@ def total_milliseconds(self: Self) -> ExprT: consider using `fill_null()` and `cast` in this case. Examples: + >>> from datetime import timedelta >>> import pandas as pd >>> import polars as pl - >>> from datetime import timedelta + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [ ... timedelta(milliseconds=10), @@ -5465,22 +6003,25 @@ def total_milliseconds(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_total_milliseconds(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... a_total_milliseconds=nw.col("a").dt.total_milliseconds() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_total_milliseconds`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_total_milliseconds(df_pd) a a_total_milliseconds 0 0 days 00:00:00.010000 10 1 0 days 00:00:00.020040 20 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_total_milliseconds(df_pl) shape: (2, 2) ┌──────────────┬──────────────────────┐ │ a ┆ a_total_milliseconds │ @@ -5490,6 +6031,14 @@ def total_milliseconds(self: Self) -> ExprT: │ 10ms ┆ 10 │ │ 20040µs ┆ 20 │ └──────────────┴──────────────────────┘ + + >>> agnostic_dt_total_milliseconds(df_pa) + pyarrow.Table + a: duration[us] + a_total_milliseconds: int64 + ---- + a: [[10000,20040]] + a_total_milliseconds: [[10,20]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.total_milliseconds() @@ -5507,11 +6056,13 @@ def total_microseconds(self: Self) -> ExprT: consider using `fill_null()` and `cast` in this case. Examples: + >>> from datetime import timedelta >>> import pandas as pd >>> import polars as pl - >>> from datetime import timedelta + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [ ... timedelta(microseconds=10), @@ -5520,22 +6071,25 @@ def total_microseconds(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_total_microseconds(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... a_total_microseconds=nw.col("a").dt.total_microseconds() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_total_microseconds`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_total_microseconds(df_pd) a a_total_microseconds 0 0 days 00:00:00.000010 10 1 0 days 00:00:00.001200 1200 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_total_microseconds(df_pl) shape: (2, 2) ┌──────────────┬──────────────────────┐ │ a ┆ a_total_microseconds │ @@ -5545,6 +6099,14 @@ def total_microseconds(self: Self) -> ExprT: │ 10µs ┆ 10 │ │ 1200µs ┆ 1200 │ └──────────────┴──────────────────────┘ + + >>> agnostic_dt_total_microseconds(df_pa) + pyarrow.Table + a: duration[us] + a_total_microseconds: int64 + ---- + a: [[10,1200]] + a_total_microseconds: [[10,1200]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.total_microseconds() @@ -5562,11 +6124,12 @@ def total_nanoseconds(self: Self) -> ExprT: consider using `fill_null()` and `cast` in this case. Examples: + >>> from datetime import timedelta >>> import pandas as pd >>> import polars as pl - >>> from datetime import timedelta >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = ["2024-01-01 00:00:00.000000001", "2024-01-01 00:00:00.000000002"] >>> df_pd = pd.DataFrame({"a": pd.to_datetime(data)}) >>> df_pl = pl.DataFrame({"a": data}).with_columns( @@ -5575,19 +6138,21 @@ def total_nanoseconds(self: Self) -> ExprT: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_total_nanoseconds(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... a_diff_total_nanoseconds=nw.col("a").diff().dt.total_nanoseconds() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_total_nanoseconds`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_total_nanoseconds(df_pd) a a_diff_total_nanoseconds 0 2024-01-01 00:00:00.000000001 NaN 1 2024-01-01 00:00:00.000000002 1.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_total_nanoseconds(df_pl) shape: (2, 2) ┌───────────────────────────────┬──────────────────────────┐ │ a ┆ a_diff_total_nanoseconds │ @@ -5646,33 +6211,39 @@ def to_string(self: Self, format: str) -> ExprT: # noqa: A002 >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> data = [ - ... datetime(2020, 3, 1), - ... datetime(2020, 4, 1), - ... datetime(2020, 5, 1), - ... ] - >>> df_pd = pd.DataFrame({"a": data}) - >>> df_pl = pl.DataFrame({"a": data}) + >>> + >>> data = { + ... "a": [ + ... datetime(2020, 3, 1), + ... datetime(2020, 4, 1), + ... datetime(2020, 5, 1), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_to_string(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").dt.to_string("%Y/%m/%d %H:%M:%S") ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_to_string`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_to_string(df_pd) a 0 2020/03/01 00:00:00 1 2020/04/01 00:00:00 2 2020/05/01 00:00:00 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_dt_to_string(df_pl) shape: (3, 1) ┌─────────────────────┐ │ a │ @@ -5683,6 +6254,12 @@ def to_string(self: Self, format: str) -> ExprT: # noqa: A002 │ 2020/04/01 00:00:00 │ │ 2020/05/01 00:00:00 │ └─────────────────────┘ + + >>> agnostic_dt_to_string(df_pa) + pyarrow.Table + a: string + ---- + a: [["2020/03/01 00:00:00.000000","2020/04/01 00:00:00.000000","2020/05/01 00:00:00.000000"]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.to_string(format) @@ -5699,11 +6276,12 @@ def replace_time_zone(self: Self, time_zone: str | None) -> ExprT: Examples: >>> from datetime import datetime, timezone - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [ ... datetime(2024, 1, 1, tzinfo=timezone.utc), @@ -5716,19 +6294,21 @@ def replace_time_zone(self: Self, time_zone: str | None) -> ExprT: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_replace_time_zone(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").dt.replace_time_zone("Asia/Kathmandu") ... ).to_native() - We can then pass pandas / PyArrow / Polars / any other supported library: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_replace_time_zone`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_replace_time_zone(df_pd) a 0 2024-01-01 00:00:00+05:45 1 2024-01-02 00:00:00+05:45 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_replace_time_zone(df_pl) shape: (2, 1) ┌──────────────────────────────┐ │ a │ @@ -5738,7 +6318,8 @@ def replace_time_zone(self: Self, time_zone: str | None) -> ExprT: │ 2024-01-01 00:00:00 +0545 │ │ 2024-01-02 00:00:00 +0545 │ └──────────────────────────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_dt_replace_time_zone(df_pa) pyarrow.Table a: timestamp[us, tz=Asia/Kathmandu] ---- @@ -5762,11 +6343,12 @@ def convert_time_zone(self: Self, time_zone: str) -> ExprT: Examples: >>> from datetime import datetime, timezone - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [ ... datetime(2024, 1, 1, tzinfo=timezone.utc), @@ -5779,19 +6361,21 @@ def convert_time_zone(self: Self, time_zone: str) -> ExprT: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_convert_time_zone(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").dt.convert_time_zone("Asia/Kathmandu") ... ).to_native() - We can then pass pandas / PyArrow / Polars / any other supported library: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_convert_time_zone`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_convert_time_zone(df_pd) a 0 2024-01-01 05:45:00+05:45 1 2024-01-02 05:45:00+05:45 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_convert_time_zone(df_pl) shape: (2, 1) ┌──────────────────────────────┐ │ a │ @@ -5801,7 +6385,8 @@ def convert_time_zone(self: Self, time_zone: str) -> ExprT: │ 2024-01-01 05:45:00 +0545 │ │ 2024-01-02 05:45:00 +0545 │ └──────────────────────────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_dt_convert_time_zone(df_pa) pyarrow.Table a: timestamp[us, tz=Asia/Kathmandu] ---- @@ -5826,11 +6411,12 @@ def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ExprT: Examples: >>> from datetime import date - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"date": [date(2001, 1, 1), None, date(2001, 1, 3)]} >>> df_pd = pd.DataFrame(data, dtype="datetime64[ns]") >>> df_pl = pl.DataFrame(data) @@ -5838,21 +6424,23 @@ def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ExprT: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_timestamp(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.col("date").dt.timestamp().alias("timestamp_us"), ... nw.col("date").dt.timestamp("ms").alias("timestamp_ms"), ... ).to_native() - We can then pass pandas / PyArrow / Polars / any other supported library: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_timestamp`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_timestamp(df_pd) date timestamp_us timestamp_ms 0 2001-01-01 9.783072e+14 9.783072e+11 1 NaT NaN NaN 2 2001-01-03 9.784800e+14 9.784800e+11 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_timestamp(df_pl) shape: (3, 3) ┌────────────┬─────────────────┬──────────────┐ │ date ┆ timestamp_us ┆ timestamp_ms │ @@ -5863,7 +6451,8 @@ def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ExprT: │ null ┆ null ┆ null │ │ 2001-01-03 ┆ 978480000000000 ┆ 978480000000 │ └────────────┴─────────────────┴──────────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_dt_timestamp(df_pa) pyarrow.Table date: date32[day] timestamp_us: int64 @@ -5900,27 +6489,33 @@ def keep(self: Self) -> ExprT: expression in a chain. Only one name operation per expression will work. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> >>> data = {"foo": [1, 2], "BAR": [4, 5]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_name_keep(df_native: IntoFrame) -> list[str]: ... df = nw.from_native(df_native) - ... return df.select( - ... nw.col("foo").alias("alias_for_foo").name.keep() - ... ).to_native() + ... return df.select(nw.col("foo").alias("alias_for_foo").name.keep()).columns + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_keep`: - We can then pass either pandas or Polars to `func`: + >>> agnostic_name_keep(df_pd) + ['foo'] + + >>> agnostic_name_keep(df_pl) + ['foo'] - >>> my_library_agnostic_function(df_pd).columns - Index(['foo'], dtype='object') - >>> my_library_agnostic_function(df_pl).columns + >>> agnostic_name_keep(df_pa) ['foo'] """ return self._expr.__class__( @@ -5942,26 +6537,34 @@ def map(self: Self, function: Callable[[str], str]) -> ExprT: expression in a chain. Only one name operation per expression will work. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> >>> data = {"foo": [1, 2], "BAR": [4, 5]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: >>> renaming_func = lambda s: s[::-1] # reverse column name - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_name_map(df_native: IntoFrame) -> list[str]: ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo", "BAR").name.map(renaming_func)).to_native() + ... return df.select(nw.col("foo", "BAR").name.map(renaming_func)).columns + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_map`: - We can then pass either pandas or Polars to `func`: + >>> agnostic_name_map(df_pd) + ['oof', 'RAB'] + + >>> agnostic_name_map(df_pl) + ['oof', 'RAB'] - >>> my_library_agnostic_function(df_pd).columns - Index(['oof', 'RAB'], dtype='object') - >>> my_library_agnostic_function(df_pl).columns + >>> agnostic_name_map(df_pa) ['oof', 'RAB'] """ return self._expr.__class__( @@ -5983,26 +6586,33 @@ def prefix(self: Self, prefix: str) -> ExprT: expression in a chain. Only one name operation per expression will work. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> >>> data = {"foo": [1, 2], "BAR": [4, 5]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def add_colname_prefix(df_native: IntoFrameT, prefix: str) -> IntoFrameT: + >>> def agnostic_name_prefix(df_native: IntoFrame, prefix: str) -> list[str]: ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo", "BAR").name.prefix(prefix)).to_native() + ... return df.select(nw.col("foo", "BAR").name.prefix(prefix)).columns - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_prefix`: + + >>> agnostic_name_prefix(df_pd, "with_prefix_") + ['with_prefix_foo', 'with_prefix_BAR'] - >>> add_colname_prefix(df_pd, "with_prefix_").columns - Index(['with_prefix_foo', 'with_prefix_BAR'], dtype='object') + >>> agnostic_name_prefix(df_pl, "with_prefix_") + ['with_prefix_foo', 'with_prefix_BAR'] - >>> add_colname_prefix(df_pl, "with_prefix_").columns + >>> agnostic_name_prefix(df_pa, "with_prefix_") ['with_prefix_foo', 'with_prefix_BAR'] """ return self._expr.__class__( @@ -6024,25 +6634,33 @@ def suffix(self: Self, suffix: str) -> ExprT: expression in a chain. Only one name operation per expression will work. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> >>> data = {"foo": [1, 2], "BAR": [4, 5]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def add_colname_suffix(df_native: IntoFrameT, suffix: str) -> IntoFrameT: + >>> def agnostic_name_suffix(df_native: IntoFrame, suffix: str) -> list[str]: ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo", "BAR").name.suffix(suffix)).to_native() + ... return df.select(nw.col("foo", "BAR").name.suffix(suffix)).columns + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_suffix`: - We can then pass either pandas or Polars to `func`: + >>> agnostic_name_suffix(df_pd, "_with_suffix") + ['foo_with_suffix', 'BAR_with_suffix'] + + >>> agnostic_name_suffix(df_pl, "_with_suffix") + ['foo_with_suffix', 'BAR_with_suffix'] - >>> add_colname_suffix(df_pd, "_with_suffix").columns - Index(['foo_with_suffix', 'BAR_with_suffix'], dtype='object') - >>> add_colname_suffix(df_pl, "_with_suffix").columns + >>> agnostic_name_suffix(df_pa, "_with_suffix") ['foo_with_suffix', 'BAR_with_suffix'] """ return self._expr.__class__( @@ -6061,25 +6679,33 @@ def to_lowercase(self: Self) -> ExprT: expression in a chain. Only one name operation per expression will work. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> >>> data = {"foo": [1, 2], "BAR": [4, 5]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def to_lower(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_name_to_lowercase(df_native: IntoFrame) -> list[str]: ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo", "BAR").name.to_lowercase()).to_native() + ... return df.select(nw.col("foo", "BAR").name.to_lowercase()).columns + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_to_lowercase`: - We can then pass either pandas or Polars to `func`: + >>> agnostic_name_to_lowercase(df_pd) + ['foo', 'bar'] + + >>> agnostic_name_to_lowercase(df_pl) + ['foo', 'bar'] - >>> to_lower(df_pd).columns - Index(['foo', 'bar'], dtype='object') - >>> to_lower(df_pl).columns + >>> agnostic_name_to_lowercase(df_pa) ['foo', 'bar'] """ return self._expr.__class__( @@ -6098,24 +6724,33 @@ def to_uppercase(self: Self) -> ExprT: expression in a chain. Only one name operation per expression will work. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> >>> data = {"foo": [1, 2], "BAR": [4, 5]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def to_upper(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_name_to_uppercase(df_native: IntoFrame) -> list[str]: ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo", "BAR").name.to_uppercase()).to_native() + ... return df.select(nw.col("foo", "BAR").name.to_uppercase()).columns + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_to_uppercase`: + + >>> agnostic_name_to_uppercase(df_pd) + ['FOO', 'BAR'] + + >>> agnostic_name_to_uppercase(df_pl) + ['FOO', 'BAR'] - We can then pass either pandas or Polars to `func`: - >>> to_upper(df_pd).columns - Index(['FOO', 'BAR'], dtype='object') - >>> to_upper(df_pl).columns + >>> agnostic_name_to_uppercase(df_pa) ['FOO', 'BAR'] """ return self._expr.__class__( @@ -6136,11 +6771,12 @@ def len(self: Self) -> ExprT: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [[1, 2], [3, 4, None], None, []]} Let's define a dataframe-agnostic function: @@ -6202,23 +6838,27 @@ def col(*names: str | Iterable[str]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [3, 4]}) - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [3, 4]}) + >>> + >>> data = {"a": [1, 2], "b": [3, 4]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_col(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a") * nw.col("b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_col`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_col(df_pd) a 0 3 1 8 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_col(df_pl) shape: (2, 1) ┌─────┐ │ a │ @@ -6228,7 +6868,8 @@ def col(*names: str | Iterable[str]) -> Expr: │ 3 │ │ 8 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_col(df_pa) pyarrow.Table a: int64 ---- @@ -6260,6 +6901,7 @@ def nth(*indices: int | Sequence[int]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2], "b": [3, 4]} >>> df_pl = pl.DataFrame(data) >>> df_pd = pd.DataFrame(data) @@ -6267,17 +6909,18 @@ def nth(*indices: int | Sequence[int]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_nth(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.nth(0) * 2).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_nth`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_nth(df_pd) a 0 2 1 4 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_nth(df_pl) shape: (2, 1) ┌─────┐ │ a │ @@ -6287,7 +6930,8 @@ def nth(*indices: int | Sequence[int]) -> Expr: │ 2 │ │ 4 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_nth(df_pa) pyarrow.Table a: int64 ---- @@ -6313,24 +6957,28 @@ def all_() -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> df_pa = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> + >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_all(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.all() * 2).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_all`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_all(df_pd) a b 0 2 8 1 4 10 2 6 12 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_all(df_pl) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -6341,7 +6989,8 @@ def all_() -> Expr: │ 4 ┆ 10 │ │ 6 ┆ 12 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_all(df_pa) pyarrow.Table a: int64 b: int64 @@ -6365,22 +7014,25 @@ def len_() -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [5, 10]}) + >>> + >>> data = {"a": [1, 2], "b": [5, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_len(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.len()).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_len`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_len(df_pd) len 0 2 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_len(df_pl) shape: (1, 1) ┌─────┐ │ len │ @@ -6389,7 +7041,7 @@ def len_() -> Expr: ╞═════╡ │ 2 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_len(df_pa) pyarrow.Table len: int64 ---- @@ -6420,22 +7072,26 @@ def sum(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2]}) - >>> df_pd = pd.DataFrame({"a": [1, 2]}) - >>> df_pa = pa.table({"a": [1, 2]}) + >>> + >>> data = {"a": [1, 2]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_sum(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.sum("a")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_sum`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_sum(df_pd) a 0 3 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_sum(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -6444,7 +7100,8 @@ def sum(*columns: str) -> Expr: ╞═════╡ │ 3 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_sum(df_pa) pyarrow.Table a: int64 ---- @@ -6471,22 +7128,26 @@ def mean(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 8, 3]}) - >>> df_pd = pd.DataFrame({"a": [1, 8, 3]}) - >>> df_pa = pa.table({"a": [1, 8, 3]}) + >>> + >>> data = {"a": [1, 8, 3]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_mean(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.mean("a")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_mean`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_mean(df_pd) a 0 4.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_mean(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -6495,7 +7156,8 @@ def mean(*columns: str) -> Expr: ╞═════╡ │ 4.0 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_mean(df_pa) pyarrow.Table a: double ---- @@ -6509,7 +7171,8 @@ def median(*columns: str) -> Expr: Notes: - Syntactic sugar for ``nw.col(columns).median()`` - - Results might slightly differ across backends due to differences in the underlying algorithms used to compute the median. + - Results might slightly differ across backends due to differences in the + underlying algorithms used to compute the median. Arguments: columns: Name(s) of the columns to use in the aggregation function @@ -6523,22 +7186,26 @@ def median(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [4, 5, 2]}) - >>> df_pl = pl.DataFrame({"a": [4, 5, 2]}) - >>> df_pa = pa.table({"a": [4, 5, 2]}) + >>> + >>> data = {"a": [4, 5, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_median(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.median("a")).to_native() - We can then pass any supported library such as pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_median`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_median(df_pd) a 0 4.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_median(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -6547,7 +7214,8 @@ def median(*columns: str) -> Expr: ╞═════╡ │ 4.0 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_median(df_pa) pyarrow.Table a: double ---- @@ -6574,22 +7242,26 @@ def min(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [5, 10]}) + >>> + >>> data = {"a": [1, 2], "b": [5, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_min(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.min("b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_min`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_min(df_pd) b 0 5 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_min(df_pl) shape: (1, 1) ┌─────┐ │ b │ @@ -6598,7 +7270,8 @@ def min(*columns: str) -> Expr: ╞═════╡ │ 5 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_min(df_pa) pyarrow.Table b: int64 ---- @@ -6625,22 +7298,26 @@ def max(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [5, 10]}) + >>> + >>> data = {"a": [1, 2], "b": [5, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_max(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.max("a")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_max`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_max(df_pd) a 0 2 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_max(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -6649,7 +7326,8 @@ def max(*columns: str) -> Expr: ╞═════╡ │ 2 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_max(df_pa) pyarrow.Table a: int64 ---- @@ -6677,6 +7355,7 @@ def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3], "b": [5, 10, None]} >>> df_pl = pl.DataFrame(data) >>> df_pd = pd.DataFrame(data) @@ -6684,18 +7363,19 @@ def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_sum_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.sum_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_sum_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_sum_horizontal(df_pd) a 0 6.0 1 12.0 2 3.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_sum_horizontal(df_pl) shape: (3, 1) ┌─────┐ │ a │ @@ -6706,7 +7386,8 @@ def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 12 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_sum_horizontal(df_pa) pyarrow.Table a: int64 ---- @@ -6736,11 +7417,12 @@ def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 8, 3], ... "b": [4, 5, None], @@ -6750,18 +7432,20 @@ def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function that computes the horizontal min of "a" and "b" columns: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_min_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.min_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_min_horizontal`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_min_horizontal(pd.DataFrame(data)) a 0 1.0 1 5.0 2 3.0 - >>> my_library_agnostic_function(pl.DataFrame(data)) + + >>> agnostic_min_horizontal(pl.DataFrame(data)) shape: (3, 1) ┌─────┐ │ a │ @@ -6772,7 +7456,8 @@ def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 5 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(pa.table(data)) + + >>> agnostic_min_horizontal(pa.table(data)) pyarrow.Table a: int64 ---- @@ -6802,11 +7487,12 @@ def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 8, 3], ... "b": [4, 5, None], @@ -6816,18 +7502,20 @@ def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function that computes the horizontal max of "a" and "b" columns: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_max_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.max_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_max_horizontal`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_max_horizontal(pd.DataFrame(data)) a 0 4.0 1 8.0 2 3.0 - >>> my_library_agnostic_function(pl.DataFrame(data)) + + >>> agnostic_max_horizontal(pl.DataFrame(data)) shape: (3, 1) ┌─────┐ │ a │ @@ -6838,7 +7526,8 @@ def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 8 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(pa.table(data)) + + >>> agnostic_max_horizontal(pa.table(data)) pyarrow.Table a: int64 ---- @@ -6892,9 +7581,9 @@ def when(*predicates: IntoExpr | Iterable[IntoExpr]) -> When: If not appended, and none of the conditions are `True`, `None` will be returned. Arguments: - predicates: Condition(s) that must be met in order to apply the subsequent statement. - Accepts one or more boolean expressions, which are implicitly combined with `&`. - String input is parsed as a column name. + predicates: Condition(s) that must be met in order to apply the subsequent + statement. Accepts one or more boolean expressions, which are implicitly + combined with `&`. String input is parsed as a column name. Returns: A "when" object, which `.then` can be called on. @@ -6905,26 +7594,30 @@ def when(*predicates: IntoExpr | Iterable[IntoExpr]) -> When: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2, 3], "b": [5, 10, 15]}) - >>> df_pd = pd.DataFrame({"a": [1, 2, 3], "b": [5, 10, 15]}) - >>> df_pa = pa.table({"a": [1, 2, 3], "b": [5, 10, 15]}) + >>> + >>> data = {"a": [1, 2, 3], "b": [5, 10, 15]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_when_then_otherwise(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.when(nw.col("a") < 3).then(5).otherwise(6).alias("a_when") ... ).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_when_then_otherwise`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_when_then_otherwise(df_pd) a b a_when 0 1 5 5 1 2 10 5 2 3 15 6 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_when_then_otherwise(df_pl) shape: (3, 3) ┌─────┬─────┬────────┐ │ a ┆ b ┆ a_when │ @@ -6935,7 +7628,8 @@ def when(*predicates: IntoExpr | Iterable[IntoExpr]) -> When: │ 2 ┆ 10 ┆ 5 │ │ 3 ┆ 15 ┆ 6 │ └─────┴─────┴────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_when_then_otherwise(df_pa) pyarrow.Table a: int64 b: int64 @@ -6952,7 +7646,8 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: r"""Compute the bitwise AND horizontally across columns. Arguments: - exprs: Name(s) of the columns to use in the aggregation function. Accepts expression input. + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. Returns: A new expression. @@ -6963,6 +7658,7 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [False, False, True, True, False, None], ... "b": [False, True, True, None, None, None], @@ -6973,13 +7669,14 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_all_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select("a", "b", all=nw.all_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_all_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_all_horizontal(df_pd) a b all 0 False False False 1 False True False @@ -6988,7 +7685,7 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: 4 False False 5 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_all_horizontal(df_pl) shape: (6, 3) ┌───────┬───────┬───────┐ │ a ┆ b ┆ all │ @@ -7003,7 +7700,7 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ null ┆ null ┆ null │ └───────┴───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_all_horizontal(df_pa) pyarrow.Table a: bool b: bool @@ -7028,7 +7725,8 @@ def lit(value: Any, dtype: DType | type[DType] | None = None) -> Expr: Arguments: value: The value to use as literal. - dtype: The data type of the literal value. If not provided, the data type will be inferred. + dtype: The data type of the literal value. If not provided, the data type will + be inferred. Returns: A new expression. @@ -7039,23 +7737,27 @@ def lit(value: Any, dtype: DType | type[DType] | None = None) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2]}) - >>> df_pd = pd.DataFrame({"a": [1, 2]}) - >>> df_pa = pa.table({"a": [1, 2]}) + >>> + >>> data = {"a": [1, 2]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_lit(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns(nw.lit(3)).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_lit`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_lit(df_pd) a literal 0 1 3 1 2 3 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_lit(df_pl) shape: (2, 2) ┌─────┬─────────┐ │ a ┆ literal │ @@ -7065,7 +7767,8 @@ def lit(value: Any, dtype: DType | type[DType] | None = None) -> Expr: │ 1 ┆ 3 │ │ 2 ┆ 3 │ └─────┴─────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_lit(df_pa) pyarrow.Table a: int64 literal: int64 @@ -7091,7 +7794,8 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: r"""Compute the bitwise OR horizontally across columns. Arguments: - exprs: Name(s) of the columns to use in the aggregation function. Accepts expression input. + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. Returns: A new expression. @@ -7102,6 +7806,7 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [False, False, True, True, False, None], ... "b": [False, True, True, None, None, None], @@ -7112,13 +7817,14 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_any_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select("a", "b", any=nw.any_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_any_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_any_horizontal(df_pd) a b any 0 False False False 1 False True True @@ -7127,7 +7833,7 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: 4 False 5 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_any_horizontal(df_pl) shape: (6, 3) ┌───────┬───────┬───────┐ │ a ┆ b ┆ any │ @@ -7142,7 +7848,7 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ null ┆ null ┆ null │ └───────┴───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_any_horizontal(df_pa) pyarrow.Table a: bool b: bool @@ -7178,6 +7884,7 @@ def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 8, 3], ... "b": [4, 5, None], @@ -7190,19 +7897,20 @@ def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function that computes the horizontal mean of "a" and "b" columns: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_mean_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.mean_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_mean_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_mean_horizontal(df_pd) a 0 2.5 1 6.5 2 3.0 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_mean_horizontal(df_pl) shape: (3, 1) ┌─────┐ │ a │ @@ -7214,7 +7922,7 @@ def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 3.0 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_mean_horizontal(df_pa) pyarrow.Table a: double ---- @@ -7253,11 +7961,12 @@ def concat_str( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 2, 3], ... "b": ["dogs", "cats", None], @@ -7267,7 +7976,7 @@ def concat_str( We define a dataframe-agnostic function that computes the horizontal string concatenation of different columns - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_concat_str(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.concat_str( @@ -7280,15 +7989,16 @@ def concat_str( ... ).alias("full_sentence") ... ).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_concat_str`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_concat_str(pd.DataFrame(data)) full_sentence 0 2 dogs play 1 4 cats swim 2 None - >>> my_library_agnostic_function(pl.DataFrame(data)) + >>> agnostic_concat_str(pl.DataFrame(data)) shape: (3, 1) ┌───────────────┐ │ full_sentence │ @@ -7300,7 +8010,7 @@ def concat_str( │ null │ └───────────────┘ - >>> my_library_agnostic_function(pa.table(data)) + >>> agnostic_concat_str(pa.table(data)) pyarrow.Table full_sentence: string ---- diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 5ffc475e5..cb5d2006c 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -2353,24 +2353,28 @@ def all() -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> df_pa = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> + >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_all(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.all() * 2).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_all`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_all(df_pd) a b 0 2 8 1 4 10 2 6 12 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_all(df_pl) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -2381,7 +2385,8 @@ def all() -> Expr: │ 4 ┆ 10 │ │ 6 ┆ 12 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_all(df_pa) pyarrow.Table a: int64 b: int64 @@ -2407,23 +2412,27 @@ def col(*names: str | Iterable[str]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [3, 4]}) - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [3, 4]}) + >>> + >>> data = {"a": [1, 2], "b": [3, 4]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_col(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a") * nw.col("b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_col`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_col(df_pd) a 0 3 1 8 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_col(df_pl) shape: (2, 1) ┌─────┐ │ a │ @@ -2433,7 +2442,8 @@ def col(*names: str | Iterable[str]) -> Expr: │ 3 │ │ 8 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_col(df_pa) pyarrow.Table a: int64 ---- @@ -2461,6 +2471,7 @@ def nth(*indices: int | Sequence[int]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2], "b": [3, 4]} >>> df_pl = pl.DataFrame(data) >>> df_pd = pd.DataFrame(data) @@ -2468,17 +2479,18 @@ def nth(*indices: int | Sequence[int]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_nth(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.nth(0) * 2).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_nth`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_nth(df_pd) a 0 2 1 4 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_nth(df_pl) shape: (2, 1) ┌─────┐ │ a │ @@ -2488,7 +2500,8 @@ def nth(*indices: int | Sequence[int]) -> Expr: │ 2 │ │ 4 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_nth(df_pa) pyarrow.Table a: int64 ---- @@ -2509,22 +2522,25 @@ def len() -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [5, 10]}) + >>> + >>> data = {"a": [1, 2], "b": [5, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_len(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.len()).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_len`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_len(df_pd) len 0 2 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_len(df_pl) shape: (1, 1) ┌─────┐ │ len │ @@ -2533,7 +2549,7 @@ def len() -> Expr: ╞═════╡ │ 2 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_len(df_pa) pyarrow.Table len: int64 ---- @@ -2547,7 +2563,8 @@ def lit(value: Any, dtype: DType | type[DType] | None = None) -> Expr: Arguments: value: The value to use as literal. - dtype: The data type of the literal value. If not provided, the data type will be inferred. + dtype: The data type of the literal value. If not provided, the data type will + be inferred. Returns: A new expression. @@ -2558,23 +2575,27 @@ def lit(value: Any, dtype: DType | type[DType] | None = None) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2]}) - >>> df_pd = pd.DataFrame({"a": [1, 2]}) - >>> df_pa = pa.table({"a": [1, 2]}) + >>> + >>> data = {"a": [1, 2]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_lit(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns(nw.lit(3)).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_lit`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_lit(df_pd) a literal 0 1 3 1 2 3 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_lit(df_pl) shape: (2, 2) ┌─────┬─────────┐ │ a ┆ literal │ @@ -2584,7 +2605,8 @@ def lit(value: Any, dtype: DType | type[DType] | None = None) -> Expr: │ 1 ┆ 3 │ │ 2 ┆ 3 │ └─────┴─────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_lit(df_pa) pyarrow.Table a: int64 literal: int64 @@ -2613,22 +2635,26 @@ def min(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [5, 10]}) + >>> + >>> data = {"a": [1, 2], "b": [5, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_min(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.min("b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_min`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_min(df_pd) b 0 5 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_min(df_pl) shape: (1, 1) ┌─────┐ │ b │ @@ -2637,7 +2663,8 @@ def min(*columns: str) -> Expr: ╞═════╡ │ 5 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_min(df_pa) pyarrow.Table b: int64 ---- @@ -2664,22 +2691,26 @@ def max(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [5, 10]}) + >>> + >>> data = {"a": [1, 2], "b": [5, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_max(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.max("a")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_max`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_max(df_pd) a 0 2 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_max(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -2688,7 +2719,8 @@ def max(*columns: str) -> Expr: ╞═════╡ │ 2 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_max(df_pa) pyarrow.Table a: int64 ---- @@ -2715,22 +2747,26 @@ def mean(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 8, 3]}) - >>> df_pd = pd.DataFrame({"a": [1, 8, 3]}) - >>> df_pa = pa.table({"a": [1, 8, 3]}) + >>> + >>> data = {"a": [1, 8, 3]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_mean(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.mean("a")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_mean`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_mean(df_pd) a 0 4.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_mean(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -2739,7 +2775,8 @@ def mean(*columns: str) -> Expr: ╞═════╡ │ 4.0 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_mean(df_pa) pyarrow.Table a: double ---- @@ -2753,7 +2790,8 @@ def median(*columns: str) -> Expr: Notes: - Syntactic sugar for ``nw.col(columns).median()`` - - Results might slightly differ across backends due to differences in the underlying algorithms used to compute the median. + - Results might slightly differ across backends due to differences in the + underlying algorithms used to compute the median. Arguments: columns: Name(s) of the columns to use in the aggregation function @@ -2767,22 +2805,26 @@ def median(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [4, 5, 2]}) - >>> df_pl = pl.DataFrame({"a": [4, 5, 2]}) - >>> df_pa = pa.table({"a": [4, 5, 2]}) + >>> + >>> data = {"a": [4, 5, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_median(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.median("a")).to_native() - We can then pass any supported library such as pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_median`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_median(df_pd) a 0 4.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_median(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -2791,7 +2833,8 @@ def median(*columns: str) -> Expr: ╞═════╡ │ 4.0 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_median(df_pa) pyarrow.Table a: double ---- @@ -2818,22 +2861,26 @@ def sum(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2]}) - >>> df_pd = pd.DataFrame({"a": [1, 2]}) - >>> df_pa = pa.table({"a": [1, 2]}) + >>> + >>> data = {"a": [1, 2]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_sum(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.sum("a")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_sum`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_sum(df_pd) a 0 3 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_sum(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -2842,7 +2889,8 @@ def sum(*columns: str) -> Expr: ╞═════╡ │ 3 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_sum(df_pa) pyarrow.Table a: int64 ---- @@ -2870,6 +2918,7 @@ def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3], "b": [5, 10, None]} >>> df_pl = pl.DataFrame(data) >>> df_pd = pd.DataFrame(data) @@ -2877,18 +2926,19 @@ def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_sum_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.sum_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_sum_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_sum_horizontal(df_pd) a 0 6.0 1 12.0 2 3.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_sum_horizontal(df_pl) shape: (3, 1) ┌─────┐ │ a │ @@ -2899,7 +2949,8 @@ def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 12 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_sum_horizontal(df_pa) pyarrow.Table a: int64 ---- @@ -2912,7 +2963,8 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: r"""Compute the bitwise AND horizontally across columns. Arguments: - exprs: Name(s) of the columns to use in the aggregation function. Accepts expression input. + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. Returns: A new expression. @@ -2923,6 +2975,7 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [False, False, True, True, False, None], ... "b": [False, True, True, None, None, None], @@ -2933,13 +2986,14 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_all_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select("a", "b", all=nw.all_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_all_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_all_horizontal(df_pd) a b all 0 False False False 1 False True False @@ -2948,7 +3002,7 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: 4 False False 5 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_all_horizontal(df_pl) shape: (6, 3) ┌───────┬───────┬───────┐ │ a ┆ b ┆ all │ @@ -2963,7 +3017,7 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ null ┆ null ┆ null │ └───────┴───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_all_horizontal(df_pa) pyarrow.Table a: bool b: bool @@ -2980,7 +3034,8 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: r"""Compute the bitwise OR horizontally across columns. Arguments: - exprs: Name(s) of the columns to use in the aggregation function. Accepts expression input. + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. Returns: A new expression. @@ -2991,6 +3046,7 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [False, False, True, True, False, None], ... "b": [False, True, True, None, None, None], @@ -3001,13 +3057,14 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_any_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select("a", "b", any=nw.any_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_any_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_any_horizontal(df_pd) a b any 0 False False False 1 False True True @@ -3016,7 +3073,7 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: 4 False 5 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_any_horizontal(df_pl) shape: (6, 3) ┌───────┬───────┬───────┐ │ a ┆ b ┆ any │ @@ -3031,7 +3088,7 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ null ┆ null ┆ null │ └───────┴───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_any_horizontal(df_pa) pyarrow.Table a: bool b: bool @@ -3060,6 +3117,7 @@ def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 8, 3], ... "b": [4, 5, None], @@ -3072,19 +3130,20 @@ def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function that computes the horizontal mean of "a" and "b" columns: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_mean_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.mean_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_mean_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_mean_horizontal(df_pd) a 0 2.5 1 6.5 2 3.0 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_mean_horizontal(df_pl) shape: (3, 1) ┌─────┐ │ a │ @@ -3096,7 +3155,7 @@ def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 3.0 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_mean_horizontal(df_pa) pyarrow.Table a: double ---- @@ -3119,11 +3178,12 @@ def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 8, 3], ... "b": [4, 5, None], @@ -3133,18 +3193,20 @@ def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function that computes the horizontal min of "a" and "b" columns: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_min_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.min_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_min_horizontal`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_min_horizontal(pd.DataFrame(data)) a 0 1.0 1 5.0 2 3.0 - >>> my_library_agnostic_function(pl.DataFrame(data)) + + >>> agnostic_min_horizontal(pl.DataFrame(data)) shape: (3, 1) ┌─────┐ │ a │ @@ -3155,7 +3217,8 @@ def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 5 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(pa.table(data)) + + >>> agnostic_min_horizontal(pa.table(data)) pyarrow.Table a: int64 ---- @@ -3178,11 +3241,12 @@ def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 8, 3], ... "b": [4, 5, None], @@ -3192,18 +3256,20 @@ def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function that computes the horizontal max of "a" and "b" columns: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_max_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.max_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_max_horizontal`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_max_horizontal(pd.DataFrame(data)) a 0 4.0 1 8.0 2 3.0 - >>> my_library_agnostic_function(pl.DataFrame(data)) + + >>> agnostic_max_horizontal(pl.DataFrame(data)) shape: (3, 1) ┌─────┐ │ a │ @@ -3214,7 +3280,8 @@ def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 8 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(pa.table(data)) + + >>> agnostic_max_horizontal(pa.table(data)) pyarrow.Table a: int64 ---- @@ -3405,11 +3472,12 @@ def concat_str( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 2, 3], ... "b": ["dogs", "cats", None], @@ -3419,7 +3487,7 @@ def concat_str( We define a dataframe-agnostic function that computes the horizontal string concatenation of different columns - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_concat_str(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.concat_str( @@ -3432,15 +3500,16 @@ def concat_str( ... ).alias("full_sentence") ... ).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_concat_str`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_concat_str(pd.DataFrame(data)) full_sentence 0 2 dogs play 1 4 cats swim 2 None - >>> my_library_agnostic_function(pl.DataFrame(data)) + >>> agnostic_concat_str(pl.DataFrame(data)) shape: (3, 1) ┌───────────────┐ │ full_sentence │ @@ -3452,7 +3521,7 @@ def concat_str( │ null │ └───────────────┘ - >>> my_library_agnostic_function(pa.table(data)) + >>> agnostic_concat_str(pa.table(data)) pyarrow.Table full_sentence: string ---- @@ -3495,9 +3564,9 @@ def when(*predicates: IntoExpr | Iterable[IntoExpr]) -> When: If not appended, and none of the conditions are `True`, `None` will be returned. Arguments: - predicates: Condition(s) that must be met in order to apply the subsequent statement. - Accepts one or more boolean expressions, which are implicitly combined with `&`. - String input is parsed as a column name. + predicates: Condition(s) that must be met in order to apply the subsequent + statement. Accepts one or more boolean expressions, which are implicitly + combined with `&`. String input is parsed as a column name. Returns: A "when" object, which `.then` can be called on. @@ -3508,26 +3577,30 @@ def when(*predicates: IntoExpr | Iterable[IntoExpr]) -> When: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2, 3], "b": [5, 10, 15]}) - >>> df_pd = pd.DataFrame({"a": [1, 2, 3], "b": [5, 10, 15]}) - >>> df_pa = pa.table({"a": [1, 2, 3], "b": [5, 10, 15]}) + >>> + >>> data = {"a": [1, 2, 3], "b": [5, 10, 15]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_when_then_otherwise(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.when(nw.col("a") < 3).then(5).otherwise(6).alias("a_when") ... ).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_when_then_otherwise`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_when_then_otherwise(df_pd) a b a_when 0 1 5 5 1 2 10 5 2 3 15 6 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_when_then_otherwise(df_pl) shape: (3, 3) ┌─────┬─────┬────────┐ │ a ┆ b ┆ a_when │ @@ -3538,7 +3611,8 @@ def when(*predicates: IntoExpr | Iterable[IntoExpr]) -> When: │ 2 ┆ 10 ┆ 5 │ │ 3 ┆ 15 ┆ 6 │ └─────┴─────┴────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_when_then_otherwise(df_pa) pyarrow.Table a: int64 b: int64 From e56f91d761062d8a219b2d51c5d0e49b2cbedfab Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 6 Jan 2025 10:15:30 +0000 Subject: [PATCH 08/35] test: catch some cudf failures (#1735) --- pyproject.toml | 1 + tests/expr_and_series/clip_test.py | 3 +++ tests/expr_and_series/lit_test.py | 6 +++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 43a1dbc12..daa21c3ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -187,6 +187,7 @@ omit = [ exclude_also = [ "if sys.version_info() <", "if .*implementation is Implementation.CUDF", + "if .*implementation.is_cudf", 'request.applymarker\(pytest.mark.xfail', 'backend_version <', 'if "cudf" in str\(constructor' diff --git a/tests/expr_and_series/clip_test.py b/tests/expr_and_series/clip_test.py index 29ed6379b..2ae9e043d 100644 --- a/tests/expr_and_series/clip_test.py +++ b/tests/expr_and_series/clip_test.py @@ -28,6 +28,9 @@ def test_clip_expr_expressified( ) -> None: if "modin_pyarrow" in str(constructor): request.applymarker(pytest.mark.xfail) + if "cudf" in str(constructor): + # https://github.com/rapidsai/cudf/issues/17682 + request.applymarker(pytest.mark.xfail) data = {"a": [1, 2, 3, -4, 5], "lb": [3, 2, 1, 1, 1], "ub": [4, 4, 2, 2, 2]} df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/lit_test.py b/tests/expr_and_series/lit_test.py index f5be7dfbe..501bfc4bd 100644 --- a/tests/expr_and_series/lit_test.py +++ b/tests/expr_and_series/lit_test.py @@ -108,4 +108,8 @@ def test_date_lit(constructor: Constructor, request: pytest.FixtureRequest) -> N request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1]})) result = df.with_columns(nw.lit(date(2020, 1, 1), dtype=nw.Date)).collect_schema() - assert result == {"a": nw.Int64, "literal": nw.Date} + if df.implementation.is_cudf(): + # cudf has no date dtype + assert result == {"a": nw.Int64, "literal": nw.Datetime} + else: + assert result == {"a": nw.Int64, "literal": nw.Date} From aa48faae5ebaf36d0efd39cc1b0267a4502ee51a Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 6 Jan 2025 10:41:53 +0000 Subject: [PATCH 09/35] feat: Implement partial "lazy" support for DuckDB (even with this PR, DuckDB support is work-in-progress!) (#1725) --- README.md | 3 +- docs/backcompat.md | 4 + docs/basics/dataframe_conversion.md | 16 +- docs/extending.md | 7 +- narwhals/_arrow/dataframe.py | 4 + narwhals/_dask/dataframe.py | 4 + narwhals/_duckdb/dataframe.py | 329 +++++--- narwhals/_duckdb/expr.py | 767 ++++++++++++++++++ narwhals/_duckdb/group_by.py | 57 ++ narwhals/_duckdb/namespace.py | 205 +++++ narwhals/_duckdb/series.py | 2 +- narwhals/_duckdb/typing.py | 16 + narwhals/_duckdb/utils.py | 213 +++++ narwhals/_pandas_like/dataframe.py | 4 + narwhals/functions.py | 3 + narwhals/translate.py | 19 +- pyproject.toml | 1 + tests/conftest.py | 23 +- tests/expr_and_series/all_horizontal_test.py | 2 + tests/expr_and_series/arithmetic_test.py | 6 +- tests/expr_and_series/cast_test.py | 18 +- tests/expr_and_series/concat_str_test.py | 8 +- .../expr_and_series/convert_time_zone_test.py | 2 + tests/expr_and_series/cum_count_test.py | 2 + tests/expr_and_series/cum_max_test.py | 2 + tests/expr_and_series/cum_min_test.py | 2 + tests/expr_and_series/cum_prod_test.py | 2 + tests/expr_and_series/cum_sum_test.py | 2 + tests/expr_and_series/diff_test.py | 2 + .../dt/datetime_attributes_test.py | 3 + .../dt/datetime_duration_test.py | 2 + tests/expr_and_series/dt/timestamp_test.py | 8 + tests/expr_and_series/dt/to_string_test.py | 20 +- tests/expr_and_series/fill_null_test.py | 12 +- tests/expr_and_series/is_duplicated_test.py | 14 +- tests/expr_and_series/is_finite_test.py | 4 +- .../expr_and_series/is_first_distinct_test.py | 8 +- .../expr_and_series/is_last_distinct_test.py | 8 +- tests/expr_and_series/is_nan_test.py | 8 +- tests/expr_and_series/is_unique_test.py | 12 +- tests/expr_and_series/lit_test.py | 7 + tests/expr_and_series/mean_horizontal_test.py | 10 +- tests/expr_and_series/median_test.py | 11 +- tests/expr_and_series/n_unique_test.py | 6 +- .../expr_and_series/name/to_uppercase_test.py | 16 +- tests/expr_and_series/nth_test.py | 2 + tests/expr_and_series/null_count_test.py | 8 +- tests/expr_and_series/over_test.py | 24 +- tests/expr_and_series/quantile_test.py | 5 +- tests/expr_and_series/reduction_test.py | 22 +- tests/expr_and_series/replace_strict_test.py | 6 + .../expr_and_series/replace_time_zone_test.py | 3 + tests/expr_and_series/shift_test.py | 5 +- tests/expr_and_series/std_test.py | 21 +- tests/expr_and_series/str/len_chars_test.py | 6 +- tests/expr_and_series/str/replace_test.py | 8 +- tests/expr_and_series/str/to_datetime_test.py | 12 +- .../str/to_uppercase_to_lowercase_test.py | 2 + tests/expr_and_series/sum_horizontal_test.py | 14 +- tests/expr_and_series/unary_test.py | 16 +- tests/expr_and_series/var_test.py | 21 +- tests/expr_and_series/when_test.py | 44 +- tests/frame/add_test.py | 6 +- tests/frame/clone_test.py | 2 + tests/frame/concat_test.py | 12 +- tests/frame/drop_nulls_test.py | 11 +- tests/frame/explode_test.py | 8 +- tests/frame/filter_test.py | 6 +- tests/frame/gather_every_test.py | 6 +- tests/frame/join_test.py | 22 +- tests/frame/select_test.py | 12 +- tests/frame/unique_test.py | 17 +- tests/frame/unpivot_test.py | 4 +- tests/frame/with_columns_test.py | 2 + tests/frame/with_row_index_test.py | 6 +- tests/group_by_test.py | 24 +- tests/selectors_test.py | 31 +- tests/stable_api_test.py | 6 +- tests/utils.py | 8 +- tpch/execute.py | 3 +- utils/import_check.py | 2 + 81 files changed, 2064 insertions(+), 217 deletions(-) create mode 100644 narwhals/_duckdb/expr.py create mode 100644 narwhals/_duckdb/group_by.py create mode 100644 narwhals/_duckdb/namespace.py create mode 100644 narwhals/_duckdb/typing.py create mode 100644 narwhals/_duckdb/utils.py diff --git a/README.md b/README.md index bb024c6c2..eee90ebd9 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,7 @@ Extremely lightweight and extensible compatibility layer between dataframe libraries! - **Full API support**: cuDF, Modin, pandas, Polars, PyArrow -- **Lazy-only support**: Dask -- **Interchange-level support**: DuckDB, Ibis, Vaex, anything which implements the DataFrame Interchange Protocol +- **Lazy-only support**: Dask. Work in progress: DuckDB, Ibis, PySpark. Seamlessly support all, without depending on any! diff --git a/docs/backcompat.md b/docs/backcompat.md index 55b927fd8..b2d312e0a 100644 --- a/docs/backcompat.md +++ b/docs/backcompat.md @@ -111,6 +111,10 @@ before making any change. ### After `stable.v1` + +- Since Narwhals 1.21, passing a `DuckDBPyRelation` to `from_native` returns a `LazyFrame`. In + `narwhals.stable.v1`, it returns a `DataFrame` with `level='interchange'`. + - Since Narwhals 1.15, `Series` is generic in the native Series, meaning that you can write: ```python diff --git a/docs/basics/dataframe_conversion.md b/docs/basics/dataframe_conversion.md index 690f5d093..a4753a033 100644 --- a/docs/basics/dataframe_conversion.md +++ b/docs/basics/dataframe_conversion.md @@ -14,6 +14,7 @@ To illustrate, we create dataframes in various formats: ```python exec="1" source="above" session="conversion" import narwhals as nw from narwhals.typing import IntoDataFrame +from typing import Any import duckdb import polars as pl @@ -45,11 +46,15 @@ print(df_to_pandas(df_polars)) ### Via PyCapsule Interface -Similarly, if your library uses Polars internally, you can convert any user-supplied dataframe to Polars format using Narwhals. +Similarly, if your library uses Polars internally, you can convert any user-supplied dataframe +which implements `__arrow_c_stream__`: ```python exec="1" source="above" session="conversion" result="python" -def df_to_polars(df: IntoDataFrame) -> pl.DataFrame: - return nw.from_arrow(nw.from_native(df), native_namespace=pl).to_native() +def df_to_polars(df_native: Any) -> pl.DataFrame: + if hasattr(df_native, "__arrow_c_stream__"): + return nw.from_arrow(df_native, native_namespace=pl).to_native() + msg = f"Expected object which implements '__arrow_c_stream__' got: {type(df)}" + raise TypeError(msg) print(df_to_polars(df_duckdb)) # You can only execute this line of code once. @@ -66,8 +71,9 @@ If you need to ingest the same dataframe multiple times, then you may want to go This may be less efficient than the PyCapsule approach above (and always requires PyArrow!), but is more forgiving: ```python exec="1" source="above" session="conversion" result="python" -def df_to_polars(df: IntoDataFrame) -> pl.DataFrame: - return pl.DataFrame(nw.from_native(df).to_arrow()) +def df_to_polars(df_native: IntoDataFrame) -> pl.DataFrame: + df = nw.from_native(df_native).lazy().collect() + return pl.DataFrame(nw.from_native(df, eager_only=True).to_arrow()) df_duckdb = duckdb.sql("SELECT * FROM df_polars") diff --git a/docs/extending.md b/docs/extending.md index 2a8953987..588e234f4 100644 --- a/docs/extending.md +++ b/docs/extending.md @@ -15,17 +15,16 @@ Currently, Narwhals has **full API** support for the following libraries: It also has **lazy-only** support for [Dask](https://github.com/dask/dask), and **interchange** support for [DuckDB](https://github.com/duckdb/duckdb) and [Ibis](https://github.com/ibis-project/ibis). +We are working towards full "lazy-only" support for DuckDB, Ibis, and PySpark. + ### Levels of support Narwhals comes with three levels of support: - **Full API support**: cuDF, Modin, pandas, Polars, PyArrow -- **Lazy-only support**: Dask +- **Lazy-only support**: Dask. Work in progress: DuckDB, Ibis, PySpark. - **Interchange-level support**: DuckDB, Ibis, Vaex, anything which implements the DataFrame Interchange Protocol -The lazy-only layer is a major item on our 2025 roadmap, and hope to be able to bring libraries currently in -the "interchange" level into that one. - Libraries for which we have full support can benefit from the whole [Narwhals API](./api-reference/index.md). diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 9e5ce0621..f4ad2912e 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -16,6 +16,7 @@ from narwhals._arrow.utils import validate_dataframe_comparand from narwhals._expression_parsing import evaluate_into_exprs from narwhals.dependencies import is_numpy_array +from narwhals.exceptions import ColumnNotFoundError from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_temporary_column_name @@ -669,6 +670,9 @@ def unique( import pyarrow.compute as pc df = self._native_frame + if subset is not None and any(x not in self.columns for x in subset): + msg = f"Column(s) {subset} not found in {self.columns}" + raise ColumnNotFoundError(msg) subset = subset or self.columns if keep in {"any", "first", "last"}: diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 5e652a937..16053d69a 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -11,6 +11,7 @@ from narwhals._dask.utils import parse_exprs_and_named_exprs from narwhals._pandas_like.utils import native_to_narwhals_dtype from narwhals._pandas_like.utils import select_columns_by_name +from narwhals.exceptions import ColumnNotFoundError from narwhals.typing import CompliantLazyFrame from narwhals.utils import Implementation from narwhals.utils import flatten @@ -197,6 +198,9 @@ def unique( *, keep: Literal["any", "none"] = "any", ) -> Self: + if subset is not None and any(x not in self.columns for x in subset): + msg = f"Column(s) {subset} not found in {self.columns}" + raise ColumnNotFoundError(msg) native_frame = self._native_frame if keep == "none": subset = subset or self.columns diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 73dd055ca..76ff68ae0 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -1,105 +1,74 @@ from __future__ import annotations -import re -from functools import lru_cache +from itertools import chain from typing import TYPE_CHECKING from typing import Any +from typing import Iterable +from typing import Literal +from typing import Sequence +from narwhals._duckdb.utils import native_to_narwhals_dtype +from narwhals._duckdb.utils import parse_exprs_and_named_exprs from narwhals.dependencies import get_duckdb +from narwhals.exceptions import ColumnNotFoundError from narwhals.utils import Implementation -from narwhals.utils import import_dtypes_module +from narwhals.utils import Version +from narwhals.utils import flatten +from narwhals.utils import generate_temporary_column_name +from narwhals.utils import parse_columns_to_drop from narwhals.utils import parse_version from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType + import duckdb import pandas as pd import pyarrow as pa from typing_extensions import Self + from narwhals._duckdb.expr import DuckDBExpr + from narwhals._duckdb.group_by import DuckDBGroupBy + from narwhals._duckdb.namespace import DuckDBNamespace from narwhals._duckdb.series import DuckDBInterchangeSeries from narwhals.dtypes import DType - from narwhals.utils import Version - - -@lru_cache(maxsize=16) -def native_to_narwhals_dtype(duckdb_dtype: str, version: Version) -> DType: - dtypes = import_dtypes_module(version) - if duckdb_dtype == "HUGEINT": - return dtypes.Int128() - if duckdb_dtype == "BIGINT": - return dtypes.Int64() - if duckdb_dtype == "INTEGER": - return dtypes.Int32() - if duckdb_dtype == "SMALLINT": - return dtypes.Int16() - if duckdb_dtype == "TINYINT": - return dtypes.Int8() - if duckdb_dtype == "UHUGEINT": - return dtypes.UInt128() - if duckdb_dtype == "UBIGINT": - return dtypes.UInt64() - if duckdb_dtype == "UINTEGER": - return dtypes.UInt32() - if duckdb_dtype == "USMALLINT": - return dtypes.UInt16() - if duckdb_dtype == "UTINYINT": - return dtypes.UInt8() - if duckdb_dtype == "DOUBLE": - return dtypes.Float64() - if duckdb_dtype == "FLOAT": - return dtypes.Float32() - if duckdb_dtype == "VARCHAR": - return dtypes.String() - if duckdb_dtype == "DATE": - return dtypes.Date() - if duckdb_dtype == "TIMESTAMP": - return dtypes.Datetime() - if duckdb_dtype == "BOOLEAN": - return dtypes.Boolean() - if duckdb_dtype == "INTERVAL": - return dtypes.Duration() - if duckdb_dtype.startswith("STRUCT"): - matchstruc_ = re.findall(r"(\w+)\s+(\w+)", duckdb_dtype) - return dtypes.Struct( - [ - dtypes.Field( - matchstruc_[i][0], - native_to_narwhals_dtype(matchstruc_[i][1], version), - ) - for i in range(len(matchstruc_)) - ] - ) - if match_ := re.match(r"(.*)\[\]$", duckdb_dtype): - return dtypes.List(native_to_narwhals_dtype(match_.group(1), version)) - if match_ := re.match(r"(\w+)\[(\d+)\]", duckdb_dtype): - return dtypes.Array( - native_to_narwhals_dtype(match_.group(1), version), - int(match_.group(2)), - ) - if duckdb_dtype.startswith("DECIMAL("): - return dtypes.Decimal() - return dtypes.Unknown() # pragma: no cover -class DuckDBInterchangeFrame: +class DuckDBLazyFrame: _implementation = Implementation.DUCKDB def __init__( - self, df: Any, *, backend_version: tuple[int, ...], version: Version + self, + df: duckdb.DuckDBPyRelation, + *, + backend_version: tuple[int, ...], + version: Version, ) -> None: - self._native_frame = df + self._native_frame: duckdb.DuckDBPyRelation = df self._version = version self._backend_version = backend_version validate_backend_version(self._implementation, self._backend_version) - def __narwhals_dataframe__(self) -> Any: + def __narwhals_dataframe__(self) -> Any: # pragma: no cover + # Keep around for backcompat. + if self._version is not Version.V1: + msg = "__narwhals_dataframe__ is not implemented for DuckDBLazyFrame" + raise AttributeError(msg) + return self + + def __narwhals_lazyframe__(self) -> Any: return self def __native_namespace__(self: Self) -> ModuleType: return get_duckdb() # type: ignore[no-any-return] + def __narwhals_namespace__(self) -> DuckDBNamespace: + from narwhals._duckdb.namespace import DuckDBNamespace + + return DuckDBNamespace( + backend_version=self._backend_version, version=self._version + ) + def __getitem__(self, item: str) -> DuckDBInterchangeSeries: from narwhals._duckdb.series import DuckDBInterchangeSeries @@ -107,42 +76,101 @@ def __getitem__(self, item: str) -> DuckDBInterchangeSeries: self._native_frame.select(item), version=self._version ) + def collect(self) -> Any: + try: + import pyarrow as pa # ignore-banned-import + except ModuleNotFoundError as exc: # pragma: no cover + msg = "PyArrow>=11.0.0 is required to collect `LazyFrame` backed by DuckDcollect `LazyFrame` backed by DuckDB" + raise ModuleNotFoundError(msg) from exc + + from narwhals._arrow.dataframe import ArrowDataFrame + + return ArrowDataFrame( + native_dataframe=self._native_frame.arrow(), + backend_version=parse_version(pa.__version__), + version=self._version, + ) + + def head(self, n: int) -> Self: + return self._from_native_frame(self._native_frame.limit(n)) + def select( self: Self, *exprs: Any, **named_exprs: Any, ) -> Self: - if named_exprs or not all(isinstance(x, str) for x in exprs): # pragma: no cover - msg = ( - "`select`-ing not by name is not supported for DuckDB backend.\n\n" - "If you would like to see this kind of object better supported in " - "Narwhals, please open a feature request " - "at https://github.com/narwhals-dev/narwhals/issues." + new_columns_map = parse_exprs_and_named_exprs(self, *exprs, **named_exprs) + if not new_columns_map: + # TODO(marco): return empty relation with 0 columns? + return self._from_native_frame(self._native_frame.limit(0)) + + if all(getattr(x, "_returns_scalar", False) for x in exprs) and all( + getattr(x, "_returns_scalar", False) for x in named_exprs.values() + ): + return self._from_native_frame( + self._native_frame.aggregate( + [val.alias(col) for col, val in new_columns_map.items()] + ) ) - raise NotImplementedError(msg) - return self._from_native_frame(self._native_frame.select(*exprs)) + return self._from_native_frame( + self._native_frame.select( + *(val.alias(col) for col, val in new_columns_map.items()) + ) + ) - def __getattr__(self, attr: str) -> Any: - if attr == "schema": - return { - column_name: native_to_narwhals_dtype(str(duckdb_dtype), self._version) - for column_name, duckdb_dtype in zip( - self._native_frame.columns, self._native_frame.types - ) - } - elif attr == "columns": - return self._native_frame.columns - - msg = ( # pragma: no cover - f"Attribute {attr} is not supported for metadata-only dataframes.\n\n" - "If you would like to see this kind of object better supported in " - "Narwhals, please open a feature request " - "at https://github.com/narwhals-dev/narwhals/issues." + def drop(self: Self, columns: list[str], strict: bool) -> Self: # noqa: FBT001 + columns_to_drop = parse_columns_to_drop( + compliant_frame=self, columns=columns, strict=strict + ) + selection = (col for col in self.columns if col not in columns_to_drop) + return self._from_native_frame(self._native_frame.select(*selection)) + + def lazy(self) -> Self: + return self + + def with_columns( + self: Self, + *exprs: Any, + **named_exprs: Any, + ) -> Self: + from duckdb import ColumnExpression + + new_columns_map = parse_exprs_and_named_exprs(self, *exprs, **named_exprs) + result = [] + for col in self._native_frame.columns: + if col in new_columns_map: + result.append(new_columns_map.pop(col).alias(col)) + else: + result.append(ColumnExpression(col)) + for col, value in new_columns_map.items(): + result.append(value.alias(col)) + return self._from_native_frame(self._native_frame.select(*result)) + + def filter(self, *predicates: DuckDBExpr, **constraints: Any) -> Self: + plx = self.__narwhals_namespace__() + expr = plx.all_horizontal( + *chain(predicates, (plx.col(name) == v for name, v in constraints.items())) ) - raise NotImplementedError(msg) # pragma: no cover + # `[0]` is safe as all_horizontal's expression only returns a single column + mask = expr._call(self)[0] + return self._from_native_frame(self._native_frame.filter(mask)) + + @property + def schema(self) -> dict[str, DType]: + return { + column_name: native_to_narwhals_dtype(str(duckdb_dtype), self._version) + for column_name, duckdb_dtype in zip( + self._native_frame.columns, self._native_frame.types + ) + } + + @property + def columns(self) -> list[str]: + return self._native_frame.columns # type: ignore[no-any-return] def to_pandas(self: Self) -> pd.DataFrame: + # only if version is v1, keep around for backcompat import pandas as pd # ignore-banned-import() if parse_version(pd.__version__) >= parse_version("1.0.0"): @@ -152,6 +180,7 @@ def to_pandas(self: Self) -> pd.DataFrame: raise NotImplementedError(msg) def to_arrow(self: Self) -> pa.Table: + # only if version is v1, keep around for backcompat return self._native_frame.arrow() def _change_version(self: Self, version: Version) -> Self: @@ -161,9 +190,68 @@ def _change_version(self: Self, version: Version) -> Self: def _from_native_frame(self: Self, df: Any) -> Self: return self.__class__( - df, version=self._version, backend_version=self._backend_version + df, backend_version=self._backend_version, version=self._version + ) + + def group_by(self: Self, *keys: str, drop_null_keys: bool) -> DuckDBGroupBy: + from narwhals._duckdb.group_by import DuckDBGroupBy + + if drop_null_keys: + msg = "todo" + raise NotImplementedError(msg) + + return DuckDBGroupBy( + compliant_frame=self, keys=list(keys), drop_null_keys=drop_null_keys + ) + + def rename(self: Self, mapping: dict[str, str]) -> Self: + df = self._native_frame + selection = [ + f"{col} as {mapping[col]}" if col in mapping else col for col in df.columns + ] + return self._from_native_frame(df.select(", ".join(selection))) + + def join( + self: Self, + other: Self, + *, + how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner", + left_on: str | list[str] | None, + right_on: str | list[str] | None, + suffix: str, + ) -> Self: + if isinstance(left_on, str): + left_on = [left_on] + if isinstance(right_on, str): + right_on = [right_on] + + if how not in ("inner", "left"): + msg = "Only inner and left join is implemented for DuckDB" + raise NotImplementedError(msg) + + # help mypy + assert left_on is not None # noqa: S101 + assert right_on is not None # noqa: S101 + + conditions = [ + f"lhs.{left} = rhs.{right}" for left, right in zip(left_on, right_on) + ] + original_alias = self._native_frame.alias + condition = " and ".join(conditions) + rel = self._native_frame.set_alias("lhs").join( + other._native_frame.set_alias("rhs"), condition=condition, how=how ) + select = [f"lhs.{x}" for x in self._native_frame.columns] + for col in other._native_frame.columns: + if col in self._native_frame.columns and col not in right_on: + select.append(f"rhs.{col} as {col}{suffix}") + elif col not in right_on: + select.append(col) + + res = rel.select(", ".join(select)).set_alias(original_alias) + return self._from_native_frame(res) + def collect_schema(self) -> dict[str, DType]: return { column_name: native_to_narwhals_dtype(str(duckdb_dtype), self._version) @@ -171,3 +259,56 @@ def collect_schema(self) -> dict[str, DType]: self._native_frame.columns, self._native_frame.types ) } + + def unique(self, subset: Sequence[str] | None, keep: str) -> Self: + if subset is not None: + import duckdb + + rel = self._native_frame + # Sanitise input + if any(x not in rel.columns for x in subset): + msg = f"Columns {set(subset).difference(rel.columns)} not found in {rel.columns}." + raise ColumnNotFoundError(msg) + idx_name = f'"{generate_temporary_column_name(8, rel.columns)}"' + count_name = ( + f'"{generate_temporary_column_name(8, [*rel.columns, idx_name])}"' + ) + if keep == "none": + keep_condition = f"where {count_name}=1" + else: + keep_condition = f"where {idx_name}=1" + query = f""" + with cte as ( + select *, + row_number() over (partition by {",".join(subset)}) as {idx_name}, + count(*) over (partition by {",".join(subset)}) as {count_name} + from rel + ) + select * exclude ({idx_name}, {count_name}) from cte {keep_condition} + """ # noqa: S608 + return self._from_native_frame(duckdb.sql(query)) + return self._from_native_frame(self._native_frame.unique(", ".join(self.columns))) + + def sort( + self: Self, + by: str | Iterable[str], + *more_by: str, + descending: bool | Sequence[bool] = False, + nulls_last: bool = False, + ) -> Self: + flat_by = flatten([*flatten([by]), *more_by]) + if isinstance(descending, bool): + descending = [descending] * len(flat_by) + descending_str = ["desc" if x else "" for x in descending] + + result = self._native_frame.order( + ",".join( + ( + f"{col} {desc} nulls last" + if nulls_last + else f"{col} {desc} nulls first" + for col, desc in zip(flat_by, descending_str) + ) + ) + ) + return self._from_native_frame(result) diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py new file mode 100644 index 000000000..3956e919d --- /dev/null +++ b/narwhals/_duckdb/expr.py @@ -0,0 +1,767 @@ +from __future__ import annotations + +import functools +from typing import TYPE_CHECKING +from typing import Any +from typing import Callable +from typing import Literal +from typing import NoReturn +from typing import Sequence + +from narwhals._duckdb.utils import binary_operation_returns_scalar +from narwhals._duckdb.utils import get_column_name +from narwhals._duckdb.utils import maybe_evaluate +from narwhals._duckdb.utils import narwhals_to_native_dtype +from narwhals._expression_parsing import infer_new_root_output_names +from narwhals.typing import CompliantExpr +from narwhals.utils import Implementation + +if TYPE_CHECKING: + import duckdb + from typing_extensions import Self + + from narwhals._duckdb.dataframe import DuckDBLazyFrame + from narwhals._duckdb.namespace import DuckDBNamespace + from narwhals.dtypes import DType + from narwhals.utils import Version + + +class DuckDBExpr(CompliantExpr["duckdb.Expression"]): + _implementation = Implementation.DUCKDB + + def __init__( + self, + call: Callable[[DuckDBLazyFrame], list[duckdb.Expression]], + *, + depth: int, + function_name: str, + root_names: list[str] | None, + output_names: list[str] | None, + # Whether the expression is a length-1 Column resulting from + # a reduction, such as `nw.col('a').sum()` + returns_scalar: bool, + backend_version: tuple[int, ...], + version: Version, + kwargs: dict[str, Any], + ) -> None: + self._call = call + self._depth = depth + self._function_name = function_name + self._root_names = root_names + self._output_names = output_names + self._returns_scalar = returns_scalar + self._backend_version = backend_version + self._version = version + self._kwargs = kwargs + + def __call__(self, df: DuckDBLazyFrame) -> Sequence[duckdb.Expression]: + return self._call(df) + + def __narwhals_expr__(self) -> None: ... + + def __narwhals_namespace__(self) -> DuckDBNamespace: # pragma: no cover + # Unused, just for compatibility with PandasLikeExpr + from narwhals._duckdb.namespace import DuckDBNamespace + + return DuckDBNamespace( + backend_version=self._backend_version, version=self._version + ) + + @classmethod + def from_column_names( + cls: type[Self], + *column_names: str, + backend_version: tuple[int, ...], + version: Version, + ) -> Self: + def func(_: DuckDBLazyFrame) -> list[duckdb.Expression]: + from duckdb import ColumnExpression + + return [ColumnExpression(col_name) for col_name in column_names] + + return cls( + func, + depth=0, + function_name="col", + root_names=list(column_names), + output_names=list(column_names), + returns_scalar=False, + backend_version=backend_version, + version=version, + kwargs={}, + ) + + def _from_call( + self, + call: Callable[..., duckdb.Expression], + expr_name: str, + *, + returns_scalar: bool, + **kwargs: Any, + ) -> Self: + def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: + results = [] + inputs = self._call(df) + _kwargs = {key: maybe_evaluate(df, value) for key, value in kwargs.items()} + for _input in inputs: + input_col_name = get_column_name( + df, _input, returns_scalar=self._returns_scalar + ) + if self._returns_scalar: + # TODO(marco): once WindowExpression is supported, then + # we may need to call it with `over(1)` here, + # depending on the context? + pass + + column_result = call(_input, **_kwargs) + column_result = column_result.alias(input_col_name) + if returns_scalar: + # TODO(marco): once WindowExpression is supported, then + # we may need to call it with `over(1)` here, + # depending on the context? + pass + results.append(column_result) + return results + + root_names, output_names = infer_new_root_output_names(self, **kwargs) + + return self.__class__( + func, + depth=self._depth + 1, + function_name=f"{self._function_name}->{expr_name}", + root_names=root_names, + output_names=output_names, + returns_scalar=returns_scalar, + backend_version=self._backend_version, + version=self._version, + kwargs=kwargs, + ) + + def __and__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input & other, + "__and__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __or__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input | other, + "__or__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __add__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input + other, + "__add__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __truediv__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input / other, + "__truediv__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __floordiv__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__floordiv__(other), + "__floordiv__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __mod__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__mod__(other), + "__mod__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __sub__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input - other, + "__sub__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __mul__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input * other, + "__mul__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __pow__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input**other, + "__pow__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __lt__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input < other, + "__lt__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __gt__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input > other, + "__gt__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __le__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input <= other, + "__le__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __ge__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input >= other, + "__ge__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __eq__(self, other: DuckDBExpr) -> Self: # type: ignore[override] + return self._from_call( + lambda _input, other: _input == other, + "__eq__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __ne__(self, other: DuckDBExpr) -> Self: # type: ignore[override] + return self._from_call( + lambda _input, other: _input != other, + "__ne__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __invert__(self) -> Self: + return self._from_call( + lambda _input: ~_input, + "__invert__", + returns_scalar=self._returns_scalar, + ) + + def alias(self, name: str) -> Self: + def _alias(df: DuckDBLazyFrame) -> list[duckdb.Expression]: + return [col.alias(name) for col in self._call(df)] + + # Define this one manually, so that we can + # override `output_names` and not increase depth + return self.__class__( + _alias, + depth=self._depth, + function_name=self._function_name, + root_names=self._root_names, + output_names=[name], + returns_scalar=self._returns_scalar, + backend_version=self._backend_version, + version=self._version, + kwargs={**self._kwargs, "name": name}, + ) + + def abs(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("abs", _input), + "abs", + returns_scalar=self._returns_scalar, + ) + + def mean(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("mean", _input), + "mean", + returns_scalar=True, + ) + + def skew(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("skewness", _input), + "skew", + returns_scalar=True, + ) + + def median(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("median", _input), + "median", + returns_scalar=True, + ) + + def all(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("bool_and", _input), + "all", + returns_scalar=True, + ) + + def any(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("bool_or", _input), + "any", + returns_scalar=True, + ) + + def quantile( + self, + quantile: float, + interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"], + ) -> Self: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + def func(_input: duckdb.Expression) -> duckdb.Expression: + if interpolation == "linear": + return FunctionExpression( + "quantile_cont", _input, ConstantExpression(quantile) + ) + msg = "Only linear interpolation methods are supported for DuckDB quantile." + raise NotImplementedError(msg) + + return self._from_call( + func, + "quantile", + returns_scalar=True, + ) + + def clip(self, lower_bound: Any, upper_bound: Any) -> Self: + from duckdb import FunctionExpression + + def func( + _input: duckdb.Expression, lower_bound: Any, upper_bound: Any + ) -> duckdb.Expression: + return FunctionExpression( + "greatest", + FunctionExpression("least", _input, upper_bound), + lower_bound, + ) + + return self._from_call( + func, + "clip", + lower_bound=lower_bound, + upper_bound=upper_bound, + returns_scalar=self._returns_scalar, + ) + + def is_between( + self, + lower_bound: Any, + upper_bound: Any, + closed: Literal["left", "right", "none", "both"], + ) -> Self: + def func( + _input: duckdb.Expression, lower_bound: Any, upper_bound: Any + ) -> duckdb.Expression: + if closed == "left": + return (_input >= lower_bound) & (_input < upper_bound) + elif closed == "right": + return (_input > lower_bound) & (_input <= upper_bound) + elif closed == "none": + return (_input > lower_bound) & (_input < upper_bound) + return (_input >= lower_bound) & (_input <= upper_bound) + + return self._from_call( + func, + "is_between", + lower_bound=lower_bound, + upper_bound=upper_bound, + returns_scalar=self._returns_scalar, + ) + + def sum(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("sum", _input), + "sum", + returns_scalar=True, + ) + + def count(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("count", _input), + "count", + returns_scalar=True, + ) + + def len(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("count"), + "len", + returns_scalar=True, + ) + + def std(self, ddof: int) -> Self: + from duckdb import FunctionExpression + + if ddof == 1: + func = "stddev_samp" + elif ddof == 0: + func = "stddev_pop" + else: + msg = f"std with ddof {ddof} is not supported in DuckDB" + raise NotImplementedError(msg) + return self._from_call( + lambda _input: FunctionExpression(func, _input), + "std", + returns_scalar=True, + ) + + def var(self, ddof: int) -> Self: + from duckdb import FunctionExpression + + if ddof == 1: + func = "var_samp" + elif ddof == 0: + func = "var_pop" + else: + msg = f"var with ddof {ddof} is not supported in DuckDB" + raise NotImplementedError(msg) + return self._from_call( + lambda _input: FunctionExpression(func, _input), + "var", + returns_scalar=True, + ) + + def max(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("max", _input), + "max", + returns_scalar=True, + ) + + def min(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("min", _input), + "min", + returns_scalar=True, + ) + + def is_null(self) -> Self: + return self._from_call( + lambda _input: _input.isnull(), + "is_null", + returns_scalar=self._returns_scalar, + ) + + def is_in(self, other: Sequence[Any]) -> Self: + from duckdb import ConstantExpression + + return self._from_call( + lambda _input: functools.reduce( + lambda x, y: x | _input.isin(ConstantExpression(y)), + other[1:], + _input.isin(ConstantExpression(other[0])), + ), + "is_in", + returns_scalar=self._returns_scalar, + ) + + def round(self, decimals: int) -> Self: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression( + "round", _input, ConstantExpression(decimals) + ), + "round", + returns_scalar=self._returns_scalar, + ) + + def fill_null(self, value: Any, strategy: Any, limit: int | None) -> Self: + from duckdb import CoalesceOperator + from duckdb import ConstantExpression + + if strategy is not None: + msg = "todo" + raise NotImplementedError(msg) + + return self._from_call( + lambda _input: CoalesceOperator(_input, ConstantExpression(value)), + "fill_null", + returns_scalar=self._returns_scalar, + ) + + def cast( + self: Self, + dtype: DType | type[DType], + ) -> Self: + def func(_input: Any, dtype: DType | type[DType]) -> Any: + native_dtype = narwhals_to_native_dtype(dtype, self._version) + return _input.cast(native_dtype) + + return self._from_call( + func, + "cast", + dtype=dtype, + returns_scalar=self._returns_scalar, + ) + + @property + def str(self: Self) -> DuckDBExprStringNamespace: + return DuckDBExprStringNamespace(self) + + @property + def dt(self: Self) -> DuckDBExprDateTimeNamespace: + return DuckDBExprDateTimeNamespace(self) + + +class DuckDBExprStringNamespace: + def __init__(self, expr: DuckDBExpr) -> None: + self._compliant_expr = expr + + def starts_with(self, prefix: str) -> DuckDBExpr: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression( + "starts_with", _input, ConstantExpression(prefix) + ), + "starts_with", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def ends_with(self, suffix: str) -> DuckDBExpr: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression( + "ends_with", _input, ConstantExpression(suffix) + ), + "ends_with", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def contains(self, pattern: str, *, literal: bool) -> DuckDBExpr: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + def func(_input: duckdb.Expression) -> duckdb.Expression: + if literal: + return FunctionExpression("contains", _input, ConstantExpression(pattern)) + return FunctionExpression( + "regexp_matches", _input, ConstantExpression(pattern) + ) + + return self._compliant_expr._from_call( + func, + "contains", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def slice(self, offset: int, length: int) -> DuckDBExpr: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + def func(_input: duckdb.Expression) -> duckdb.Expression: + return FunctionExpression( + "array_slice", + _input, + ConstantExpression(offset + 1) + if offset >= 0 + else FunctionExpression("length", _input) + offset + 1, + FunctionExpression("length", _input) + if length is None + else ConstantExpression(length) + offset, + ) + + return self._compliant_expr._from_call( + func, + "slice", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def to_lowercase(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("lower", _input), + "to_lowercase", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def to_uppercase(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("upper", _input), + "to_uppercase", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def strip_chars(self, characters: str | None) -> DuckDBExpr: + import string + + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression( + "trim", + _input, + ConstantExpression( + string.whitespace if characters is None else characters + ), + ), + "strip_chars", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def replace_all( + self, pattern: str, value: str, *, literal: bool = False + ) -> DuckDBExpr: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + if literal is False: + msg = "`replace_all` for DuckDB currently only supports `literal=True`." + raise NotImplementedError(msg) + return self._compliant_expr._from_call( + lambda _input: FunctionExpression( + "replace", + _input, + ConstantExpression(pattern), + ConstantExpression(value), + ), + "replace_all", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def replace(self, pattern: str, value: str, *, literal: bool, n: int) -> NoReturn: + msg = "`replace` is currently not supported for DuckDB" + raise NotImplementedError(msg) + + +class DuckDBExprDateTimeNamespace: + def __init__(self, expr: DuckDBExpr) -> None: + self._compliant_expr = expr + + def year(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("year", _input), + "year", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def month(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("month", _input), + "month", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def day(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("day", _input), + "day", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def hour(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("hour", _input), + "hour", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def minute(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("minute", _input), + "minute", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def second(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("second", _input), + "second", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def millisecond(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("millisecond", _input) + - FunctionExpression("second", _input) * 1_000, + "millisecond", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def microsecond(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("microsecond", _input) + - FunctionExpression("second", _input) * 1_000_000, + "microsecond", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def nanosecond(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("nanosecond", _input) + - FunctionExpression("second", _input) * 1_000_000_000, + "nanosecond", + returns_scalar=self._compliant_expr._returns_scalar, + ) diff --git a/narwhals/_duckdb/group_by.py b/narwhals/_duckdb/group_by.py new file mode 100644 index 000000000..0b312ff03 --- /dev/null +++ b/narwhals/_duckdb/group_by.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from copy import copy +from typing import TYPE_CHECKING + +from narwhals._expression_parsing import parse_into_exprs + +if TYPE_CHECKING: + from narwhals._duckdb.dataframe import DuckDBLazyFrame + from narwhals._duckdb.typing import IntoDuckDBExpr + + +class DuckDBGroupBy: + def __init__( + self, + compliant_frame: DuckDBLazyFrame, + keys: list[str], + drop_null_keys: bool, # noqa: FBT001 + ) -> None: + self._compliant_frame = compliant_frame + self._keys = keys + + def agg( + self, + *aggs: IntoDuckDBExpr, + **named_aggs: IntoDuckDBExpr, + ) -> DuckDBLazyFrame: + exprs = parse_into_exprs( + *aggs, + namespace=self._compliant_frame.__narwhals_namespace__(), + **named_aggs, + ) + output_names: list[str] = copy(self._keys) + for expr in exprs: + if expr._output_names is None: # pragma: no cover + msg = ( + "Anonymous expressions are not supported in group_by.agg.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + + output_names.extend(expr._output_names) + + agg_columns = [ + *self._keys, + *(x for expr in exprs for x in expr(self._compliant_frame)), + ] + try: + return self._compliant_frame._from_native_frame( + self._compliant_frame._native_frame.aggregate( + agg_columns, group_expr=",".join(self._keys) + ) + ) + except ValueError as exc: # pragma: no cover + msg = "Failed to aggregated - does your aggregation function return a scalar?" + raise RuntimeError(msg) from exc diff --git a/narwhals/_duckdb/namespace.py b/narwhals/_duckdb/namespace.py new file mode 100644 index 000000000..bcd7eff6d --- /dev/null +++ b/narwhals/_duckdb/namespace.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +import functools +import operator +from functools import reduce +from typing import TYPE_CHECKING +from typing import Any +from typing import Literal +from typing import Sequence + +from narwhals._duckdb.expr import DuckDBExpr +from narwhals._duckdb.utils import narwhals_to_native_dtype +from narwhals._expression_parsing import combine_root_names +from narwhals._expression_parsing import parse_into_exprs +from narwhals._expression_parsing import reduce_output_names +from narwhals.typing import CompliantNamespace + +if TYPE_CHECKING: + import duckdb + + from narwhals._duckdb.dataframe import DuckDBLazyFrame + from narwhals._duckdb.typing import IntoDuckDBExpr + from narwhals.dtypes import DType + from narwhals.utils import Version + + +def get_column_name(df: DuckDBLazyFrame, column: duckdb.Expression) -> str: + return str(df._native_frame.select(column).columns[0]) + + +class DuckDBNamespace(CompliantNamespace["duckdb.Expression"]): + def __init__(self, *, backend_version: tuple[int, ...], version: Version) -> None: + self._backend_version = backend_version + self._version = version + + def all(self) -> DuckDBExpr: + def _all(df: DuckDBLazyFrame) -> list[duckdb.Expression]: + from duckdb import ColumnExpression + + return [ColumnExpression(col_name) for col_name in df.columns] + + return DuckDBExpr( + call=_all, + depth=0, + function_name="all", + root_names=None, + output_names=None, + returns_scalar=False, + backend_version=self._backend_version, + version=self._version, + kwargs={}, + ) + + def concat( + self, + items: Sequence[DuckDBLazyFrame], + *, + how: Literal["horizontal", "vertical", "diagonal"], + ) -> DuckDBLazyFrame: + if how == "horizontal": + msg = "horizontal concat not supported for duckdb. Please join instead" + raise TypeError(msg) + if how == "diagonal": + msg = "Not implemented yet" + raise NotImplementedError(msg) + first = items[0] + schema = first.schema + if how == "vertical" and not all(x.schema == schema for x in items[1:]): + msg = "inputs should all have the same schema" + raise TypeError(msg) + res = functools.reduce( + lambda x, y: x.union(y), (item._native_frame for item in items) + ) + return first._from_native_frame(res) + + def all_horizontal(self, *exprs: IntoDuckDBExpr) -> DuckDBExpr: + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: + cols = [c for _expr in parsed_exprs for c in _expr(df)] + col_name = get_column_name(df, cols[0]) + return [reduce(operator.and_, cols).alias(col_name)] + + return DuckDBExpr( + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="all_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + returns_scalar=False, + backend_version=self._backend_version, + version=self._version, + kwargs={"exprs": exprs}, + ) + + def any_horizontal(self, *exprs: IntoDuckDBExpr) -> DuckDBExpr: + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: + cols = [c for _expr in parsed_exprs for c in _expr(df)] + col_name = get_column_name(df, cols[0]) + return [reduce(operator.or_, cols).alias(col_name)] + + return DuckDBExpr( + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="or_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + returns_scalar=False, + backend_version=self._backend_version, + version=self._version, + kwargs={"exprs": exprs}, + ) + + def max_horizontal(self, *exprs: IntoDuckDBExpr) -> DuckDBExpr: + from duckdb import FunctionExpression + + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: + cols = [c for _expr in parsed_exprs for c in _expr(df)] + col_name = get_column_name(df, cols[0]) + return [FunctionExpression("greatest", *cols).alias(col_name)] + + return DuckDBExpr( + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="max_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + returns_scalar=False, + backend_version=self._backend_version, + version=self._version, + kwargs={"exprs": exprs}, + ) + + def min_horizontal(self, *exprs: IntoDuckDBExpr) -> DuckDBExpr: + from duckdb import FunctionExpression + + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: + cols = [c for _expr in parsed_exprs for c in _expr(df)] + col_name = get_column_name(df, cols[0]) + return [FunctionExpression("least", *cols).alias(col_name)] + + return DuckDBExpr( + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="min_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + returns_scalar=False, + backend_version=self._backend_version, + version=self._version, + kwargs={"exprs": exprs}, + ) + + def col(self, *column_names: str) -> DuckDBExpr: + return DuckDBExpr.from_column_names( + *column_names, backend_version=self._backend_version, version=self._version + ) + + def lit(self, value: Any, dtype: DType | None) -> DuckDBExpr: + from duckdb import ConstantExpression + + def func(_df: DuckDBLazyFrame) -> list[duckdb.Expression]: + if dtype is not None: + return [ + ConstantExpression(value) + .cast(narwhals_to_native_dtype(dtype, version=self._version)) + .alias("literal") + ] + return [ConstantExpression(value).alias("literal")] + + return DuckDBExpr( + func, + depth=0, + function_name="lit", + root_names=None, + output_names=["literal"], + returns_scalar=True, + backend_version=self._backend_version, + version=self._version, + kwargs={}, + ) + + def len(self) -> DuckDBExpr: + def func(_df: DuckDBLazyFrame) -> list[duckdb.Expression]: + from duckdb import FunctionExpression + + return [FunctionExpression("count").alias("len")] + + return DuckDBExpr( + call=func, + depth=0, + function_name="len", + root_names=None, + output_names=["len"], + returns_scalar=True, + backend_version=self._backend_version, + version=self._version, + kwargs={}, + ) diff --git a/narwhals/_duckdb/series.py b/narwhals/_duckdb/series.py index dc7485e98..bec9e0e08 100644 --- a/narwhals/_duckdb/series.py +++ b/narwhals/_duckdb/series.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING from typing import Any -from narwhals._duckdb.dataframe import native_to_narwhals_dtype +from narwhals._duckdb.utils import native_to_narwhals_dtype from narwhals.dependencies import get_duckdb if TYPE_CHECKING: diff --git a/narwhals/_duckdb/typing.py b/narwhals/_duckdb/typing.py new file mode 100644 index 000000000..65d1ba3a7 --- /dev/null +++ b/narwhals/_duckdb/typing.py @@ -0,0 +1,16 @@ +from __future__ import annotations # pragma: no cover + +from typing import TYPE_CHECKING # pragma: no cover +from typing import Union # pragma: no cover + +if TYPE_CHECKING: + import sys + + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + + from narwhals._duckdb.expr import DuckDBExpr + + IntoDuckDBExpr: TypeAlias = Union[DuckDBExpr, str] diff --git a/narwhals/_duckdb/utils.py b/narwhals/_duckdb/utils.py new file mode 100644 index 000000000..abac2e158 --- /dev/null +++ b/narwhals/_duckdb/utils.py @@ -0,0 +1,213 @@ +from __future__ import annotations + +import re +from functools import lru_cache +from typing import TYPE_CHECKING +from typing import Any + +from narwhals.dtypes import DType +from narwhals.exceptions import InvalidIntoExprError +from narwhals.utils import import_dtypes_module +from narwhals.utils import isinstance_or_issubclass + +if TYPE_CHECKING: + import duckdb + + from narwhals._duckdb.dataframe import DuckDBLazyFrame + from narwhals._duckdb.expr import DuckDBExpr + from narwhals._duckdb.typing import IntoDuckDBExpr + from narwhals.utils import Version + + +def get_column_name( + df: DuckDBLazyFrame, column: duckdb.Expression, *, returns_scalar: bool +) -> str: + if returns_scalar: + return str(df._native_frame.aggregate([column]).columns[0]) + return str(df._native_frame.select(column).columns[0]) + + +def maybe_evaluate(df: DuckDBLazyFrame, obj: Any) -> Any: + import duckdb + + from narwhals._duckdb.expr import DuckDBExpr + + if isinstance(obj, DuckDBExpr): + column_results = obj._call(df) + if len(column_results) != 1: # pragma: no cover + msg = "Multi-output expressions (e.g. `nw.all()` or `nw.col('a', 'b')`) not supported in this context" + raise NotImplementedError(msg) + column_result = column_results[0] + if obj._returns_scalar: + msg = "Reductions are not yet supported for DuckDB, at least until they implement duckdb.WindowExpression" + raise NotImplementedError(msg) + return column_result + if isinstance_or_issubclass(obj, DType): + return obj + return duckdb.ConstantExpression(obj) + + +def parse_exprs_and_named_exprs( + df: DuckDBLazyFrame, + *exprs: IntoDuckDBExpr, + **named_exprs: IntoDuckDBExpr, +) -> dict[str, duckdb.Expression]: + result_columns: dict[str, list[duckdb.Expression]] = {} + for expr in exprs: + column_list = _columns_from_expr(df, expr) + if isinstance(expr, str): # pragma: no cover + output_names = [expr] + elif expr._output_names is None: + output_names = [ + get_column_name(df, col, returns_scalar=expr._returns_scalar) + for col in column_list + ] + else: + output_names = expr._output_names + result_columns.update(zip(output_names, column_list)) + for col_alias, expr in named_exprs.items(): + columns_list = _columns_from_expr(df, expr) + if len(columns_list) != 1: # pragma: no cover + msg = "Named expressions must return a single column" + raise AssertionError(msg) + result_columns[col_alias] = columns_list[0] + return result_columns + + +def _columns_from_expr( + df: DuckDBLazyFrame, expr: IntoDuckDBExpr +) -> list[duckdb.Expression]: + if isinstance(expr, str): # pragma: no cover + from duckdb import ColumnExpression + + return [ColumnExpression(expr)] + elif hasattr(expr, "__narwhals_expr__"): + col_output_list = expr._call(df) + if expr._output_names is not None and ( + len(col_output_list) != len(expr._output_names) + ): # pragma: no cover + msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues" + raise AssertionError(msg) + return col_output_list + else: + raise InvalidIntoExprError.from_invalid_type(type(expr)) + + +@lru_cache(maxsize=16) +def native_to_narwhals_dtype(duckdb_dtype: str, version: Version) -> DType: + dtypes = import_dtypes_module(version) + if duckdb_dtype == "HUGEINT": + return dtypes.Int128() + if duckdb_dtype == "BIGINT": + return dtypes.Int64() + if duckdb_dtype == "INTEGER": + return dtypes.Int32() + if duckdb_dtype == "SMALLINT": + return dtypes.Int16() + if duckdb_dtype == "TINYINT": + return dtypes.Int8() + if duckdb_dtype == "UHUGEINT": + return dtypes.UInt128() + if duckdb_dtype == "UBIGINT": + return dtypes.UInt64() + if duckdb_dtype == "UINTEGER": + return dtypes.UInt32() + if duckdb_dtype == "USMALLINT": + return dtypes.UInt16() + if duckdb_dtype == "UTINYINT": + return dtypes.UInt8() + if duckdb_dtype == "DOUBLE": + return dtypes.Float64() + if duckdb_dtype == "FLOAT": + return dtypes.Float32() + if duckdb_dtype == "VARCHAR": + return dtypes.String() + if duckdb_dtype == "DATE": + return dtypes.Date() + if duckdb_dtype == "TIMESTAMP": + return dtypes.Datetime() + if duckdb_dtype == "BOOLEAN": + return dtypes.Boolean() + if duckdb_dtype == "INTERVAL": + return dtypes.Duration() + if duckdb_dtype.startswith("STRUCT"): + matchstruc_ = re.findall(r"(\w+)\s+(\w+)", duckdb_dtype) + return dtypes.Struct( + [ + dtypes.Field( + matchstruc_[i][0], + native_to_narwhals_dtype(matchstruc_[i][1], version), + ) + for i in range(len(matchstruc_)) + ] + ) + if match_ := re.match(r"(.*)\[\]$", duckdb_dtype): + return dtypes.List(native_to_narwhals_dtype(match_.group(1), version)) + if match_ := re.match(r"(\w+)\[(\d+)\]", duckdb_dtype): + return dtypes.Array( + native_to_narwhals_dtype(match_.group(1), version), + int(match_.group(2)), + ) + if duckdb_dtype.startswith("DECIMAL("): + return dtypes.Decimal() + return dtypes.Unknown() # pragma: no cover + + +def narwhals_to_native_dtype(dtype: DType | type[DType], version: Version) -> str: + dtypes = import_dtypes_module(version) + if isinstance_or_issubclass(dtype, dtypes.Float64): + return "FLOAT" + if isinstance_or_issubclass(dtype, dtypes.Float32): + return "DOUBLE" + if isinstance_or_issubclass(dtype, dtypes.Int64): + return "BIGINT" + if isinstance_or_issubclass(dtype, dtypes.Int32): + return "INT" + if isinstance_or_issubclass(dtype, dtypes.Int16): + return "SMALLINT" + if isinstance_or_issubclass(dtype, dtypes.Int8): + return "TINYINT" + if isinstance_or_issubclass(dtype, dtypes.UInt64): + return "UBIGINT" + if isinstance_or_issubclass(dtype, dtypes.UInt32): + return "UINT" + if isinstance_or_issubclass(dtype, dtypes.UInt16): # pragma: no cover + return "USMALLINT" + if isinstance_or_issubclass(dtype, dtypes.UInt8): # pragma: no cover + return "UTINYINT" + if isinstance_or_issubclass(dtype, dtypes.String): + return "VARCHAR" + if isinstance_or_issubclass(dtype, dtypes.Boolean): # pragma: no cover + return "BOOLEAN" + if isinstance_or_issubclass(dtype, dtypes.Categorical): + msg = "Categorical not supported by DuckDB" + raise NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Datetime): + _time_unit = getattr(dtype, "time_unit", "us") + _time_zone = getattr(dtype, "time_zone", None) + msg = "todo" + raise NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Duration): # pragma: no cover + _time_unit = getattr(dtype, "time_unit", "us") + msg = "todo" + raise NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Date): # pragma: no cover + return "DATE" + if isinstance_or_issubclass(dtype, dtypes.List): + msg = "todo" + raise NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Struct): # pragma: no cover + msg = "todo" + raise NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Array): # pragma: no cover + msg = "todo" + raise NotImplementedError(msg) + msg = f"Unknown dtype: {dtype}" # pragma: no cover + raise AssertionError(msg) + + +def binary_operation_returns_scalar(lhs: DuckDBExpr, rhs: DuckDBExpr | Any) -> bool: + # If `rhs` is a DuckDBExpr, we look at `_returns_scalar`. If it isn't, + # it means that it was a scalar (e.g. nw.col('a') + 1), and so we default + # to `True`. + return lhs._returns_scalar and getattr(rhs, "_returns_scalar", True) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 293f5cefe..e11c02710 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -20,6 +20,7 @@ from narwhals._pandas_like.utils import select_columns_by_name from narwhals._pandas_like.utils import validate_dataframe_comparand from narwhals.dependencies import is_numpy_array +from narwhals.exceptions import ColumnNotFoundError from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_temporary_column_name @@ -694,6 +695,9 @@ def unique( # The param `maintain_order` is only here for compatibility with the Polars API # and has no effect on the output. mapped_keep = {"none": False, "any": "first"}.get(keep, keep) + if subset is not None and any(x not in self.columns for x in subset): + msg = f"Column(s) {subset} not found in {self.columns}" + raise ColumnNotFoundError(msg) return self._from_native_frame( self._native_frame.drop_duplicates(subset=subset, keep=mapped_keep) ) diff --git a/narwhals/functions.py b/narwhals/functions.py index 75cd9000e..ed167fb0d 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -1102,6 +1102,7 @@ def _scan_csv_impl( Implementation.MODIN, Implementation.CUDF, Implementation.DASK, + Implementation.DUCKDB, ): native_frame = native_namespace.read_csv(source, **kwargs) elif implementation is Implementation.PYARROW: @@ -1190,6 +1191,7 @@ def _read_parquet_impl( Implementation.PANDAS, Implementation.MODIN, Implementation.CUDF, + Implementation.DUCKDB, ): native_frame = native_namespace.read_parquet(source, **kwargs) elif implementation is Implementation.PYARROW: @@ -1273,6 +1275,7 @@ def _scan_parquet_impl( Implementation.MODIN, Implementation.CUDF, Implementation.DASK, + Implementation.DUCKDB, ): native_frame = native_namespace.read_parquet(source, **kwargs) elif implementation is Implementation.PYARROW: diff --git a/narwhals/translate.py b/narwhals/translate.py index 77c83b548..8d0805a26 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -698,13 +698,13 @@ def _from_native_impl( # noqa: PLR0915 # DuckDB elif is_duckdb_relation(native_object): - from narwhals._duckdb.dataframe import DuckDBInterchangeFrame + from narwhals._duckdb.dataframe import DuckDBLazyFrame if eager_only or series_only: # pragma: no cover if not pass_through: msg = ( "Cannot only use `series_only=True` or `eager_only=False` " - "with DuckDB Relation" + "with DuckDBPyRelation" ) else: return native_object @@ -712,11 +712,18 @@ def _from_native_impl( # noqa: PLR0915 import duckdb # ignore-banned-import backend_version = parse_version(duckdb.__version__) - return DataFrame( - DuckDBInterchangeFrame( - native_object, version=version, backend_version=backend_version + if version is Version.V1: + return DataFrame( + DuckDBLazyFrame( + native_object, backend_version=backend_version, version=version + ), + level="interchange", + ) + return LazyFrame( + DuckDBLazyFrame( + native_object, backend_version=backend_version, version=version ), - level="interchange", + level="full", ) # Ibis diff --git a/pyproject.toml b/pyproject.toml index daa21c3ee..6c33c09bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -116,6 +116,7 @@ lint.ignore = [ "E501", "FIX", "ISC001", + "PD003", "PD010", "PD901", # This is a auxiliary library so dataframe variables have no concrete business meaning "PLR0911", diff --git a/tests/conftest.py b/tests/conftest.py index 28fbc7610..dee762705 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,10 +13,7 @@ import pytest if TYPE_CHECKING: - from narwhals.typing import IntoDataFrame - from narwhals.typing import IntoFrame - -if TYPE_CHECKING: + import duckdb from pyspark.sql import SparkSession from narwhals.typing import IntoDataFrame @@ -109,6 +106,13 @@ def polars_lazy_constructor(obj: Any) -> pl.LazyFrame: return pl.LazyFrame(obj) +def duckdb_lazy_constructor(obj: Any) -> duckdb.DuckDBPyRelation: + import duckdb + + _df = pl.LazyFrame(obj) + return duckdb.table("_df") + + def dask_lazy_p1_constructor(obj: Any) -> IntoFrame: # pragma: no cover import dask.dataframe as dd @@ -168,6 +172,7 @@ def spark_session() -> Generator[SparkSession, None, None]: # pragma: no cover LAZY_CONSTRUCTORS: dict[str, Callable[[Any], IntoFrame]] = { "dask": dask_lazy_p2_constructor, "polars[lazy]": polars_lazy_constructor, + "duckdb": duckdb_lazy_constructor, } GPU_CONSTRUCTORS: dict[str, Callable[[Any], IntoFrame]] = {"cudf": cudf_constructor} @@ -207,4 +212,14 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: "constructor_eager", eager_constructors, ids=eager_constructors_ids ) elif "constructor" in metafunc.fixturenames: + if ( + any( + x in str(metafunc.module) + for x in ("list", "name", "unpivot", "from_dict", "from_numpy", "tail") + ) + and LAZY_CONSTRUCTORS["duckdb"] in constructors + ): + # TODO(unassigned): list and name namespaces still need implementing for duckdb + constructors.remove(LAZY_CONSTRUCTORS["duckdb"]) + constructors_ids.remove("duckdb") metafunc.parametrize("constructor", constructors, ids=constructors_ids) diff --git a/tests/expr_and_series/all_horizontal_test.py b/tests/expr_and_series/all_horizontal_test.py index 706c42baf..6eb98c3a3 100644 --- a/tests/expr_and_series/all_horizontal_test.py +++ b/tests/expr_and_series/all_horizontal_test.py @@ -57,6 +57,8 @@ def test_allh_nth( ) -> None: if "polars" in str(constructor) and POLARS_VERSION < (1, 0): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "a": [False, False, True], "b": [False, True, True], diff --git a/tests/expr_and_series/arithmetic_test.py b/tests/expr_and_series/arithmetic_test.py index cd82a945e..aec586c62 100644 --- a/tests/expr_and_series/arithmetic_test.py +++ b/tests/expr_and_series/arithmetic_test.py @@ -38,6 +38,8 @@ def test_arithmetic_expr( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor) and attr == "__floordiv__": + request.applymarker(pytest.mark.xfail) if attr == "__mod__" and any( x in str(constructor) for x in ["pandas_pyarrow", "modin_pyarrow"] ): @@ -244,7 +246,9 @@ def test_arithmetic_expr_left_literal( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: - if "dask" in str(constructor) and DASK_VERSION < (2024, 10): + if ("duckdb" in str(constructor) and attr == "__floordiv__") or ( + "dask" in str(constructor) and DASK_VERSION < (2024, 10) + ): request.applymarker(pytest.mark.xfail) if attr == "__mod__" and any( x in str(constructor) for x in ["pandas_pyarrow", "modin_pyarrow"] diff --git a/tests/expr_and_series/cast_test.py b/tests/expr_and_series/cast_test.py index e956dd455..b6ce43573 100644 --- a/tests/expr_and_series/cast_test.py +++ b/tests/expr_and_series/cast_test.py @@ -13,6 +13,7 @@ from tests.utils import PANDAS_VERSION from tests.utils import PYARROW_VERSION from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import assert_equal_data from tests.utils import is_windows @@ -59,6 +60,8 @@ def test_cast( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "pyarrow_table_constructor" in str(constructor) and PYARROW_VERSION <= ( 15, ): # pragma: no cover @@ -109,18 +112,18 @@ def test_cast( def test_cast_series( - constructor: Constructor, + constructor_eager: ConstructorEager, request: pytest.FixtureRequest, ) -> None: - if "pyarrow_table_constructor" in str(constructor) and PYARROW_VERSION <= ( + if "pyarrow_table_constructor" in str(constructor_eager) and PYARROW_VERSION <= ( 15, ): # pragma: no cover request.applymarker(pytest.mark.xfail) - if "modin_constructor" in str(constructor): + if "modin_constructor" in str(constructor_eager): # TODO(unassigned): in modin, we end up with `' None: def test_cast_raises_for_unknown_dtype( constructor: Constructor, request: pytest.FixtureRequest ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (15,): # Unsupported cast from string to dictionary using function cast_dictionary request.applymarker(pytest.mark.xfail) @@ -196,6 +201,7 @@ def test_cast_datetime_tz_aware( ) -> None: if ( "dask" in str(constructor) + or "duckdb" in str(constructor) or "cudf" in str(constructor) # https://github.com/rapidsai/cudf/issues/16973 or ("pyarrow_table" in str(constructor) and is_windows()) ): @@ -222,7 +228,9 @@ def test_cast_datetime_tz_aware( def test_cast_struct(request: pytest.FixtureRequest, constructor: Constructor) -> None: - if any(backend in str(constructor) for backend in ("dask", "modin", "cudf")): + if any( + backend in str(constructor) for backend in ("dask", "modin", "cudf", "duckdb") + ): request.applymarker(pytest.mark.xfail) if "pandas" in str(constructor) and PANDAS_VERSION < (2, 2): diff --git a/tests/expr_and_series/concat_str_test.py b/tests/expr_and_series/concat_str_test.py index 26366d2f2..7c9f259ba 100644 --- a/tests/expr_and_series/concat_str_test.py +++ b/tests/expr_and_series/concat_str_test.py @@ -21,8 +21,14 @@ ], ) def test_concat_str( - constructor: Constructor, *, ignore_nulls: bool, expected: list[str] + constructor: Constructor, + *, + ignore_nulls: bool, + expected: list[str], + request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = ( df.select( diff --git a/tests/expr_and_series/convert_time_zone_test.py b/tests/expr_and_series/convert_time_zone_test.py index aa4235549..6b3cf5b41 100644 --- a/tests/expr_and_series/convert_time_zone_test.py +++ b/tests/expr_and_series/convert_time_zone_test.py @@ -28,6 +28,7 @@ def test_convert_time_zone( or ("pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1)) or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1)) or ("cudf" in str(constructor)) + or ("duckdb" in str(constructor)) ): request.applymarker(pytest.mark.xfail) data = { @@ -84,6 +85,7 @@ def test_convert_time_zone_from_none( or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1)) or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) or ("cudf" in str(constructor)) + or ("duckdb" in str(constructor)) ): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) and POLARS_VERSION < (0, 20, 7): diff --git a/tests/expr_and_series/cum_count_test.py b/tests/expr_and_series/cum_count_test.py index 6ddf6c991..1a2377f34 100644 --- a/tests/expr_and_series/cum_count_test.py +++ b/tests/expr_and_series/cum_count_test.py @@ -21,6 +21,8 @@ def test_cum_count_expr( ) -> None: if "dask" in str(constructor) and reverse: request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) name = "reverse_cum_count" if reverse else "cum_count" df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/cum_max_test.py b/tests/expr_and_series/cum_max_test.py index 054537d34..22b7c73fa 100644 --- a/tests/expr_and_series/cum_max_test.py +++ b/tests/expr_and_series/cum_max_test.py @@ -23,6 +23,8 @@ def test_cum_max_expr( ) -> None: if "dask" in str(constructor) and reverse: request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if PYARROW_VERSION < (13, 0, 0) and "pyarrow_table" in str(constructor): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/cum_min_test.py b/tests/expr_and_series/cum_min_test.py index bb92f5b9d..b34672219 100644 --- a/tests/expr_and_series/cum_min_test.py +++ b/tests/expr_and_series/cum_min_test.py @@ -23,6 +23,8 @@ def test_cum_min_expr( ) -> None: if "dask" in str(constructor) and reverse: request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if PYARROW_VERSION < (13, 0, 0) and "pyarrow_table" in str(constructor): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/cum_prod_test.py b/tests/expr_and_series/cum_prod_test.py index 1d5816ff2..4dd5207dc 100644 --- a/tests/expr_and_series/cum_prod_test.py +++ b/tests/expr_and_series/cum_prod_test.py @@ -23,6 +23,8 @@ def test_cum_prod_expr( ) -> None: if "dask" in str(constructor) and reverse: request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if PYARROW_VERSION < (13, 0, 0) and "pyarrow_table" in str(constructor): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/cum_sum_test.py b/tests/expr_and_series/cum_sum_test.py index 8df3396bc..5878222fb 100644 --- a/tests/expr_and_series/cum_sum_test.py +++ b/tests/expr_and_series/cum_sum_test.py @@ -18,6 +18,8 @@ def test_cum_sum_expr( request: pytest.FixtureRequest, constructor: Constructor, *, reverse: bool ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "dask" in str(constructor) and reverse: request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/diff_test.py b/tests/expr_and_series/diff_test.py index da433f7ad..f7730a2d4 100644 --- a/tests/expr_and_series/diff_test.py +++ b/tests/expr_and_series/diff_test.py @@ -22,6 +22,8 @@ def test_diff( if "pyarrow_table_constructor" in str(constructor) and PYARROW_VERSION < (13,): # pc.pairwisediff is available since pyarrow 13.0.0 request.applymarker(pytest.mark.xfail) + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.with_columns(c_diff=nw.col("c").diff()).filter(nw.col("i") > 0) expected = { diff --git a/tests/expr_and_series/dt/datetime_attributes_test.py b/tests/expr_and_series/dt/datetime_attributes_test.py index ad5f8dc3f..e1af276e4 100644 --- a/tests/expr_and_series/dt/datetime_attributes_test.py +++ b/tests/expr_and_series/dt/datetime_attributes_test.py @@ -49,6 +49,8 @@ def test_datetime_attributes( request.applymarker(pytest.mark.xfail) if attribute == "date" and "cudf" in str(constructor): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor) and attribute in ("date", "weekday", "ordinal_day"): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(getattr(nw.col("a").dt, attribute)()) @@ -118,6 +120,7 @@ def test_to_date(request: pytest.FixtureRequest, constructor: Constructor) -> No "pandas_nullable_constructor", "cudf", "modin_constructor", + "duckdb", ) ): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/dt/datetime_duration_test.py b/tests/expr_and_series/dt/datetime_duration_test.py index 09f227c79..bda3e4703 100644 --- a/tests/expr_and_series/dt/datetime_duration_test.py +++ b/tests/expr_and_series/dt/datetime_duration_test.py @@ -46,6 +46,8 @@ def test_duration_attributes( ) -> None: if PANDAS_VERSION < (2, 2) and "pandas_pyarrow" in str(constructor): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py index e205d8179..b7e20519f 100644 --- a/tests/expr_and_series/dt/timestamp_test.py +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -50,6 +50,8 @@ def test_timestamp_datetimes( time_unit: Literal["ns", "us", "ms"], expected: list[int | None], ) -> None: + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) if original_time_unit == "s" and "polars" in str(constructor): request.applymarker(pytest.mark.xfail) if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < ( @@ -90,6 +92,8 @@ def test_timestamp_datetimes_tz_aware( time_unit: Literal["ns", "us", "ms"], expected: list[int | None], ) -> None: + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) if ( (any(x in str(constructor) for x in ("pyarrow",)) and is_windows()) or ("pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) @@ -136,6 +140,8 @@ def test_timestamp_dates( time_unit: Literal["ns", "us", "ms"], expected: list[int | None], ) -> None: + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) if any( x in str(constructor) for x in ( @@ -161,6 +167,8 @@ def test_timestamp_dates( def test_timestamp_invalid_date( request: pytest.FixtureRequest, constructor: Constructor ) -> None: + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) if "polars" in str(constructor): request.applymarker(pytest.mark.xfail) data_str = {"a": ["x", "y", None]} diff --git a/tests/expr_and_series/dt/to_string_test.py b/tests/expr_and_series/dt/to_string_test.py index 629b39806..6fa500024 100644 --- a/tests/expr_and_series/dt/to_string_test.py +++ b/tests/expr_and_series/dt/to_string_test.py @@ -59,7 +59,11 @@ def test_dt_to_string_series(constructor_eager: ConstructorEager, fmt: str) -> N ], ) @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") -def test_dt_to_string_expr(constructor: Constructor, fmt: str) -> None: +def test_dt_to_string_expr( + constructor: Constructor, fmt: str, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) input_frame = nw.from_native(constructor(data)) expected_col = [datetime.strftime(d, fmt) for d in data["a"]] @@ -132,8 +136,13 @@ def test_dt_to_string_iso_local_datetime_series( ) @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") def test_dt_to_string_iso_local_datetime_expr( - constructor: Constructor, data: datetime, expected: str + constructor: Constructor, + data: datetime, + expected: str, + request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = constructor({"a": [data]}) result = nw.from_native(df).with_columns( @@ -166,8 +175,13 @@ def test_dt_to_string_iso_local_date_series( ) @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") def test_dt_to_string_iso_local_date_expr( - constructor: Constructor, data: datetime, expected: str + constructor: Constructor, + data: datetime, + expected: str, + request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = constructor({"a": [data]}) result = nw.from_native(df).with_columns( nw.col("a").dt.to_string("%Y-%m-%d").alias("b") diff --git a/tests/expr_and_series/fill_null_test.py b/tests/expr_and_series/fill_null_test.py index 57f767d4d..58ef5c890 100644 --- a/tests/expr_and_series/fill_null_test.py +++ b/tests/expr_and_series/fill_null_test.py @@ -47,7 +47,11 @@ def test_fill_null_exceptions(constructor: Constructor) -> None: df.with_columns(nw.col("a").fill_null(strategy="invalid")) # type: ignore # noqa: PGH003 -def test_fill_null_strategies_with_limit_as_none(constructor: Constructor) -> None: +def test_fill_null_strategies_with_limit_as_none( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data_limits = { "a": [1, None, None, None, 5, 6, None, None, None, 10], "b": ["a", None, None, None, "b", "c", None, None, None, "d"], @@ -113,7 +117,11 @@ def test_fill_null_strategies_with_limit_as_none(constructor: Constructor) -> No assert_equal_data(result_backward, expected_backward) -def test_fill_null_limits(constructor: Constructor) -> None: +def test_fill_null_limits( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) context: Any = ( pytest.raises(NotImplementedError, match="The limit keyword is not supported") if "cudf" in str(constructor) diff --git a/tests/expr_and_series/is_duplicated_test.py b/tests/expr_and_series/is_duplicated_test.py index d4ce3461f..fe8b45bf1 100644 --- a/tests/expr_and_series/is_duplicated_test.py +++ b/tests/expr_and_series/is_duplicated_test.py @@ -1,12 +1,18 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data -def test_is_duplicated_expr(constructor: Constructor) -> None: +def test_is_duplicated_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 1, 2], "b": [1, 2, 3], "index": [0, 1, 2]} df = nw.from_native(constructor(data)) result = df.select(nw.col("a", "b").is_duplicated(), "index").sort("index") @@ -14,7 +20,11 @@ def test_is_duplicated_expr(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_is_duplicated_w_nulls_expr(constructor: Constructor) -> None: +def test_is_duplicated_w_nulls_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 1, None], "b": [1, None, None], "index": [0, 1, 2]} df = nw.from_native(constructor(data)) result = df.select(nw.col("a", "b").is_duplicated(), "index").sort("index") diff --git a/tests/expr_and_series/is_finite_test.py b/tests/expr_and_series/is_finite_test.py index 270ba7d52..7718ed1a7 100644 --- a/tests/expr_and_series/is_finite_test.py +++ b/tests/expr_and_series/is_finite_test.py @@ -11,7 +11,9 @@ @pytest.mark.filterwarnings("ignore:invalid value encountered in cast") -def test_is_finite_expr(constructor: Constructor) -> None: +def test_is_finite_expr(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) or "pyarrow_table" in str(constructor): expected = {"a": [False, False, True, None]} elif ( diff --git a/tests/expr_and_series/is_first_distinct_test.py b/tests/expr_and_series/is_first_distinct_test.py index 7084fb3fb..786f2ade7 100644 --- a/tests/expr_and_series/is_first_distinct_test.py +++ b/tests/expr_and_series/is_first_distinct_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -11,7 +13,11 @@ } -def test_is_first_distinct_expr(constructor: Constructor) -> None: +def test_is_first_distinct_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.all().is_first_distinct()) expected = { diff --git a/tests/expr_and_series/is_last_distinct_test.py b/tests/expr_and_series/is_last_distinct_test.py index b91c171d3..c5d73c8d7 100644 --- a/tests/expr_and_series/is_last_distinct_test.py +++ b/tests/expr_and_series/is_last_distinct_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -11,7 +13,11 @@ } -def test_is_last_distinct_expr(constructor: Constructor) -> None: +def test_is_last_distinct_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.all().is_last_distinct()) expected = { diff --git a/tests/expr_and_series/is_nan_test.py b/tests/expr_and_series/is_nan_test.py index 806dc7535..7bae35a52 100644 --- a/tests/expr_and_series/is_nan_test.py +++ b/tests/expr_and_series/is_nan_test.py @@ -24,7 +24,9 @@ ] -def test_nan(constructor: Constructor) -> None: +def test_nan(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data_na = {"int": [0, 1, None]} df = nw.from_native(constructor(data_na)).with_columns( float=nw.col("int").cast(nw.Float64), float_na=nw.col("int") / nw.col("int") @@ -93,7 +95,9 @@ def test_nan_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_nan_non_float(constructor: Constructor) -> None: +def test_nan_non_float(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) from polars.exceptions import InvalidOperationError as PlInvalidOperationError from pyarrow.lib import ArrowNotImplementedError diff --git a/tests/expr_and_series/is_unique_test.py b/tests/expr_and_series/is_unique_test.py index b44878886..3e9259c03 100644 --- a/tests/expr_and_series/is_unique_test.py +++ b/tests/expr_and_series/is_unique_test.py @@ -1,12 +1,16 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data -def test_is_unique_expr(constructor: Constructor) -> None: +def test_is_unique_expr(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "a": [1, 1, 2], "b": [1, 2, 3], @@ -22,7 +26,11 @@ def test_is_unique_expr(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_is_unique_w_nulls_expr(constructor: Constructor) -> None: +def test_is_unique_w_nulls_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "a": [None, 1, 2], "b": [None, 2, None], diff --git a/tests/expr_and_series/lit_test.py b/tests/expr_and_series/lit_test.py index 501bfc4bd..505d99bf8 100644 --- a/tests/expr_and_series/lit_test.py +++ b/tests/expr_and_series/lit_test.py @@ -87,6 +87,13 @@ def test_lit_operation( expected_result: list[int], request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor) and col_name in ( + "left_scalar_with_agg", + "left_lit_with_agg", + "right_lit", + "right_lit_with_agg", + ): + request.applymarker(pytest.mark.xfail) if ( "dask" in str(constructor) and col_name in ("left_lit", "left_scalar") diff --git a/tests/expr_and_series/mean_horizontal_test.py b/tests/expr_and_series/mean_horizontal_test.py index 485bf1750..c1652c837 100644 --- a/tests/expr_and_series/mean_horizontal_test.py +++ b/tests/expr_and_series/mean_horizontal_test.py @@ -10,7 +10,11 @@ @pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) -def test_meanh(constructor: Constructor, col_expr: Any) -> None: +def test_meanh( + constructor: Constructor, col_expr: Any, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, None, None], "b": [4, None, 6, None]} df = nw.from_native(constructor(data)) result = df.select(horizontal_mean=nw.mean_horizontal(col_expr, nw.col("b"))) @@ -18,7 +22,9 @@ def test_meanh(constructor: Constructor, col_expr: Any) -> None: assert_equal_data(result, expected) -def test_meanh_all(constructor: Constructor) -> None: +def test_meanh_all(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [2, 4, 6], "b": [10, 20, 30]} df = nw.from_native(constructor(data)) result = df.select(nw.mean_horizontal(nw.all())) diff --git a/tests/expr_and_series/median_test.py b/tests/expr_and_series/median_test.py index 7c50988dc..b0b6edcba 100644 --- a/tests/expr_and_series/median_test.py +++ b/tests/expr_and_series/median_test.py @@ -41,16 +41,17 @@ def test_median_series( @pytest.mark.parametrize("expr", [nw.col("s").median(), nw.median("s")]) def test_median_expr_raises_on_str( - constructor: Constructor, - expr: nw.Expr, + constructor: Constructor, expr: nw.Expr, request: pytest.FixtureRequest ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) from polars.exceptions import InvalidOperationError as PlInvalidOperationError df = nw.from_native(constructor(data)) - if "polars_lazy" in str(constructor): + if isinstance(df, nw.LazyFrame): with pytest.raises( - PlInvalidOperationError, - match="`median` operation not supported for dtype `str`", + (InvalidOperationError, PlInvalidOperationError), + match="`median` operation not supported", ): df.select(expr).lazy().collect() else: diff --git a/tests/expr_and_series/n_unique_test.py b/tests/expr_and_series/n_unique_test.py index 90bffb04b..d8e4d9b77 100644 --- a/tests/expr_and_series/n_unique_test.py +++ b/tests/expr_and_series/n_unique_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -11,7 +13,9 @@ } -def test_n_unique(constructor: Constructor) -> None: +def test_n_unique(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.all().n_unique()) expected = {"a": [3], "b": [4]} diff --git a/tests/expr_and_series/name/to_uppercase_test.py b/tests/expr_and_series/name/to_uppercase_test.py index 785da4957..e6703212d 100644 --- a/tests/expr_and_series/name/to_uppercase_test.py +++ b/tests/expr_and_series/name/to_uppercase_test.py @@ -12,21 +12,31 @@ data = {"foo": [1, 2, 3], "BAR": [4, 5, 6]} -def test_to_uppercase(constructor: Constructor) -> None: +def test_to_uppercase(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select((nw.col("foo", "BAR") * 2).name.to_uppercase()) expected = {k.upper(): [e * 2 for e in v] for k, v in data.items()} assert_equal_data(result, expected) -def test_to_uppercase_after_alias(constructor: Constructor) -> None: +def test_to_uppercase_after_alias( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select((nw.col("foo")).alias("alias_for_foo").name.to_uppercase()) expected = {"FOO": data["foo"]} assert_equal_data(result, expected) -def test_to_uppercase_raise_anonymous(constructor: Constructor) -> None: +def test_to_uppercase_raise_anonymous( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) df_raw = constructor(data) df = nw.from_native(df_raw) diff --git a/tests/expr_and_series/nth_test.py b/tests/expr_and_series/nth_test.py index 8179fb261..4dd453528 100644 --- a/tests/expr_and_series/nth_test.py +++ b/tests/expr_and_series/nth_test.py @@ -25,6 +25,8 @@ def test_nth( expected: dict[str, list[int]], request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) and POLARS_VERSION < (1, 0, 0): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/null_count_test.py b/tests/expr_and_series/null_count_test.py index 0f2250713..d10258901 100644 --- a/tests/expr_and_series/null_count_test.py +++ b/tests/expr_and_series/null_count_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -11,7 +13,11 @@ } -def test_null_count_expr(constructor: Constructor) -> None: +def test_null_count_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.all().null_count()) expected = { diff --git a/tests/expr_and_series/over_test.py b/tests/expr_and_series/over_test.py index a67c7973b..f42bdca54 100644 --- a/tests/expr_and_series/over_test.py +++ b/tests/expr_and_series/over_test.py @@ -24,6 +24,8 @@ def test_over_single(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "dask_lazy_p2" in str(constructor): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) expected = { @@ -40,6 +42,8 @@ def test_over_single(request: pytest.FixtureRequest, constructor: Constructor) - def test_over_multiple(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "dask_lazy_p2" in str(constructor): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) expected = { @@ -56,6 +60,8 @@ def test_over_multiple(request: pytest.FixtureRequest, constructor: Constructor) def test_over_invalid(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "polars" in str(constructor): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) with pytest.raises(ValueError, match="Anonymous expressions"): @@ -67,6 +73,8 @@ def test_over_cumsum(request: pytest.FixtureRequest, constructor: Constructor) - request.applymarker(pytest.mark.xfail) if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_cum)) expected = { @@ -84,6 +92,8 @@ def test_over_cumsum(request: pytest.FixtureRequest, constructor: Constructor) - def test_over_cumcount(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "pyarrow_table" in str(constructor) or "dask_lazy_p2" in str(constructor): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_cum)) expected = { @@ -101,10 +111,12 @@ def test_over_cumcount(request: pytest.FixtureRequest, constructor: Constructor) def test_over_cummax(request: pytest.FixtureRequest, constructor: Constructor) -> None: - if "pyarrow_table" in str(constructor) or "dask_lazy_p2" in str(constructor): + if any(x in str(constructor) for x in ("pyarrow_table", "dask_lazy_p2", "duckdb")): request.applymarker(pytest.mark.xfail) if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_cum)) expected = { "a": ["a", "a", "b", "b", "b"], @@ -120,9 +132,10 @@ def test_over_cummax(request: pytest.FixtureRequest, constructor: Constructor) - def test_over_cummin(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "pyarrow_table" in str(constructor) or "dask_lazy_p2" in str(constructor): request.applymarker(pytest.mark.xfail) - if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_cum)) expected = { @@ -138,11 +151,12 @@ def test_over_cummin(request: pytest.FixtureRequest, constructor: Constructor) - def test_over_cumprod(request: pytest.FixtureRequest, constructor: Constructor) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "dask_lazy_p2")): + if any(x in str(constructor) for x in ("pyarrow_table", "dask_lazy_p2", "duckdb")): request.applymarker(pytest.mark.xfail) - if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_cum)) expected = { @@ -170,6 +184,8 @@ def test_over_shift(request: pytest.FixtureRequest, constructor: Constructor) -> constructor ) or "dask_lazy_p2_constructor" in str(constructor): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) expected = { diff --git a/tests/expr_and_series/quantile_test.py b/tests/expr_and_series/quantile_test.py index ae707e739..d52fae16c 100644 --- a/tests/expr_and_series/quantile_test.py +++ b/tests/expr_and_series/quantile_test.py @@ -28,7 +28,10 @@ def test_quantile_expr( expected: dict[str, list[float]], request: pytest.FixtureRequest, ) -> None: - if "dask" in str(constructor) and interpolation != "linear": + if ( + any(x in str(constructor) for x in ("dask", "duckdb")) + and interpolation != "linear" + ): request.applymarker(pytest.mark.xfail) q = 0.3 diff --git a/tests/expr_and_series/reduction_test.py b/tests/expr_and_series/reduction_test.py index 3b579d9f3..4f2faa0ce 100644 --- a/tests/expr_and_series/reduction_test.py +++ b/tests/expr_and_series/reduction_test.py @@ -30,6 +30,9 @@ def test_scalar_reduction_select( constructor: Constructor, expr: list[Any], expected: dict[str, list[Any]] ) -> None: + if "duckdb" in str(constructor): + # First one passes, the others fail. + return data = {"a": [1, 2, 3], "b": [4, 5, 6]} df = nw.from_native(constructor(data)) result = df.select(*expr) @@ -54,15 +57,24 @@ def test_scalar_reduction_select( ids=range(5), ) def test_scalar_reduction_with_columns( - constructor: Constructor, expr: list[Any], expected: dict[str, list[Any]] + constructor: Constructor, + expr: list[Any], + expected: dict[str, list[Any]], + request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 2, 3], "b": [4, 5, 6]} df = nw.from_native(constructor(data)) result = df.with_columns(*expr).select(*expected.keys()) assert_equal_data(result, expected) -def test_empty_scalar_reduction_select(constructor: Constructor) -> None: +def test_empty_scalar_reduction_select( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "str": [*"abcde"], "int": [0, 1, 2, 3, 4], @@ -91,7 +103,11 @@ def test_empty_scalar_reduction_select(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_empty_scalar_reduction_with_columns(constructor: Constructor) -> None: +def test_empty_scalar_reduction_with_columns( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) from itertools import chain data = { diff --git a/tests/expr_and_series/replace_strict_test.py b/tests/expr_and_series/replace_strict_test.py index b1449af24..07e349bc6 100644 --- a/tests/expr_and_series/replace_strict_test.py +++ b/tests/expr_and_series/replace_strict_test.py @@ -23,6 +23,8 @@ def test_replace_strict( ) -> None: if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 2, 3]})) result = df.select( nw.col("a").replace_strict( @@ -58,6 +60,8 @@ def test_replace_non_full( if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 2, 3]})) if isinstance(df, nw.LazyFrame): with pytest.raises((ValueError, PolarsError)): @@ -77,6 +81,8 @@ def test_replace_strict_mapping( ) -> None: if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 2, 3]})) result = df.select( diff --git a/tests/expr_and_series/replace_time_zone_test.py b/tests/expr_and_series/replace_time_zone_test.py index 94367d1e1..eed90feb1 100644 --- a/tests/expr_and_series/replace_time_zone_test.py +++ b/tests/expr_and_series/replace_time_zone_test.py @@ -26,6 +26,7 @@ def test_replace_time_zone( or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) or ("cudf" in str(constructor)) + or ("duckdb" in str(constructor)) ): request.applymarker(pytest.mark.xfail) data = { @@ -52,6 +53,8 @@ def test_replace_time_zone_none( or ("pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) + or ("cudf" in str(constructor)) + or ("duckdb" in str(constructor)) ): request.applymarker(pytest.mark.xfail) data = { diff --git a/tests/expr_and_series/shift_test.py b/tests/expr_and_series/shift_test.py index 379f40986..07f5d2b58 100644 --- a/tests/expr_and_series/shift_test.py +++ b/tests/expr_and_series/shift_test.py @@ -1,6 +1,7 @@ from __future__ import annotations import pyarrow as pa +import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor @@ -15,7 +16,9 @@ } -def test_shift(constructor: Constructor) -> None: +def test_shift(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.with_columns(nw.col("a", "b", "c").shift(2)).filter(nw.col("i") > 1) expected = { diff --git a/tests/expr_and_series/std_test.py b/tests/expr_and_series/std_test.py index b83100801..f2eabf4f2 100644 --- a/tests/expr_and_series/std_test.py +++ b/tests/expr_and_series/std_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +from contextlib import nullcontext as does_not_raise + import pytest import narwhals.stable.v1 as nw @@ -24,10 +26,27 @@ def test_std(constructor: Constructor, input_data: dict[str, list[float | None]] result = df.select( nw.col("a").std(ddof=1).alias("a_ddof_1"), nw.col("a").std(ddof=0).alias("a_ddof_0"), - nw.col("b").std(ddof=2).alias("b_ddof_2"), nw.col("z").std(ddof=0).alias("z_ddof_0"), ) + expected_results = { + "a_ddof_1": [1.0], + "a_ddof_0": [0.816497], + "z_ddof_0": [0.816497], + } assert_equal_data(result, expected_results) + context = ( + pytest.raises(NotImplementedError) + if "duckdb" in str(constructor) + else does_not_raise() + ) + with context: + result = df.select( + nw.col("b").std(ddof=2).alias("b_ddof_2"), + ) + expected_results = { + "b_ddof_2": [1.632993], + } + assert_equal_data(result, expected_results) @pytest.mark.parametrize("input_data", [data, data_with_nulls]) diff --git a/tests/expr_and_series/str/len_chars_test.py b/tests/expr_and_series/str/len_chars_test.py index f9c63e01c..1a318801a 100644 --- a/tests/expr_and_series/str/len_chars_test.py +++ b/tests/expr_and_series/str/len_chars_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -8,7 +10,9 @@ data = {"a": ["foo", "foobar", "Café", "345", "東京"]} -def test_str_len_chars(constructor: Constructor) -> None: +def test_str_len_chars(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.col("a").str.len_chars()) expected = { diff --git a/tests/expr_and_series/str/replace_test.py b/tests/expr_and_series/str/replace_test.py index ffd8fce2e..7d57eeb7d 100644 --- a/tests/expr_and_series/str/replace_test.py +++ b/tests/expr_and_series/str/replace_test.py @@ -93,6 +93,7 @@ def test_str_replace_all_series( ) def test_str_replace_expr( constructor: Constructor, + request: pytest.FixtureRequest, data: dict[str, list[str]], pattern: str, value: str, @@ -100,8 +101,9 @@ def test_str_replace_expr( literal: bool, # noqa: FBT001 expected: dict[str, list[str]], ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result_df = df.select( nw.col("a").str.replace(pattern=pattern, value=value, n=n, literal=literal) ) @@ -114,14 +116,16 @@ def test_str_replace_expr( ) def test_str_replace_all_expr( constructor: Constructor, + request: pytest.FixtureRequest, data: dict[str, list[str]], pattern: str, value: str, literal: bool, # noqa: FBT001 expected: dict[str, list[str]], ) -> None: + if "duckdb" in str(constructor) and literal is False: + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result = df.select( nw.col("a").str.replace_all(pattern=pattern, value=value, literal=literal) ) diff --git a/tests/expr_and_series/str/to_datetime_test.py b/tests/expr_and_series/str/to_datetime_test.py index 388ef23db..3f8df65a7 100644 --- a/tests/expr_and_series/str/to_datetime_test.py +++ b/tests/expr_and_series/str/to_datetime_test.py @@ -17,7 +17,9 @@ data = {"a": ["2020-01-01T12:34:56"]} -def test_to_datetime(constructor: Constructor) -> None: +def test_to_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "cudf" in str(constructor): expected = "2020-01-01T12:34:56.000000000" else: @@ -78,6 +80,8 @@ def test_to_datetime_infer_fmt( request.applymarker(pytest.mark.xfail) if "cudf" in str(constructor): expected = expected_cudf + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) result = ( nw.from_native(constructor(data)) .lazy() @@ -126,7 +130,11 @@ def test_to_datetime_series_infer_fmt( assert str(result) == expected -def test_to_datetime_infer_fmt_from_date(constructor: Constructor) -> None: +def test_to_datetime_infer_fmt_from_date( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"z": ["2020-01-01", "2020-01-02", None]} expected = [datetime(2020, 1, 1), datetime(2020, 1, 2), None] result = ( diff --git a/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py b/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py index 1d0eb8834..1057b33de 100644 --- a/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py +++ b/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py @@ -39,6 +39,7 @@ def test_str_to_uppercase( "pandas_pyarrow_constructor", "pyarrow_table_constructor", "modin_pyarrow_constructor", + "duckdb_lazy_constructor", ) or ("dask" in str(constructor) and PYARROW_VERSION >= (12,)) ): @@ -80,6 +81,7 @@ def test_str_to_uppercase_series( "pandas_nullable_constructor", "polars_eager_constructor", "cudf_constructor", + "duckdb_lazy_constructor", "modin_constructor", ) ): diff --git a/tests/expr_and_series/sum_horizontal_test.py b/tests/expr_and_series/sum_horizontal_test.py index 21bd138c2..decb65c02 100644 --- a/tests/expr_and_series/sum_horizontal_test.py +++ b/tests/expr_and_series/sum_horizontal_test.py @@ -10,7 +10,11 @@ @pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) -def test_sumh(constructor: Constructor, col_expr: Any) -> None: +def test_sumh( + constructor: Constructor, col_expr: Any, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor(data)) result = df.with_columns(horizontal_sum=nw.sum_horizontal(col_expr, nw.col("b"))) @@ -23,7 +27,9 @@ def test_sumh(constructor: Constructor, col_expr: Any) -> None: assert_equal_data(result, expected) -def test_sumh_nullable(constructor: Constructor) -> None: +def test_sumh_nullable(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 8, 3], "b": [4, 5, None]} expected = {"hsum": [5, 13, 3]} @@ -32,7 +38,9 @@ def test_sumh_nullable(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_sumh_all(constructor: Constructor) -> None: +def test_sumh_all(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 2, 3], "b": [10, 20, 30]} df = nw.from_native(constructor(data)) result = df.select(nw.sum_horizontal(nw.all())) diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index f2f9c33ff..9ee38a230 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -10,7 +10,9 @@ from tests.utils import assert_equal_data -def test_unary(constructor: Constructor) -> None: +def test_unary(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "a": [1, 3, 2], "b": [4, 4, 6], @@ -77,7 +79,11 @@ def test_unary_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_unary_two_elements(constructor: Constructor) -> None: +def test_unary_two_elements( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 2], "b": [2, 10], "c": [2.0, None]} result = nw.from_native(constructor(data)).select( a_nunique=nw.col("a").n_unique(), @@ -120,7 +126,11 @@ def test_unary_two_elements_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_unary_one_element(constructor: Constructor) -> None: +def test_unary_one_element( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1], "b": [2], "c": [None]} # Dask runs into a divide by zero RuntimeWarning for 1 element skew. context = ( diff --git a/tests/expr_and_series/var_test.py b/tests/expr_and_series/var_test.py index bab97d383..2053dfe69 100644 --- a/tests/expr_and_series/var_test.py +++ b/tests/expr_and_series/var_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +from contextlib import nullcontext as does_not_raise + import pytest import narwhals.stable.v1 as nw @@ -24,10 +26,27 @@ def test_var(constructor: Constructor, input_data: dict[str, list[float | None]] result = df.select( nw.col("a").var(ddof=1).alias("a_ddof_1"), nw.col("a").var(ddof=0).alias("a_ddof_0"), - nw.col("b").var(ddof=2).alias("b_ddof_2"), nw.col("z").var(ddof=0).alias("z_ddof_0"), ) + expected_results = { + "a_ddof_1": [1.0], + "a_ddof_0": [0.6666666666666666], + "z_ddof_0": [0.6666666666666666], + } assert_equal_data(result, expected_results) + context = ( + pytest.raises(NotImplementedError) + if "duckdb" in str(constructor) + else does_not_raise() + ) + with context: + result = df.select( + nw.col("b").var(ddof=2).alias("b_ddof_2"), + ) + expected_results = { + "b_ddof_2": [2.666666666666667], + } + assert_equal_data(result, expected_results) @pytest.mark.parametrize("input_data", [data, data_with_nulls]) diff --git a/tests/expr_and_series/when_test.py b/tests/expr_and_series/when_test.py index 5c60febb4..b59dda488 100644 --- a/tests/expr_and_series/when_test.py +++ b/tests/expr_and_series/when_test.py @@ -17,7 +17,9 @@ } -def test_when(constructor: Constructor) -> None: +def test_when(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") == 1).then(value=3).alias("a_when")) expected = { @@ -26,7 +28,9 @@ def test_when(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_when_otherwise(constructor: Constructor) -> None: +def test_when_otherwise(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") == 1).then(3).otherwise(6).alias("a_when")) expected = { @@ -35,7 +39,11 @@ def test_when_otherwise(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_multiple_conditions(constructor: Constructor) -> None: +def test_multiple_conditions( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select( nw.when(nw.col("a") < 3, nw.col("c") < 5.0).then(3).alias("a_when") @@ -46,7 +54,11 @@ def test_multiple_conditions(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_no_arg_when_fail(constructor: Constructor) -> None: +def test_no_arg_when_fail( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) with pytest.raises((TypeError, ValueError)): df.select(nw.when().then(value=3).alias("a_when")) @@ -77,7 +89,11 @@ def test_value_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_value_expression(constructor: Constructor) -> None: +def test_value_expression( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") == 1).then(nw.col("a") + 9).alias("a_when")) expected = { @@ -110,7 +126,11 @@ def test_otherwise_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_otherwise_expression(constructor: Constructor) -> None: +def test_otherwise_expression( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select( nw.when(nw.col("a") == 1).then(-1).otherwise(nw.col("a") + 7).alias("a_when") @@ -121,14 +141,22 @@ def test_otherwise_expression(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_when_then_otherwise_into_expr(constructor: Constructor) -> None: +def test_when_then_otherwise_into_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") > 1).then("c").otherwise("e")) expected = {"c": [7, 5, 6]} assert_equal_data(result, expected) -def test_when_then_otherwise_lit_str(constructor: Constructor) -> None: +def test_when_then_otherwise_lit_str( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") > 1).then(nw.col("b")).otherwise(nw.lit("z"))) expected = {"b": ["z", "b", "c"]} diff --git a/tests/frame/add_test.py b/tests/frame/add_test.py index 27a332ed0..e04561895 100644 --- a/tests/frame/add_test.py +++ b/tests/frame/add_test.py @@ -1,11 +1,15 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import assert_equal_data -def test_add(constructor: Constructor) -> None: +def test_add(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor(data)) result = df.with_columns( diff --git a/tests/frame/clone_test.py b/tests/frame/clone_test.py index 1a02910c8..e142ed0a7 100644 --- a/tests/frame/clone_test.py +++ b/tests/frame/clone_test.py @@ -10,6 +10,8 @@ def test_clone(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "pyarrow_table" in str(constructor): request.applymarker(pytest.mark.xfail) diff --git a/tests/frame/concat_test.py b/tests/frame/concat_test.py index 26bbd2e62..4d5f3ebc9 100644 --- a/tests/frame/concat_test.py +++ b/tests/frame/concat_test.py @@ -7,7 +7,11 @@ from tests.utils import assert_equal_data -def test_concat_horizontal(constructor: Constructor) -> None: +def test_concat_horizontal( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_left = nw.from_native(constructor(data)).lazy() @@ -56,7 +60,11 @@ def test_concat_vertical(constructor: Constructor) -> None: nw.concat([df_left, df_left.select("d")], how="vertical").collect() -def test_concat_diagonal(constructor: Constructor) -> None: +def test_concat_diagonal( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data_1 = {"a": [1, 3], "b": [4, 6]} data_2 = {"a": [100, 200], "z": ["x", "y"]} expected = { diff --git a/tests/frame/drop_nulls_test.py b/tests/frame/drop_nulls_test.py index bb55439eb..368ad6ba0 100644 --- a/tests/frame/drop_nulls_test.py +++ b/tests/frame/drop_nulls_test.py @@ -12,7 +12,9 @@ } -def test_drop_nulls(constructor: Constructor) -> None: +def test_drop_nulls(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) result = nw.from_native(constructor(data)).drop_nulls() expected = { "a": [2.0, 4.0], @@ -30,7 +32,12 @@ def test_drop_nulls(constructor: Constructor) -> None: ], ) def test_drop_nulls_subset( - constructor: Constructor, subset: str | list[str], expected: dict[str, float] + constructor: Constructor, + subset: str | list[str], + expected: dict[str, float], + request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) result = nw.from_native(constructor(data)).drop_nulls(subset=subset) assert_equal_data(result, expected) diff --git a/tests/frame/explode_test.py b/tests/frame/explode_test.py index 631da0255..b79215a18 100644 --- a/tests/frame/explode_test.py +++ b/tests/frame/explode_test.py @@ -40,7 +40,7 @@ def test_explode_single_col( ) -> None: if any( backend in str(constructor) - for backend in ("dask", "modin", "cudf", "pyarrow_table") + for backend in ("dask", "modin", "cudf", "pyarrow_table", "duckdb") ): request.applymarker(pytest.mark.xfail) @@ -89,7 +89,7 @@ def test_explode_multiple_cols( ) -> None: if any( backend in str(constructor) - for backend in ("dask", "modin", "cudf", "pyarrow_table") + for backend in ("dask", "modin", "cudf", "pyarrow_table", "duckdb") ): request.applymarker(pytest.mark.xfail) @@ -110,7 +110,7 @@ def test_explode_shape_error( ) -> None: if any( backend in str(constructor) - for backend in ("dask", "modin", "cudf", "pyarrow_table") + for backend in ("dask", "modin", "cudf", "pyarrow_table", "duckdb") ): request.applymarker(pytest.mark.xfail) @@ -133,7 +133,7 @@ def test_explode_shape_error( def test_explode_invalid_operation_error( request: pytest.FixtureRequest, constructor: Constructor ) -> None: - if "dask" in str(constructor) or "pyarrow_table" in str(constructor): + if any(x in str(constructor) for x in ("pyarrow_table", "dask", "duckdb")): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) and POLARS_VERSION < (0, 20, 6): diff --git a/tests/frame/filter_test.py b/tests/frame/filter_test.py index b55ab7767..759d175ca 100644 --- a/tests/frame/filter_test.py +++ b/tests/frame/filter_test.py @@ -17,7 +17,11 @@ def test_filter(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_filter_with_boolean_list(constructor: Constructor) -> None: +def test_filter_with_boolean_list( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor(data)) context = ( diff --git a/tests/frame/gather_every_test.py b/tests/frame/gather_every_test.py index 671737ad1..40e9291de 100644 --- a/tests/frame/gather_every_test.py +++ b/tests/frame/gather_every_test.py @@ -11,7 +11,11 @@ @pytest.mark.parametrize("n", [1, 2, 3]) @pytest.mark.parametrize("offset", [1, 2, 3]) -def test_gather_every(constructor: Constructor, n: int, offset: int) -> None: +def test_gather_every( + constructor: Constructor, n: int, offset: int, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.gather_every(n=n, offset=offset) expected = {"a": data["a"][offset::n]} diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index faeac5b2f..4aa68e571 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -74,7 +74,9 @@ def test_inner_join_single_key(constructor: Constructor) -> None: assert_equal_data(result_on, expected) -def test_cross_join(constructor: Constructor) -> None: +def test_cross_join(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"antananarivo": [1, 3, 2]} df = nw.from_native(constructor(data)) result = df.join(df, how="cross").sort("antananarivo", "antananarivo_right") # type: ignore[arg-type] @@ -112,7 +114,11 @@ def test_suffix(constructor: Constructor, how: str, suffix: str) -> None: @pytest.mark.parametrize("suffix", ["_right", "_custom_suffix"]) -def test_cross_join_suffix(constructor: Constructor, suffix: str) -> None: +def test_cross_join_suffix( + constructor: Constructor, suffix: str, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"antananarivo": [1, 3, 2]} df = nw.from_native(constructor(data)) result = df.join(df, how="cross", suffix=suffix).sort( # type: ignore[arg-type] @@ -159,7 +165,10 @@ def test_anti_join( join_key: list[str], filter_expr: nw.Expr, expected: dict[str, list[Any]], + request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) other = df.filter(filter_expr) @@ -197,7 +206,10 @@ def test_semi_join( join_key: list[str], filter_expr: nw.Expr, expected: dict[str, list[Any]], + request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) other = df.filter(filter_expr) @@ -355,7 +367,7 @@ def test_joinasof_numeric( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: - if "pyarrow_table" in str(constructor) or "cudf" in str(constructor): + if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "duckdb")): request.applymarker(pytest.mark.xfail) if PANDAS_VERSION < (2, 1) and ( ("pandas_pyarrow" in str(constructor)) or ("pandas_nullable" in str(constructor)) @@ -414,7 +426,7 @@ def test_joinasof_time( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: - if "pyarrow_table" in str(constructor) or "cudf" in str(constructor): + if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "duckdb")): request.applymarker(pytest.mark.xfail) if PANDAS_VERSION < (2, 1) and ("pandas_pyarrow" in str(constructor)): request.applymarker(pytest.mark.xfail) @@ -495,7 +507,7 @@ def test_joinasof_by( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: - if "pyarrow_table" in str(constructor) or "cudf" in str(constructor): + if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "duckdb")): request.applymarker(pytest.mark.xfail) if PANDAS_VERSION < (2, 1) and ( ("pandas_pyarrow" in str(constructor)) or ("pandas_nullable" in str(constructor)) diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index d85697249..9d601e468 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -27,7 +27,9 @@ def test_select(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_empty_select(constructor: Constructor) -> None: +def test_empty_select(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) result = nw.from_native(constructor({"a": [1, 2, 3]})).lazy().select() assert result.collect().shape == (0, 0) @@ -75,7 +77,11 @@ def test_comparison_with_list_error_message() -> None: nw.from_native(pd.Series([[1, 2, 3]]), series_only=True) == [1, 2, 3] # noqa: B015 -def test_missing_columns(constructor: Constructor) -> None: +def test_missing_columns( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor(data)) selected_columns = ["a", "e", "f"] @@ -120,6 +126,8 @@ def test_left_to_right_broadcasting( ) -> None: if "dask" in str(constructor) and DASK_VERSION < (2024, 10): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 1, 2], "b": [4, 5, 6]})) result = df.select(nw.col("a") + nw.col("b").sum()) expected = {"a": [16, 16, 17]} diff --git a/tests/frame/unique_test.py b/tests/frame/unique_test.py index 96d5a8c2d..ca34d29b4 100644 --- a/tests/frame/unique_test.py +++ b/tests/frame/unique_test.py @@ -5,7 +5,10 @@ import pytest -import narwhals.stable.v1 as nw +# We use nw instead of nw.stable.v1 to ensure that DuckDBPyRelation +# becomes LazyFrame instead of DataFrame +import narwhals as nw +from narwhals.exceptions import ColumnNotFoundError from tests.utils import Constructor from tests.utils import assert_equal_data @@ -31,7 +34,10 @@ def test_unique( ) -> None: df_raw = constructor(data) df = nw.from_native(df_raw) - if isinstance(df, nw.LazyFrame) and keep in {"first", "last"}: + if isinstance(df, nw.LazyFrame) and keep in { + "first", + "last", + }: context: Any = pytest.raises(ValueError, match="row order") elif keep == "foo": context = pytest.raises(ValueError, match=": foo") @@ -43,6 +49,13 @@ def test_unique( assert_equal_data(result, expected) +def test_unique_invalid_subset(constructor: Constructor) -> None: + df_raw = constructor(data) + df = nw.from_native(df_raw) + with pytest.raises(ColumnNotFoundError): + df.lazy().unique(["fdssfad"]).collect() + + @pytest.mark.filterwarnings("ignore:.*backwards-compatibility:UserWarning") def test_unique_none(constructor: Constructor) -> None: df_raw = constructor(data) diff --git a/tests/frame/unpivot_test.py b/tests/frame/unpivot_test.py index ad7eefe5b..2867720a7 100644 --- a/tests/frame/unpivot_test.py +++ b/tests/frame/unpivot_test.py @@ -37,9 +37,7 @@ [("b", expected_b_only), (["b", "c"], expected_b_c), (None, expected_b_c)], ) def test_unpivot_on( - constructor: Constructor, - on: str | list[str] | None, - expected: dict[str, list[float]], + constructor: Constructor, on: str | list[str] | None, expected: dict[str, list[float]] ) -> None: df = nw.from_native(constructor(data)) result = df.unpivot(on=on, index=["a"]).sort("variable", "a") diff --git a/tests/frame/with_columns_test.py b/tests/frame/with_columns_test.py index c05a41646..335c53896 100644 --- a/tests/frame/with_columns_test.py +++ b/tests/frame/with_columns_test.py @@ -52,6 +52,8 @@ def test_with_columns_dtypes_single_row( ) -> None: if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (15,): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": ["foo"]} df = nw.from_native(constructor(data)).with_columns(nw.col("a").cast(nw.Categorical)) result = df.with_columns(nw.col("a")) diff --git a/tests/frame/with_row_index_test.py b/tests/frame/with_row_index_test.py index e19d3c994..bc514fa70 100644 --- a/tests/frame/with_row_index_test.py +++ b/tests/frame/with_row_index_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import assert_equal_data @@ -10,7 +12,9 @@ } -def test_with_row_index(constructor: Constructor) -> None: +def test_with_row_index(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) result = nw.from_native(constructor(data)).with_row_index() expected = {"index": [0, 1], "a": ["foo", "bars"], "ab": ["foo", "bars"]} assert_equal_data(result, expected) diff --git a/tests/group_by_test.py b/tests/group_by_test.py index 22c3b6f19..0dd6d8a10 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -115,6 +115,8 @@ def test_group_by_depth_1_agg( expected: dict[str, list[int | float]], request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor) and attr == "n_unique": + request.applymarker(pytest.mark.xfail) if "pandas_pyarrow" in str(constructor) and attr == "var" and PANDAS_VERSION < (2, 1): # Known issue with variance calculation in pandas 2.0.x with pyarrow backend in groupby operations" request.applymarker(pytest.mark.xfail) @@ -134,10 +136,10 @@ def test_group_by_depth_1_agg( ], ) def test_group_by_depth_1_std_var( - constructor: Constructor, - attr: str, - ddof: int, + constructor: Constructor, attr: str, ddof: int, request: pytest.FixtureRequest ) -> None: + if "duckdb" in str(constructor) and ddof == 2: + request.applymarker(pytest.mark.xfail) data = {"a": [1, 1, 1, 2, 2, 2], "b": [4, 5, 6, 0, 5, 5]} _pow = 0.5 if attr == "std" else 1 expected = { @@ -164,7 +166,11 @@ def test_group_by_median(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_group_by_n_unique_w_missing(constructor: Constructor) -> None: +def test_group_by_n_unique_w_missing( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 1, 2], "b": [4, None, 5], "c": [None, None, 7], "d": [1, 1, 3]} result = ( nw.from_native(constructor(data)) @@ -288,8 +294,10 @@ def test_key_with_nulls( def test_key_with_nulls_ignored( - constructor: Constructor, + constructor: Constructor, request: pytest.FixtureRequest ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"b": [4, 5, None], "a": [1, 2, 3]} result = ( nw.from_native(constructor(data)) @@ -341,6 +349,8 @@ def test_group_by_categorical( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "pyarrow_table" in str(constructor) and PYARROW_VERSION < ( 15, 0, @@ -366,6 +376,8 @@ def test_group_by_categorical( def test_group_by_shift_raises( constructor: Constructor, request: pytest.FixtureRequest ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "polars" in str(constructor): # Polars supports all kinds of crazy group-by aggregations, so # we don't check that it errors here. @@ -406,6 +418,8 @@ def test_all_kind_of_aggs( # and modin lol https://github.com/modin-project/modin/issues/7414 # and cudf https://github.com/rapidsai/cudf/issues/17649 request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "pandas" in str(constructor) and PANDAS_VERSION < (1, 4): # Bug in old pandas, can't do DataFrameGroupBy[['b', 'b']] request.applymarker(pytest.mark.xfail) diff --git a/tests/selectors_test.py b/tests/selectors_test.py index 86bdbac53..103ea666d 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -23,28 +23,36 @@ } -def test_selectors(constructor: Constructor) -> None: +def test_selectors(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(by_dtype([nw.Int64, nw.Float64]) + 1) expected = {"a": [2, 2, 3], "c": [5.1, 6.0, 7.0]} assert_equal_data(result, expected) -def test_numeric(constructor: Constructor) -> None: +def test_numeric(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(numeric() + 1) expected = {"a": [2, 2, 3], "c": [5.1, 6.0, 7.0]} assert_equal_data(result, expected) -def test_boolean(constructor: Constructor) -> None: +def test_boolean(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(boolean()) expected = {"d": [True, False, True]} assert_equal_data(result, expected) -def test_string(constructor: Constructor) -> None: +def test_string(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(string()) expected = {"b": ["a", "b", "c"]} @@ -59,6 +67,8 @@ def test_categorical( 15, ): # pragma: no cover request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) expected = {"b": ["a", "b", "c"]} df = nw.from_native(constructor(data)).with_columns(nw.col("b").cast(nw.Categorical)) @@ -81,15 +91,24 @@ def test_categorical( ], ) def test_set_ops( - constructor: Constructor, selector: nw.selectors.Selector, expected: list[str] + constructor: Constructor, + selector: nw.selectors.Selector, + expected: list[str], + request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(selector).collect_schema().names() assert sorted(result) == expected @pytest.mark.parametrize("invalid_constructor", [pd.DataFrame, pa.table]) -def test_set_ops_invalid(invalid_constructor: Constructor) -> None: +def test_set_ops_invalid( + invalid_constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(invalid_constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(invalid_constructor(data)) with pytest.raises((NotImplementedError, ValueError)): df.select(1 - numeric()) diff --git a/tests/stable_api_test.py b/tests/stable_api_test.py index fd08f575c..c3d028563 100644 --- a/tests/stable_api_test.py +++ b/tests/stable_api_test.py @@ -13,7 +13,11 @@ from tests.utils import assert_equal_data -def test_renamed_taxicab_norm(constructor: Constructor) -> None: +def test_renamed_taxicab_norm( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) # Suppose we need to rename `_l1_norm` to `_taxicab_norm`. # We need `narwhals.stable.v1` to stay stable. So, we # make the change in `narwhals`, and then add the new method diff --git a/tests/utils.py b/tests/utils.py index 34f1bfa1e..005b4eee2 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -11,6 +11,7 @@ import pandas as pd +from narwhals.translate import from_native from narwhals.typing import IntoDataFrame from narwhals.typing import IntoFrame from narwhals.utils import Implementation @@ -72,7 +73,12 @@ def assert_equal_data(result: Any, expected: dict[str, Any]) -> None: hasattr(result, "_compliant_frame") and result.implementation is Implementation.PYSPARK ) - + is_duckdb = ( + hasattr(result, "_compliant_frame") + and result._compliant_frame._implementation is Implementation.DUCKDB + ) + if is_duckdb: + result = from_native(result.to_native().arrow()) if hasattr(result, "collect"): if result.implementation is Implementation.POLARS and os.environ.get( "NARWHALS_POLARS_GPU", False diff --git a/tpch/execute.py b/tpch/execute.py index fb5982c10..e19b51dfb 100644 --- a/tpch/execute.py +++ b/tpch/execute.py @@ -13,6 +13,7 @@ pd.options.mode.copy_on_write = True pd.options.future.infer_string = True +pl.Config.set_fmt_float("full") DATA_DIR = Path("data") LINEITEM_PATH = DATA_DIR / "lineitem.parquet" @@ -92,7 +93,7 @@ def execute_query(query_id: str) -> None: print(f"\nRunning {query_id} with {backend=}") # noqa: T201 result = query_module.query( *( - nw.scan_parquet(path, native_namespace=native_namespace, **kwargs) + nw.scan_parquet(str(path), native_namespace=native_namespace, **kwargs) for path in data_paths ) ) diff --git a/utils/import_check.py b/utils/import_check.py index eee35dfc4..bac54aff7 100644 --- a/utils/import_check.py +++ b/utils/import_check.py @@ -23,6 +23,7 @@ "_arrow": {"pyarrow", "pyarrow.compute", "pyarrow.parquet"}, "_dask": {"dask.dataframe", "pandas", "dask_expr"}, "_polars": {"polars"}, + "_duckdb": {"duckdb"}, } @@ -63,6 +64,7 @@ def visit_ImportFrom(self, node: ast.ImportFrom) -> None: # noqa: N802 if ( node.module in BANNED_IMPORTS and "# ignore-banned-import" not in self.lines[node.lineno - 1] + and node.module not in self.allowed_imports ): print( # noqa: T201 f"{self.file_name}:{node.lineno}:{node.col_offset}: found {node.module} import" From 827718b2d83953d58bd16990b2168b11f920dfe6 Mon Sep 17 00:00:00 2001 From: Magdalena Kowalczuk <74981211+anopsy@users.noreply.github.com> Date: Mon, 6 Jan 2025 13:11:59 +0100 Subject: [PATCH 10/35] release: Bump version to 1.21.0 (#1738) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index a406b6295..a9d192515 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -30,7 +30,7 @@ To verify the installation, start the Python REPL and execute: ```python >>> import narwhals >>> narwhals.__version__ -'1.20.1' +'1.21.0' ``` If you see the version number, then the installation was successful! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 646a6a769..7c622fca6 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -79,7 +79,7 @@ from narwhals.utils import maybe_reset_index from narwhals.utils import maybe_set_index -__version__ = "1.20.1" +__version__ = "1.21.0" __all__ = [ "Array", diff --git a/pyproject.toml b/pyproject.toml index 6c33c09bb..c16407d80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.20.1" +version = "1.21.0" dependencies = [] requires-python = ">=3.8" authors = [ From 17546f28aefde385700ee2e99dd3da93d7824850 Mon Sep 17 00:00:00 2001 From: Alessandro Miola <37796412+AlessandroMiola@users.noreply.github.com> Date: Mon, 6 Jan 2025 18:51:36 +0100 Subject: [PATCH 11/35] docs: update api-completeness with `duckdb` (#1740) --- docs/basics/dataframe_conversion.md | 4 +++- utils/generate_backend_completeness.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/basics/dataframe_conversion.md b/docs/basics/dataframe_conversion.md index a4753a033..bdf803a23 100644 --- a/docs/basics/dataframe_conversion.md +++ b/docs/basics/dataframe_conversion.md @@ -53,7 +53,9 @@ which implements `__arrow_c_stream__`: def df_to_polars(df_native: Any) -> pl.DataFrame: if hasattr(df_native, "__arrow_c_stream__"): return nw.from_arrow(df_native, native_namespace=pl).to_native() - msg = f"Expected object which implements '__arrow_c_stream__' got: {type(df)}" + msg = ( + f"Expected object which implements '__arrow_c_stream__' got: {type(df_native)}" + ) raise TypeError(msg) diff --git a/utils/generate_backend_completeness.py b/utils/generate_backend_completeness.py index 2ab8d2187..397c8e4d6 100644 --- a/utils/generate_backend_completeness.py +++ b/utils/generate_backend_completeness.py @@ -31,9 +31,10 @@ class Backend(NamedTuple): MODULES = ["dataframe", "series", "expr"] BACKENDS = [ - Backend(name="pandas-like", module="_pandas_like", type_=BackendType.EAGER), Backend(name="arrow", module="_arrow", type_=BackendType.EAGER), Backend(name="dask", module="_dask", type_=BackendType.LAZY), + Backend(name="duckdb", module="_duckdb", type_=BackendType.LAZY), + Backend(name="pandas-like", module="_pandas_like", type_=BackendType.EAGER), Backend(name="spark-like", module="_spark_like", type_=BackendType.LAZY), ] @@ -55,6 +56,7 @@ def parse_module(module_name: str, backend: str, nw_class_name: str) -> list[str inspect.isclass(c) and c.__name__.endswith(nw_class_name) and not c.__name__.startswith("Compliant") # Exclude protocols + and not c.__name__.startswith("DuckDBInterchange") ), ) From 3e42edd51e4e3f28cc62a4282c4e69e5fdfcd8ea Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Tue, 7 Jan 2025 08:58:18 +0100 Subject: [PATCH 12/35] chore: increase PySpark min version to `3.5.0` (#1744) --- .github/workflows/extremes.yml | 8 ++++---- narwhals/_spark_like/expr.py | 14 ++------------ narwhals/_spark_like/group_by.py | 13 +++---------- narwhals/_spark_like/utils.py | 18 ++++-------------- pyproject.toml | 2 +- 5 files changed, 14 insertions(+), 41 deletions(-) diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index 9e7e997b2..47ebc85ea 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -61,7 +61,7 @@ jobs: cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "pyproject.toml" - name: install-pretty-old-versions - run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.3.0 scipy==1.5.0 scikit-learn==1.1.0 tzdata --system + run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.5.0 scikit-learn==1.1.0 tzdata --system - name: install-reqs run: uv pip install -e ".[dev]" --system - name: show-deps @@ -75,7 +75,7 @@ jobs: echo "$DEPS" | grep 'polars==0.20.3' echo "$DEPS" | grep 'numpy==1.17.5' echo "$DEPS" | grep 'pyarrow==11.0.0' - echo "$DEPS" | grep 'pyspark==3.3.0' + echo "$DEPS" | grep 'pyspark==3.5.0' echo "$DEPS" | grep 'scipy==1.5.0' echo "$DEPS" | grep 'scikit-learn==1.1.0' - name: Run pytest @@ -99,7 +99,7 @@ jobs: cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "pyproject.toml" - name: install-not-so-old-versions - run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==15.0.0 "pyarrow-stubs<17" pyspark==3.4.0 scipy==1.8.0 scikit-learn==1.3.0 dask[dataframe]==2024.10 tzdata --system + run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==15.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.8.0 scikit-learn==1.3.0 dask[dataframe]==2024.10 tzdata --system - name: install-reqs run: uv pip install -e ".[dev]" --system - name: show-deps @@ -111,7 +111,7 @@ jobs: echo "$DEPS" | grep 'polars==0.20.8' echo "$DEPS" | grep 'numpy==1.24.4' echo "$DEPS" | grep 'pyarrow==15.0.0' - echo "$DEPS" | grep 'pyspark==3.4.0' + echo "$DEPS" | grep 'pyspark==3.5.0' echo "$DEPS" | grep 'scipy==1.8.0' echo "$DEPS" | grep 'scikit-learn==1.3.0' echo "$DEPS" | grep 'dask==2024.10' diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index b74aea678..d190b5667 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -225,12 +225,7 @@ def std(self: Self, ddof: int) -> Self: from narwhals._spark_like.utils import _std - func = partial( - _std, - ddof=ddof, - backend_version=self._backend_version, - np_version=parse_version(np.__version__), - ) + func = partial(_std, ddof=ddof, np_version=parse_version(np.__version__)) return self._from_call(func, "std", returns_scalar=True, ddof=ddof) @@ -241,11 +236,6 @@ def var(self: Self, ddof: int) -> Self: from narwhals._spark_like.utils import _var - func = partial( - _var, - ddof=ddof, - backend_version=self._backend_version, - np_version=parse_version(np.__version__), - ) + func = partial(_var, ddof=ddof, np_version=parse_version(np.__version__)) return self._from_call(func, "var", returns_scalar=True, ddof=ddof) diff --git a/narwhals/_spark_like/group_by.py b/narwhals/_spark_like/group_by.py index c7cc52bf1..7f3dc077d 100644 --- a/narwhals/_spark_like/group_by.py +++ b/narwhals/_spark_like/group_by.py @@ -79,16 +79,13 @@ def _from_native_frame(self, df: SparkLikeLazyFrame) -> SparkLikeLazyFrame: ) -def get_spark_function( - function_name: str, backend_version: tuple[int, ...], **kwargs: Any -) -> Column: +def get_spark_function(function_name: str, **kwargs: Any) -> Column: if function_name in {"std", "var"}: import numpy as np # ignore-banned-import return partial( _std if function_name == "std" else _var, ddof=kwargs.get("ddof", 1), - backend_version=backend_version, np_version=parse_version(np.__version__), ) from pyspark.sql import functions as F # noqa: N812 @@ -127,9 +124,7 @@ def agg_pyspark( function_name = POLARS_TO_PYSPARK_AGGREGATIONS.get( expr._function_name, expr._function_name ) - agg_func = get_spark_function( - function_name, backend_version=expr._backend_version, **expr._kwargs - ) + agg_func = get_spark_function(function_name, **expr._kwargs) simple_aggregations.update( {output_name: agg_func(keys[0]) for output_name in expr._output_names} ) @@ -146,9 +141,7 @@ def agg_pyspark( pyspark_function = POLARS_TO_PYSPARK_AGGREGATIONS.get( function_name, function_name ) - agg_func = get_spark_function( - pyspark_function, backend_version=expr._backend_version, **expr._kwargs - ) + agg_func = get_spark_function(pyspark_function, **expr._kwargs) simple_aggregations.update( { diff --git a/narwhals/_spark_like/utils.py b/narwhals/_spark_like/utils.py index a3c77033c..fb3a3f3c4 100644 --- a/narwhals/_spark_like/utils.py +++ b/narwhals/_spark_like/utils.py @@ -120,13 +120,8 @@ def maybe_evaluate(df: SparkLikeLazyFrame, obj: Any) -> Any: return obj -def _std( - _input: Column | str, - ddof: int, - backend_version: tuple[int, ...], - np_version: tuple[int, ...], -) -> Column: - if backend_version < (3, 5) or np_version > (2, 0): +def _std(_input: Column | str, ddof: int, np_version: tuple[int, ...]) -> Column: + if np_version > (2, 0): from pyspark.sql import functions as F # noqa: N812 if ddof == 1: @@ -142,13 +137,8 @@ def _std( return stddev(input_col, ddof=ddof) -def _var( - _input: Column | str, - ddof: int, - backend_version: tuple[int, ...], - np_version: tuple[int, ...], -) -> Column: - if backend_version < (3, 5) or np_version > (2, 0): +def _var(_input: Column | str, ddof: int, np_version: tuple[int, ...]) -> Column: + if np_version > (2, 0): from pyspark.sql import functions as F # noqa: N812 if ddof == 1: diff --git a/pyproject.toml b/pyproject.toml index c16407d80..bb89564b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ pandas = ["pandas>=0.25.3"] modin = ["modin"] cudf = ["cudf>=24.10.0"] pyarrow = ["pyarrow>=11.0.0"] -pyspark = ["pyspark>=3.3.0"] +pyspark = ["pyspark>=3.5.0"] polars = ["polars>=0.20.3"] dask = ["dask[dataframe]>=2024.8"] duckdb = ["duckdb>=1.0"] From 74dd9db65c886616703dce4865b9549e1f14dc56 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Jan 2025 07:59:10 +0000 Subject: [PATCH 13/35] [pre-commit.ci] pre-commit autoupdate (#1741) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.8.1 → v0.8.6](https://github.com/astral-sh/ruff-pre-commit/compare/v0.8.1...v0.8.6) - [github.com/pre-commit/mirrors-mypy: v1.13.0 → v1.14.1](https://github.com/pre-commit/mirrors-mypy/compare/v1.13.0...v1.14.1) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix mypy --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- narwhals/_arrow/utils.py | 2 +- narwhals/_pandas_like/series.py | 4 ++-- narwhals/_pandas_like/utils.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fb149f30b..5e6929f16 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ ci: repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.8.1' + rev: 'v0.8.6' hooks: # Run the formatter. - id: ruff-format @@ -14,7 +14,7 @@ repos: alias: check-docstrings entry: python utils/check_docstrings.py - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.13.0' + rev: 'v1.14.1' hooks: - id: mypy additional_dependencies: ['polars==1.4.1', 'pytest==8.3.2'] diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index 4dbc17a91..ca4852655 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -184,7 +184,7 @@ def broadcast_and_extract_native( rhs = rhs[0] if isinstance(rhs, ArrowDataFrame): - return NotImplemented + return NotImplemented # type: ignore[no-any-return] if isinstance(rhs, ArrowSeries): if len(rhs) == 1: diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 8a6779828..60918fd2c 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -300,13 +300,13 @@ def arg_true(self) -> PandasLikeSeries: def arg_min(self) -> int: ser = self._native_series if self._implementation is Implementation.PANDAS and self._backend_version < (1,): - return ser.values.argmin() # type: ignore[no-any-return] # noqa: PD011 + return ser.values.argmin() # type: ignore[no-any-return] return ser.argmin() # type: ignore[no-any-return] def arg_max(self) -> int: ser = self._native_series if self._implementation is Implementation.PANDAS and self._backend_version < (1,): - return ser.values.argmax() # type: ignore[no-any-return] # noqa: PD011 + return ser.values.argmax() # type: ignore[no-any-return] return ser.argmax() # type: ignore[no-any-return] # Binary comparisons diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 03f025f78..08d490581 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -118,7 +118,7 @@ def broadcast_align_and_extract_native( lhs_index = lhs._native_series.index if isinstance(rhs, PandasLikeDataFrame): - return NotImplemented + return NotImplemented # type: ignore[no-any-return] if isinstance(rhs, PandasLikeSeries): rhs_index = rhs._native_series.index From 9a62d90557c6ad23700e0effc5b60411a17c7a87 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 7 Jan 2025 09:14:39 +0000 Subject: [PATCH 14/35] chore: filter old pyarrow/pandas warnings we cant do anything about (#1746) --- tests/frame/to_arrow_test.py | 1 + tests/frame/write_parquet_test.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/frame/to_arrow_test.py b/tests/frame/to_arrow_test.py index 3e8c704ea..70913ed15 100644 --- a/tests/frame/to_arrow_test.py +++ b/tests/frame/to_arrow_test.py @@ -12,6 +12,7 @@ from tests.utils import ConstructorEager +@pytest.mark.filterwarnings("ignore:.*is_sparse is deprecated:DeprecationWarning") def test_to_arrow( request: pytest.FixtureRequest, constructor_eager: ConstructorEager, diff --git a/tests/frame/write_parquet_test.py b/tests/frame/write_parquet_test.py index e4b826cfb..670e8c7c9 100644 --- a/tests/frame/write_parquet_test.py +++ b/tests/frame/write_parquet_test.py @@ -14,6 +14,7 @@ @pytest.mark.skipif(PANDAS_VERSION < (2, 0, 0), reason="too old for pyarrow") +@pytest.mark.filterwarnings("ignore:.*is_sparse is deprecated:DeprecationWarning") def test_write_parquet( constructor_eager: ConstructorEager, tmpdir: pytest.TempdirFactory, From 5c0a33a0efca0f353be0e3001eef7dbc36a03622 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Tue, 7 Jan 2025 10:19:44 +0100 Subject: [PATCH 15/35] feat: add `Series|Expr.rank` (#1342) * pandas int workaround * old pyarrow * fail pandas_pyarrow for pandas < (2,1) * xfail int only * fix options in over * merge main and better return docstring * float(nan) -> None * test eager only for rank --- docs/api-reference/expr.md | 1 + docs/api-reference/series.md | 1 + narwhals/_arrow/expr.py | 10 +++ narwhals/_arrow/series.py | 30 +++++++ narwhals/_pandas_like/expr.py | 24 +++++- narwhals/_pandas_like/series.py | 50 +++++++++++ narwhals/expr.py | 97 +++++++++++++++++++++ narwhals/series.py | 95 ++++++++++++++++++++ tests/expr_and_series/rank_test.py | 134 +++++++++++++++++++++++++++++ 9 files changed, 439 insertions(+), 3 deletions(-) create mode 100644 tests/expr_and_series/rank_test.py diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md index 299ab2d4a..e0f7b6578 100644 --- a/docs/api-reference/expr.md +++ b/docs/api-reference/expr.md @@ -47,6 +47,7 @@ - over - pipe - quantile + - rank - replace_strict - rolling_mean - rolling_std diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index c2e35a3c5..0aea494f7 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -54,6 +54,7 @@ - null_count - pipe - quantile + - rank - rename - replace_strict - rolling_mean diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index e511f405d..5ae6ce6b0 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -527,6 +527,16 @@ def rolling_std( ddof=ddof, ) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + return reuse_series_implementation( + self, "rank", method=method, descending=descending + ) + @property def dt(self: Self) -> ArrowExprDateTimeNamespace: return ArrowExprDateTimeNamespace(self) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 046e26e05..1e8d09827 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -1097,6 +1097,36 @@ def rolling_std( ** 0.5 ) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + if method == "average": + msg = ( + "`rank` with `method='average' is not supported for pyarrow backend. " + "The available methods are {'min', 'max', 'dense', 'ordinal'}." + ) + raise ValueError(msg) + + import pyarrow as pa # ignore-banned-import + import pyarrow.compute as pc # ignore-banned-import + + sort_keys = "descending" if descending else "ascending" + tiebreaker = "first" if method == "ordinal" else method + + native_series = self._native_series + if self._backend_version < (14, 0, 0): # pragma: no cover + native_series = native_series.combine_chunks() + + null_mask = pc.is_null(native_series) + + rank = pc.rank(native_series, sort_keys=sort_keys, tiebreaker=tiebreaker) + + result = pc.if_else(null_mask, pa.scalar(None), rank) + return self._from_native_series(result) + def __iter__(self: Self) -> Iterator[Any]: yield from ( maybe_extract_py_scalar(x, return_py_scalar=True) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index c681fc487..fac9a2ed6 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -34,6 +34,7 @@ # So, instead of using "cumcount" we use "cumsum" on notna() to get the same result "col->cum_count": "cumsum", "col->shift": "shift", + "col->rank": "rank", } @@ -383,7 +384,7 @@ def alias(self, name: str) -> Self: kwargs={**self._kwargs, "name": name}, ) - def over(self, keys: list[str]) -> Self: + def over(self: Self, keys: list[str]) -> Self: if self._function_name in MANY_TO_MANY_AGG_FUNCTIONS_TO_PANDAS_EQUIVALENT: def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: @@ -412,8 +413,15 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: if self._function_name == "col->shift": kwargs = {"periods": self._kwargs.get("n", 1)} - else: - # Cumulative operation + elif self._function_name == "col->rank": + _method = self._kwargs.get("method", "average") + kwargs = { + "method": "first" if _method == "ordinal" else _method, + "ascending": not self._kwargs.get("descending", False), + "na_option": "keep", + "pct": False, + } + else: # Cumulative operation kwargs = {"skipna": True} res_native = getattr( @@ -617,6 +625,16 @@ def rolling_std( ddof=ddof, ) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + return reuse_series_implementation( + self, "rank", method=method, descending=descending + ) + @property def str(self: Self) -> PandasLikeExprStringNamespace: return PandasLikeExprStringNamespace(self) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 60918fd2c..e5c5e771e 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -1119,6 +1119,56 @@ def is_finite(self: Self) -> Self: s = self._native_series return self._from_native_series((s > float("-inf")) & (s < float("inf"))) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + pd_method = "first" if method == "ordinal" else method + native_series = self._native_series + dtypes = import_dtypes_module(self._version) + if ( + self._implementation is Implementation.PANDAS + and self._backend_version < (3,) + and self.dtype + in { + dtypes.Int64, + dtypes.Int32, + dtypes.Int16, + dtypes.Int8, + dtypes.UInt64, + dtypes.UInt32, + dtypes.UInt16, + dtypes.UInt8, + } + and (null_mask := native_series.isna()).any() + ): + # crazy workaround for the case of `na_option="keep"` and nullable + # integer dtypes. This should be supported in pandas > 3.0 + # https://github.com/pandas-dev/pandas/issues/56976 + ranked_series = ( + native_series.to_frame() + .assign(**{f"{native_series.name}_is_null": null_mask}) + .groupby(f"{native_series.name}_is_null") + .rank( + method=pd_method, + na_option="keep", + ascending=not descending, + pct=False, + )[native_series.name] + ) + + else: + ranked_series = native_series.rank( + method=pd_method, + na_option="keep", + ascending=not descending, + pct=False, + ) + + return self._from_native_series(ranked_series) + @property def str(self) -> PandasLikeSeriesStringNamespace: return PandasLikeSeriesStringNamespace(self) diff --git a/narwhals/expr.py b/narwhals/expr.py index 3e457989a..809f76e77 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -4114,6 +4114,103 @@ def rolling_std( ) ) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"] = "average", + *, + descending: bool = False, + ) -> Self: + """Assign ranks to data, dealing with ties appropriately. + + Notes: + The resulting dtype may differ between backends. + + Arguments: + method: The method used to assign ranks to tied elements. + The following methods are available (default is 'average'): + + - 'average' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - 'min' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - 'max' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - 'dense' : Like 'min', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - 'ordinal' : All values are given a distinct rank, corresponding to the + order that the values occur in the Series. + + descending: Rank in descending order. + + Returns: + A new expression with rank data. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [3, 6, 1, 1, 6]} + + We define a dataframe-agnostic function that computes the dense rank for + the data: + + >>> def agnostic_dense_rank(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... result = df.with_columns(rnk=nw.col("a").rank(method="dense")) + ... return result.to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dense_rank`: + + >>> agnostic_dense_rank(pd.DataFrame(data)) + a rnk + 0 3 2.0 + 1 6 3.0 + 2 1 1.0 + 3 1 1.0 + 4 6 3.0 + + >>> agnostic_dense_rank(pl.DataFrame(data)) + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ rnk │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 2 │ + │ 6 ┆ 3 │ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 6 ┆ 3 │ + └─────┴─────┘ + + >>> agnostic_dense_rank(pa.table(data)) + pyarrow.Table + a: int64 + rnk: uint64 + ---- + a: [[3,6,1,1,6]] + rnk: [[2,3,1,1,3]] + """ + supported_rank_methods = {"average", "min", "max", "dense", "ordinal"} + if method not in supported_rank_methods: + msg = ( + "Ranking method must be one of {'average', 'min', 'max', 'dense', 'ordinal'}. " + f"Found '{method}'" + ) + raise ValueError(msg) + + return self.__class__( + lambda plx: self._to_compliant_expr(plx).rank( + method=method, descending=descending + ) + ) + @property def str(self: Self) -> ExprStringNamespace[Self]: return ExprStringNamespace(self) diff --git a/narwhals/series.py b/narwhals/series.py index c3e6f181b..7b4cfbf6e 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -4738,6 +4738,101 @@ def __iter__(self: Self) -> Iterator[Any]: def __contains__(self: Self, other: Any) -> bool: return self._compliant_series.__contains__(other) # type: ignore[no-any-return] + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"] = "average", + *, + descending: bool = False, + ) -> Self: + """Assign ranks to data, dealing with ties appropriately. + + Notes: + The resulting dtype may differ between backends. + + Arguments: + method: The method used to assign ranks to tied elements. + The following methods are available (default is 'average'): + + - 'average' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - 'min' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - 'max' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - 'dense' : Like 'min', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - 'ordinal' : All values are given a distinct rank, corresponding to the + order that the values occur in the Series. + + descending: Rank in descending order. + + Returns: + A new series with rank data as values. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> + >>> data = [3, 6, 1, 1, 6] + + We define a dataframe-agnostic function that computes the dense rank for + the data: + + >>> def agnostic_dense_rank(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.rank(method="dense").to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dense_rank`: + + >>> agnostic_dense_rank(pd.Series(data)) + 0 2.0 + 1 3.0 + 2 1.0 + 3 1.0 + 4 3.0 + dtype: float64 + + >>> agnostic_dense_rank(pl.Series(data)) # doctest:+NORMALIZE_WHITESPACE + shape: (5,) + Series: '' [u32] + [ + 2 + 3 + 1 + 1 + 3 + ] + + >>> agnostic_dense_rank(pa.chunked_array([data])) # doctest:+ELLIPSIS + + [ + [ + 2, + 3, + 1, + 1, + 3 + ] + ] + """ + supported_rank_methods = {"average", "min", "max", "dense", "ordinal"} + if method not in supported_rank_methods: + msg = ( + "Ranking method must be one of {'average', 'min', 'max', 'dense', 'ordinal'}. " + f"Found '{method}'" + ) + raise ValueError(msg) + + return self._from_compliant_series( + self._compliant_series.rank(method=method, descending=descending) + ) + @property def str(self: Self) -> SeriesStringNamespace[Self]: return SeriesStringNamespace(self) diff --git a/tests/expr_and_series/rank_test.py b/tests/expr_and_series/rank_test.py new file mode 100644 index 000000000..99a64371e --- /dev/null +++ b/tests/expr_and_series/rank_test.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +from contextlib import nullcontext as does_not_raise +from typing import Literal + +import pytest + +import narwhals.stable.v1 as nw +from tests.utils import PANDAS_VERSION +from tests.utils import ConstructorEager +from tests.utils import assert_equal_data + +rank_methods = ["average", "min", "max", "dense", "ordinal"] + +data_int = {"a": [3, 6, 1, 1, None, 6], "b": [1, 1, 2, 1, 2, 2]} +data_float = {"a": [3.1, 6.1, 1.5, 1.5, None, 6.1], "b": [1, 1, 2, 1, 2, 2]} + +expected = { + "average": [3.0, 4.5, 1.5, 1.5, None, 4.5], + "min": [3, 4, 1, 1, None, 4], + "max": [3, 5, 2, 2, None, 5], + "dense": [2, 3, 1, 1, None, 3], + "ordinal": [3, 4, 1, 2, None, 5], +} + +expected_over = { + "average": [2.0, 3.0, 1.0, 1.0, None, 2.0], + "min": [2, 3, 1, 1, None, 2], + "max": [2, 3, 1, 1, None, 2], + "dense": [2, 3, 1, 1, None, 2], + "ordinal": [2, 3, 1, 1, None, 2], +} + + +@pytest.mark.parametrize("method", rank_methods) +@pytest.mark.parametrize("data", [data_int, data_float]) +def test_rank_expr( + request: pytest.FixtureRequest, + constructor_eager: ConstructorEager, + method: Literal["average", "min", "max", "dense", "ordinal"], + data: dict[str, list[float]], +) -> None: + if ( + "pandas_pyarrow" in str(constructor_eager) + and PANDAS_VERSION < (2, 1) + and isinstance(data["a"][0], int) + ): + request.applymarker(pytest.mark.xfail) + + context = ( + pytest.raises( + ValueError, + match=r"`rank` with `method='average' is not supported for pyarrow backend.", + ) + if "pyarrow_table" in str(constructor_eager) and method == "average" + else does_not_raise() + ) + + with context: + df = nw.from_native(constructor_eager(data)) + + result = df.select(nw.col("a").rank(method=method)) + expected_data = {"a": expected[method]} + assert_equal_data(result, expected_data) + + +@pytest.mark.parametrize("method", rank_methods) +@pytest.mark.parametrize("data", [data_int, data_float]) +def test_rank_series( + request: pytest.FixtureRequest, + constructor_eager: ConstructorEager, + method: Literal["average", "min", "max", "dense", "ordinal"], + data: dict[str, list[float]], +) -> None: + if ( + "pandas_pyarrow" in str(constructor_eager) + and PANDAS_VERSION < (2, 1) + and isinstance(data["a"][0], int) + ): + request.applymarker(pytest.mark.xfail) + + context = ( + pytest.raises( + ValueError, + match=r"`rank` with `method='average' is not supported for pyarrow backend.", + ) + if "pyarrow_table" in str(constructor_eager) and method == "average" + else does_not_raise() + ) + + with context: + df = nw.from_native(constructor_eager(data), eager_only=True) + + result = {"a": df["a"].rank(method=method)} + expected_data = {"a": expected[method]} + assert_equal_data(result, expected_data) + + +@pytest.mark.parametrize("method", rank_methods) +def test_rank_expr_in_over_context( + request: pytest.FixtureRequest, + constructor_eager: ConstructorEager, + method: Literal["average", "min", "max", "dense", "ordinal"], +) -> None: + if any(x in str(constructor_eager) for x in ("pyarrow_table", "dask")): + # Pyarrow raises: + # > pyarrow.lib.ArrowKeyError: No function registered with name: hash_rank + # We can handle that to provide a better error message. + request.applymarker(pytest.mark.xfail) + + if "pandas_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2, 1): + request.applymarker(pytest.mark.xfail) + + df = nw.from_native(constructor_eager(data_float)) + + result = df.select(nw.col("a").rank(method=method).over("b")) + expected_data = {"a": expected_over[method]} + assert_equal_data(result, expected_data) + + +def test_invalid_method_raise(constructor_eager: ConstructorEager) -> None: + method = "invalid_method_name" + df = nw.from_native(constructor_eager(data_float)) + + msg = ( + "Ranking method must be one of {'average', 'min', 'max', 'dense', 'ordinal'}. " + f"Found '{method}'" + ) + + with pytest.raises(ValueError, match=msg): + df.select(nw.col("a").rank(method=method)) # type: ignore[arg-type] + + with pytest.raises(ValueError, match=msg): + df.lazy().collect()["a"].rank(method=method) # type: ignore[arg-type] From 320d6bc72a000f2302f48f00ae4a44ac22101c28 Mon Sep 17 00:00:00 2001 From: Dhanunjaya Elluri Date: Tue, 7 Jan 2025 12:11:04 +0100 Subject: [PATCH 16/35] test: remove `cudf` from `tests/expr_and_series/replace_time_zone_test.py::test_replace_time_zone_none[cudf]` (#1748) test: remove `cudf` from `replace_time_zone_test.py::test_replace_time_zone_none` --- tests/expr_and_series/replace_time_zone_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/expr_and_series/replace_time_zone_test.py b/tests/expr_and_series/replace_time_zone_test.py index eed90feb1..132c4efc5 100644 --- a/tests/expr_and_series/replace_time_zone_test.py +++ b/tests/expr_and_series/replace_time_zone_test.py @@ -53,7 +53,6 @@ def test_replace_time_zone_none( or ("pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) - or ("cudf" in str(constructor)) or ("duckdb" in str(constructor)) ): request.applymarker(pytest.mark.xfail) From 44dd8d8b048bbaf4d0babc5cc72840399355697a Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 7 Jan 2025 11:40:39 +0000 Subject: [PATCH 17/35] fix: fix license classifier (#1751) --- pyproject.toml | 2 +- tests/expr_and_series/rolling_mean_test.py | 1 + tests/expr_and_series/rolling_sum_test.py | 1 + tests/expr_and_series/rolling_var_test.py | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bb89564b7..f2800292e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,8 +14,8 @@ authors = [ description = "Extremely lightweight compatibility layer between dataframe libraries" readme = "README.md" keywords = ["dataframes", "interoperability", "pandas", "polars", "pyarrow", "dask", "modin", "cudf"] -license = {file = "LICENSE.md"} classifiers = [ + "License :: OSI Approved :: MIT License", "Programming Language :: Python", "Operating System :: OS Independent", ] diff --git a/tests/expr_and_series/rolling_mean_test.py b/tests/expr_and_series/rolling_mean_test.py index 2fb6a47fb..b91ecb27e 100644 --- a/tests/expr_and_series/rolling_mean_test.py +++ b/tests/expr_and_series/rolling_mean_test.py @@ -76,6 +76,7 @@ def test_rolling_mean_series(constructor_eager: ConstructorEager) -> None: @pytest.mark.skipif(PANDAS_VERSION < (1,), reason="too old for pyarrow") @pytest.mark.slow @pytest.mark.filterwarnings("ignore:.*:narwhals.exceptions.NarwhalsUnstableWarning") +@pytest.mark.filterwarnings("ignore:.*is_sparse is deprecated:DeprecationWarning") def test_rolling_mean_hypothesis(center: bool, values: list[float]) -> None: # noqa: FBT001 s = pd.Series(values) n_missing = random.randint(0, len(s) - 1) # noqa: S311 diff --git a/tests/expr_and_series/rolling_sum_test.py b/tests/expr_and_series/rolling_sum_test.py index 0e3951958..f63786051 100644 --- a/tests/expr_and_series/rolling_sum_test.py +++ b/tests/expr_and_series/rolling_sum_test.py @@ -194,6 +194,7 @@ def test_rolling_sum_series_invalid_params( ) @pytest.mark.skipif(PANDAS_VERSION < (1,), reason="too old for pyarrow") @pytest.mark.filterwarnings("ignore:.*:narwhals.exceptions.NarwhalsUnstableWarning") +@pytest.mark.filterwarnings("ignore:.*is_sparse is deprecated:DeprecationWarning") @pytest.mark.slow def test_rolling_sum_hypothesis(center: bool, values: list[float]) -> None: # noqa: FBT001 s = pd.Series(values) diff --git a/tests/expr_and_series/rolling_var_test.py b/tests/expr_and_series/rolling_var_test.py index 4d4c596d9..3e5bb0c96 100644 --- a/tests/expr_and_series/rolling_var_test.py +++ b/tests/expr_and_series/rolling_var_test.py @@ -105,6 +105,7 @@ def test_rolling_var_series( ) @pytest.mark.skipif(PANDAS_VERSION < (1,), reason="too old for pyarrow") @pytest.mark.skipif(POLARS_VERSION < (1,), reason="different null behavior") +@pytest.mark.filterwarnings("ignore:.*is_sparse is deprecated:DeprecationWarning") @pytest.mark.filterwarnings("ignore:.*:narwhals.exceptions.NarwhalsUnstableWarning") def test_rolling_var_hypothesis(center: bool, values: list[float]) -> None: # noqa: FBT001 s = pd.Series(values) From 77e18c128ad328b80c21e799820db8d51ac32042 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 7 Jan 2025 11:49:15 +0000 Subject: [PATCH 18/35] release: Bump version to 1.21.1 (#1752) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index a9d192515..8857b8029 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -30,7 +30,7 @@ To verify the installation, start the Python REPL and execute: ```python >>> import narwhals >>> narwhals.__version__ -'1.21.0' +'1.21.1' ``` If you see the version number, then the installation was successful! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 7c622fca6..ac0a8cbe4 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -79,7 +79,7 @@ from narwhals.utils import maybe_reset_index from narwhals.utils import maybe_set_index -__version__ = "1.21.0" +__version__ = "1.21.1" __all__ = [ "Array", diff --git a/pyproject.toml b/pyproject.toml index f2800292e..4d904d5b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.21.0" +version = "1.21.1" dependencies = [] requires-python = ">=3.8" authors = [ From 92e3b87a5f491348d24d4f64b1fe255831fd827d Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 7 Jan 2025 11:59:43 +0000 Subject: [PATCH 19/35] chore: Filter left join warning (#1753) chore: ignore reverted warning --- pyproject.toml | 2 ++ tests/frame/join_test.py | 3 --- tests/hypothesis/join_test.py | 1 - tests/spark_like_test.py | 3 --- 4 files changed, 2 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4d904d5b2..bea188a59 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -162,6 +162,8 @@ filterwarnings = [ 'ignore:.*You are using pyarrow version', # This warning was temporarily raised by pandas but then reverted. 'ignore:.*Passing a BlockManager to DataFrame:DeprecationWarning', + # This warning was temporarily raised by Polars but then reverted. + 'ignore:.*The default coalesce behavior of left join will change:DeprecationWarning', ] xfail_strict = true markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"] diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index 4aa68e571..7332cb254 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -233,7 +233,6 @@ def test_join_not_implemented(constructor: Constructor, how: str) -> None: df.join(df, left_on="antananarivo", right_on="antananarivo", how=how) # type: ignore[arg-type] -@pytest.mark.filterwarnings("ignore:the default coalesce behavior") def test_left_join(constructor: Constructor) -> None: data_left = { "antananarivo": [1.0, 2, 3], @@ -272,7 +271,6 @@ def test_left_join(constructor: Constructor) -> None: assert_equal_data(result_on_list, expected_on_list) -@pytest.mark.filterwarnings("ignore: the default coalesce behavior") def test_left_join_multiple_column(constructor: Constructor) -> None: data_left = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "index": [0, 1, 2]} data_right = {"antananarivo": [1, 2, 3], "c": [4, 5, 6], "index": [0, 1, 2]} @@ -290,7 +288,6 @@ def test_left_join_multiple_column(constructor: Constructor) -> None: assert_equal_data(result, expected) -@pytest.mark.filterwarnings("ignore: the default coalesce behavior") def test_left_join_overlapping_column(constructor: Constructor) -> None: data_left = { "antananarivo": [1.0, 2, 3], diff --git a/tests/hypothesis/join_test.py b/tests/hypothesis/join_test.py index 7f1cd8103..da4a61679 100644 --- a/tests/hypothesis/join_test.py +++ b/tests/hypothesis/join_test.py @@ -134,7 +134,6 @@ def test_cross_join( # pragma: no cover ), ) @pytest.mark.slow -@pytest.mark.filterwarnings("ignore:the default coalesce behavior") def test_left_join( # pragma: no cover a_left_data: list[int], b_left_data: list[int], diff --git a/tests/spark_like_test.py b/tests/spark_like_test.py index 3d67eac53..f0c66ab04 100644 --- a/tests/spark_like_test.py +++ b/tests/spark_like_test.py @@ -830,7 +830,6 @@ def test_semi_join( assert_equal_data(result, expected) -@pytest.mark.filterwarnings("ignore:the default coalesce behavior") def test_left_join(pyspark_constructor: Constructor) -> None: data_left = { "antananarivo": [1.0, 2, 3], @@ -874,7 +873,6 @@ def test_left_join(pyspark_constructor: Constructor) -> None: assert_equal_data(result_on_list, expected_on_list) -@pytest.mark.filterwarnings("ignore: the default coalesce behavior") def test_left_join_multiple_column(pyspark_constructor: Constructor) -> None: data_left = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "idx": [0, 1, 2]} data_right = {"antananarivo": [1, 2, 3], "c": [4, 5, 6], "idx": [0, 1, 2]} @@ -894,7 +892,6 @@ def test_left_join_multiple_column(pyspark_constructor: Constructor) -> None: assert_equal_data(result, expected) -@pytest.mark.filterwarnings("ignore: the default coalesce behavior") def test_left_join_overlapping_column(pyspark_constructor: Constructor) -> None: data_left = { "antananarivo": [1.0, 2, 3], From 46a030a9d9dc94fdad2866f6766d00c1491287c1 Mon Sep 17 00:00:00 2001 From: Dhanunjaya Elluri Date: Tue, 7 Jan 2025 13:21:28 +0100 Subject: [PATCH 20/35] feat: add `SparkLikeExpr` methods: `median`, `clip`, `is_between`, `is_duplicated`, `is_finite`, `is_in`, `is_unique`, `len`, `round` and `skew`(#1721) * feat(spark): add missing methods to SparkLikeExpr * feat(spark): add few missing methods * fix: add xfail to median when python<3.9 * fix: fixing reviewd requests & updated tests * fix: fix `PYSPARK_VERSION` for `median` calculation * fix: fix refactor issue * fix: remove `is_nan` method * fix: fixing `is_duplicated` & `is_unique` & remove `n_unique` --------- Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> --- CONTRIBUTING.md | 4 + narwhals/_spark_like/expr.py | 179 ++++++++++++++++++++++++++++++----- narwhals/utils.py | 6 +- tests/spark_like_test.py | 160 ++++++++++++++++++++++++++++++- 4 files changed, 323 insertions(+), 26 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0f8a6eb0b..af0eb1cbc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -78,6 +78,10 @@ where `YOUR-GITHUB-USERNAME` will be your GitHub user name. Here's how you can set up your local development environment to contribute. +#### Prerequisites for PySpark tests + +If you want to run PySpark-related tests, you'll need to have Java installed. Refer to the [Spark documentation](https://spark.apache.org/docs/latest/#downloading) for more information. + #### Option 1: Use UV (recommended) 1. Make sure you have Python3.12 installed, create a virtual environment, diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index d190b5667..66826a6ab 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -160,6 +160,11 @@ def __gt__(self, other: SparkLikeExpr) -> Self: returns_scalar=False, ) + def abs(self) -> Self: + from pyspark.sql import functions as F # noqa: N812 + + return self._from_call(F.abs, "abs", returns_scalar=self._returns_scalar) + def alias(self, name: str) -> Self: def _alias(df: SparkLikeLazyFrame) -> list[Column]: return [col.alias(name) for col in self._call(df)] @@ -179,44 +184,42 @@ def _alias(df: SparkLikeLazyFrame) -> list[Column]: ) def count(self) -> Self: - def _count(_input: Column) -> Column: - from pyspark.sql import functions as F # noqa: N812 + from pyspark.sql import functions as F # noqa: N812 - return F.count(_input) - - return self._from_call(_count, "count", returns_scalar=True) + return self._from_call(F.count, "count", returns_scalar=True) def max(self) -> Self: - def _max(_input: Column) -> Column: - from pyspark.sql import functions as F # noqa: N812 + from pyspark.sql import functions as F # noqa: N812 - return F.max(_input) - - return self._from_call(_max, "max", returns_scalar=True) + return self._from_call(F.max, "max", returns_scalar=True) def mean(self) -> Self: - def _mean(_input: Column) -> Column: + from pyspark.sql import functions as F # noqa: N812 + + return self._from_call(F.mean, "mean", returns_scalar=True) + + def median(self) -> Self: + def _median(_input: Column) -> Column: + import pyspark # ignore-banned-import from pyspark.sql import functions as F # noqa: N812 - return F.mean(_input) + if parse_version(pyspark.__version__) < (3, 4): + # Use percentile_approx with default accuracy parameter (10000) + return F.percentile_approx(_input.cast("double"), 0.5) - return self._from_call(_mean, "mean", returns_scalar=True) + return F.median(_input) - def min(self) -> Self: - def _min(_input: Column) -> Column: - from pyspark.sql import functions as F # noqa: N812 + return self._from_call(_median, "median", returns_scalar=True) - return F.min(_input) + def min(self) -> Self: + from pyspark.sql import functions as F # noqa: N812 - return self._from_call(_min, "min", returns_scalar=True) + return self._from_call(F.min, "min", returns_scalar=True) def sum(self) -> Self: - def _sum(_input: Column) -> Column: - from pyspark.sql import functions as F # noqa: N812 - - return F.sum(_input) + from pyspark.sql import functions as F # noqa: N812 - return self._from_call(_sum, "sum", returns_scalar=True) + return self._from_call(F.sum, "sum", returns_scalar=True) def std(self: Self, ddof: int) -> Self: from functools import partial @@ -239,3 +242,133 @@ def var(self: Self, ddof: int) -> Self: func = partial(_var, ddof=ddof, np_version=parse_version(np.__version__)) return self._from_call(func, "var", returns_scalar=True, ddof=ddof) + + def clip( + self, + lower_bound: Any | None = None, + upper_bound: Any | None = None, + ) -> Self: + def _clip(_input: Column, lower_bound: Any, upper_bound: Any) -> Column: + from pyspark.sql import functions as F # noqa: N812 + + result = _input + if lower_bound is not None: + # Convert lower_bound to a literal Column + result = F.when(result < lower_bound, F.lit(lower_bound)).otherwise( + result + ) + if upper_bound is not None: + # Convert upper_bound to a literal Column + result = F.when(result > upper_bound, F.lit(upper_bound)).otherwise( + result + ) + return result + + return self._from_call( + _clip, + "clip", + lower_bound=lower_bound, + upper_bound=upper_bound, + returns_scalar=self._returns_scalar, + ) + + def is_between( + self, + lower_bound: Any, + upper_bound: Any, + closed: str, + ) -> Self: + def _is_between(_input: Column, lower_bound: Any, upper_bound: Any) -> Column: + if closed == "both": + return (_input >= lower_bound) & (_input <= upper_bound) + if closed == "none": + return (_input > lower_bound) & (_input < upper_bound) + if closed == "left": + return (_input >= lower_bound) & (_input < upper_bound) + return (_input > lower_bound) & (_input <= upper_bound) + + return self._from_call( + _is_between, + "is_between", + lower_bound=lower_bound, + upper_bound=upper_bound, + returns_scalar=self._returns_scalar, + ) + + def is_duplicated(self) -> Self: + def _is_duplicated(_input: Column) -> Column: + from pyspark.sql import Window + from pyspark.sql import functions as F # noqa: N812 + + # Create a window spec that treats each value separately. + return F.count("*").over(Window.partitionBy(_input)) > 1 + + return self._from_call( + _is_duplicated, "is_duplicated", returns_scalar=self._returns_scalar + ) + + def is_finite(self) -> Self: + def _is_finite(_input: Column) -> Column: + from pyspark.sql import functions as F # noqa: N812 + + # A value is finite if it's not NaN, not NULL, and not infinite + return ( + ~F.isnan(_input) + & ~F.isnull(_input) + & (_input != float("inf")) + & (_input != float("-inf")) + ) + + return self._from_call( + _is_finite, "is_finite", returns_scalar=self._returns_scalar + ) + + def is_in(self, values: Sequence[Any]) -> Self: + def _is_in(_input: Column, values: Sequence[Any]) -> Column: + return _input.isin(values) + + return self._from_call( + _is_in, + "is_in", + values=values, + returns_scalar=self._returns_scalar, + ) + + def is_unique(self) -> Self: + def _is_unique(_input: Column) -> Column: + from pyspark.sql import Window + from pyspark.sql import functions as F # noqa: N812 + + # Create a window spec that treats each value separately + return F.count("*").over(Window.partitionBy(_input)) == 1 + + return self._from_call( + _is_unique, "is_unique", returns_scalar=self._returns_scalar + ) + + def len(self) -> Self: + def _len(_input: Column) -> Column: + from pyspark.sql import functions as F # noqa: N812 + + # Use count(*) to count all rows including nulls + return F.count("*") + + return self._from_call(_len, "len", returns_scalar=True) + + def round(self, decimals: int) -> Self: + def _round(_input: Column, decimals: int) -> Column: + from pyspark.sql import functions as F # noqa: N812 + + return F.round(_input, decimals) + + return self._from_call( + _round, + "round", + decimals=decimals, + returns_scalar=self._returns_scalar, + ) + + def skew(self) -> Self: + from pyspark.sql import functions as F # noqa: N812 + + return self._from_call(F.skewness, "skew", returns_scalar=True) diff --git a/narwhals/utils.py b/narwhals/utils.py index b8e9830e1..591cd53ae 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -155,7 +155,11 @@ def is_pandas_like(self) -> bool: >>> df.implementation.is_pandas_like() True """ - return self in {Implementation.PANDAS, Implementation.MODIN, Implementation.CUDF} + return self in { + Implementation.PANDAS, + Implementation.MODIN, + Implementation.CUDF, + } def is_polars(self) -> bool: """Return whether implementation is Polars. diff --git a/tests/spark_like_test.py b/tests/spark_like_test.py index f0c66ab04..c929f4f85 100644 --- a/tests/spark_like_test.py +++ b/tests/spark_like_test.py @@ -271,6 +271,14 @@ def test_add(pyspark_constructor: Constructor) -> None: assert_equal_data(result, expected) +def test_abs(pyspark_constructor: Constructor) -> None: + data = {"a": [1, 2, 3, -4, 5]} + df = nw.from_native(pyspark_constructor(data)) + result = df.select(nw.col("a").abs()) + expected = {"a": [1, 2, 3, 4, 5]} + assert_equal_data(result, expected) + + # copied from tests/expr_and_series/all_horizontal_test.py @pytest.mark.parametrize("expr1", ["a", nw.col("a")]) @pytest.mark.parametrize("expr2", ["b", nw.col("b")]) @@ -569,7 +577,9 @@ def test_drop_nulls(pyspark_constructor: Constructor) -> None: ], ) def test_drop_nulls_subset( - pyspark_constructor: Constructor, subset: str | list[str], expected: dict[str, float] + pyspark_constructor: Constructor, + subset: str | list[str], + expected: dict[str, float], ) -> None: data = { "a": [1.0, 2.0, None, 4.0], @@ -720,7 +730,8 @@ def test_cross_join(pyspark_constructor: Constructor) -> None: assert_equal_data(result, expected) with pytest.raises( - ValueError, match="Can not pass `left_on`, `right_on` or `on` keys for cross join" + ValueError, + match="Can not pass `left_on`, `right_on` or `on` keys for cross join", ): df.join(other, how="cross", left_on="antananarivo") # type: ignore[arg-type] @@ -940,3 +951,148 @@ def test_left_join_overlapping_column(pyspark_constructor: Constructor) -> None: "c": [4.0, 6.0, None], } assert_equal_data(result, expected) + + +# Copied from tests/expr_and_series/median_test.py +def test_median(pyspark_constructor: Constructor) -> None: + data = {"a": [3, 8, 2, None], "b": [5, 5, None, 7], "z": [7.0, 8, 9, None]} + df = nw.from_native(pyspark_constructor(data)) + result = df.select( + a=nw.col("a").median(), b=nw.col("b").median(), z=nw.col("z").median() + ) + expected = {"a": [3.0], "b": [5.0], "z": [8.0]} + assert_equal_data(result, expected) + + +# copied from tests/expr_and_series/clip_test.py +def test_clip(pyspark_constructor: Constructor) -> None: + df = nw.from_native(pyspark_constructor({"a": [1, 2, 3, -4, 5]})) + result = df.select( + lower_only=nw.col("a").clip(lower_bound=3), + upper_only=nw.col("a").clip(upper_bound=4), + both=nw.col("a").clip(3, 4), + ) + expected = { + "lower_only": [3, 3, 3, 3, 5], + "upper_only": [1, 2, 3, -4, 4], + "both": [3, 3, 3, 3, 4], + } + assert_equal_data(result, expected) + + +# copied from tests/expr_and_series/is_between_test.py +@pytest.mark.parametrize( + ("closed", "expected"), + [ + ("left", [True, True, True, False]), + ("right", [False, True, True, True]), + ("both", [True, True, True, True]), + ("none", [False, True, True, False]), + ], +) +def test_is_between( + pyspark_constructor: Constructor, closed: str, expected: list[bool] +) -> None: + data = {"a": [1, 4, 2, 5]} + df = nw.from_native(pyspark_constructor(data)) + result = df.select(nw.col("a").is_between(1, 5, closed=closed)) + expected_dict = {"a": expected} + assert_equal_data(result, expected_dict) + + +# copied from tests/expr_and_series/is_duplicated_test.py +def test_is_duplicated(pyspark_constructor: Constructor) -> None: + data = {"a": [1, 1, 2, None], "b": [1, 2, None, None], "level_0": [0, 1, 2, 3]} + df = nw.from_native(pyspark_constructor(data)) + result = df.select( + a=nw.col("a").is_duplicated(), + b=nw.col("b").is_duplicated(), + level_0=nw.col("level_0"), + ).sort("level_0") + expected = { + "a": [True, True, False, False], + "b": [False, False, True, True], + "level_0": [0, 1, 2, 3], + } + assert_equal_data(result, expected) + + +# copied from tests/expr_and_series/is_finite_test.py +def test_is_finite(pyspark_constructor: Constructor) -> None: + data = {"a": [float("nan"), float("inf"), 2.0, None]} + df = nw.from_native(pyspark_constructor(data)) + result = df.select(finite=nw.col("a").is_finite()) + expected = {"finite": [False, False, True, False]} + assert_equal_data(result, expected) + + +def test_is_in(pyspark_constructor: Constructor) -> None: + data = {"a": [1, 2, 3, 4, 5]} + df = nw.from_native(pyspark_constructor(data)) + result = df.select(in_list=nw.col("a").is_in([2, 4])) + expected = {"in_list": [False, True, False, True, False]} + assert_equal_data(result, expected) + + +# copied from tests/expr_and_series/is_unique_test.py +def test_is_unique(pyspark_constructor: Constructor) -> None: + data = {"a": [1, 1, 2, None], "b": [1, 2, None, None], "level_0": [0, 1, 2, 3]} + df = nw.from_native(pyspark_constructor(data)) + result = df.select( + a=nw.col("a").is_unique(), + b=nw.col("b").is_unique(), + level_0=nw.col("level_0"), + ).sort("level_0") + expected = { + "a": [False, False, True, True], + "b": [True, True, False, False], + "level_0": [0, 1, 2, 3], + } + assert_equal_data(result, expected) + + +def test_len(pyspark_constructor: Constructor) -> None: + data = {"a": [1, 2, float("nan"), 4, None], "b": [None, 3, None, 5, None]} + df = nw.from_native(pyspark_constructor(data)) + result = df.select( + a=nw.col("a").len(), + b=nw.col("b").len(), + ) + expected = {"a": [5], "b": [5]} + assert_equal_data(result, expected) + + +# Copied from tests/expr_and_series/round_test.py +@pytest.mark.parametrize("decimals", [0, 1, 2]) +def test_round(pyspark_constructor: Constructor, decimals: int) -> None: + data = {"a": [2.12345, 2.56789, 3.901234]} + df = nw.from_native(pyspark_constructor(data)) + + expected_data = {k: [round(e, decimals) for e in v] for k, v in data.items()} + result_frame = df.select(nw.col("a").round(decimals)) + assert_equal_data(result_frame, expected_data) + + +# copied from tests/expr_and_series/skew_test.py +@pytest.mark.parametrize( + ("data", "expected"), + [ + pytest.param( + [], + None, + marks=pytest.mark.skip( + reason="PySpark cannot infer schema from empty datasets" + ), + ), + ([1], None), + ([1, 2], 0.0), + ([0.0, 0.0, 0.0], None), + ([1, 2, 3, 2, 1], 0.343622), + ], +) +def test_skew( + pyspark_constructor: Constructor, data: list[float], expected: float | None +) -> None: + df = nw.from_native(pyspark_constructor({"a": data})) + result = df.select(skew=nw.col("a").skew()) + assert_equal_data(result, {"skew": [expected]}) From a6d76e17c3ae3111af7f5a619c47767fa4790e25 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 7 Jan 2025 14:58:03 +0000 Subject: [PATCH 21/35] chore: remove some expr._kwargs defaults (#1747) --- narwhals/_arrow/group_by.py | 2 +- narwhals/_dask/group_by.py | 4 +--- narwhals/_pandas_like/expr.py | 4 ++-- narwhals/_pandas_like/group_by.py | 5 ++--- narwhals/_spark_like/group_by.py | 2 +- 5 files changed, 7 insertions(+), 10 deletions(-) diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index ffb16578f..11ed914fe 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -171,7 +171,7 @@ def agg_arrow( function_name = remove_prefix(expr._function_name, "col->") if function_name in {"std", "var"}: - option = pc.VarianceOptions(ddof=expr._kwargs.get("ddof", 1)) + option = pc.VarianceOptions(ddof=expr._kwargs["ddof"]) elif function_name in {"len", "n_unique"}: option = pc.CountOptions(mode="all") elif function_name == "count": diff --git a/narwhals/_dask/group_by.py b/narwhals/_dask/group_by.py index 7bda88ee5..243b21b71 100644 --- a/narwhals/_dask/group_by.py +++ b/narwhals/_dask/group_by.py @@ -178,9 +178,7 @@ def agg_dask( function_name = remove_prefix(expr._function_name, "col->") kwargs = ( - {"ddof": expr._kwargs.get("ddof", 1)} - if function_name in {"std", "var"} - else {} + {"ddof": expr._kwargs["ddof"]} if function_name in {"std", "var"} else {} ) agg_function = POLARS_TO_DASK_AGGREGATIONS.get(function_name, function_name) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index fac9a2ed6..34d05b7eb 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -412,12 +412,12 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: df = df.with_columns(~plx.col(*self._root_names).is_null()) if self._function_name == "col->shift": - kwargs = {"periods": self._kwargs.get("n", 1)} + kwargs = {"periods": self._kwargs["n"]} elif self._function_name == "col->rank": _method = self._kwargs.get("method", "average") kwargs = { "method": "first" if _method == "ordinal" else _method, - "ascending": not self._kwargs.get("descending", False), + "ascending": not self._kwargs["descending"], "na_option": "keep", "pct": False, } diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 0f1000606..a1eca5b5d 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -209,14 +209,13 @@ def agg_pandas( # noqa: PLR0915 is_n_unique = function_name == "nunique" is_std = function_name == "std" is_var = function_name == "var" - ddof = expr._kwargs.get("ddof", 1) for root_name, output_name in zip(expr._root_names, expr._output_names): if is_n_unique: nunique_aggs[output_name] = root_name - elif is_std and ddof != 1: + elif is_std and (ddof := expr._kwargs["ddof"]) != 1: std_aggs[ddof][0].append(root_name) std_aggs[ddof][1].append(output_name) - elif is_var and ddof != 1: + elif is_var and (ddof := expr._kwargs["ddof"]) != 1: var_aggs[ddof][0].append(root_name) var_aggs[ddof][1].append(output_name) else: diff --git a/narwhals/_spark_like/group_by.py b/narwhals/_spark_like/group_by.py index 7f3dc077d..0100500ff 100644 --- a/narwhals/_spark_like/group_by.py +++ b/narwhals/_spark_like/group_by.py @@ -85,7 +85,7 @@ def get_spark_function(function_name: str, **kwargs: Any) -> Column: return partial( _std if function_name == "std" else _var, - ddof=kwargs.get("ddof", 1), + ddof=kwargs["ddof"], np_version=parse_version(np.__version__), ) from pyspark.sql import functions as F # noqa: N812 From 3672e86f0a2356869637848ccd13c41852ad1c28 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 7 Jan 2025 15:39:04 +0000 Subject: [PATCH 22/35] chore: validate predicates in `nw.when` one level higher (#1756) * chore: validate predicates in `nw.when` one level higher * sort out fail --- narwhals/_arrow/namespace.py | 7 +------ narwhals/_dask/namespace.py | 7 +------ narwhals/_pandas_like/namespace.py | 7 +------ narwhals/expr.py | 3 +++ tests/expr_and_series/when_test.py | 6 +----- 5 files changed, 7 insertions(+), 23 deletions(-) diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index 99f043ebd..b02ad32ee 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -359,12 +359,7 @@ def when( *predicates: IntoArrowExpr, ) -> ArrowWhen: plx = self.__class__(backend_version=self._backend_version, version=self._version) - if predicates: - condition = plx.all_horizontal(*predicates) - else: - msg = "at least one predicate needs to be provided" - raise TypeError(msg) - + condition = plx.all_horizontal(*predicates) return ArrowWhen(condition, self._backend_version, version=self._version) def concat_str( diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index d9a1a8ac6..9a16d7f13 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -310,12 +310,7 @@ def when( *predicates: IntoDaskExpr, ) -> DaskWhen: plx = self.__class__(backend_version=self._backend_version, version=self._version) - if predicates: - condition = plx.all_horizontal(*predicates) - else: - msg = "at least one predicate needs to be provided" - raise TypeError(msg) - + condition = plx.all_horizontal(*predicates) return DaskWhen( condition, self._backend_version, returns_scalar=False, version=self._version ) diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 7885d7de0..212c9c938 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -371,12 +371,7 @@ def when( plx = self.__class__( self._implementation, self._backend_version, version=self._version ) - if predicates: - condition = plx.all_horizontal(*predicates) - else: - msg = "at least one predicate needs to be provided" - raise TypeError(msg) - + condition = plx.all_horizontal(*predicates) return PandasWhen( condition, self._implementation, self._backend_version, version=self._version ) diff --git a/narwhals/expr.py b/narwhals/expr.py index 809f76e77..653300da8 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -7643,6 +7643,9 @@ def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: class When: def __init__(self, *predicates: IntoExpr | Iterable[IntoExpr]) -> None: self._predicates = flatten([predicates]) + if not self._predicates: + msg = "At least one predicate needs to be provided to `narwhals.when`." + raise TypeError(msg) def _extract_predicates(self, plx: Any) -> Any: return [extract_compliant(plx, v) for v in self._predicates] diff --git a/tests/expr_and_series/when_test.py b/tests/expr_and_series/when_test.py index b59dda488..739b00e2d 100644 --- a/tests/expr_and_series/when_test.py +++ b/tests/expr_and_series/when_test.py @@ -54,11 +54,7 @@ def test_multiple_conditions( assert_equal_data(result, expected) -def test_no_arg_when_fail( - constructor: Constructor, request: pytest.FixtureRequest -) -> None: - if "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_no_arg_when_fail(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) with pytest.raises((TypeError, ValueError)): df.select(nw.when().then(value=3).alias("a_when")) From 1bdf4dc88a73c393ae6a92a5d1c62fed4086801f Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 7 Jan 2025 17:34:21 +0000 Subject: [PATCH 23/35] chore: Remove some unnecessary trailing commas (#1757) --- .github/workflows/downstream_tests.yml | 2 +- narwhals/_arrow/dataframe.py | 50 +++-------- narwhals/_arrow/expr.py | 56 +++--------- narwhals/_dask/expr.py | 118 +++++-------------------- narwhals/_duckdb/expr.py | 45 +++------- 5 files changed, 56 insertions(+), 215 deletions(-) diff --git a/.github/workflows/downstream_tests.yml b/.github/workflows/downstream_tests.yml index 548251ddc..5ad95b6d9 100644 --- a/.github/workflows/downstream_tests.yml +++ b/.github/workflows/downstream_tests.yml @@ -220,7 +220,7 @@ jobs: run: | cd tea-tasting pdm remove narwhals - pdm add ./.. + pdm add ./..[dev] - name: show-deps run: | cd tea-tasting diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index f4ad2912e..e6bb6fa65 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -101,23 +101,14 @@ def row(self: Self, index: int) -> tuple[Any, ...]: return tuple(col[index] for col in self._native_frame) @overload - def rows( - self: Self, - *, - named: Literal[True], - ) -> list[dict[str, Any]]: ... + def rows(self: Self, *, named: Literal[True]) -> list[dict[str, Any]]: ... @overload - def rows( - self: Self, - *, - named: Literal[False], - ) -> list[tuple[Any, ...]]: ... + def rows(self: Self, *, named: Literal[False]) -> list[tuple[Any, ...]]: ... + @overload def rows( - self: Self, - *, - named: bool, + self: Self, *, named: bool ) -> list[tuple[Any, ...]] | list[dict[str, Any]]: ... def rows(self: Self, *, named: bool) -> list[tuple[Any, ...]] | list[dict[str, Any]]: @@ -126,10 +117,7 @@ def rows(self: Self, *, named: bool) -> list[tuple[Any, ...]] | list[dict[str, A return self._native_frame.to_pylist() # type: ignore[no-any-return] def iter_rows( - self: Self, - *, - named: bool, - buffer_size: int, + self: Self, *, named: bool, buffer_size: int ) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: df = self._native_frame num_rows = df.num_rows @@ -263,9 +251,7 @@ def __getitem__( ) start = item.start or 0 stop = item.stop if item.stop is not None else len(self._native_frame) - return self._from_native_frame( - self._native_frame.slice(start, stop - start), - ) + return self._from_native_frame(self._native_frame.slice(start, stop - start)) elif isinstance(item, Sequence) or (is_numpy_array(item) and item.ndim == 1): if ( @@ -301,11 +287,7 @@ def estimated_size(self: Self, unit: SizeUnit) -> int | float: def columns(self: Self) -> list[str]: return self._native_frame.schema.names # type: ignore[no-any-return] - def select( - self: Self, - *exprs: IntoArrowExpr, - **named_exprs: IntoArrowExpr, - ) -> Self: + def select(self: Self, *exprs: IntoArrowExpr, **named_exprs: IntoArrowExpr) -> Self: import pyarrow as pa new_series = evaluate_into_exprs(self, *exprs, **named_exprs) @@ -313,16 +295,11 @@ def select( # return empty dataframe, like Polars does return self._from_native_frame(self._native_frame.__class__.from_arrays([])) names = [s.name for s in new_series] - df = pa.Table.from_arrays( - broadcast_series(new_series), - names=names, - ) + df = pa.Table.from_arrays(broadcast_series(new_series), names=names) return self._from_native_frame(df) def with_columns( - self: Self, - *exprs: IntoArrowExpr, - **named_exprs: IntoArrowExpr, + self: Self, *exprs: IntoArrowExpr, **named_exprs: IntoArrowExpr ) -> Self: native_frame = self._native_frame new_columns = evaluate_into_exprs(self, *exprs, **named_exprs) @@ -334,9 +311,7 @@ def with_columns( col_name = col_value.name column = validate_dataframe_comparand( - length=length, - other=col_value, - backend_version=self._backend_version, + length=length, other=col_value, backend_version=self._backend_version ) native_frame = ( @@ -611,12 +586,9 @@ def is_duplicated(self: Self) -> ArrowSeries: columns = self.columns index_token = generate_temporary_column_name(n_bytes=8, columns=columns) col_token = generate_temporary_column_name( - n_bytes=8, - columns=[*columns, index_token], + n_bytes=8, columns=[*columns, index_token] ) - df = self.with_row_index(index_token)._native_frame - row_count = ( df.append_column(col_token, pa.repeat(pa.scalar(1), len(self))) .group_by(columns) diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 5ae6ce6b0..1c0d0734e 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -87,8 +87,7 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: except KeyError as e: missing_columns = [x for x in column_names if x not in df.columns] raise ColumnNotFoundError.from_missing_and_available_column_names( - missing_columns=missing_columns, - available_columns=df.columns, + missing_columns=missing_columns, available_columns=df.columns ) from e return cls( @@ -564,9 +563,7 @@ def __init__(self: Self, expr: ArrowExpr) -> None: def get_categories(self: Self) -> ArrowExpr: return reuse_series_namespace_implementation( - self._compliant_expr, - "cat", - "get_categories", + self._compliant_expr, "cat", "get_categories" ) @@ -676,12 +673,7 @@ def len_chars(self: Self) -> ArrowExpr: ) def replace( - self: Self, - pattern: str, - value: str, - *, - literal: bool, - n: int, + self: Self, pattern: str, value: str, *, literal: bool, n: int ) -> ArrowExpr: return reuse_series_namespace_implementation( self._compliant_expr, @@ -693,13 +685,7 @@ def replace( n=n, ) - def replace_all( - self: Self, - pattern: str, - value: str, - *, - literal: bool, - ) -> ArrowExpr: + def replace_all(self: Self, pattern: str, value: str, *, literal: bool) -> ArrowExpr: return reuse_series_namespace_implementation( self._compliant_expr, "str", @@ -711,26 +697,17 @@ def replace_all( def strip_chars(self: Self, characters: str | None) -> ArrowExpr: return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "strip_chars", - characters=characters, + self._compliant_expr, "str", "strip_chars", characters=characters ) def starts_with(self: Self, prefix: str) -> ArrowExpr: return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "starts_with", - prefix=prefix, + self._compliant_expr, "str", "starts_with", prefix=prefix ) def ends_with(self: Self, suffix: str) -> ArrowExpr: return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "ends_with", - suffix=suffix, + self._compliant_expr, "str", "ends_with", suffix=suffix ) def contains(self, pattern: str, *, literal: bool) -> ArrowExpr: @@ -745,24 +722,17 @@ def slice(self: Self, offset: int, length: int | None) -> ArrowExpr: def to_datetime(self: Self, format: str | None) -> ArrowExpr: # noqa: A002 return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "to_datetime", - format=format, + self._compliant_expr, "str", "to_datetime", format=format ) def to_uppercase(self: Self) -> ArrowExpr: return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "to_uppercase", + self._compliant_expr, "str", "to_uppercase" ) def to_lowercase(self: Self) -> ArrowExpr: return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "to_lowercase", + self._compliant_expr, "str", "to_lowercase" ) @@ -931,8 +901,4 @@ def __init__(self: Self, expr: ArrowExpr) -> None: self._expr = expr def len(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._expr, - "list", - "len", - ) + return reuse_series_namespace_implementation(self._expr, "list", "len") diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index c76593404..cb20fa616 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -307,11 +307,7 @@ def __invert__(self: Self) -> Self: ) def mean(self) -> Self: - return self._from_call( - lambda _input: _input.mean(), - "mean", - returns_scalar=True, - ) + return self._from_call(lambda _input: _input.mean(), "mean", returns_scalar=True) def median(self) -> Self: from narwhals.exceptions import InvalidOperationError @@ -326,18 +322,10 @@ def func(s: dask_expr.Series) -> dask_expr.Series: return self._from_call(func, "median", returns_scalar=True) def min(self) -> Self: - return self._from_call( - lambda _input: _input.min(), - "min", - returns_scalar=True, - ) + return self._from_call(lambda _input: _input.min(), "min", returns_scalar=True) def max(self) -> Self: - return self._from_call( - lambda _input: _input.max(), - "max", - returns_scalar=True, - ) + return self._from_call(lambda _input: _input.max(), "max", returns_scalar=True) def std(self, ddof: int) -> Self: return self._from_call( @@ -356,11 +344,7 @@ def var(self, ddof: int) -> Self: ) def skew(self: Self) -> Self: - return self._from_call( - lambda _input: _input.skew(), - "skew", - returns_scalar=True, - ) + return self._from_call(lambda _input: _input.skew(), "skew", returns_scalar=True) def shift(self, n: int) -> Self: return self._from_call( @@ -435,9 +419,7 @@ def is_between( closed = "neither" return self._from_call( lambda _input, lower_bound, upper_bound, closed: _input.between( - lower_bound, - upper_bound, - closed, + lower_bound, upper_bound, closed ), "is_between", lower_bound=lower_bound, @@ -447,17 +429,11 @@ def is_between( ) def sum(self) -> Self: - return self._from_call( - lambda _input: _input.sum(), - "sum", - returns_scalar=True, - ) + return self._from_call(lambda _input: _input.sum(), "sum", returns_scalar=True) def count(self) -> Self: return self._from_call( - lambda _input: _input.count(), - "count", - returns_scalar=True, + lambda _input: _input.count(), "count", returns_scalar=True ) def round(self, decimals: int) -> Self: @@ -510,9 +486,7 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> NoRetur def abs(self) -> Self: return self._from_call( - lambda _input: _input.abs(), - "abs", - returns_scalar=self._returns_scalar, + lambda _input: _input.abs(), "abs", returns_scalar=self._returns_scalar ) def all(self) -> Self: @@ -579,23 +553,17 @@ def clip( def diff(self: Self) -> Self: return self._from_call( - lambda _input: _input.diff(), - "diff", - returns_scalar=self._returns_scalar, + lambda _input: _input.diff(), "diff", returns_scalar=self._returns_scalar ) def n_unique(self: Self) -> Self: return self._from_call( - lambda _input: _input.nunique(dropna=False), - "n_unique", - returns_scalar=True, + lambda _input: _input.nunique(dropna=False), "n_unique", returns_scalar=True ) def is_null(self: Self) -> Self: return self._from_call( - lambda _input: _input.isna(), - "is_null", - returns_scalar=self._returns_scalar, + lambda _input: _input.isna(), "is_null", returns_scalar=self._returns_scalar ) def is_nan(self: Self) -> Self: @@ -606,18 +574,10 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: msg = f"`.is_nan` only supported for numeric dtypes and not {dtype}, did you mean `.is_null`?" raise InvalidOperationError(msg) - return self._from_call( - func, - "is_null", - returns_scalar=self._returns_scalar, - ) + return self._from_call(func, "is_null", returns_scalar=self._returns_scalar) def len(self: Self) -> Self: - return self._from_call( - lambda _input: _input.size, - "len", - returns_scalar=True, - ) + return self._from_call(lambda _input: _input.size, "len", returns_scalar=True) def quantile( self: Self, @@ -633,10 +593,7 @@ def func(_input: dask_expr.Series, quantile: float) -> dask_expr.Series: return _input.quantile(q=quantile, method="dask") # pragma: no cover return self._from_call( - func, - "quantile", - quantile=quantile, - returns_scalar=True, + func, "quantile", quantile=quantile, returns_scalar=True ) else: msg = "`higher`, `lower`, `midpoint`, `nearest` - interpolation methods are not supported by Dask. Please use `linear` instead." @@ -655,13 +612,10 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: first_distinct_index = _input.groupby(_name).agg({col_token: "min"})[ col_token ] - return _input[col_token].isin(first_distinct_index) return self._from_call( - func, - "is_first_distinct", - returns_scalar=self._returns_scalar, + func, "is_first_distinct", returns_scalar=self._returns_scalar ) def is_last_distinct(self: Self) -> Self: @@ -675,13 +629,10 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: implementation=self._implementation, ) last_distinct_index = _input.groupby(_name).agg({col_token: "max"})[col_token] - return _input[col_token].isin(last_distinct_index) return self._from_call( - func, - "is_last_distinct", - returns_scalar=self._returns_scalar, + func, "is_last_distinct", returns_scalar=self._returns_scalar ) def is_duplicated(self: Self) -> Self: @@ -694,11 +645,7 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: > 1 ) - return self._from_call( - func, - "is_duplicated", - returns_scalar=self._returns_scalar, - ) + return self._from_call(func, "is_duplicated", returns_scalar=self._returns_scalar) def is_unique(self: Self) -> Self: def func(_input: dask_expr.Series) -> dask_expr.Series: @@ -710,11 +657,7 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: == 1 ) - return self._from_call( - func, - "is_unique", - returns_scalar=self._returns_scalar, - ) + return self._from_call(func, "is_unique", returns_scalar=self._returns_scalar) def is_in(self: Self, other: Any) -> Self: return self._from_call( @@ -788,19 +731,13 @@ def dt(self: Self) -> DaskExprDateTimeNamespace: def name(self: Self) -> DaskExprNameNamespace: return DaskExprNameNamespace(self) - def cast( - self: Self, - dtype: DType | type[DType], - ) -> Self: + def cast(self: Self, dtype: DType | type[DType]) -> Self: def func(_input: Any, dtype: DType | type[DType]) -> Any: dtype = narwhals_to_native_dtype(dtype, self._version) return _input.astype(dtype) return self._from_call( - func, - "cast", - dtype=dtype, - returns_scalar=self._returns_scalar, + func, "cast", dtype=dtype, returns_scalar=self._returns_scalar ) def is_finite(self: Self) -> Self: @@ -825,12 +762,7 @@ def len_chars(self) -> DaskExpr: ) def replace( - self, - pattern: str, - value: str, - *, - literal: bool = False, - n: int = 1, + self, pattern: str, value: str, *, literal: bool = False, n: int = 1 ) -> DaskExpr: return self._compliant_expr._from_call( lambda _input, pattern, value, literal, n: _input.str.replace( @@ -844,13 +776,7 @@ def replace( returns_scalar=self._compliant_expr._returns_scalar, ) - def replace_all( - self, - pattern: str, - value: str, - *, - literal: bool = False, - ) -> DaskExpr: + def replace_all(self, pattern: str, value: str, *, literal: bool = False) -> DaskExpr: return self._compliant_expr._from_call( lambda _input, pattern, value, literal: _input.str.replace( pattern, value, n=-1, regex=not literal diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py index 3956e919d..0f33ff846 100644 --- a/narwhals/_duckdb/expr.py +++ b/narwhals/_duckdb/expr.py @@ -365,9 +365,7 @@ def func( _input: duckdb.Expression, lower_bound: Any, upper_bound: Any ) -> duckdb.Expression: return FunctionExpression( - "greatest", - FunctionExpression("least", _input, upper_bound), - lower_bound, + "greatest", FunctionExpression("least", _input, upper_bound), lower_bound ) return self._from_call( @@ -407,9 +405,7 @@ def sum(self) -> Self: from duckdb import FunctionExpression return self._from_call( - lambda _input: FunctionExpression("sum", _input), - "sum", - returns_scalar=True, + lambda _input: FunctionExpression("sum", _input), "sum", returns_scalar=True ) def count(self) -> Self: @@ -425,9 +421,7 @@ def len(self) -> Self: from duckdb import FunctionExpression return self._from_call( - lambda _input: FunctionExpression("count"), - "len", - returns_scalar=True, + lambda _input: FunctionExpression("count"), "len", returns_scalar=True ) def std(self, ddof: int) -> Self: @@ -441,9 +435,7 @@ def std(self, ddof: int) -> Self: msg = f"std with ddof {ddof} is not supported in DuckDB" raise NotImplementedError(msg) return self._from_call( - lambda _input: FunctionExpression(func, _input), - "std", - returns_scalar=True, + lambda _input: FunctionExpression(func, _input), "std", returns_scalar=True ) def var(self, ddof: int) -> Self: @@ -457,34 +449,26 @@ def var(self, ddof: int) -> Self: msg = f"var with ddof {ddof} is not supported in DuckDB" raise NotImplementedError(msg) return self._from_call( - lambda _input: FunctionExpression(func, _input), - "var", - returns_scalar=True, + lambda _input: FunctionExpression(func, _input), "var", returns_scalar=True ) def max(self) -> Self: from duckdb import FunctionExpression return self._from_call( - lambda _input: FunctionExpression("max", _input), - "max", - returns_scalar=True, + lambda _input: FunctionExpression("max", _input), "max", returns_scalar=True ) def min(self) -> Self: from duckdb import FunctionExpression return self._from_call( - lambda _input: FunctionExpression("min", _input), - "min", - returns_scalar=True, + lambda _input: FunctionExpression("min", _input), "min", returns_scalar=True ) def is_null(self) -> Self: return self._from_call( - lambda _input: _input.isnull(), - "is_null", - returns_scalar=self._returns_scalar, + lambda _input: _input.isnull(), "is_null", returns_scalar=self._returns_scalar ) def is_in(self, other: Sequence[Any]) -> Self: @@ -590,9 +574,7 @@ def func(_input: duckdb.Expression) -> duckdb.Expression: ) return self._compliant_expr._from_call( - func, - "contains", - returns_scalar=self._compliant_expr._returns_scalar, + func, "contains", returns_scalar=self._compliant_expr._returns_scalar ) def slice(self, offset: int, length: int) -> DuckDBExpr: @@ -612,9 +594,7 @@ def func(_input: duckdb.Expression) -> duckdb.Expression: ) return self._compliant_expr._from_call( - func, - "slice", - returns_scalar=self._compliant_expr._returns_scalar, + func, "slice", returns_scalar=self._compliant_expr._returns_scalar ) def to_lowercase(self) -> DuckDBExpr: @@ -664,10 +644,7 @@ def replace_all( raise NotImplementedError(msg) return self._compliant_expr._from_call( lambda _input: FunctionExpression( - "replace", - _input, - ConstantExpression(pattern), - ConstantExpression(value), + "replace", _input, ConstantExpression(pattern), ConstantExpression(value) ), "replace_all", returns_scalar=self._compliant_expr._returns_scalar, From 373320ef28d32c99d4b1c39db956c7ba2f732775 Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Wed, 8 Jan 2025 09:16:18 +0100 Subject: [PATCH 24/35] fix: update Spark min version in `utils.py` (#1760) update min version in utils --- narwhals/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/utils.py b/narwhals/utils.py index 591cd53ae..c03642c90 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -295,7 +295,7 @@ def is_ibis(self) -> bool: Implementation.MODIN: (0, 25, 3), Implementation.CUDF: (24, 10), Implementation.PYARROW: (11,), - Implementation.PYSPARK: (3, 3), + Implementation.PYSPARK: (3, 5), Implementation.POLARS: (0, 20, 3), Implementation.DASK: (2024, 8), Implementation.DUCKDB: (1,), From 1f0c7183048c8568cb714b39d8efe8684b5c66f4 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 8 Jan 2025 13:08:46 +0000 Subject: [PATCH 25/35] feat: implement when/then/otherwise for DuckDB (#1759) --- narwhals/_duckdb/expr.py | 2 +- narwhals/_duckdb/namespace.py | 109 +++++++++++++++++++++++++++++ narwhals/_duckdb/utils.py | 3 +- tests/expr_and_series/when_test.py | 26 ++----- tpch/execute.py | 9 +++ 5 files changed, 126 insertions(+), 23 deletions(-) diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py index 0f33ff846..4515cbba1 100644 --- a/narwhals/_duckdb/expr.py +++ b/narwhals/_duckdb/expr.py @@ -31,7 +31,7 @@ class DuckDBExpr(CompliantExpr["duckdb.Expression"]): def __init__( self, - call: Callable[[DuckDBLazyFrame], list[duckdb.Expression]], + call: Callable[[DuckDBLazyFrame], Sequence[duckdb.Expression]], *, depth: int, function_name: str, diff --git a/narwhals/_duckdb/namespace.py b/narwhals/_duckdb/namespace.py index bcd7eff6d..c91d11d3f 100644 --- a/narwhals/_duckdb/namespace.py +++ b/narwhals/_duckdb/namespace.py @@ -7,6 +7,7 @@ from typing import Any from typing import Literal from typing import Sequence +from typing import cast from narwhals._duckdb.expr import DuckDBExpr from narwhals._duckdb.utils import narwhals_to_native_dtype @@ -157,6 +158,16 @@ def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: kwargs={"exprs": exprs}, ) + def when( + self, + *predicates: IntoDuckDBExpr, + ) -> DuckDBWhen: + plx = self.__class__(backend_version=self._backend_version, version=self._version) + condition = plx.all_horizontal(*predicates) + return DuckDBWhen( + condition, self._backend_version, returns_scalar=False, version=self._version + ) + def col(self, *column_names: str) -> DuckDBExpr: return DuckDBExpr.from_column_names( *column_names, backend_version=self._backend_version, version=self._version @@ -203,3 +214,101 @@ def func(_df: DuckDBLazyFrame) -> list[duckdb.Expression]: version=self._version, kwargs={}, ) + + +class DuckDBWhen: + def __init__( + self, + condition: DuckDBExpr, + backend_version: tuple[int, ...], + then_value: Any = None, + otherwise_value: Any = None, + *, + returns_scalar: bool, + version: Version, + ) -> None: + self._backend_version = backend_version + self._condition = condition + self._then_value = then_value + self._otherwise_value = otherwise_value + self._returns_scalar = returns_scalar + self._version = version + + def __call__(self, df: DuckDBLazyFrame) -> Sequence[duckdb.Expression]: + from duckdb import CaseExpression + from duckdb import ConstantExpression + + from narwhals._expression_parsing import parse_into_expr + + plx = df.__narwhals_namespace__() + condition = parse_into_expr(self._condition, namespace=plx)(df)[0] + condition = cast("duckdb.Expression", condition) + + try: + value = parse_into_expr(self._then_value, namespace=plx)(df)[0] + except TypeError: + # `self._otherwise_value` is a scalar and can't be converted to an expression + value = ConstantExpression(self._then_value) + value = cast("duckdb.Expression", value) + + if self._otherwise_value is None: + return [CaseExpression(condition=condition, value=value)] + try: + otherwise_expr = parse_into_expr(self._otherwise_value, namespace=plx) + except TypeError: + # `self._otherwise_value` is a scalar and can't be converted to an expression + return [ + CaseExpression(condition=condition, value=value).otherwise( + ConstantExpression(self._otherwise_value) + ) + ] + otherwise = otherwise_expr(df)[0] + return [CaseExpression(condition=condition, value=value).otherwise(otherwise)] + + def then(self, value: DuckDBExpr | Any) -> DuckDBThen: + self._then_value = value + + return DuckDBThen( + self, + depth=0, + function_name="whenthen", + root_names=None, + output_names=None, + returns_scalar=self._returns_scalar, + backend_version=self._backend_version, + version=self._version, + kwargs={"value": value}, + ) + + +class DuckDBThen(DuckDBExpr): + def __init__( + self, + call: DuckDBWhen, + *, + depth: int, + function_name: str, + root_names: list[str] | None, + output_names: list[str] | None, + returns_scalar: bool, + backend_version: tuple[int, ...], + version: Version, + kwargs: dict[str, Any], + ) -> None: + self._backend_version = backend_version + self._version = version + self._call = call + self._depth = depth + self._function_name = function_name + self._root_names = root_names + self._output_names = output_names + self._returns_scalar = returns_scalar + self._kwargs = kwargs + + def otherwise(self, value: DuckDBExpr | Any) -> DuckDBExpr: + # type ignore because we are setting the `_call` attribute to a + # callable object of type `DuckDBWhen`, base class has the attribute as + # only a `Callable` + self._call._otherwise_value = value # type: ignore[attr-defined] + self._function_name = "whenotherwise" + return self diff --git a/narwhals/_duckdb/utils.py b/narwhals/_duckdb/utils.py index abac2e158..62f126db9 100644 --- a/narwhals/_duckdb/utils.py +++ b/narwhals/_duckdb/utils.py @@ -4,6 +4,7 @@ from functools import lru_cache from typing import TYPE_CHECKING from typing import Any +from typing import Sequence from narwhals.dtypes import DType from narwhals.exceptions import InvalidIntoExprError @@ -76,7 +77,7 @@ def parse_exprs_and_named_exprs( def _columns_from_expr( df: DuckDBLazyFrame, expr: IntoDuckDBExpr -) -> list[duckdb.Expression]: +) -> Sequence[duckdb.Expression]: if isinstance(expr, str): # pragma: no cover from duckdb import ColumnExpression diff --git a/tests/expr_and_series/when_test.py b/tests/expr_and_series/when_test.py index 739b00e2d..94e37aaa3 100644 --- a/tests/expr_and_series/when_test.py +++ b/tests/expr_and_series/when_test.py @@ -17,9 +17,7 @@ } -def test_when(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_when(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") == 1).then(value=3).alias("a_when")) expected = { @@ -28,9 +26,7 @@ def test_when(constructor: Constructor, request: pytest.FixtureRequest) -> None: assert_equal_data(result, expected) -def test_when_otherwise(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_when_otherwise(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") == 1).then(3).otherwise(6).alias("a_when")) expected = { @@ -39,11 +35,7 @@ def test_when_otherwise(constructor: Constructor, request: pytest.FixtureRequest assert_equal_data(result, expected) -def test_multiple_conditions( - constructor: Constructor, request: pytest.FixtureRequest -) -> None: - if "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_multiple_conditions(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select( nw.when(nw.col("a") < 3, nw.col("c") < 5.0).then(3).alias("a_when") @@ -85,11 +77,7 @@ def test_value_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_value_expression( - constructor: Constructor, request: pytest.FixtureRequest -) -> None: - if "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_value_expression(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") == 1).then(nw.col("a") + 9).alias("a_when")) expected = { @@ -122,11 +110,7 @@ def test_otherwise_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_otherwise_expression( - constructor: Constructor, request: pytest.FixtureRequest -) -> None: - if "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_otherwise_expression(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select( nw.when(nw.col("a") == 1).then(-1).otherwise(nw.col("a") + 7).alias("a_when") diff --git a/tpch/execute.py b/tpch/execute.py index e19b51dfb..1f3823ced 100644 --- a/tpch/execute.py +++ b/tpch/execute.py @@ -5,6 +5,7 @@ from pathlib import Path import dask.dataframe as dd +import duckdb import pandas as pd import polars as pl import pyarrow as pa @@ -29,14 +30,18 @@ "pandas[pyarrow]": (pd, {"engine": "pyarrow", "dtype_backend": "pyarrow"}), "polars[lazy]": (pl, {}), "pyarrow": (pa, {}), + "duckdb": (duckdb, {}), "dask": (dd, {"engine": "pyarrow", "dtype_backend": "pyarrow"}), } BACKEND_COLLECT_FUNC_MAP = { "polars[lazy]": lambda x: x.collect(), + "duckdb": lambda x: x.pl(), "dask": lambda x: x.compute(), } +DUCKDB_XFAILS = ["q11", "q14", "q15", "q16", "q18", "q22"] + QUERY_DATA_PATH_MAP = { "q1": (LINEITEM_PATH,), "q2": (REGION_PATH, NATION_PATH, SUPPLIER_PATH, PART_PATH, PARTSUPP_PATH), @@ -90,6 +95,10 @@ def execute_query(query_id: str) -> None: data_paths = QUERY_DATA_PATH_MAP[query_id] for backend, (native_namespace, kwargs) in BACKEND_NAMESPACE_KWARGS_MAP.items(): + if backend == "duckdb" and query_id in DUCKDB_XFAILS: + print(f"\nSkipping {query_id} for DuckDB") # noqa: T201 + continue + print(f"\nRunning {query_id} with {backend=}") # noqa: T201 result = query_module.query( *( From 5dca2a9890314b93757c79a7f8dfe8be68f7e470 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Wed, 8 Jan 2025 16:44:36 +0100 Subject: [PATCH 26/35] docs: fix `is_between` type hint in signature (#1766) * docs: fix is_between signature type hint * rm default in _dask * test typing --- narwhals/_arrow/expr.py | 7 ++++++- narwhals/_arrow/series.py | 5 ++++- narwhals/_dask/expr.py | 7 +++---- narwhals/_pandas_like/expr.py | 5 ++++- narwhals/_pandas_like/series.py | 5 ++++- narwhals/_spark_like/expr.py | 3 ++- narwhals/expr.py | 4 ++-- narwhals/series.py | 5 ++++- tests/expr_and_series/is_between_test.py | 12 ++++++++++-- tests/spark_like_test.py | 5 ++++- 10 files changed, 43 insertions(+), 15 deletions(-) diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 1c0d0734e..df5c95367 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -283,7 +283,12 @@ def is_null(self: Self) -> Self: def is_nan(self: Self) -> Self: return reuse_series_implementation(self, "is_nan") - def is_between(self: Self, lower_bound: Any, upper_bound: Any, closed: str) -> Self: + def is_between( + self: Self, + lower_bound: Any, + upper_bound: Any, + closed: Literal["left", "right", "none", "both"], + ) -> Self: return reuse_series_implementation( self, "is_between", diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 1e8d09827..193fc25a2 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -490,7 +490,10 @@ def all(self: Self, *, _return_py_scalar: bool = True) -> bool: ) def is_between( - self, lower_bound: Any, upper_bound: Any, closed: str = "both" + self, + lower_bound: Any, + upper_bound: Any, + closed: Literal["left", "right", "none", "both"], ) -> Self: import pyarrow.compute as pc diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index cb20fa616..40e7eff9c 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -413,10 +413,9 @@ def is_between( self, lower_bound: Self | Any, upper_bound: Self | Any, - closed: str = "both", + closed: Literal["left", "right", "none", "both"], ) -> Self: - if closed == "none": - closed = "neither" + closed_ = "neither" if closed == "none" else closed return self._from_call( lambda _input, lower_bound, upper_bound, closed: _input.between( lower_bound, upper_bound, closed @@ -424,7 +423,7 @@ def is_between( "is_between", lower_bound=lower_bound, upper_bound=upper_bound, - closed=closed, + closed=closed_, returns_scalar=self._returns_scalar, ) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 34d05b7eb..c694b3420 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -262,7 +262,10 @@ def clip(self, lower_bound: Any, upper_bound: Any) -> Self: ) def is_between( - self, lower_bound: Any, upper_bound: Any, closed: str = "both" + self, + lower_bound: Any, + upper_bound: Any, + closed: Literal["left", "right", "none", "both"], ) -> Self: return reuse_series_implementation( self, diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index e5c5e771e..35ec672e4 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -263,7 +263,10 @@ def to_list(self) -> Any: return self._native_series.to_list() def is_between( - self, lower_bound: Any, upper_bound: Any, closed: str = "both" + self, + lower_bound: Any, + upper_bound: Any, + closed: Literal["left", "right", "none", "both"], ) -> PandasLikeSeries: ser = self._native_series _, lower_bound = broadcast_align_and_extract_native(self, lower_bound) diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index 66826a6ab..03529ca96 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING from typing import Any from typing import Callable +from typing import Literal from typing import Sequence from narwhals._expression_parsing import infer_new_root_output_names @@ -276,7 +277,7 @@ def is_between( self, lower_bound: Any, upper_bound: Any, - closed: str, + closed: Literal["left", "right", "none", "both"], ) -> Self: def _is_between(_input: Column, lower_bound: Any, upper_bound: Any) -> Column: if closed == "both": diff --git a/narwhals/expr.py b/narwhals/expr.py index 653300da8..807a7f04b 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1792,10 +1792,10 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: # --- transform --- def is_between( - self, + self: Self, lower_bound: Any | IntoExpr, upper_bound: Any | IntoExpr, - closed: str = "both", + closed: Literal["left", "right", "none", "both"] = "both", ) -> Self: """Check if this expression is between the given lower and upper bounds. diff --git a/narwhals/series.py b/narwhals/series.py index 7b4cfbf6e..8385b43ad 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -2605,7 +2605,10 @@ def fill_null( ) def is_between( - self, lower_bound: Any | Self, upper_bound: Any | Self, closed: str = "both" + self: Self, + lower_bound: Any | Self, + upper_bound: Any | Self, + closed: Literal["left", "right", "none", "both"] = "both", ) -> Self: """Get a boolean mask of the values that are between the given lower/upper bounds. diff --git a/tests/expr_and_series/is_between_test.py b/tests/expr_and_series/is_between_test.py index 57ad545c0..a24277fa5 100644 --- a/tests/expr_and_series/is_between_test.py +++ b/tests/expr_and_series/is_between_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import Literal + import pytest import narwhals.stable.v1 as nw @@ -17,7 +19,11 @@ ("none", [False, True, True, False]), ], ) -def test_is_between(constructor: Constructor, closed: str, expected: list[bool]) -> None: +def test_is_between( + constructor: Constructor, + closed: Literal["left", "right", "none", "both"], + expected: list[bool], +) -> None: data = {"a": [1, 4, 2, 5]} df = nw.from_native(constructor(data)) result = df.select(nw.col("a").is_between(1, 5, closed=closed)) @@ -43,7 +49,9 @@ def test_is_between_expressified(constructor: Constructor) -> None: ], ) def test_is_between_series( - constructor_eager: ConstructorEager, closed: str, expected: list[bool] + constructor_eager: ConstructorEager, + closed: Literal["left", "right", "none", "both"], + expected: list[bool], ) -> None: data = {"a": [1, 4, 2, 5]} df = nw.from_native(constructor_eager(data), eager_only=True) diff --git a/tests/spark_like_test.py b/tests/spark_like_test.py index c929f4f85..30610be45 100644 --- a/tests/spark_like_test.py +++ b/tests/spark_like_test.py @@ -9,6 +9,7 @@ from contextlib import nullcontext as does_not_raise from typing import TYPE_CHECKING from typing import Any +from typing import Literal import pandas as pd import pytest @@ -991,7 +992,9 @@ def test_clip(pyspark_constructor: Constructor) -> None: ], ) def test_is_between( - pyspark_constructor: Constructor, closed: str, expected: list[bool] + pyspark_constructor: Constructor, + closed: Literal["left", "right", "none", "both"], + expected: list[bool], ) -> None: data = {"a": [1, 4, 2, 5]} df = nw.from_native(pyspark_constructor(data)) From 40a83e36f41389db59881cc211ed83ec1b6913f6 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Thu, 9 Jan 2025 10:11:17 +0100 Subject: [PATCH 27/35] chore: dask nightly (#1768) --- narwhals/_dask/expr.py | 48 +++++++++++++++++++------------------ narwhals/_dask/group_by.py | 19 +++++++++++---- narwhals/_dask/namespace.py | 33 +++++++++++++------------ narwhals/_dask/selectors.py | 7 ++++-- narwhals/_dask/utils.py | 21 ++++++++++------ narwhals/translate.py | 5 +++- 6 files changed, 81 insertions(+), 52 deletions(-) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 40e7eff9c..373c29020 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -23,7 +23,11 @@ from narwhals.utils import import_dtypes_module if TYPE_CHECKING: - import dask_expr + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx + from typing_extensions import Self from narwhals._dask.dataframe import DaskLazyFrame @@ -32,12 +36,12 @@ from narwhals.utils import Version -class DaskExpr(CompliantExpr["dask_expr.Series"]): +class DaskExpr(CompliantExpr["dx.Series"]): _implementation: Implementation = Implementation.DASK def __init__( self, - call: Callable[[DaskLazyFrame], Sequence[dask_expr.Series]], + call: Callable[[DaskLazyFrame], Sequence[dx.Series]], *, depth: int, function_name: str, @@ -60,7 +64,7 @@ def __init__( self._version = version self._kwargs = kwargs - def __call__(self, df: DaskLazyFrame) -> Sequence[dask_expr.Series]: + def __call__(self, df: DaskLazyFrame) -> Sequence[dx.Series]: return self._call(df) def __narwhals_expr__(self) -> None: ... @@ -78,7 +82,7 @@ def from_column_names( backend_version: tuple[int, ...], version: Version, ) -> Self: - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: try: return [df._native_frame[column_name] for column_name in column_names] except KeyError as e: @@ -107,7 +111,7 @@ def from_column_indices( backend_version: tuple[int, ...], version: Version, ) -> Self: - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: return [ df._native_frame.iloc[:, column_index] for column_index in column_indices ] @@ -126,14 +130,14 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: def _from_call( self, - # First argument to `call` should be `dask_expr.Series` - call: Callable[..., dask_expr.Series], + # First argument to `call` should be `dx.Series` + call: Callable[..., dx.Series], expr_name: str, *, returns_scalar: bool, **kwargs: Any, ) -> Self: - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: results = [] inputs = self._call(df) _kwargs = {key: maybe_evaluate(df, value) for key, value in kwargs.items()} @@ -163,7 +167,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: ) def alias(self, name: str) -> Self: - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: inputs = self._call(df) return [_input.rename(name) for _input in inputs] @@ -312,7 +316,7 @@ def mean(self) -> Self: def median(self) -> Self: from narwhals.exceptions import InvalidOperationError - def func(s: dask_expr.Series) -> dask_expr.Series: + def func(s: dx.Series) -> dx.Series: dtype = native_to_narwhals_dtype(s, self._version, Implementation.DASK) if not dtype.is_numeric(): msg = "`median` operation not supported for non-numeric input type." @@ -511,11 +515,11 @@ def fill_null( limit: int | None = None, ) -> DaskExpr: def func( - _input: dask_expr.Series, + _input: dx.Series, value: Any | None, strategy: str | None, limit: int | None, - ) -> dask_expr.Series: + ) -> dx.Series: if value is not None: res_ser = _input.fillna(value) else: @@ -566,7 +570,7 @@ def is_null(self: Self) -> Self: ) def is_nan(self: Self) -> Self: - def func(_input: dask_expr.Series) -> dask_expr.Series: + def func(_input: dx.Series) -> dx.Series: dtype = native_to_narwhals_dtype(_input, self._version, self._implementation) if dtype.is_numeric(): return _input != _input # noqa: PLR0124 @@ -585,7 +589,7 @@ def quantile( ) -> Self: if interpolation == "linear": - def func(_input: dask_expr.Series, quantile: float) -> dask_expr.Series: + def func(_input: dx.Series, quantile: float) -> dx.Series: if _input.npartitions > 1: msg = "`Expr.quantile` is not supported for Dask backend with multiple partitions." raise NotImplementedError(msg) @@ -599,7 +603,7 @@ def func(_input: dask_expr.Series, quantile: float) -> dask_expr.Series: raise NotImplementedError(msg) def is_first_distinct(self: Self) -> Self: - def func(_input: dask_expr.Series) -> dask_expr.Series: + def func(_input: dx.Series) -> dx.Series: _name = _input.name col_token = generate_temporary_column_name(n_bytes=8, columns=[_name]) _input = add_row_index( @@ -618,7 +622,7 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: ) def is_last_distinct(self: Self) -> Self: - def func(_input: dask_expr.Series) -> dask_expr.Series: + def func(_input: dx.Series) -> dx.Series: _name = _input.name col_token = generate_temporary_column_name(n_bytes=8, columns=[_name]) _input = add_row_index( @@ -635,7 +639,7 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: ) def is_duplicated(self: Self) -> Self: - def func(_input: dask_expr.Series) -> dask_expr.Series: + def func(_input: dx.Series) -> dx.Series: _name = _input.name return ( _input.to_frame() @@ -647,7 +651,7 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: return self._from_call(func, "is_duplicated", returns_scalar=self._returns_scalar) def is_unique(self: Self) -> Self: - def func(_input: dask_expr.Series) -> dask_expr.Series: + def func(_input: dx.Series) -> dx.Series: _name = _input.name return ( _input.to_frame() @@ -967,7 +971,7 @@ def replace_time_zone(self, time_zone: str | None) -> DaskExpr: ) def convert_time_zone(self, time_zone: str) -> DaskExpr: - def func(s: dask_expr.Series, time_zone: str) -> dask_expr.Series: + def func(s: dx.Series, time_zone: str) -> dx.Series: dtype = native_to_narwhals_dtype( s, self._compliant_expr._version, Implementation.DASK ) @@ -984,9 +988,7 @@ def func(s: dask_expr.Series, time_zone: str) -> dask_expr.Series: ) def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> DaskExpr: - def func( - s: dask_expr.Series, time_unit: Literal["ns", "us", "ms"] = "us" - ) -> dask_expr.Series: + def func(s: dx.Series, time_unit: Literal["ns", "us", "ms"] = "us") -> dx.Series: dtype = native_to_narwhals_dtype( s, self._compliant_expr._version, Implementation.DASK ) diff --git a/narwhals/_dask/group_by.py b/narwhals/_dask/group_by.py index 243b21b71..60086efa2 100644 --- a/narwhals/_dask/group_by.py +++ b/narwhals/_dask/group_by.py @@ -12,7 +12,12 @@ if TYPE_CHECKING: import dask.dataframe as dd - import dask_expr + + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx + import pandas as pd from narwhals._dask.dataframe import DaskLazyFrame @@ -43,7 +48,10 @@ def var( ]: from functools import partial - import dask_expr as dx + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx return partial(dx._groupby.GroupBy.var, ddof=ddof) @@ -55,7 +63,10 @@ def std( ]: from functools import partial - import dask_expr as dx + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx return partial(dx._groupby.GroupBy.std, ddof=ddof) @@ -127,7 +138,7 @@ def _from_native_frame(self, df: DaskLazyFrame) -> DaskLazyFrame: def agg_dask( df: DaskLazyFrame, grouped: Any, - exprs: Sequence[CompliantExpr[dask_expr.Series]], + exprs: Sequence[CompliantExpr[dx.Series]], keys: list[str], from_dataframe: Callable[[Any], DaskLazyFrame], ) -> DaskLazyFrame: diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 9a16d7f13..d8b2b7a9a 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -21,14 +21,17 @@ from narwhals.typing import CompliantNamespace if TYPE_CHECKING: - import dask_expr + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx from narwhals._dask.typing import IntoDaskExpr from narwhals.dtypes import DType from narwhals.utils import Version -class DaskNamespace(CompliantNamespace["dask_expr.Series"]): +class DaskNamespace(CompliantNamespace["dx.Series"]): @property def selectors(self) -> DaskSelectorNamespace: return DaskSelectorNamespace( @@ -40,7 +43,7 @@ def __init__(self, *, backend_version: tuple[int, ...], version: Version) -> Non self._version = version def all(self) -> DaskExpr: - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: return [df._native_frame[column_name] for column_name in df.columns] return DaskExpr( @@ -69,7 +72,7 @@ def lit(self, value: Any, dtype: DType | None) -> DaskExpr: import dask.dataframe as dd import pandas as pd - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: return [ dd.from_pandas( pd.Series( @@ -99,7 +102,7 @@ def len(self) -> DaskExpr: import dask.dataframe as dd import pandas as pd - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: if not df.columns: return [ dd.from_pandas( @@ -125,7 +128,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: def all_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: series = [s for _expr in parsed_exprs for s in _expr(df)] return [reduce(lambda x, y: x & y, series).rename(series[0].name)] @@ -144,7 +147,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: def any_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: series = [s for _expr in parsed_exprs for s in _expr(df)] return [reduce(lambda x, y: x | y, series).rename(series[0].name)] @@ -163,7 +166,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: def sum_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: series = [s.fillna(0) for _expr in parsed_exprs for s in _expr(df)] return [reduce(lambda x, y: x + y, series).rename(series[0].name)] @@ -239,7 +242,7 @@ def concat( def mean_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: series = (s.fillna(0) for _expr in parsed_exprs for s in _expr(df)) non_na = (1 - s.isna() for _expr in parsed_exprs for s in _expr(df)) return [ @@ -266,7 +269,7 @@ def min_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: series = [s for _expr in parsed_exprs for s in _expr(df)] return [dd.concat(series, axis=1).min(axis=1).rename(series[0].name)] @@ -288,7 +291,7 @@ def max_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: series = [s for _expr in parsed_exprs for s in _expr(df)] return [dd.concat(series, axis=1).max(axis=1).rename(series[0].name)] @@ -327,7 +330,7 @@ def concat_str( *parse_into_exprs(*more_exprs, namespace=self), ] - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: series = (s.astype(str) for _expr in parsed_exprs for s in _expr(df)) null_mask = [s for _expr in parsed_exprs for s in _expr.is_null()(df)] @@ -389,12 +392,12 @@ def __init__( self._returns_scalar = returns_scalar self._version = version - def __call__(self, df: DaskLazyFrame) -> Sequence[dask_expr.Series]: + def __call__(self, df: DaskLazyFrame) -> Sequence[dx.Series]: from narwhals._expression_parsing import parse_into_expr plx = df.__narwhals_namespace__() condition = parse_into_expr(self._condition, namespace=plx)(df)[0] - condition = cast("dask_expr.Series", condition) + condition = cast("dx.Series", condition) try: value_series = parse_into_expr(self._then_value, namespace=plx)(df)[0] except TypeError: @@ -402,7 +405,7 @@ def __call__(self, df: DaskLazyFrame) -> Sequence[dask_expr.Series]: _df = condition.to_frame("a") _df["tmp"] = self._then_value value_series = _df["tmp"] - value_series = cast("dask_expr.Series", value_series) + value_series = cast("dx.Series", value_series) validate_comparand(condition, value_series) if self._otherwise_value is None: diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index 2891d84ff..703e24860 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -8,7 +8,10 @@ from narwhals.utils import import_dtypes_module if TYPE_CHECKING: - import dask_expr + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx from typing_extensions import Self from narwhals._dask.dataframe import DaskLazyFrame @@ -135,7 +138,7 @@ def call(df: DaskLazyFrame) -> list[Any]: def __or__(self: Self, other: DaskSelector | Any) -> DaskSelector | Any: if isinstance(other, DaskSelector): - def call(df: DaskLazyFrame) -> list[dask_expr.Series]: + def call(df: DaskLazyFrame) -> list[dx.Series]: lhs = self._call(df) rhs = other._call(df) return [*(x for x in lhs if x.name not in {x.name for x in rhs}), *rhs] diff --git a/narwhals/_dask/utils.py b/narwhals/_dask/utils.py index 4f2952d0b..cd303d8ec 100644 --- a/narwhals/_dask/utils.py +++ b/narwhals/_dask/utils.py @@ -14,7 +14,11 @@ if TYPE_CHECKING: import dask.dataframe as dd - import dask_expr + + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx from narwhals._dask.dataframe import DaskLazyFrame from narwhals._dask.expr import DaskExpr @@ -42,7 +46,7 @@ def maybe_evaluate(df: DaskLazyFrame, obj: Any) -> Any: def parse_exprs_and_named_exprs( df: DaskLazyFrame, *exprs: Any, **named_exprs: Any -) -> dict[str, dask_expr.Series]: +) -> dict[str, dx.Series]: results = {} for expr in exprs: if hasattr(expr, "__narwhals_expr__"): @@ -82,10 +86,13 @@ def add_row_index( ) -def validate_comparand(lhs: dask_expr.Series, rhs: dask_expr.Series) -> None: - import dask_expr +def validate_comparand(lhs: dx.Series, rhs: dx.Series) -> None: + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx - if not dask_expr._expr.are_co_aligned(lhs._expr, rhs._expr): # pragma: no cover + if not dx._expr.are_co_aligned(lhs._expr, rhs._expr): # pragma: no cover # are_co_aligned is a method which cheaply checks if two Dask expressions # have the same index, and therefore don't require index alignment. # If someone only operates on a Dask DataFrame via expressions, then this @@ -154,11 +161,11 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], version: Version) -> An raise AssertionError(msg) -def name_preserving_sum(s1: dask_expr.Series, s2: dask_expr.Series) -> dask_expr.Series: +def name_preserving_sum(s1: dx.Series, s2: dx.Series) -> dx.Series: return (s1 + s2).rename(s1.name) -def name_preserving_div(s1: dask_expr.Series, s2: dask_expr.Series) -> dask_expr.Series: +def name_preserving_div(s1: dx.Series, s2: dx.Series) -> dx.Series: return (s1 / s2).rename(s1.name) diff --git a/narwhals/translate.py b/narwhals/translate.py index 8d0805a26..9ad868016 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -684,7 +684,10 @@ def _from_native_impl( # noqa: PLR0915 msg = "Cannot only use `eager_only` or `eager_or_interchange_only` with dask DataFrame" raise TypeError(msg) return native_object - if get_dask_expr() is None: # pragma: no cover + if ( + parse_version(get_dask().__version__) <= (2024, 12, 1) + and get_dask_expr() is None + ): # pragma: no cover msg = "Please install dask-expr" raise ImportError(msg) return LazyFrame( From 36dacf91886d67333f6127ebb70cd2f5bdeeeea4 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 9 Jan 2025 09:22:13 +0000 Subject: [PATCH 28/35] feat: implement `n_unique` for DuckDB (#1762) --- narwhals/_duckdb/expr.py | 22 ++++++++++++++++++++++ tests/expr_and_series/n_unique_test.py | 6 +----- tests/expr_and_series/unary_test.py | 6 +----- tests/group_by_test.py | 8 +------- tpch/execute.py | 2 +- 5 files changed, 26 insertions(+), 18 deletions(-) diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py index 4515cbba1..e5e612085 100644 --- a/narwhals/_duckdb/expr.py +++ b/narwhals/_duckdb/expr.py @@ -408,6 +408,28 @@ def sum(self) -> Self: lambda _input: FunctionExpression("sum", _input), "sum", returns_scalar=True ) + def n_unique(self) -> Self: + from duckdb import CaseExpression + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + def func(_input: duckdb.Expression) -> duckdb.Expression: + # https://stackoverflow.com/a/79338887/4451315 + return FunctionExpression( + "array_unique", FunctionExpression("array_agg", _input) + ) + FunctionExpression( + "max", + CaseExpression( + condition=_input.isnotnull(), value=ConstantExpression(0) + ).otherwise(ConstantExpression(1)), + ) + + return self._from_call( + func, + "n_unique", + returns_scalar=True, + ) + def count(self) -> Self: from duckdb import FunctionExpression diff --git a/tests/expr_and_series/n_unique_test.py b/tests/expr_and_series/n_unique_test.py index d8e4d9b77..90bffb04b 100644 --- a/tests/expr_and_series/n_unique_test.py +++ b/tests/expr_and_series/n_unique_test.py @@ -1,7 +1,5 @@ from __future__ import annotations -import pytest - import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -13,9 +11,7 @@ } -def test_n_unique(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_n_unique(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) result = df.select(nw.all().n_unique()) expected = {"a": [3], "b": [4]} diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index 9ee38a230..f3e01d80f 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -126,11 +126,7 @@ def test_unary_two_elements_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_unary_one_element( - constructor: Constructor, request: pytest.FixtureRequest -) -> None: - if "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_unary_one_element(constructor: Constructor) -> None: data = {"a": [1], "b": [2], "c": [None]} # Dask runs into a divide by zero RuntimeWarning for 1 element skew. context = ( diff --git a/tests/group_by_test.py b/tests/group_by_test.py index 0dd6d8a10..c854da453 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -115,8 +115,6 @@ def test_group_by_depth_1_agg( expected: dict[str, list[int | float]], request: pytest.FixtureRequest, ) -> None: - if "duckdb" in str(constructor) and attr == "n_unique": - request.applymarker(pytest.mark.xfail) if "pandas_pyarrow" in str(constructor) and attr == "var" and PANDAS_VERSION < (2, 1): # Known issue with variance calculation in pandas 2.0.x with pyarrow backend in groupby operations" request.applymarker(pytest.mark.xfail) @@ -166,11 +164,7 @@ def test_group_by_median(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_group_by_n_unique_w_missing( - constructor: Constructor, request: pytest.FixtureRequest -) -> None: - if "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_group_by_n_unique_w_missing(constructor: Constructor) -> None: data = {"a": [1, 1, 2], "b": [4, None, 5], "c": [None, None, 7], "d": [1, 1, 3]} result = ( nw.from_native(constructor(data)) diff --git a/tpch/execute.py b/tpch/execute.py index 1f3823ced..f2f3041df 100644 --- a/tpch/execute.py +++ b/tpch/execute.py @@ -40,7 +40,7 @@ "dask": lambda x: x.compute(), } -DUCKDB_XFAILS = ["q11", "q14", "q15", "q16", "q18", "q22"] +DUCKDB_XFAILS = ["q11", "q14", "q15", "q18", "q22"] QUERY_DATA_PATH_MAP = { "q1": (LINEITEM_PATH,), From 145e4dea680d0e0625adee12b8d1d9b4c6c8b5b4 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 9 Jan 2025 09:34:23 +0000 Subject: [PATCH 29/35] feat: semi-join for duckdb (#1767) --- narwhals/_duckdb/dataframe.py | 17 ++++++++++------- tests/frame/join_test.py | 3 --- tpch/execute.py | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 76ff68ae0..2ff0e085a 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -225,7 +225,7 @@ def join( if isinstance(right_on, str): right_on = [right_on] - if how not in ("inner", "left"): + if how not in ("inner", "left", "semi"): msg = "Only inner and left join is implemented for DuckDB" raise NotImplementedError(msg) @@ -242,12 +242,15 @@ def join( other._native_frame.set_alias("rhs"), condition=condition, how=how ) - select = [f"lhs.{x}" for x in self._native_frame.columns] - for col in other._native_frame.columns: - if col in self._native_frame.columns and col not in right_on: - select.append(f"rhs.{col} as {col}{suffix}") - elif col not in right_on: - select.append(col) + if how in ("inner", "left"): + select = [f"lhs.{x}" for x in self._native_frame.columns] + for col in other._native_frame.columns: + if col in self._native_frame.columns and col not in right_on: + select.append(f"rhs.{col} as {col}{suffix}") + elif col not in right_on: + select.append(col) + else: # semi + select = [f"lhs.{x}" for x in self._native_frame.columns] res = rel.select(", ".join(select)).set_alias(original_alias) return self._from_native_frame(res) diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index 7332cb254..242696394 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -206,10 +206,7 @@ def test_semi_join( join_key: list[str], filter_expr: nw.Expr, expected: dict[str, list[Any]], - request: pytest.FixtureRequest, ) -> None: - if "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) other = df.filter(filter_expr) diff --git a/tpch/execute.py b/tpch/execute.py index f2f3041df..ea4cc3a8a 100644 --- a/tpch/execute.py +++ b/tpch/execute.py @@ -40,7 +40,7 @@ "dask": lambda x: x.compute(), } -DUCKDB_XFAILS = ["q11", "q14", "q15", "q18", "q22"] +DUCKDB_SKIPS = ["q11", "q14", "q15", "q22"] QUERY_DATA_PATH_MAP = { "q1": (LINEITEM_PATH,), @@ -95,7 +95,7 @@ def execute_query(query_id: str) -> None: data_paths = QUERY_DATA_PATH_MAP[query_id] for backend, (native_namespace, kwargs) in BACKEND_NAMESPACE_KWARGS_MAP.items(): - if backend == "duckdb" and query_id in DUCKDB_XFAILS: + if backend == "duckdb" and query_id in DUCKDB_SKIPS: print(f"\nSkipping {query_id} for DuckDB") # noqa: T201 continue From 8f0cf50a1a3d1d44560018a039a2d560495aa3bc Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 9 Jan 2025 12:41:31 +0000 Subject: [PATCH 30/35] fix: parse_version was not parsing duckdb pre-preleases correctly (#1763) --- narwhals/utils.py | 9 +++++---- tests/utils_test.py | 13 +++++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/narwhals/utils.py b/narwhals/utils.py index c03642c90..509a0e36a 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -372,7 +372,7 @@ def _is_iterable(arg: Any | Iterable[Any]) -> bool: return isinstance(arg, Iterable) and not isinstance(arg, (str, bytes, Series)) -def parse_version(version: Sequence[str | int]) -> tuple[int, ...]: +def parse_version(version: str) -> tuple[int, ...]: """Simple version parser; split into a tuple of ints for comparison. Arguments: @@ -382,9 +382,10 @@ def parse_version(version: Sequence[str | int]) -> tuple[int, ...]: Parsed version number. """ # lifted from Polars - if isinstance(version, str): # pragma: no cover - version = version.split(".") - return tuple(int(re.sub(r"\D", "", str(v))) for v in version) + # [marco]: Take care of DuckDB pre-releases which end with e.g. `-dev4108` + # and pandas pre-releases which end with e.g. .dev0+618.gb552dc95c9 + version = re.sub(r"(\D?dev.*$)", "", version) + return tuple(int(re.sub(r"\D", "", str(v))) for v in version.split(".")) def isinstance_or_issubclass(obj: Any, cls: Any) -> bool: diff --git a/tests/utils_test.py b/tests/utils_test.py index 26bd2ecf9..e999696d3 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -13,6 +13,7 @@ from pandas.testing import assert_series_equal import narwhals.stable.v1 as nw +from narwhals.utils import parse_version from tests.utils import PANDAS_VERSION from tests.utils import get_module_version_as_tuple @@ -271,3 +272,15 @@ def test_generate_temporary_column_name_raise() -> None: match="Internal Error: Narwhals was not able to generate a column name with ", ): nw.generate_temporary_column_name(n_bytes=1, columns=columns) + + +@pytest.mark.parametrize( + ("version", "expected"), + [ + ("2020.1.2", (2020, 1, 2)), + ("2020.1.2-dev123", (2020, 1, 2)), + ("3.0.0.dev0+618.gb552dc95c9", (3, 0, 0)), + ], +) +def test_parse_version(version: str, expected: tuple[int, ...]) -> None: + assert parse_version(version) == expected From deee14ce4182e76041fd4daa34de8cc679a85249 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 9 Jan 2025 14:56:24 +0000 Subject: [PATCH 31/35] feat: implement cross-join for duckdb (#1773) --- .github/workflows/check_tpch_queries.yml | 2 +- .github/workflows/extremes.yml | 16 +++++---- narwhals/_duckdb/dataframe.py | 42 +++++++++++++++--------- tests/frame/join_test.py | 5 +-- tests/utils.py | 1 + tpch/execute.py | 2 +- 6 files changed, 41 insertions(+), 27 deletions(-) diff --git a/.github/workflows/check_tpch_queries.yml b/.github/workflows/check_tpch_queries.yml index 723fa6e80..ce7da6f8e 100644 --- a/.github/workflows/check_tpch_queries.yml +++ b/.github/workflows/check_tpch_queries.yml @@ -25,7 +25,7 @@ jobs: cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "pyproject.toml" - name: local-install - run: uv pip install -e ".[dev, core, dask]" --system + run: uv pip install -U --pre -e ".[dev, core, dask]" --system - name: generate-data run: cd tpch && python generate_data.py - name: tpch-tests diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index 47ebc85ea..0e7e6a205 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -158,8 +158,6 @@ jobs: run: | uv pip uninstall pyarrow --system uv pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ --pre pyarrow --system - - name: show-deps - run: uv pip freeze - name: install numpy nightly run: | uv pip uninstall numpy --system @@ -167,18 +165,22 @@ jobs: - name: install dask run: | uv pip uninstall dask dask-expr --system - python -m pip install git+https://github.com/dask/distributed git+https://github.com/dask/dask git+https://github.com/dask/dask-expr + python -m pip install git+https://github.com/dask/distributed git+https://github.com/dask/dask + - name: install duckdb + run: | + python -m pip install -U --pre duckdb - name: show-deps run: uv pip freeze - name: Assert nightlies dependencies run: | DEPS=$(uv pip freeze) - echo "$DEPS" | grep 'polars' + echo "$DEPS" | grep 'polars.*@' echo "$DEPS" | grep 'pandas.*dev' echo "$DEPS" | grep 'pyarrow.*dev' - echo "$DEPS" | grep 'numpy' - echo "$DEPS" | grep 'dask' + echo "$DEPS" | grep 'numpy.*dev' + echo "$DEPS" | grep 'dask.*@' + echo "$DEPS" | grep 'duckdb.*dev' - name: Run pytest run: | pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow \ - --constructors=pandas,pandas[nullable],pandas[pyarrow],pyarrow,polars[eager],polars[lazy],dask + --constructors=pandas,pandas[nullable],pandas[pyarrow],pyarrow,polars[eager],polars[lazy],dask,duckdb diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 2ff0e085a..e1c0f994c 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -224,30 +224,40 @@ def join( left_on = [left_on] if isinstance(right_on, str): right_on = [right_on] + original_alias = self._native_frame.alias - if how not in ("inner", "left", "semi"): + if how not in ("inner", "left", "semi", "cross"): msg = "Only inner and left join is implemented for DuckDB" raise NotImplementedError(msg) - # help mypy - assert left_on is not None # noqa: S101 - assert right_on is not None # noqa: S101 - - conditions = [ - f"lhs.{left} = rhs.{right}" for left, right in zip(left_on, right_on) - ] - original_alias = self._native_frame.alias - condition = " and ".join(conditions) - rel = self._native_frame.set_alias("lhs").join( - other._native_frame.set_alias("rhs"), condition=condition, how=how - ) + if how == "cross": + if self._backend_version < (1, 1, 4): + msg = f"DuckDB>=1.1.4 is required for cross-join, found version: {self._backend_version}" + raise NotImplementedError(msg) + rel = self._native_frame.set_alias("lhs").cross( # pragma: no cover + other._native_frame.set_alias("rhs") + ) + else: + # help mypy + assert left_on is not None # noqa: S101 + assert right_on is not None # noqa: S101 + + conditions = [ + f"lhs.{left} = rhs.{right}" for left, right in zip(left_on, right_on) + ] + condition = " and ".join(conditions) + rel = self._native_frame.set_alias("lhs").join( + other._native_frame.set_alias("rhs"), condition=condition, how=how + ) - if how in ("inner", "left"): + if how in ("inner", "left", "cross"): select = [f"lhs.{x}" for x in self._native_frame.columns] for col in other._native_frame.columns: - if col in self._native_frame.columns and col not in right_on: + if col in self._native_frame.columns and ( + right_on is None or col not in right_on + ): select.append(f"rhs.{col} as {col}{suffix}") - elif col not in right_on: + elif right_on is None or col not in right_on: select.append(col) else: # semi select = [f"lhs.{x}" for x in self._native_frame.columns] diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index 242696394..5bf5c91f0 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -10,6 +10,7 @@ import narwhals.stable.v1 as nw from narwhals.utils import Implementation +from tests.utils import DUCKDB_VERSION from tests.utils import PANDAS_VERSION from tests.utils import Constructor from tests.utils import assert_equal_data @@ -75,7 +76,7 @@ def test_inner_join_single_key(constructor: Constructor) -> None: def test_cross_join(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): + if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 1, 4): request.applymarker(pytest.mark.xfail) data = {"antananarivo": [1, 3, 2]} df = nw.from_native(constructor(data)) @@ -117,7 +118,7 @@ def test_suffix(constructor: Constructor, how: str, suffix: str) -> None: def test_cross_join_suffix( constructor: Constructor, suffix: str, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 1, 4): request.applymarker(pytest.mark.xfail) data = {"antananarivo": [1, 3, 2]} df = nw.from_native(constructor(data)) diff --git a/tests/utils.py b/tests/utils.py index 005b4eee2..2d41d6782 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -33,6 +33,7 @@ def get_module_version_as_tuple(module_name: str) -> tuple[int, ...]: IBIS_VERSION: tuple[int, ...] = get_module_version_as_tuple("ibis") NUMPY_VERSION: tuple[int, ...] = get_module_version_as_tuple("numpy") PANDAS_VERSION: tuple[int, ...] = get_module_version_as_tuple("pandas") +DUCKDB_VERSION: tuple[int, ...] = get_module_version_as_tuple("duckdb") POLARS_VERSION: tuple[int, ...] = get_module_version_as_tuple("polars") DASK_VERSION: tuple[int, ...] = get_module_version_as_tuple("dask") PYARROW_VERSION: tuple[int, ...] = get_module_version_as_tuple("pyarrow") diff --git a/tpch/execute.py b/tpch/execute.py index ea4cc3a8a..5209ad48e 100644 --- a/tpch/execute.py +++ b/tpch/execute.py @@ -40,7 +40,7 @@ "dask": lambda x: x.compute(), } -DUCKDB_SKIPS = ["q11", "q14", "q15", "q22"] +DUCKDB_SKIPS = ["q14", "q15"] QUERY_DATA_PATH_MAP = { "q1": (LINEITEM_PATH,), From 0f385212dc43aad770b3c75740a0dceaaf67d38f Mon Sep 17 00:00:00 2001 From: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> Date: Thu, 9 Jan 2025 17:11:53 +0100 Subject: [PATCH 32/35] feat: add missing dunder methods in `SparkLikeExpr` and `SparkLikeNamespace.lit` (#1708) --- narwhals/_spark_like/dataframe.py | 2 +- narwhals/_spark_like/expr.py | 103 ++++++++++++++++++++++++++-- narwhals/_spark_like/namespace.py | 23 +++++++ tests/spark_like_test.py | 110 ++++++++++++++++++++++++++++++ 4 files changed, 230 insertions(+), 8 deletions(-) diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index e04da7f57..e54a05997 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -50,7 +50,7 @@ def __native_namespace__(self) -> Any: # pragma: no cover def __narwhals_namespace__(self) -> SparkLikeNamespace: from narwhals._spark_like.namespace import SparkLikeNamespace - return SparkLikeNamespace( # type: ignore[abstract] + return SparkLikeNamespace( backend_version=self._backend_version, version=self._version ) diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index 03529ca96..10fb76227 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -59,7 +59,7 @@ def __narwhals_namespace__(self) -> SparkLikeNamespace: # pragma: no cover # Unused, just for compatibility with PandasLikeExpr from narwhals._spark_like.namespace import SparkLikeNamespace - return SparkLikeNamespace( # type: ignore[abstract] + return SparkLikeNamespace( backend_version=self._backend_version, version=self._version ) @@ -123,7 +123,7 @@ def func(df: SparkLikeLazyFrame) -> list[Column]: def __add__(self, other: SparkLikeExpr) -> Self: return self._from_call( - lambda _input, other: _input + other, + lambda _input, other: _input.__add__(other), "__add__", other=other, returns_scalar=False, @@ -131,7 +131,7 @@ def __add__(self, other: SparkLikeExpr) -> Self: def __sub__(self, other: SparkLikeExpr) -> Self: return self._from_call( - lambda _input, other: _input - other, + lambda _input, other: _input.__sub__(other), "__sub__", other=other, returns_scalar=False, @@ -139,16 +139,66 @@ def __sub__(self, other: SparkLikeExpr) -> Self: def __mul__(self, other: SparkLikeExpr) -> Self: return self._from_call( - lambda _input, other: _input * other, + lambda _input, other: _input.__mul__(other), "__mul__", other=other, returns_scalar=False, ) - def __lt__(self, other: SparkLikeExpr) -> Self: + def __truediv__(self, other: SparkLikeExpr) -> Self: return self._from_call( - lambda _input, other: _input < other, - "__lt__", + lambda _input, other: _input.__truediv__(other), + "__truediv__", + other=other, + returns_scalar=False, + ) + + def __floordiv__(self, other: SparkLikeExpr) -> Self: + def _floordiv(_input: Column, other: Column) -> Column: + from pyspark.sql import functions as F # noqa: N812 + + return F.floor(_input / other) + + return self._from_call( + _floordiv, "__floordiv__", other=other, returns_scalar=False + ) + + def __pow__(self, other: SparkLikeExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__pow__(other), + "__pow__", + other=other, + returns_scalar=False, + ) + + def __mod__(self, other: SparkLikeExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__mod__(other), + "__mod__", + other=other, + returns_scalar=False, + ) + + def __eq__(self, other: SparkLikeExpr) -> Self: # type: ignore[override] + return self._from_call( + lambda _input, other: _input.__eq__(other), + "__eq__", + other=other, + returns_scalar=False, + ) + + def __ne__(self, other: SparkLikeExpr) -> Self: # type: ignore[override] + return self._from_call( + lambda _input, other: _input.__ne__(other), + "__ne__", + other=other, + returns_scalar=False, + ) + + def __ge__(self, other: SparkLikeExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__ge__(other), + "__ge__", other=other, returns_scalar=False, ) @@ -161,6 +211,45 @@ def __gt__(self, other: SparkLikeExpr) -> Self: returns_scalar=False, ) + def __le__(self, other: SparkLikeExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__le__(other), + "__le__", + other=other, + returns_scalar=False, + ) + + def __lt__(self, other: SparkLikeExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__lt__(other), + "__lt__", + other=other, + returns_scalar=False, + ) + + def __and__(self, other: SparkLikeExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__and__(other), + "__and__", + other=other, + returns_scalar=False, + ) + + def __or__(self, other: SparkLikeExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__or__(other), + "__or__", + other=other, + returns_scalar=False, + ) + + def __invert__(self) -> Self: + return self._from_call( + lambda _input: _input.__invert__(), + "__invert__", + returns_scalar=self._returns_scalar, + ) + def abs(self) -> Self: from pyspark.sql import functions as F # noqa: N812 diff --git a/narwhals/_spark_like/namespace.py b/narwhals/_spark_like/namespace.py index d34867b00..56cc4d271 100644 --- a/narwhals/_spark_like/namespace.py +++ b/narwhals/_spark_like/namespace.py @@ -16,6 +16,7 @@ from narwhals._spark_like.dataframe import SparkLikeLazyFrame from narwhals._spark_like.typing import IntoSparkLikeExpr + from narwhals.dtypes import DType from narwhals.utils import Version @@ -67,6 +68,28 @@ def col(self, *column_names: str) -> SparkLikeExpr: *column_names, backend_version=self._backend_version, version=self._version ) + def lit(self, value: object, dtype: DType | None) -> SparkLikeExpr: + if dtype is not None: + msg = "todo" + raise NotImplementedError(msg) + + def _lit(_: SparkLikeLazyFrame) -> list[Column]: + import pyspark.sql.functions as F # noqa: N812 + + return [F.lit(value).alias("literal")] + + return SparkLikeExpr( # type: ignore[abstract] + call=_lit, + depth=0, + function_name="lit", + root_names=None, + output_names=["literal"], + returns_scalar=True, + backend_version=self._backend_version, + version=self._version, + kwargs={}, + ) + def sum_horizontal(self, *exprs: IntoSparkLikeExpr) -> SparkLikeExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) diff --git a/tests/spark_like_test.py b/tests/spark_like_test.py index 30610be45..f7cd9e6a9 100644 --- a/tests/spark_like_test.py +++ b/tests/spark_like_test.py @@ -21,6 +21,7 @@ if TYPE_CHECKING: from pyspark.sql import SparkSession + from narwhals.dtypes import DType from narwhals.typing import IntoFrame from tests.utils import Constructor @@ -954,6 +955,53 @@ def test_left_join_overlapping_column(pyspark_constructor: Constructor) -> None: assert_equal_data(result, expected) +# copied from tests/expr_and_series/arithmetic_test.py +@pytest.mark.parametrize( + ("attr", "rhs", "expected"), + [ + ("__add__", 1, [2, 3, 4]), + ("__sub__", 1, [0, 1, 2]), + ("__mul__", 2, [2, 4, 6]), + ("__truediv__", 2.0, [0.5, 1.0, 1.5]), + ("__truediv__", 1, [1, 2, 3]), + ("__floordiv__", 2, [0, 1, 1]), + ("__mod__", 2, [1, 0, 1]), + ("__pow__", 2, [1, 4, 9]), + ], +) +def test_arithmetic_expr( + attr: str, rhs: Any, expected: list[Any], pyspark_constructor: Constructor +) -> None: + data = {"a": [1.0, 2, 3]} + df = nw.from_native(pyspark_constructor(data)) + result = df.select(getattr(nw.col("a"), attr)(rhs)) + assert_equal_data(result, {"a": expected}) + + +@pytest.mark.parametrize( + ("attr", "rhs", "expected"), + [ + ("__radd__", 1, [2, 3, 4]), + ("__rsub__", 1, [0, -1, -2]), + ("__rmul__", 2, [2, 4, 6]), + ("__rtruediv__", 2.0, [2, 1, 2 / 3]), + ("__rfloordiv__", 2, [2, 1, 0]), + ("__rmod__", 2, [0, 0, 2]), + ("__rpow__", 2, [2, 4, 8]), + ], +) +def test_right_arithmetic_expr( + attr: str, + rhs: Any, + expected: list[Any], + pyspark_constructor: Constructor, +) -> None: + data = {"a": [1, 2, 3]} + df = nw.from_native(pyspark_constructor(data)) + result = df.select(getattr(nw.col("a"), attr)(rhs)) + assert_equal_data(result, {"literal": expected}) + + # Copied from tests/expr_and_series/median_test.py def test_median(pyspark_constructor: Constructor) -> None: data = {"a": [3, 8, 2, None], "b": [5, 5, None, 7], "z": [7.0, 8, 9, None]} @@ -1099,3 +1147,65 @@ def test_skew( df = nw.from_native(pyspark_constructor({"a": data})) result = df.select(skew=nw.col("a").skew()) assert_equal_data(result, {"skew": [expected]}) + + +# copied from tests/expr_and_series/list_test.py +@pytest.mark.parametrize( + ("dtype", "expected_lit"), + [(None, [2, 2, 2]), (nw.String, ["2", "2", "2"]), (nw.Float32, [2.0, 2.0, 2.0])], +) +def test_lit( + pyspark_constructor: Constructor, + dtype: DType | None, + expected_lit: list[Any], + request: pytest.FixtureRequest, +) -> None: + if dtype is not None: + request.applymarker(pytest.mark.xfail) + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + df_raw = pyspark_constructor(data) + df = nw.from_native(df_raw).lazy() + result = df.with_columns(nw.lit(2, dtype).alias("lit")) + expected = { + "a": [1, 3, 2], + "b": [4, 4, 6], + "z": [7.0, 8.0, 9.0], + "lit": expected_lit, + } + assert_equal_data(result, expected) + + +@pytest.mark.parametrize( + ("col_name", "expr", "expected_result"), + [ + ("left_lit", nw.lit(1) + nw.col("a"), [2, 4, 3]), + ("right_lit", nw.col("a") + nw.lit(1), [2, 4, 3]), + ("left_lit_with_agg", nw.lit(1) + nw.col("a").mean(), [3]), + ("right_lit_with_agg", nw.col("a").mean() - nw.lit(1), [1]), + ("left_scalar", 1 + nw.col("a"), [2, 4, 3]), + ("right_scalar", nw.col("a") + 1, [2, 4, 3]), + ("left_scalar_with_agg", 1 + nw.col("a").mean(), [3]), + ("right_scalar_with_agg", nw.col("a").mean() - 1, [1]), + ], +) +def test_lit_operation( + pyspark_constructor: Constructor, + col_name: str, + expr: nw.Expr, + expected_result: list[int], + request: pytest.FixtureRequest, +) -> None: + if col_name in ( + "left_scalar_with_agg", + "left_lit_with_agg", + "right_lit", + "right_lit_with_agg", + ): + request.applymarker(pytest.mark.xfail) + + data = {"a": [1, 3, 2]} + df_raw = pyspark_constructor(data) + df = nw.from_native(df_raw).lazy() + result = df.select(expr.alias(col_name)) + expected = {col_name: expected_result} + assert_equal_data(result, expected) From 9abe7d25064057351864b8e646466a6cf3a52ee2 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 9 Jan 2025 16:19:13 +0000 Subject: [PATCH 33/35] fix: duckdb join was failing if column names contained spaces (#1775) * fix: duckdb column names with spaces * test * sort was raising too too --- narwhals/_duckdb/dataframe.py | 12 +++++----- tests/frame/join_test.py | 44 +++++++++++++++++------------------ tests/frame/sort_test.py | 16 ++++++------- 3 files changed, 36 insertions(+), 36 deletions(-) diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index e1c0f994c..33cfc19d2 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -243,7 +243,7 @@ def join( assert right_on is not None # noqa: S101 conditions = [ - f"lhs.{left} = rhs.{right}" for left, right in zip(left_on, right_on) + f'lhs."{left}" = rhs."{right}"' for left, right in zip(left_on, right_on) ] condition = " and ".join(conditions) rel = self._native_frame.set_alias("lhs").join( @@ -251,16 +251,16 @@ def join( ) if how in ("inner", "left", "cross"): - select = [f"lhs.{x}" for x in self._native_frame.columns] + select = [f'lhs."{x}"' for x in self._native_frame.columns] for col in other._native_frame.columns: if col in self._native_frame.columns and ( right_on is None or col not in right_on ): - select.append(f"rhs.{col} as {col}{suffix}") + select.append(f'rhs."{col}" as "{col}{suffix}"') elif right_on is None or col not in right_on: select.append(col) else: # semi - select = [f"lhs.{x}" for x in self._native_frame.columns] + select = ["lhs.*"] res = rel.select(", ".join(select)).set_alias(original_alias) return self._from_native_frame(res) @@ -317,9 +317,9 @@ def sort( result = self._native_frame.order( ",".join( ( - f"{col} {desc} nulls last" + f'"{col}" {desc} nulls last' if nulls_last - else f"{col} {desc} nulls first" + else f'"{col}" {desc} nulls first' for col, desc in zip(flat_by, descending_str) ) ) diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index 5bf5c91f0..f176aca67 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -20,7 +20,7 @@ def test_inner_join_two_keys(constructor: Constructor) -> None: data = { "antananarivo": [1, 3, 2], "bob": [4, 4, 6], - "zorro": [7.0, 8, 9], + "zor ro": [7.0, 8, 9], "index": [0, 1, 2], } df = nw.from_native(constructor(data)) @@ -37,9 +37,9 @@ def test_inner_join_two_keys(constructor: Constructor) -> None: expected = { "antananarivo": [1, 3, 2], "bob": [4, 4, 6], - "zorro": [7.0, 8, 9], + "zor ro": [7.0, 8, 9], "index": [0, 1, 2], - "zorro_right": [7.0, 8, 9], + "zor ro_right": [7.0, 8, 9], } assert_equal_data(result, expected) assert_equal_data(result_on, expected) @@ -49,7 +49,7 @@ def test_inner_join_single_key(constructor: Constructor) -> None: data = { "antananarivo": [1, 3, 2], "bob": [4, 4, 6], - "zorro": [7.0, 8, 9], + "zor ro": [7.0, 8, 9], "index": [0, 1, 2], } df = nw.from_native(constructor(data)) @@ -66,10 +66,10 @@ def test_inner_join_single_key(constructor: Constructor) -> None: expected = { "antananarivo": [1, 3, 2], "bob": [4, 4, 6], - "zorro": [7.0, 8, 9], + "zor ro": [7.0, 8, 9], "index": [0, 1, 2], "bob_right": [4, 4, 6], - "zorro_right": [7.0, 8, 9], + "zor ro_right": [7.0, 8, 9], } assert_equal_data(result, expected) assert_equal_data(result_on, expected) @@ -99,7 +99,7 @@ def test_suffix(constructor: Constructor, how: str, suffix: str) -> None: data = { "antananarivo": [1, 3, 2], "bob": [4, 4, 6], - "zorro": [7.0, 8, 9], + "zor ro": [7.0, 8, 9], } df = nw.from_native(constructor(data)) df_right = df @@ -111,7 +111,7 @@ def test_suffix(constructor: Constructor, how: str, suffix: str) -> None: suffix=suffix, ) result_cols = result.collect_schema().names() - assert result_cols == ["antananarivo", "bob", "zorro", f"zorro{suffix}"] + assert result_cols == ["antananarivo", "bob", "zor ro", f"zor ro{suffix}"] @pytest.mark.parametrize("suffix", ["_right", "_custom_suffix"]) @@ -151,13 +151,13 @@ def test_cross_join_non_pandas() -> None: ( ["antananarivo", "bob"], (nw.col("bob") < 5), - {"antananarivo": [2], "bob": [6], "zorro": [9]}, + {"antananarivo": [2], "bob": [6], "zor ro": [9]}, ), - (["bob"], (nw.col("bob") < 5), {"antananarivo": [2], "bob": [6], "zorro": [9]}), + (["bob"], (nw.col("bob") < 5), {"antananarivo": [2], "bob": [6], "zor ro": [9]}), ( ["bob"], (nw.col("bob") > 5), - {"antananarivo": [1, 3], "bob": [4, 4], "zorro": [7.0, 8.0]}, + {"antananarivo": [1, 3], "bob": [4, 4], "zor ro": [7.0, 8.0]}, ), ], ) @@ -170,7 +170,7 @@ def test_anti_join( ) -> None: if "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) other = df.filter(filter_expr) result = df.join(other, how="anti", left_on=join_key, right_on=join_key) # type: ignore[arg-type] @@ -183,22 +183,22 @@ def test_anti_join( ( "antananarivo", (nw.col("bob") > 5), - {"antananarivo": [2], "bob": [6], "zorro": [9]}, + {"antananarivo": [2], "bob": [6], "zor ro": [9]}, ), ( ["antananarivo"], (nw.col("bob") > 5), - {"antananarivo": [2], "bob": [6], "zorro": [9]}, + {"antananarivo": [2], "bob": [6], "zor ro": [9]}, ), ( ["bob"], (nw.col("bob") < 5), - {"antananarivo": [1, 3], "bob": [4, 4], "zorro": [7, 8]}, + {"antananarivo": [1, 3], "bob": [4, 4], "zor ro": [7, 8]}, ), ( ["antananarivo", "bob"], (nw.col("bob") < 5), - {"antananarivo": [1, 3], "bob": [4, 4], "zorro": [7, 8]}, + {"antananarivo": [1, 3], "bob": [4, 4], "zor ro": [7, 8]}, ), ], ) @@ -208,7 +208,7 @@ def test_semi_join( filter_expr: nw.Expr, expected: dict[str, list[Any]], ) -> None: - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) other = df.filter(filter_expr) result = df.join(other, how="semi", left_on=join_key, right_on=join_key).sort( # type: ignore[arg-type] @@ -219,7 +219,7 @@ def test_semi_join( @pytest.mark.parametrize("how", ["right", "full"]) def test_join_not_implemented(constructor: Constructor, how: str) -> None: - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) with pytest.raises( @@ -333,7 +333,7 @@ def test_left_join_overlapping_column(constructor: Constructor) -> None: @pytest.mark.parametrize("how", ["inner", "left", "semi", "anti"]) def test_join_keys_exceptions(constructor: Constructor, how: str) -> None: - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) with pytest.raises( @@ -538,7 +538,7 @@ def test_joinasof_by( def test_joinasof_not_implemented( constructor: Constructor, strategy: Literal["backward", "forward"] ) -> None: - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) with pytest.raises( @@ -554,7 +554,7 @@ def test_joinasof_not_implemented( def test_joinasof_keys_exceptions(constructor: Constructor) -> None: - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) with pytest.raises( @@ -595,7 +595,7 @@ def test_joinasof_keys_exceptions(constructor: Constructor) -> None: def test_joinasof_by_exceptions(constructor: Constructor) -> None: - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) with pytest.raises( ValueError, diff --git a/tests/frame/sort_test.py b/tests/frame/sort_test.py index 5147c6f56..1ce3414c8 100644 --- a/tests/frame/sort_test.py +++ b/tests/frame/sort_test.py @@ -8,18 +8,18 @@ def test_sort(constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + data = {"an tan": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor(data)) - result = df.sort("a", "b") + result = df.sort("an tan", "b") expected = { - "a": [1, 2, 3], + "an tan": [1, 2, 3], "b": [4, 6, 4], "z": [7.0, 9.0, 8.0], } assert_equal_data(result, expected) - result = df.sort("a", "b", descending=[True, False]) + result = df.sort("an tan", "b", descending=[True, False]) expected = { - "a": [3, 2, 1], + "an tan": [3, 2, 1], "b": [4, 6, 4], "z": [8.0, 9.0, 7.0], } @@ -29,14 +29,14 @@ def test_sort(constructor: Constructor) -> None: @pytest.mark.parametrize( ("nulls_last", "expected"), [ - (True, {"a": [0, 2, 0, -1], "b": [3, 2, 1, None]}), - (False, {"a": [-1, 0, 2, 0], "b": [None, 3, 2, 1]}), + (True, {"antan desc": [0, 2, 0, -1], "b": [3, 2, 1, None]}), + (False, {"antan desc": [-1, 0, 2, 0], "b": [None, 3, 2, 1]}), ], ) def test_sort_nulls( constructor: Constructor, *, nulls_last: bool, expected: dict[str, float] ) -> None: - data = {"a": [0, 0, 2, -1], "b": [1, 3, 2, None]} + data = {"antan desc": [0, 0, 2, -1], "b": [1, 3, 2, None]} df = nw.from_native(constructor(data)) result = df.sort("b", descending=True, nulls_last=nulls_last) assert_equal_data(result, expected) From ab21e72c464f0ef04ba323b6970e3d03a2af548e Mon Sep 17 00:00:00 2001 From: Marvin Lopez Date: Thu, 9 Jan 2025 11:47:31 -0500 Subject: [PATCH 34/35] docs: Increase width for content (#1769) * Increased width for content. * Added left margin. --- docs/css/extra.css | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/css/extra.css b/docs/css/extra.css index 6d19bea30..9aee280d0 100644 --- a/docs/css/extra.css +++ b/docs/css/extra.css @@ -2,3 +2,7 @@ .md-typeset ul li { margin-bottom: 0.1em !important; } +.md-main__inner.md-grid { + max-width: initial; + margin-left: 5vw; +} From 20eb53b7548e412a652b291f7133ed8a544ac274 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Thu, 9 Jan 2025 17:54:03 +0100 Subject: [PATCH 35/35] chore: move pyspark tests into main test suite (#1761) * chore: move pyspark tests into main test suite * delay call to pyspark constructor * xfail from_dict, from_numpy * one more * feedback and tests * missing condition to xfail * move warnings to pyproject * statement order? * pragma no cover branch --------- Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- .github/workflows/pytest.yml | 4 + narwhals/_spark_like/dataframe.py | 13 +- narwhals/_spark_like/expr.py | 32 +- narwhals/_spark_like/group_by.py | 17 +- pyproject.toml | 5 + tests/conftest.py | 42 +- tests/expr_and_series/all_horizontal_test.py | 2 +- tests/expr_and_series/any_all_test.py | 7 +- tests/expr_and_series/any_horizontal_test.py | 10 +- tests/expr_and_series/arithmetic_test.py | 1 - tests/expr_and_series/binary_test.py | 4 +- tests/expr_and_series/cast_test.py | 8 +- tests/expr_and_series/concat_str_test.py | 2 +- .../expr_and_series/convert_time_zone_test.py | 2 + tests/expr_and_series/cum_count_test.py | 2 +- tests/expr_and_series/cum_max_test.py | 2 +- tests/expr_and_series/cum_min_test.py | 2 +- tests/expr_and_series/cum_prod_test.py | 2 +- tests/expr_and_series/cum_sum_test.py | 2 +- .../dt/datetime_attributes_test.py | 3 + .../dt/datetime_duration_test.py | 2 +- tests/expr_and_series/dt/to_string_test.py | 6 +- tests/expr_and_series/fill_null_test.py | 8 +- tests/expr_and_series/is_duplicated_test.py | 4 +- tests/expr_and_series/is_finite_test.py | 2 +- .../expr_and_series/is_first_distinct_test.py | 2 +- .../expr_and_series/is_last_distinct_test.py | 2 +- tests/expr_and_series/is_nan_test.py | 4 +- tests/expr_and_series/is_null_test.py | 7 +- tests/expr_and_series/is_unique_test.py | 4 +- tests/expr_and_series/len_test.py | 5 +- tests/expr_and_series/list/len_test.py | 4 +- tests/expr_and_series/lit_test.py | 17 +- tests/expr_and_series/max_horizontal_test.py | 12 +- tests/expr_and_series/mean_horizontal_test.py | 4 +- tests/expr_and_series/median_test.py | 2 +- tests/expr_and_series/min_horizontal_test.py | 12 +- tests/expr_and_series/n_unique_test.py | 6 +- tests/expr_and_series/name/keep_test.py | 19 +- tests/expr_and_series/name/map_test.py | 19 +- tests/expr_and_series/name/prefix_test.py | 19 +- tests/expr_and_series/name/suffix_test.py | 19 +- .../expr_and_series/name/to_lowercase_test.py | 19 +- tests/expr_and_series/nth_test.py | 2 +- tests/expr_and_series/null_count_test.py | 2 +- tests/expr_and_series/operators_test.py | 14 +- tests/expr_and_series/over_test.py | 18 +- tests/expr_and_series/quantile_test.py | 2 +- tests/expr_and_series/reduction_test.py | 26 +- tests/expr_and_series/replace_strict_test.py | 6 +- .../expr_and_series/replace_time_zone_test.py | 2 + tests/expr_and_series/shift_test.py | 2 +- tests/expr_and_series/str/contains_test.py | 16 +- tests/expr_and_series/str/head_test.py | 7 +- tests/expr_and_series/str/len_chars_test.py | 2 +- tests/expr_and_series/str/replace_test.py | 6 +- tests/expr_and_series/str/slice_test.py | 9 +- .../str/starts_with_ends_with_test.py | 12 +- tests/expr_and_series/str/strip_chars_test.py | 7 +- tests/expr_and_series/str/tail_test.py | 6 +- tests/expr_and_series/str/to_datetime_test.py | 6 +- .../str/to_uppercase_to_lowercase_test.py | 10 +- tests/expr_and_series/unary_test.py | 10 +- tests/expr_and_series/when_test.py | 30 +- tests/frame/clone_test.py | 2 +- tests/frame/concat_test.py | 11 +- tests/frame/explode_test.py | 8 +- tests/frame/gather_every_test.py | 2 +- tests/frame/join_test.py | 68 +- tests/frame/select_test.py | 4 +- tests/frame/tail_test.py | 5 +- tests/frame/unique_test.py | 7 + tests/frame/unpivot_test.py | 26 +- tests/frame/with_columns_test.py | 2 +- tests/frame/with_row_index_test.py | 2 +- tests/from_dict_test.py | 6 +- tests/from_numpy_test.py | 10 +- tests/group_by_test.py | 24 +- tests/read_scan_test.py | 12 + tests/selectors_test.py | 12 +- tests/spark_like_test.py | 1211 ----------------- tests/stable_api_test.py | 22 +- 82 files changed, 556 insertions(+), 1433 deletions(-) delete mode 100644 tests/spark_like_test.py diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index c89ab2cd7..bb46b4f0d 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -50,6 +50,10 @@ jobs: cache-dependency-glob: "pyproject.toml" - name: install-reqs run: uv pip install -e ".[dev, core, extra, dask, modin]" --system + - name: install pyspark + run: uv pip install -e ".[pyspark]" --system + # PySpark is not yet available on Python3.12+ + if: matrix.python-version != '3.12' - name: show-deps run: uv pip freeze - name: Run pytest diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index e54a05997..101d5ad24 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -1,5 +1,6 @@ from __future__ import annotations +from itertools import chain from typing import TYPE_CHECKING from typing import Any from typing import Iterable @@ -8,6 +9,7 @@ from narwhals._spark_like.utils import native_to_narwhals_dtype from narwhals._spark_like.utils import parse_exprs_and_named_exprs +from narwhals.exceptions import ColumnNotFoundError from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import parse_columns_to_drop @@ -106,9 +108,11 @@ def select( new_columns_list = [col.alias(col_name) for col_name, col in new_columns.items()] return self._from_native_frame(self._native_frame.select(*new_columns_list)) - def filter(self, *predicates: SparkLikeExpr) -> Self: + def filter(self, *predicates: SparkLikeExpr, **constraints: Any) -> Self: plx = self.__narwhals_namespace__() - expr = plx.all_horizontal(*predicates) + expr = plx.all_horizontal( + *chain(predicates, (plx.col(name) == v for name, v in constraints.items())) + ) # `[0]` is safe as all_horizontal's expression only returns a single column condition = expr._call(self)[0] spark_df = self._native_frame.where(condition) @@ -203,6 +207,11 @@ def unique( if keep != "any": msg = "`LazyFrame.unique` with PySpark backend only supports `keep='any'`." raise ValueError(msg) + + if subset is not None and any(x not in self.columns for x in subset): + msg = f"Column(s) {subset} not found in {self.columns}" + raise ColumnNotFoundError(msg) + subset = [subset] if isinstance(subset, str) else subset return self._from_native_frame(self._native_frame.dropDuplicates(subset=subset)) diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index 10fb76227..a8cafccfd 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -121,6 +121,22 @@ def func(df: SparkLikeLazyFrame) -> list[Column]: kwargs=kwargs, ) + def __eq__(self, other: SparkLikeExpr) -> Self: # type: ignore[override] + return self._from_call( + lambda _input, other: _input.__eq__(other), + "__eq__", + other=other, + returns_scalar=False, + ) + + def __ne__(self, other: SparkLikeExpr) -> Self: # type: ignore[override] + return self._from_call( + lambda _input, other: _input.__ne__(other), + "__ne__", + other=other, + returns_scalar=False, + ) + def __add__(self, other: SparkLikeExpr) -> Self: return self._from_call( lambda _input, other: _input.__add__(other), @@ -179,22 +195,6 @@ def __mod__(self, other: SparkLikeExpr) -> Self: returns_scalar=False, ) - def __eq__(self, other: SparkLikeExpr) -> Self: # type: ignore[override] - return self._from_call( - lambda _input, other: _input.__eq__(other), - "__eq__", - other=other, - returns_scalar=False, - ) - - def __ne__(self, other: SparkLikeExpr) -> Self: # type: ignore[override] - return self._from_call( - lambda _input, other: _input.__ne__(other), - "__ne__", - other=other, - returns_scalar=False, - ) - def __ge__(self, other: SparkLikeExpr) -> Self: return self._from_call( lambda _input, other: _input.__ge__(other), diff --git a/narwhals/_spark_like/group_by.py b/narwhals/_spark_like/group_by.py index 0100500ff..cbcf87692 100644 --- a/narwhals/_spark_like/group_by.py +++ b/narwhals/_spark_like/group_by.py @@ -80,6 +80,8 @@ def _from_native_frame(self, df: SparkLikeLazyFrame) -> SparkLikeLazyFrame: def get_spark_function(function_name: str, **kwargs: Any) -> Column: + from pyspark.sql import functions as F # noqa: N812 + if function_name in {"std", "var"}: import numpy as np # ignore-banned-import @@ -88,9 +90,15 @@ def get_spark_function(function_name: str, **kwargs: Any) -> Column: ddof=kwargs["ddof"], np_version=parse_version(np.__version__), ) - from pyspark.sql import functions as F # noqa: N812 + elif function_name == "len": + # Use count(*) to count all rows including nulls + def _count(*_args: Any, **_kwargs: Any) -> Column: + return F.count("*") - return getattr(F, function_name) + return _count + + else: + return getattr(F, function_name) def agg_pyspark( @@ -138,10 +146,7 @@ def agg_pyspark( raise AssertionError(msg) function_name = remove_prefix(expr._function_name, "col->") - pyspark_function = POLARS_TO_PYSPARK_AGGREGATIONS.get( - function_name, function_name - ) - agg_func = get_spark_function(pyspark_function, **expr._kwargs) + agg_func = get_spark_function(function_name, **expr._kwargs) simple_aggregations.update( { diff --git a/pyproject.toml b/pyproject.toml index bea188a59..91770923e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -164,11 +164,16 @@ filterwarnings = [ 'ignore:.*Passing a BlockManager to DataFrame:DeprecationWarning', # This warning was temporarily raised by Polars but then reverted. 'ignore:.*The default coalesce behavior of left join will change:DeprecationWarning', + 'ignore: unclosed IntoDataFrame: return pa.table(obj) # type: ignore[no-any-return] -@pytest.fixture(scope="session") -def spark_session() -> Generator[SparkSession, None, None]: # pragma: no cover +def pyspark_lazy_constructor() -> Callable[[Any], IntoFrame]: # pragma: no cover try: from pyspark.sql import SparkSession except ImportError: # pragma: no cover pytest.skip("pyspark is not installed") - return + return None import warnings + from atexit import register - os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" with warnings.catch_warnings(): # The spark session seems to trigger a polars warning. # Polars is imported in the tests, but not used in the spark operations warnings.filterwarnings( "ignore", r"Using fork\(\) can cause Polars", category=RuntimeWarning ) + session = ( SparkSession.builder.appName("unit-tests") .master("local[1]") @@ -155,8 +154,26 @@ def spark_session() -> Generator[SparkSession, None, None]: # pragma: no cover .config("spark.sql.shuffle.partitions", "2") .getOrCreate() ) - yield session - session.stop() + + register(session.stop) + + def _constructor(obj: Any) -> IntoFrame: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + r".*is_datetime64tz_dtype is deprecated and will be removed in a future version.*", + module="pyspark", + category=DeprecationWarning, + ) + pd_df = pd.DataFrame(obj).replace({float("nan"): None}).reset_index() + return ( # type: ignore[no-any-return] + session.createDataFrame(pd_df) + .repartition(2) + .orderBy("index") + .drop("index") + ) + + return _constructor EAGER_CONSTRUCTORS: dict[str, Callable[[Any], IntoDataFrame]] = { @@ -173,6 +190,7 @@ def spark_session() -> Generator[SparkSession, None, None]: # pragma: no cover "dask": dask_lazy_p2_constructor, "polars[lazy]": polars_lazy_constructor, "duckdb": duckdb_lazy_constructor, + "pyspark": pyspark_lazy_constructor, # type: ignore[dict-item] } GPU_CONSTRUCTORS: dict[str, Callable[[Any], IntoFrame]] = {"cudf": cudf_constructor} @@ -201,7 +219,13 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: constructors.append(EAGER_CONSTRUCTORS[constructor]) constructors_ids.append(constructor) elif constructor in LAZY_CONSTRUCTORS: - constructors.append(LAZY_CONSTRUCTORS[constructor]) + if constructor == "pyspark": + if sys.version_info < (3, 12): # pragma: no cover + constructors.append(pyspark_lazy_constructor()) + else: # pragma: no cover + continue + else: + constructors.append(LAZY_CONSTRUCTORS[constructor]) constructors_ids.append(constructor) else: # pragma: no cover msg = f"Expected one of {EAGER_CONSTRUCTORS.keys()} or {LAZY_CONSTRUCTORS.keys()}, got {constructor}" diff --git a/tests/expr_and_series/all_horizontal_test.py b/tests/expr_and_series/all_horizontal_test.py index 6eb98c3a3..826c0fe19 100644 --- a/tests/expr_and_series/all_horizontal_test.py +++ b/tests/expr_and_series/all_horizontal_test.py @@ -57,7 +57,7 @@ def test_allh_nth( ) -> None: if "polars" in str(constructor) and POLARS_VERSION < (1, 0): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data = { "a": [False, False, True], diff --git a/tests/expr_and_series/any_all_test.py b/tests/expr_and_series/any_all_test.py index c5f22ad9a..7fd81f04d 100644 --- a/tests/expr_and_series/any_all_test.py +++ b/tests/expr_and_series/any_all_test.py @@ -1,12 +1,17 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data -def test_any_all(constructor: Constructor) -> None: +def test_any_all(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native( constructor( { diff --git a/tests/expr_and_series/any_horizontal_test.py b/tests/expr_and_series/any_horizontal_test.py index 4eb082b51..06157f393 100644 --- a/tests/expr_and_series/any_horizontal_test.py +++ b/tests/expr_and_series/any_horizontal_test.py @@ -11,7 +11,11 @@ @pytest.mark.parametrize("expr1", ["a", nw.col("a")]) @pytest.mark.parametrize("expr2", ["b", nw.col("b")]) -def test_anyh(constructor: Constructor, expr1: Any, expr2: Any) -> None: +def test_anyh( + request: pytest.FixtureRequest, constructor: Constructor, expr1: Any, expr2: Any +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "a": [False, False, True], "b": [False, True, True], @@ -23,7 +27,9 @@ def test_anyh(constructor: Constructor, expr1: Any, expr2: Any) -> None: assert_equal_data(result, expected) -def test_anyh_all(constructor: Constructor) -> None: +def test_anyh_all(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "a": [False, False, True], "b": [False, True, True], diff --git a/tests/expr_and_series/arithmetic_test.py b/tests/expr_and_series/arithmetic_test.py index aec586c62..1baae44e5 100644 --- a/tests/expr_and_series/arithmetic_test.py +++ b/tests/expr_and_series/arithmetic_test.py @@ -76,7 +76,6 @@ def test_right_arithmetic_expr( x in str(constructor) for x in ["pandas_pyarrow", "modin_pyarrow"] ): request.applymarker(pytest.mark.xfail) - data = {"a": [1, 2, 3]} df = nw.from_native(constructor(data)) result = df.select(getattr(nw.col("a"), attr)(rhs)) diff --git a/tests/expr_and_series/binary_test.py b/tests/expr_and_series/binary_test.py index 0808810bc..308745cb4 100644 --- a/tests/expr_and_series/binary_test.py +++ b/tests/expr_and_series/binary_test.py @@ -9,7 +9,9 @@ def test_expr_binary(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "dask" in str(constructor) and DASK_VERSION < (2024, 10): + if ("dask" in str(constructor) and DASK_VERSION < (2024, 10)) or "pyspark" in str( + constructor + ): request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_raw = constructor(data) diff --git a/tests/expr_and_series/cast_test.py b/tests/expr_and_series/cast_test.py index b6ce43573..ba2b82493 100644 --- a/tests/expr_and_series/cast_test.py +++ b/tests/expr_and_series/cast_test.py @@ -60,7 +60,7 @@ def test_cast( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) if "pyarrow_table_constructor" in str(constructor) and PYARROW_VERSION <= ( 15, @@ -180,7 +180,7 @@ def test_cast_string() -> None: def test_cast_raises_for_unknown_dtype( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (15,): # Unsupported cast from string to dictionary using function cast_dictionary @@ -204,6 +204,7 @@ def test_cast_datetime_tz_aware( or "duckdb" in str(constructor) or "cudf" in str(constructor) # https://github.com/rapidsai/cudf/issues/16973 or ("pyarrow_table" in str(constructor) and is_windows()) + or ("pyspark" in str(constructor)) ): request.applymarker(pytest.mark.xfail) @@ -229,7 +230,8 @@ def test_cast_datetime_tz_aware( def test_cast_struct(request: pytest.FixtureRequest, constructor: Constructor) -> None: if any( - backend in str(constructor) for backend in ("dask", "modin", "cudf", "duckdb") + backend in str(constructor) + for backend in ("dask", "modin", "cudf", "duckdb", "pyspark") ): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/concat_str_test.py b/tests/expr_and_series/concat_str_test.py index 7c9f259ba..37d4a581d 100644 --- a/tests/expr_and_series/concat_str_test.py +++ b/tests/expr_and_series/concat_str_test.py @@ -27,7 +27,7 @@ def test_concat_str( expected: list[str], request: pytest.FixtureRequest, ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = ( diff --git a/tests/expr_and_series/convert_time_zone_test.py b/tests/expr_and_series/convert_time_zone_test.py index 6b3cf5b41..9a18ee07f 100644 --- a/tests/expr_and_series/convert_time_zone_test.py +++ b/tests/expr_and_series/convert_time_zone_test.py @@ -29,6 +29,7 @@ def test_convert_time_zone( or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1)) or ("cudf" in str(constructor)) or ("duckdb" in str(constructor)) + or ("pyspark" in str(constructor)) ): request.applymarker(pytest.mark.xfail) data = { @@ -86,6 +87,7 @@ def test_convert_time_zone_from_none( or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) or ("cudf" in str(constructor)) or ("duckdb" in str(constructor)) + or ("pyspark" in str(constructor)) ): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) and POLARS_VERSION < (0, 20, 7): diff --git a/tests/expr_and_series/cum_count_test.py b/tests/expr_and_series/cum_count_test.py index 1a2377f34..dab77ebbc 100644 --- a/tests/expr_and_series/cum_count_test.py +++ b/tests/expr_and_series/cum_count_test.py @@ -21,7 +21,7 @@ def test_cum_count_expr( ) -> None: if "dask" in str(constructor) and reverse: request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) name = "reverse_cum_count" if reverse else "cum_count" diff --git a/tests/expr_and_series/cum_max_test.py b/tests/expr_and_series/cum_max_test.py index 22b7c73fa..3df5a6ad4 100644 --- a/tests/expr_and_series/cum_max_test.py +++ b/tests/expr_and_series/cum_max_test.py @@ -23,7 +23,7 @@ def test_cum_max_expr( ) -> None: if "dask" in str(constructor) and reverse: request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) if PYARROW_VERSION < (13, 0, 0) and "pyarrow_table" in str(constructor): diff --git a/tests/expr_and_series/cum_min_test.py b/tests/expr_and_series/cum_min_test.py index b34672219..a758dc8b4 100644 --- a/tests/expr_and_series/cum_min_test.py +++ b/tests/expr_and_series/cum_min_test.py @@ -23,7 +23,7 @@ def test_cum_min_expr( ) -> None: if "dask" in str(constructor) and reverse: request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) if PYARROW_VERSION < (13, 0, 0) and "pyarrow_table" in str(constructor): diff --git a/tests/expr_and_series/cum_prod_test.py b/tests/expr_and_series/cum_prod_test.py index 4dd5207dc..2d6861b8d 100644 --- a/tests/expr_and_series/cum_prod_test.py +++ b/tests/expr_and_series/cum_prod_test.py @@ -23,7 +23,7 @@ def test_cum_prod_expr( ) -> None: if "dask" in str(constructor) and reverse: request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) if PYARROW_VERSION < (13, 0, 0) and "pyarrow_table" in str(constructor): diff --git a/tests/expr_and_series/cum_sum_test.py b/tests/expr_and_series/cum_sum_test.py index 5878222fb..8a419c9a9 100644 --- a/tests/expr_and_series/cum_sum_test.py +++ b/tests/expr_and_series/cum_sum_test.py @@ -18,7 +18,7 @@ def test_cum_sum_expr( request: pytest.FixtureRequest, constructor: Constructor, *, reverse: bool ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) if "dask" in str(constructor) and reverse: request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/dt/datetime_attributes_test.py b/tests/expr_and_series/dt/datetime_attributes_test.py index e1af276e4..9f578d3c1 100644 --- a/tests/expr_and_series/dt/datetime_attributes_test.py +++ b/tests/expr_and_series/dt/datetime_attributes_test.py @@ -51,6 +51,8 @@ def test_datetime_attributes( request.applymarker(pytest.mark.xfail) if "duckdb" in str(constructor) and attribute in ("date", "weekday", "ordinal_day"): request.applymarker(pytest.mark.xfail) + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(getattr(nw.col("a").dt, attribute)()) @@ -121,6 +123,7 @@ def test_to_date(request: pytest.FixtureRequest, constructor: Constructor) -> No "cudf", "modin_constructor", "duckdb", + "pyspark", ) ): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/dt/datetime_duration_test.py b/tests/expr_and_series/dt/datetime_duration_test.py index bda3e4703..7ec281daa 100644 --- a/tests/expr_and_series/dt/datetime_duration_test.py +++ b/tests/expr_and_series/dt/datetime_duration_test.py @@ -46,7 +46,7 @@ def test_duration_attributes( ) -> None: if PANDAS_VERSION < (2, 2) and "pandas_pyarrow" in str(constructor): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/dt/to_string_test.py b/tests/expr_and_series/dt/to_string_test.py index 6fa500024..3cc3f0edd 100644 --- a/tests/expr_and_series/dt/to_string_test.py +++ b/tests/expr_and_series/dt/to_string_test.py @@ -62,7 +62,7 @@ def test_dt_to_string_series(constructor_eager: ConstructorEager, fmt: str) -> N def test_dt_to_string_expr( constructor: Constructor, fmt: str, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) input_frame = nw.from_native(constructor(data)) @@ -141,7 +141,7 @@ def test_dt_to_string_iso_local_datetime_expr( expected: str, request: pytest.FixtureRequest, ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = constructor({"a": [data]}) @@ -180,7 +180,7 @@ def test_dt_to_string_iso_local_date_expr( expected: str, request: pytest.FixtureRequest, ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = constructor({"a": [data]}) result = nw.from_native(df).with_columns( diff --git a/tests/expr_and_series/fill_null_test.py b/tests/expr_and_series/fill_null_test.py index 58ef5c890..39b0a3c64 100644 --- a/tests/expr_and_series/fill_null_test.py +++ b/tests/expr_and_series/fill_null_test.py @@ -12,7 +12,9 @@ from tests.utils import assert_equal_data -def test_fill_null(constructor: Constructor) -> None: +def test_fill_null(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "a": [0.0, None, 2, 3, 4], "b": [1.0, None, None, 5, 3], @@ -50,7 +52,7 @@ def test_fill_null_exceptions(constructor: Constructor) -> None: def test_fill_null_strategies_with_limit_as_none( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data_limits = { "a": [1, None, None, None, 5, 6, None, None, None, 10], @@ -120,7 +122,7 @@ def test_fill_null_strategies_with_limit_as_none( def test_fill_null_limits( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) context: Any = ( pytest.raises(NotImplementedError, match="The limit keyword is not supported") diff --git a/tests/expr_and_series/is_duplicated_test.py b/tests/expr_and_series/is_duplicated_test.py index fe8b45bf1..d97d30cbd 100644 --- a/tests/expr_and_series/is_duplicated_test.py +++ b/tests/expr_and_series/is_duplicated_test.py @@ -11,7 +11,7 @@ def test_is_duplicated_expr( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data = {"a": [1, 1, 2], "b": [1, 2, 3], "index": [0, 1, 2]} df = nw.from_native(constructor(data)) @@ -23,7 +23,7 @@ def test_is_duplicated_expr( def test_is_duplicated_w_nulls_expr( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data = {"a": [1, 1, None], "b": [1, None, None], "index": [0, 1, 2]} df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/is_finite_test.py b/tests/expr_and_series/is_finite_test.py index 7718ed1a7..4fb0246e9 100644 --- a/tests/expr_and_series/is_finite_test.py +++ b/tests/expr_and_series/is_finite_test.py @@ -12,7 +12,7 @@ @pytest.mark.filterwarnings("ignore:invalid value encountered in cast") def test_is_finite_expr(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) or "pyarrow_table" in str(constructor): expected = {"a": [False, False, True, None]} diff --git a/tests/expr_and_series/is_first_distinct_test.py b/tests/expr_and_series/is_first_distinct_test.py index 786f2ade7..6870c3394 100644 --- a/tests/expr_and_series/is_first_distinct_test.py +++ b/tests/expr_and_series/is_first_distinct_test.py @@ -16,7 +16,7 @@ def test_is_first_distinct_expr( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.all().is_first_distinct()) diff --git a/tests/expr_and_series/is_last_distinct_test.py b/tests/expr_and_series/is_last_distinct_test.py index c5d73c8d7..9362cd02a 100644 --- a/tests/expr_and_series/is_last_distinct_test.py +++ b/tests/expr_and_series/is_last_distinct_test.py @@ -16,7 +16,7 @@ def test_is_last_distinct_expr( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.all().is_last_distinct()) diff --git a/tests/expr_and_series/is_nan_test.py b/tests/expr_and_series/is_nan_test.py index 7bae35a52..0280d6555 100644 --- a/tests/expr_and_series/is_nan_test.py +++ b/tests/expr_and_series/is_nan_test.py @@ -25,7 +25,7 @@ def test_nan(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data_na = {"int": [0, 1, None]} df = nw.from_native(constructor(data_na)).with_columns( @@ -96,7 +96,7 @@ def test_nan_series(constructor_eager: ConstructorEager) -> None: def test_nan_non_float(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) from polars.exceptions import InvalidOperationError as PlInvalidOperationError from pyarrow.lib import ArrowNotImplementedError diff --git a/tests/expr_and_series/is_null_test.py b/tests/expr_and_series/is_null_test.py index 5d5250da9..cf4d2e73b 100644 --- a/tests/expr_and_series/is_null_test.py +++ b/tests/expr_and_series/is_null_test.py @@ -1,12 +1,17 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data -def test_null(constructor: Constructor) -> None: +def test_null(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + data_na = {"a": [None, 3, 2], "z": [7.0, None, None]} expected = {"a": [True, False, False], "z": [True, False, False]} df = nw.from_native(constructor(data_na)) diff --git a/tests/expr_and_series/is_unique_test.py b/tests/expr_and_series/is_unique_test.py index 3e9259c03..92e725623 100644 --- a/tests/expr_and_series/is_unique_test.py +++ b/tests/expr_and_series/is_unique_test.py @@ -9,7 +9,7 @@ def test_is_unique_expr(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data = { "a": [1, 1, 2], @@ -29,7 +29,7 @@ def test_is_unique_expr(constructor: Constructor, request: pytest.FixtureRequest def test_is_unique_w_nulls_expr( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data = { "a": [None, 1, 2], diff --git a/tests/expr_and_series/len_test.py b/tests/expr_and_series/len_test.py index fffcbd4a3..142fe488b 100644 --- a/tests/expr_and_series/len_test.py +++ b/tests/expr_and_series/len_test.py @@ -34,7 +34,10 @@ def test_len_chaining( assert_equal_data(df, expected) -def test_namespace_len(constructor: Constructor) -> None: +def test_namespace_len(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})).select( nw.len(), a=nw.len() ) diff --git a/tests/expr_and_series/list/len_test.py b/tests/expr_and_series/list/len_test.py index 7066fc6cf..375cfc7d8 100644 --- a/tests/expr_and_series/list/len_test.py +++ b/tests/expr_and_series/list/len_test.py @@ -17,7 +17,9 @@ def test_len_expr( request: pytest.FixtureRequest, constructor: Constructor, ) -> None: - if any(backend in str(constructor) for backend in ("dask", "modin", "cudf")): + if any( + backend in str(constructor) for backend in ("dask", "modin", "cudf", "pyspark") + ): request.applymarker(pytest.mark.xfail) if "pandas" in str(constructor) and PANDAS_VERSION < (2, 2): diff --git a/tests/expr_and_series/lit_test.py b/tests/expr_and_series/lit_test.py index 505d99bf8..f24e6d4a1 100644 --- a/tests/expr_and_series/lit_test.py +++ b/tests/expr_and_series/lit_test.py @@ -22,8 +22,13 @@ [(None, [2, 2, 2]), (nw.String, ["2", "2", "2"]), (nw.Float32, [2.0, 2.0, 2.0])], ) def test_lit( - constructor: Constructor, dtype: DType | None, expected_lit: list[Any] + request: pytest.FixtureRequest, + constructor: Constructor, + dtype: DType | None, + expected_lit: list[Any], ) -> None: + if "pyspark" in str(constructor) and dtype is not None: + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_raw = constructor(data) df = nw.from_native(df_raw).lazy() @@ -100,6 +105,14 @@ def test_lit_operation( and DASK_VERSION < (2024, 10) ): request.applymarker(pytest.mark.xfail) + if "pyspark" in str(constructor) and col_name in { + "left_lit_with_agg", + "left_scalar_with_agg", + "right_lit_with_agg", + "right_lit", + }: + request.applymarker(pytest.mark.xfail) + data = {"a": [1, 3, 2]} df_raw = constructor(data) df = nw.from_native(df_raw).lazy() @@ -110,7 +123,7 @@ def test_lit_operation( @pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="too old for pyarrow") def test_date_lit(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): # https://github.com/dask/dask/issues/11637 request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1]})) diff --git a/tests/expr_and_series/max_horizontal_test.py b/tests/expr_and_series/max_horizontal_test.py index c86e11318..9df17fed3 100644 --- a/tests/expr_and_series/max_horizontal_test.py +++ b/tests/expr_and_series/max_horizontal_test.py @@ -14,7 +14,12 @@ @pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) @pytest.mark.filterwarnings(r"ignore:.*All-NaN slice encountered:RuntimeWarning") -def test_maxh(constructor: Constructor, col_expr: Any) -> None: +def test_maxh( + request: pytest.FixtureRequest, constructor: Constructor, col_expr: Any +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select(horizontal_max=nw.max_horizontal(col_expr, nw.col("b"), "z")) expected = {"horizontal_max": expected_values} @@ -22,7 +27,10 @@ def test_maxh(constructor: Constructor, col_expr: Any) -> None: @pytest.mark.filterwarnings(r"ignore:.*All-NaN slice encountered:RuntimeWarning") -def test_maxh_all(constructor: Constructor) -> None: +def test_maxh_all(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select(nw.max_horizontal(nw.all()), c=nw.max_horizontal(nw.all())) expected = {"a": expected_values, "c": expected_values} diff --git a/tests/expr_and_series/mean_horizontal_test.py b/tests/expr_and_series/mean_horizontal_test.py index c1652c837..5ed472e31 100644 --- a/tests/expr_and_series/mean_horizontal_test.py +++ b/tests/expr_and_series/mean_horizontal_test.py @@ -13,7 +13,7 @@ def test_meanh( constructor: Constructor, col_expr: Any, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, None, None], "b": [4, None, 6, None]} df = nw.from_native(constructor(data)) @@ -23,7 +23,7 @@ def test_meanh( def test_meanh_all(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data = {"a": [2, 4, 6], "b": [10, 20, 30]} df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/median_test.py b/tests/expr_and_series/median_test.py index b0b6edcba..9c509a182 100644 --- a/tests/expr_and_series/median_test.py +++ b/tests/expr_and_series/median_test.py @@ -43,7 +43,7 @@ def test_median_series( def test_median_expr_raises_on_str( constructor: Constructor, expr: nw.Expr, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) from polars.exceptions import InvalidOperationError as PlInvalidOperationError diff --git a/tests/expr_and_series/min_horizontal_test.py b/tests/expr_and_series/min_horizontal_test.py index 787e3e2a4..bbb0b9149 100644 --- a/tests/expr_and_series/min_horizontal_test.py +++ b/tests/expr_and_series/min_horizontal_test.py @@ -14,7 +14,12 @@ @pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) @pytest.mark.filterwarnings(r"ignore:.*All-NaN slice encountered:RuntimeWarning") -def test_minh(constructor: Constructor, col_expr: Any) -> None: +def test_minh( + request: pytest.FixtureRequest, constructor: Constructor, col_expr: Any +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select(horizontal_min=nw.min_horizontal(col_expr, nw.col("b"), "z")) expected = {"horizontal_min": expected_values} @@ -22,7 +27,10 @@ def test_minh(constructor: Constructor, col_expr: Any) -> None: @pytest.mark.filterwarnings(r"ignore:.*All-NaN slice encountered:RuntimeWarning") -def test_minh_all(constructor: Constructor) -> None: +def test_minh_all(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select(nw.min_horizontal(nw.all()), c=nw.min_horizontal(nw.all())) expected = {"a": expected_values, "c": expected_values} diff --git a/tests/expr_and_series/n_unique_test.py b/tests/expr_and_series/n_unique_test.py index 90bffb04b..cfa14e0d7 100644 --- a/tests/expr_and_series/n_unique_test.py +++ b/tests/expr_and_series/n_unique_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -11,7 +13,9 @@ } -def test_n_unique(constructor: Constructor) -> None: +def test_n_unique(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.all().n_unique()) expected = {"a": [3], "b": [4]} diff --git a/tests/expr_and_series/name/keep_test.py b/tests/expr_and_series/name/keep_test.py index 6c89d09fc..e382db733 100644 --- a/tests/expr_and_series/name/keep_test.py +++ b/tests/expr_and_series/name/keep_test.py @@ -12,21 +12,34 @@ data = {"foo": [1, 2, 3], "BAR": [4, 5, 6]} -def test_keep(constructor: Constructor) -> None: +def test_keep(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo", "BAR") * 2).name.keep()) expected = {k: [e * 2 for e in v] for k, v in data.items()} assert_equal_data(result, expected) -def test_keep_after_alias(constructor: Constructor) -> None: +def test_keep_after_alias( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo")).alias("alias_for_foo").name.keep()) expected = {"foo": data["foo"]} assert_equal_data(result, expected) -def test_keep_raise_anonymous(constructor: Constructor) -> None: +def test_keep_raise_anonymous( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df_raw = constructor(data) df = nw.from_native(df_raw) diff --git a/tests/expr_and_series/name/map_test.py b/tests/expr_and_series/name/map_test.py index 5afda2ee8..276138ef9 100644 --- a/tests/expr_and_series/name/map_test.py +++ b/tests/expr_and_series/name/map_test.py @@ -16,21 +16,34 @@ def map_func(s: str | None) -> str: return str(s)[::-1].lower() -def test_map(constructor: Constructor) -> None: +def test_map(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo", "BAR") * 2).name.map(function=map_func)) expected = {map_func(k): [e * 2 for e in v] for k, v in data.items()} assert_equal_data(result, expected) -def test_map_after_alias(constructor: Constructor) -> None: +def test_map_after_alias( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo")).alias("alias_for_foo").name.map(function=map_func)) expected = {map_func("foo"): data["foo"]} assert_equal_data(result, expected) -def test_map_raise_anonymous(constructor: Constructor) -> None: +def test_map_raise_anonymous( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df_raw = constructor(data) df = nw.from_native(df_raw) diff --git a/tests/expr_and_series/name/prefix_test.py b/tests/expr_and_series/name/prefix_test.py index 6f3fb3c9b..934d1d664 100644 --- a/tests/expr_and_series/name/prefix_test.py +++ b/tests/expr_and_series/name/prefix_test.py @@ -13,21 +13,34 @@ prefix = "with_prefix_" -def test_prefix(constructor: Constructor) -> None: +def test_prefix(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo", "BAR") * 2).name.prefix(prefix)) expected = {prefix + str(k): [e * 2 for e in v] for k, v in data.items()} assert_equal_data(result, expected) -def test_suffix_after_alias(constructor: Constructor) -> None: +def test_suffix_after_alias( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo")).alias("alias_for_foo").name.prefix(prefix)) expected = {prefix + "foo": data["foo"]} assert_equal_data(result, expected) -def test_prefix_raise_anonymous(constructor: Constructor) -> None: +def test_prefix_raise_anonymous( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df_raw = constructor(data) df = nw.from_native(df_raw) diff --git a/tests/expr_and_series/name/suffix_test.py b/tests/expr_and_series/name/suffix_test.py index 1c5816154..479546630 100644 --- a/tests/expr_and_series/name/suffix_test.py +++ b/tests/expr_and_series/name/suffix_test.py @@ -13,21 +13,34 @@ suffix = "_with_suffix" -def test_suffix(constructor: Constructor) -> None: +def test_suffix(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo", "BAR") * 2).name.suffix(suffix)) expected = {str(k) + suffix: [e * 2 for e in v] for k, v in data.items()} assert_equal_data(result, expected) -def test_suffix_after_alias(constructor: Constructor) -> None: +def test_suffix_after_alias( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo")).alias("alias_for_foo").name.suffix(suffix)) expected = {"foo" + suffix: data["foo"]} assert_equal_data(result, expected) -def test_suffix_raise_anonymous(constructor: Constructor) -> None: +def test_suffix_raise_anonymous( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df_raw = constructor(data) df = nw.from_native(df_raw) diff --git a/tests/expr_and_series/name/to_lowercase_test.py b/tests/expr_and_series/name/to_lowercase_test.py index 882663f60..1b39fc726 100644 --- a/tests/expr_and_series/name/to_lowercase_test.py +++ b/tests/expr_and_series/name/to_lowercase_test.py @@ -12,21 +12,34 @@ data = {"foo": [1, 2, 3], "BAR": [4, 5, 6]} -def test_to_lowercase(constructor: Constructor) -> None: +def test_to_lowercase(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo", "BAR") * 2).name.to_lowercase()) expected = {k.lower(): [e * 2 for e in v] for k, v in data.items()} assert_equal_data(result, expected) -def test_to_lowercase_after_alias(constructor: Constructor) -> None: +def test_to_lowercase_after_alias( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("BAR")).alias("ALIAS_FOR_BAR").name.to_lowercase()) expected = {"bar": data["BAR"]} assert_equal_data(result, expected) -def test_to_lowercase_raise_anonymous(constructor: Constructor) -> None: +def test_to_lowercase_raise_anonymous( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df_raw = constructor(data) df = nw.from_native(df_raw) diff --git a/tests/expr_and_series/nth_test.py b/tests/expr_and_series/nth_test.py index 4dd453528..a7dc7f648 100644 --- a/tests/expr_and_series/nth_test.py +++ b/tests/expr_and_series/nth_test.py @@ -25,7 +25,7 @@ def test_nth( expected: dict[str, list[int]], request: pytest.FixtureRequest, ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) and POLARS_VERSION < (1, 0, 0): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/null_count_test.py b/tests/expr_and_series/null_count_test.py index d10258901..3bd15c66c 100644 --- a/tests/expr_and_series/null_count_test.py +++ b/tests/expr_and_series/null_count_test.py @@ -16,7 +16,7 @@ def test_null_count_expr( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.all().null_count()) diff --git a/tests/expr_and_series/operators_test.py b/tests/expr_and_series/operators_test.py index 356d81d5b..f36d853d4 100644 --- a/tests/expr_and_series/operators_test.py +++ b/tests/expr_and_series/operators_test.py @@ -21,7 +21,9 @@ ], ) def test_comparand_operators_scalar_expr( - constructor: Constructor, operator: str, expected: list[bool] + constructor: Constructor, + operator: str, + expected: list[bool], ) -> None: data = {"a": [0, 1, 2]} df = nw.from_native(constructor(data)) @@ -41,7 +43,9 @@ def test_comparand_operators_scalar_expr( ], ) def test_comparand_operators_expr( - constructor: Constructor, operator: str, expected: list[bool] + constructor: Constructor, + operator: str, + expected: list[bool], ) -> None: data = {"a": [0, 1, 1], "b": [0, 0, 2]} df = nw.from_native(constructor(data)) @@ -57,7 +61,9 @@ def test_comparand_operators_expr( ], ) def test_logic_operators_expr( - constructor: Constructor, operator: str, expected: list[bool] + constructor: Constructor, + operator: str, + expected: list[bool], ) -> None: data = {"a": [True, True, False, False], "b": [True, False, True, False]} df = nw.from_native(constructor(data)) @@ -85,7 +91,7 @@ def test_logic_operators_expr_scalar( "dask" in str(constructor) and DASK_VERSION < (2024, 10) and operator in ("__rand__", "__ror__") - ): + ) or ("pyspark" in str(constructor) and operator in ("__and__", "__or__")): request.applymarker(pytest.mark.xfail) data = {"a": [True, True, False, False]} df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/over_test.py b/tests/expr_and_series/over_test.py index f42bdca54..45b64eba0 100644 --- a/tests/expr_and_series/over_test.py +++ b/tests/expr_and_series/over_test.py @@ -24,7 +24,7 @@ def test_over_single(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "dask_lazy_p2" in str(constructor): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) @@ -42,7 +42,7 @@ def test_over_single(request: pytest.FixtureRequest, constructor: Constructor) - def test_over_multiple(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "dask_lazy_p2" in str(constructor): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) @@ -60,7 +60,7 @@ def test_over_multiple(request: pytest.FixtureRequest, constructor: Constructor) def test_over_invalid(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "polars" in str(constructor): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) @@ -73,7 +73,7 @@ def test_over_cumsum(request: pytest.FixtureRequest, constructor: Constructor) - request.applymarker(pytest.mark.xfail) if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_cum)) @@ -92,7 +92,7 @@ def test_over_cumsum(request: pytest.FixtureRequest, constructor: Constructor) - def test_over_cumcount(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "pyarrow_table" in str(constructor) or "dask_lazy_p2" in str(constructor): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_cum)) @@ -115,7 +115,7 @@ def test_over_cummax(request: pytest.FixtureRequest, constructor: Constructor) - request.applymarker(pytest.mark.xfail) if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_cum)) expected = { @@ -134,7 +134,7 @@ def test_over_cummin(request: pytest.FixtureRequest, constructor: Constructor) - request.applymarker(pytest.mark.xfail) if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_cum)) @@ -155,7 +155,7 @@ def test_over_cumprod(request: pytest.FixtureRequest, constructor: Constructor) request.applymarker(pytest.mark.xfail) if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_cum)) @@ -184,7 +184,7 @@ def test_over_shift(request: pytest.FixtureRequest, constructor: Constructor) -> constructor ) or "dask_lazy_p2_constructor" in str(constructor): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/quantile_test.py b/tests/expr_and_series/quantile_test.py index d52fae16c..a9207cebd 100644 --- a/tests/expr_and_series/quantile_test.py +++ b/tests/expr_and_series/quantile_test.py @@ -31,7 +31,7 @@ def test_quantile_expr( if ( any(x in str(constructor) for x in ("dask", "duckdb")) and interpolation != "linear" - ): + ) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) q = 0.3 diff --git a/tests/expr_and_series/reduction_test.py b/tests/expr_and_series/reduction_test.py index 4f2faa0ce..49a3fddba 100644 --- a/tests/expr_and_series/reduction_test.py +++ b/tests/expr_and_series/reduction_test.py @@ -28,11 +28,21 @@ ids=range(5), ) def test_scalar_reduction_select( - constructor: Constructor, expr: list[Any], expected: dict[str, list[Any]] + constructor: Constructor, + expr: list[Any], + expected: dict[str, list[Any]], + request: pytest.FixtureRequest, ) -> None: - if "duckdb" in str(constructor): - # First one passes, the others fail. - return + if "pyspark" in str(constructor) and request.node.callspec.id in { + "pyspark-2", + "pyspark-3", + "pyspark-4", + }: + request.applymarker(pytest.mark.xfail) + + if "duckdb" in str(constructor) and request.node.callspec.id not in {"duckdb-0"}: + request.applymarker(pytest.mark.xfail) + data = {"a": [1, 2, 3], "b": [4, 5, 6]} df = nw.from_native(constructor(data)) result = df.select(*expr) @@ -62,7 +72,9 @@ def test_scalar_reduction_with_columns( expected: dict[str, list[Any]], request: pytest.FixtureRequest, ) -> None: - if "duckdb" in str(constructor): + if "duckdb" in str(constructor) or ( + "pyspark" in str(constructor) and request.node.callspec.id != "pyspark-1" + ): request.applymarker(pytest.mark.xfail) data = {"a": [1, 2, 3], "b": [4, 5, 6]} df = nw.from_native(constructor(data)) @@ -73,7 +85,7 @@ def test_scalar_reduction_with_columns( def test_empty_scalar_reduction_select( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if "pyspark" in str(constructor) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data = { "str": [*"abcde"], @@ -106,7 +118,7 @@ def test_empty_scalar_reduction_select( def test_empty_scalar_reduction_with_columns( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if "pyspark" in str(constructor) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) from itertools import chain diff --git a/tests/expr_and_series/replace_strict_test.py b/tests/expr_and_series/replace_strict_test.py index 07e349bc6..33c56bae6 100644 --- a/tests/expr_and_series/replace_strict_test.py +++ b/tests/expr_and_series/replace_strict_test.py @@ -23,7 +23,7 @@ def test_replace_strict( ) -> None: if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 2, 3]})) result = df.select( @@ -60,7 +60,7 @@ def test_replace_non_full( if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 2, 3]})) if isinstance(df, nw.LazyFrame): @@ -81,7 +81,7 @@ def test_replace_strict_mapping( ) -> None: if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 2, 3]})) diff --git a/tests/expr_and_series/replace_time_zone_test.py b/tests/expr_and_series/replace_time_zone_test.py index 132c4efc5..6876c318a 100644 --- a/tests/expr_and_series/replace_time_zone_test.py +++ b/tests/expr_and_series/replace_time_zone_test.py @@ -27,6 +27,7 @@ def test_replace_time_zone( or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) or ("cudf" in str(constructor)) or ("duckdb" in str(constructor)) + or ("pyspark" in str(constructor)) ): request.applymarker(pytest.mark.xfail) data = { @@ -54,6 +55,7 @@ def test_replace_time_zone_none( or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) or ("duckdb" in str(constructor)) + or ("pyspark" in str(constructor)) ): request.applymarker(pytest.mark.xfail) data = { diff --git a/tests/expr_and_series/shift_test.py b/tests/expr_and_series/shift_test.py index 07f5d2b58..4f7894939 100644 --- a/tests/expr_and_series/shift_test.py +++ b/tests/expr_and_series/shift_test.py @@ -17,7 +17,7 @@ def test_shift(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.with_columns(nw.col("a", "b", "c").shift(2)).filter(nw.col("i") > 1) diff --git a/tests/expr_and_series/str/contains_test.py b/tests/expr_and_series/str/contains_test.py index 06c6913aa..c1024d53a 100644 --- a/tests/expr_and_series/str/contains_test.py +++ b/tests/expr_and_series/str/contains_test.py @@ -13,7 +13,7 @@ def test_contains_case_insensitive( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "cudf" in str(constructor): + if "cudf" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) @@ -40,7 +40,12 @@ def test_contains_series_case_insensitive( assert_equal_data(result, expected) -def test_contains_case_sensitive(constructor: Constructor) -> None: +def test_contains_case_sensitive( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select(nw.col("pets").str.contains("parrot|Dove").alias("default_match")) expected = { @@ -58,7 +63,12 @@ def test_contains_series_case_sensitive(constructor_eager: ConstructorEager) -> assert_equal_data(result, expected) -def test_contains_literal(constructor: Constructor) -> None: +def test_contains_literal( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select( nw.col("pets").str.contains("Parrot|dove").alias("default_match"), diff --git a/tests/expr_and_series/str/head_test.py b/tests/expr_and_series/str/head_test.py index cf6cbd758..97fbbc6f3 100644 --- a/tests/expr_and_series/str/head_test.py +++ b/tests/expr_and_series/str/head_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -8,7 +10,10 @@ data = {"a": ["foo", "bars"]} -def test_str_head(constructor: Constructor) -> None: +def test_str_head(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select(nw.col("a").str.head(3)) expected = { diff --git a/tests/expr_and_series/str/len_chars_test.py b/tests/expr_and_series/str/len_chars_test.py index 1a318801a..812f193b2 100644 --- a/tests/expr_and_series/str/len_chars_test.py +++ b/tests/expr_and_series/str/len_chars_test.py @@ -11,7 +11,7 @@ def test_str_len_chars(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.col("a").str.len_chars()) diff --git a/tests/expr_and_series/str/replace_test.py b/tests/expr_and_series/str/replace_test.py index 7d57eeb7d..53904be73 100644 --- a/tests/expr_and_series/str/replace_test.py +++ b/tests/expr_and_series/str/replace_test.py @@ -101,7 +101,7 @@ def test_str_replace_expr( literal: bool, # noqa: FBT001 expected: dict[str, list[str]], ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result_df = df.select( @@ -123,7 +123,9 @@ def test_str_replace_all_expr( literal: bool, # noqa: FBT001 expected: dict[str, list[str]], ) -> None: - if "duckdb" in str(constructor) and literal is False: + if ("pyspark" in str(constructor)) or ( + "duckdb" in str(constructor) and literal is False + ): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select( diff --git a/tests/expr_and_series/str/slice_test.py b/tests/expr_and_series/str/slice_test.py index 1e7115a8a..6f9b4dc4f 100644 --- a/tests/expr_and_series/str/slice_test.py +++ b/tests/expr_and_series/str/slice_test.py @@ -17,8 +17,15 @@ [(1, 2, {"a": ["da", "df"]}), (-2, None, {"a": ["as", "as"]})], ) def test_str_slice( - constructor: Constructor, offset: int, length: int | None, expected: Any + request: pytest.FixtureRequest, + constructor: Constructor, + offset: int, + length: int | None, + expected: Any, ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result_frame = df.select(nw.col("a").str.slice(offset, length)) assert_equal_data(result_frame, expected) diff --git a/tests/expr_and_series/str/starts_with_ends_with_test.py b/tests/expr_and_series/str/starts_with_ends_with_test.py index 0b11a7537..dac70c288 100644 --- a/tests/expr_and_series/str/starts_with_ends_with_test.py +++ b/tests/expr_and_series/str/starts_with_ends_with_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -11,7 +13,10 @@ data = {"a": ["fdas", "edfas"]} -def test_ends_with(constructor: Constructor) -> None: +def test_ends_with(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select(nw.col("a").str.ends_with("das")) expected = { @@ -29,7 +34,10 @@ def test_ends_with_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_starts_with(constructor: Constructor) -> None: +def test_starts_with(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)).lazy() result = df.select(nw.col("a").str.starts_with("fda")) expected = { diff --git a/tests/expr_and_series/str/strip_chars_test.py b/tests/expr_and_series/str/strip_chars_test.py index d765e99e3..f369bbbf9 100644 --- a/tests/expr_and_series/str/strip_chars_test.py +++ b/tests/expr_and_series/str/strip_chars_test.py @@ -20,8 +20,13 @@ ], ) def test_str_strip_chars( - constructor: Constructor, characters: str | None, expected: Any + request: pytest.FixtureRequest, + constructor: Constructor, + characters: str | None, + expected: Any, ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result_frame = df.select(nw.col("a").str.strip_chars(characters)) assert_equal_data(result_frame, expected) diff --git a/tests/expr_and_series/str/tail_test.py b/tests/expr_and_series/str/tail_test.py index e2543de0a..cdb2c024e 100644 --- a/tests/expr_and_series/str/tail_test.py +++ b/tests/expr_and_series/str/tail_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -8,7 +10,9 @@ data = {"a": ["foo", "bars"]} -def test_str_tail(constructor: Constructor) -> None: +def test_str_tail(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) expected = {"a": ["foo", "ars"]} diff --git a/tests/expr_and_series/str/to_datetime_test.py b/tests/expr_and_series/str/to_datetime_test.py index 3f8df65a7..bfb2a4dfb 100644 --- a/tests/expr_and_series/str/to_datetime_test.py +++ b/tests/expr_and_series/str/to_datetime_test.py @@ -18,7 +18,7 @@ def test_to_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) if "cudf" in str(constructor): expected = "2020-01-01T12:34:56.000000000" @@ -80,7 +80,7 @@ def test_to_datetime_infer_fmt( request.applymarker(pytest.mark.xfail) if "cudf" in str(constructor): expected = expected_cudf - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) result = ( nw.from_native(constructor(data)) @@ -133,7 +133,7 @@ def test_to_datetime_series_infer_fmt( def test_to_datetime_infer_fmt_from_date( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data = {"z": ["2020-01-01", "2020-01-02", None]} expected = [datetime(2020, 1, 1), datetime(2020, 1, 2), None] diff --git a/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py b/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py index 1057b33de..087e26a0e 100644 --- a/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py +++ b/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py @@ -30,8 +30,8 @@ def test_str_to_uppercase( expected: dict[str, list[str]], request: pytest.FixtureRequest, ) -> None: - df = nw.from_native(constructor(data)) - result_frame = df.select(nw.col("a").str.to_uppercase()) + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) if any("ß" in s for value in data.values() for s in value) & ( constructor.__name__ @@ -48,6 +48,9 @@ def test_str_to_uppercase( # smaller cap 'ß' to upper cap 'ẞ' instead of 'SS' request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) + result_frame = df.select(nw.col("a").str.to_uppercase()) + assert_equal_data(result_frame, expected) @@ -110,10 +113,13 @@ def test_str_to_uppercase_series( ], ) def test_str_to_lowercase( + request: pytest.FixtureRequest, constructor: Constructor, data: dict[str, list[str]], expected: dict[str, list[str]], ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result_frame = df.select(nw.col("a").str.to_lowercase()) assert_equal_data(result_frame, expected) diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index f3e01d80f..82f616a64 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -11,7 +11,7 @@ def test_unary(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data = { "a": [1, 3, 2], @@ -82,7 +82,7 @@ def test_unary_series(constructor_eager: ConstructorEager) -> None: def test_unary_two_elements( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data = {"a": [1, 2], "b": [2, 10], "c": [2.0, None]} result = nw.from_native(constructor(data)).select( @@ -126,7 +126,11 @@ def test_unary_two_elements_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_unary_one_element(constructor: Constructor) -> None: +def test_unary_one_element( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1], "b": [2], "c": [None]} # Dask runs into a divide by zero RuntimeWarning for 1 element skew. context = ( diff --git a/tests/expr_and_series/when_test.py b/tests/expr_and_series/when_test.py index 94e37aaa3..0faf59172 100644 --- a/tests/expr_and_series/when_test.py +++ b/tests/expr_and_series/when_test.py @@ -17,7 +17,9 @@ } -def test_when(constructor: Constructor) -> None: +def test_when(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") == 1).then(value=3).alias("a_when")) expected = { @@ -26,7 +28,9 @@ def test_when(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_when_otherwise(constructor: Constructor) -> None: +def test_when_otherwise(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") == 1).then(3).otherwise(6).alias("a_when")) expected = { @@ -35,7 +39,11 @@ def test_when_otherwise(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_multiple_conditions(constructor: Constructor) -> None: +def test_multiple_conditions( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select( nw.when(nw.col("a") < 3, nw.col("c") < 5.0).then(3).alias("a_when") @@ -77,7 +85,11 @@ def test_value_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_value_expression(constructor: Constructor) -> None: +def test_value_expression( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") == 1).then(nw.col("a") + 9).alias("a_when")) expected = { @@ -110,7 +122,11 @@ def test_otherwise_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_otherwise_expression(constructor: Constructor) -> None: +def test_otherwise_expression( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select( nw.when(nw.col("a") == 1).then(-1).otherwise(nw.col("a") + 7).alias("a_when") @@ -124,7 +140,7 @@ def test_otherwise_expression(constructor: Constructor) -> None: def test_when_then_otherwise_into_expr( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") > 1).then("c").otherwise("e")) @@ -135,7 +151,7 @@ def test_when_then_otherwise_into_expr( def test_when_then_otherwise_lit_str( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") > 1).then(nw.col("b")).otherwise(nw.lit("z"))) diff --git a/tests/frame/clone_test.py b/tests/frame/clone_test.py index e142ed0a7..316638c06 100644 --- a/tests/frame/clone_test.py +++ b/tests/frame/clone_test.py @@ -10,7 +10,7 @@ def test_clone(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) if "pyarrow_table" in str(constructor): request.applymarker(pytest.mark.xfail) diff --git a/tests/frame/concat_test.py b/tests/frame/concat_test.py index 4d5f3ebc9..6d8fdbda0 100644 --- a/tests/frame/concat_test.py +++ b/tests/frame/concat_test.py @@ -10,7 +10,7 @@ def test_concat_horizontal( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_left = nw.from_native(constructor(data)).lazy() @@ -32,7 +32,12 @@ def test_concat_horizontal( nw.concat([]) -def test_concat_vertical(constructor: Constructor) -> None: +def test_concat_vertical( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_left = ( nw.from_native(constructor(data)).lazy().rename({"a": "c", "b": "d"}).drop("z") @@ -63,7 +68,7 @@ def test_concat_vertical(constructor: Constructor) -> None: def test_concat_diagonal( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data_1 = {"a": [1, 3], "b": [4, 6]} data_2 = {"a": [100, 200], "z": ["x", "y"]} diff --git a/tests/frame/explode_test.py b/tests/frame/explode_test.py index b79215a18..f3b096194 100644 --- a/tests/frame/explode_test.py +++ b/tests/frame/explode_test.py @@ -40,7 +40,7 @@ def test_explode_single_col( ) -> None: if any( backend in str(constructor) - for backend in ("dask", "modin", "cudf", "pyarrow_table", "duckdb") + for backend in ("dask", "modin", "cudf", "pyarrow_table", "duckdb", "pyspark") ): request.applymarker(pytest.mark.xfail) @@ -89,7 +89,7 @@ def test_explode_multiple_cols( ) -> None: if any( backend in str(constructor) - for backend in ("dask", "modin", "cudf", "pyarrow_table", "duckdb") + for backend in ("dask", "modin", "cudf", "pyarrow_table", "duckdb", "pyspark") ): request.applymarker(pytest.mark.xfail) @@ -110,7 +110,7 @@ def test_explode_shape_error( ) -> None: if any( backend in str(constructor) - for backend in ("dask", "modin", "cudf", "pyarrow_table", "duckdb") + for backend in ("dask", "modin", "cudf", "pyarrow_table", "duckdb", "pyspark") ): request.applymarker(pytest.mark.xfail) @@ -133,7 +133,7 @@ def test_explode_shape_error( def test_explode_invalid_operation_error( request: pytest.FixtureRequest, constructor: Constructor ) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "dask", "duckdb")): + if any(x in str(constructor) for x in ("pyarrow_table", "dask", "duckdb", "pyspark")): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) and POLARS_VERSION < (0, 20, 6): diff --git a/tests/frame/gather_every_test.py b/tests/frame/gather_every_test.py index 40e9291de..c151f4503 100644 --- a/tests/frame/gather_every_test.py +++ b/tests/frame/gather_every_test.py @@ -14,7 +14,7 @@ def test_gather_every( constructor: Constructor, n: int, offset: int, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.gather_every(n=n, offset=offset) diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index f176aca67..f15a1b79e 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -21,7 +21,7 @@ def test_inner_join_two_keys(constructor: Constructor) -> None: "antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9], - "index": [0, 1, 2], + "idx": [0, 1, 2], } df = nw.from_native(constructor(data)) df_right = df @@ -32,13 +32,13 @@ def test_inner_join_two_keys(constructor: Constructor) -> None: how="inner", ) result_on = df.join(df_right, on=["antananarivo", "bob"], how="inner") # type: ignore[arg-type] - result = result.sort("index").drop("index_right") - result_on = result_on.sort("index").drop("index_right") + result = result.sort("idx").drop("idx_right") + result_on = result_on.sort("idx").drop("idx_right") expected = { "antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9], - "index": [0, 1, 2], + "idx": [0, 1, 2], "zor ro_right": [7.0, 8, 9], } assert_equal_data(result, expected) @@ -50,7 +50,7 @@ def test_inner_join_single_key(constructor: Constructor) -> None: "antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9], - "index": [0, 1, 2], + "idx": [0, 1, 2], } df = nw.from_native(constructor(data)) df_right = df @@ -59,15 +59,15 @@ def test_inner_join_single_key(constructor: Constructor) -> None: left_on="antananarivo", right_on="antananarivo", how="inner", - ).sort("index") - result_on = df.join(df_right, on="antananarivo", how="inner").sort("index") # type: ignore[arg-type] - result = result.drop("index_right") - result_on = result_on.drop("index_right") + ).sort("idx") + result_on = df.join(df_right, on="antananarivo", how="inner").sort("idx") # type: ignore[arg-type] + result = result.drop("idx_right") + result_on = result_on.drop("idx_right") expected = { "antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9], - "index": [0, 1, 2], + "idx": [0, 1, 2], "bob_right": [4, 4, 6], "zor ro_right": [7.0, 8, 9], } @@ -235,34 +235,34 @@ def test_left_join(constructor: Constructor) -> None: data_left = { "antananarivo": [1.0, 2, 3], "bob": [4.0, 5, 6], - "index": [0.0, 1.0, 2.0], + "idx": [0.0, 1.0, 2.0], } data_right = { "antananarivo": [1.0, 2, 3], "co": [4.0, 5, 7], - "index": [0.0, 1.0, 2.0], + "idx": [0.0, 1.0, 2.0], } df_left = nw.from_native(constructor(data_left)) df_right = nw.from_native(constructor(data_right)) result = df_left.join(df_right, left_on="bob", right_on="co", how="left") # type: ignore[arg-type] - result = result.sort("index") - result = result.drop("index_right") + result = result.sort("idx") + result = result.drop("idx_right") expected = { "antananarivo": [1, 2, 3], "bob": [4, 5, 6], - "index": [0, 1, 2], + "idx": [0, 1, 2], "antananarivo_right": [1, 2, None], } result_on_list = df_left.join( df_right, # type: ignore[arg-type] - on=["antananarivo", "index"], + on=["antananarivo", "idx"], how="left", ) - result_on_list = result_on_list.sort("index") + result_on_list = result_on_list.sort("idx") expected_on_list = { "antananarivo": [1, 2, 3], "bob": [4, 5, 6], - "index": [0, 1, 2], + "idx": [0, 1, 2], "co": [4, 5, 7], } assert_equal_data(result, expected) @@ -270,8 +270,8 @@ def test_left_join(constructor: Constructor) -> None: def test_left_join_multiple_column(constructor: Constructor) -> None: - data_left = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "index": [0, 1, 2]} - data_right = {"antananarivo": [1, 2, 3], "c": [4, 5, 6], "index": [0, 1, 2]} + data_left = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "idx": [0, 1, 2]} + data_right = {"antananarivo": [1, 2, 3], "c": [4, 5, 6], "idx": [0, 1, 2]} df_left = nw.from_native(constructor(data_left)) df_right = nw.from_native(constructor(data_right)) result = df_left.join( @@ -280,9 +280,9 @@ def test_left_join_multiple_column(constructor: Constructor) -> None: right_on=["antananarivo", "c"], how="left", ) - result = result.sort("index") - result = result.drop("index_right") - expected = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "index": [0, 1, 2]} + result = result.sort("idx") + result = result.drop("idx_right") + expected = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "idx": [0, 1, 2]} assert_equal_data(result, expected) @@ -291,23 +291,23 @@ def test_left_join_overlapping_column(constructor: Constructor) -> None: "antananarivo": [1.0, 2, 3], "bob": [4.0, 5, 6], "d": [1.0, 4, 2], - "index": [0.0, 1.0, 2.0], + "idx": [0.0, 1.0, 2.0], } data_right = { "antananarivo": [1.0, 2, 3], "c": [4.0, 5, 6], "d": [1.0, 4, 2], - "index": [0.0, 1.0, 2.0], + "idx": [0.0, 1.0, 2.0], } df_left = nw.from_native(constructor(data_left)) df_right = nw.from_native(constructor(data_right)) - result = df_left.join(df_right, left_on="bob", right_on="c", how="left").sort("index") # type: ignore[arg-type] - result = result.drop("index_right") + result = df_left.join(df_right, left_on="bob", right_on="c", how="left").sort("idx") # type: ignore[arg-type] + result = result.drop("idx_right") expected: dict[str, list[Any]] = { "antananarivo": [1, 2, 3], "bob": [4, 5, 6], "d": [1, 4, 2], - "index": [0, 1, 2], + "idx": [0, 1, 2], "antananarivo_right": [1, 2, 3], "d_right": [1, 4, 2], } @@ -318,13 +318,13 @@ def test_left_join_overlapping_column(constructor: Constructor) -> None: right_on="d", how="left", ) - result = result.sort("index") - result = result.drop("index_right") + result = result.sort("idx") + result = result.drop("idx_right") expected = { "antananarivo": [1, 2, 3], "bob": [4, 5, 6], "d": [1, 4, 2], - "index": [0, 1, 2], + "idx": [0, 1, 2], "antananarivo_right": [1.0, 3.0, None], "c": [4.0, 6.0, None], } @@ -362,7 +362,7 @@ def test_joinasof_numeric( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "duckdb")): + if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "duckdb", "pyspark")): request.applymarker(pytest.mark.xfail) if PANDAS_VERSION < (2, 1) and ( ("pandas_pyarrow" in str(constructor)) or ("pandas_nullable" in str(constructor)) @@ -421,7 +421,7 @@ def test_joinasof_time( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "duckdb")): + if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "duckdb", "pyspark")): request.applymarker(pytest.mark.xfail) if PANDAS_VERSION < (2, 1) and ("pandas_pyarrow" in str(constructor)): request.applymarker(pytest.mark.xfail) @@ -502,7 +502,7 @@ def test_joinasof_by( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "duckdb")): + if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "duckdb", "pyspark")): request.applymarker(pytest.mark.xfail) if PANDAS_VERSION < (2, 1) and ( ("pandas_pyarrow" in str(constructor)) or ("pandas_nullable" in str(constructor)) diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index 9d601e468..946e58203 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -80,7 +80,7 @@ def test_comparison_with_list_error_message() -> None: def test_missing_columns( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor(data)) @@ -126,7 +126,7 @@ def test_left_to_right_broadcasting( ) -> None: if "dask" in str(constructor) and DASK_VERSION < (2024, 10): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 1, 2], "b": [4, 5, 6]})) result = df.select(nw.col("a") + nw.col("b").sum()) diff --git a/tests/frame/tail_test.py b/tests/frame/tail_test.py index a4d265797..75f46a4a1 100644 --- a/tests/frame/tail_test.py +++ b/tests/frame/tail_test.py @@ -9,7 +9,10 @@ from tests.utils import assert_equal_data -def test_tail(constructor: Constructor) -> None: +def test_tail(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} expected = {"a": [3, 2], "b": [4, 6], "z": [8.0, 9]} diff --git a/tests/frame/unique_test.py b/tests/frame/unique_test.py index ca34d29b4..a193ab98b 100644 --- a/tests/frame/unique_test.py +++ b/tests/frame/unique_test.py @@ -39,6 +39,13 @@ def test_unique( "last", }: context: Any = pytest.raises(ValueError, match="row order") + elif ( + keep == "none" and df.implementation is nw.Implementation.PYSPARK + ): # pragma: no cover + context = pytest.raises( + ValueError, + match="`LazyFrame.unique` with PySpark backend only supports `keep='any'`.", + ) elif keep == "foo": context = pytest.raises(ValueError, match=": foo") else: diff --git a/tests/frame/unpivot_test.py b/tests/frame/unpivot_test.py index 2867720a7..72aa81f2d 100644 --- a/tests/frame/unpivot_test.py +++ b/tests/frame/unpivot_test.py @@ -37,8 +37,14 @@ [("b", expected_b_only), (["b", "c"], expected_b_c), (None, expected_b_c)], ) def test_unpivot_on( - constructor: Constructor, on: str | list[str] | None, expected: dict[str, list[float]] + request: pytest.FixtureRequest, + constructor: Constructor, + on: str | list[str] | None, + expected: dict[str, list[float]], ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.unpivot(on=on, index=["a"]).sort("variable", "a") assert_equal_data(result, expected) @@ -53,10 +59,14 @@ def test_unpivot_on( ], ) def test_unpivot_var_value_names( + request: pytest.FixtureRequest, constructor: Constructor, variable_name: str | None, value_name: str | None, ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.unpivot( on=["b", "c"], index=["a"], variable_name=variable_name, value_name=value_name @@ -65,7 +75,12 @@ def test_unpivot_var_value_names( assert result.collect_schema().names()[-2:] == [variable_name, value_name] -def test_unpivot_default_var_value_names(constructor: Constructor) -> None: +def test_unpivot_default_var_value_names( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.unpivot(on=["b", "c"], index=["a"]) @@ -87,10 +102,13 @@ def test_unpivot_mixed_types( data: dict[str, Any], expected_dtypes: list[DType], ) -> None: - if "cudf" in str(constructor) or ( - "pyarrow_table" in str(constructor) and PYARROW_VERSION < (14, 0, 0) + if ( + "cudf" in str(constructor) + or "pyspark" in str(constructor) + or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (14, 0, 0)) ): request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.unpivot(on=["a", "b"], index="idx") diff --git a/tests/frame/with_columns_test.py b/tests/frame/with_columns_test.py index 335c53896..6fa3ab825 100644 --- a/tests/frame/with_columns_test.py +++ b/tests/frame/with_columns_test.py @@ -52,7 +52,7 @@ def test_with_columns_dtypes_single_row( ) -> None: if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (15,): request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data = {"a": ["foo"]} df = nw.from_native(constructor(data)).with_columns(nw.col("a").cast(nw.Categorical)) diff --git a/tests/frame/with_row_index_test.py b/tests/frame/with_row_index_test.py index bc514fa70..96f2b1547 100644 --- a/tests/frame/with_row_index_test.py +++ b/tests/frame/with_row_index_test.py @@ -13,7 +13,7 @@ def test_with_row_index(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) result = nw.from_native(constructor(data)).with_row_index() expected = {"index": [0, 1], "a": ["foo", "bars"], "ab": ["foo", "bars"]} diff --git a/tests/from_dict_test.py b/tests/from_dict_test.py index 86fe07eda..0630cac43 100644 --- a/tests/from_dict_test.py +++ b/tests/from_dict_test.py @@ -12,7 +12,7 @@ def test_from_dict(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) native_namespace = nw.get_native_namespace(df) @@ -25,7 +25,7 @@ def test_from_dict(constructor: Constructor, request: pytest.FixtureRequest) -> def test_from_dict_schema( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) schema = {"c": nw_v1.Int16(), "d": nw_v1.Float32()} df = nw_v1.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) @@ -62,7 +62,7 @@ def test_from_dict_one_native_one_narwhals( def test_from_dict_v1(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw_v1.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) native_namespace = nw_v1.get_native_namespace(df) diff --git a/tests/from_numpy_test.py b/tests/from_numpy_test.py index b736d5cbd..7a40136e7 100644 --- a/tests/from_numpy_test.py +++ b/tests/from_numpy_test.py @@ -19,7 +19,7 @@ def test_from_numpy(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) native_namespace = nw.get_native_namespace(df) @@ -31,7 +31,7 @@ def test_from_numpy(constructor: Constructor, request: pytest.FixtureRequest) -> def test_from_numpy_schema_dict( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) schema = { "c": nw_v1.Int16(), @@ -52,7 +52,7 @@ def test_from_numpy_schema_dict( def test_from_numpy_schema_list( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) schema = ["c", "d", "e", "f"] df = nw_v1.from_native(constructor(data)) @@ -68,7 +68,7 @@ def test_from_numpy_schema_list( def test_from_numpy_schema_notvalid( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) native_namespace = nw_v1.get_native_namespace(df) @@ -79,7 +79,7 @@ def test_from_numpy_schema_notvalid( def test_from_numpy_v1(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw_v1.from_native(constructor(data)) native_namespace = nw_v1.get_native_namespace(df) diff --git a/tests/group_by_test.py b/tests/group_by_test.py index c854da453..64b3844d0 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -115,6 +115,8 @@ def test_group_by_depth_1_agg( expected: dict[str, list[int | float]], request: pytest.FixtureRequest, ) -> None: + if "pyspark" in str(constructor) and attr == "n_unique": + request.applymarker(pytest.mark.xfail) if "pandas_pyarrow" in str(constructor) and attr == "var" and PANDAS_VERSION < (2, 1): # Known issue with variance calculation in pandas 2.0.x with pyarrow backend in groupby operations" request.applymarker(pytest.mark.xfail) @@ -164,7 +166,11 @@ def test_group_by_median(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_group_by_n_unique_w_missing(constructor: Constructor) -> None: +def test_group_by_n_unique_w_missing( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 1, 2], "b": [4, None, 5], "c": [None, None, 7], "d": [1, 1, 3]} result = ( nw.from_native(constructor(data)) @@ -269,6 +275,10 @@ def test_key_with_nulls( if "modin" in str(constructor): # TODO(unassigned): Modin flaky here? request.applymarker(pytest.mark.skip) + + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + context = ( pytest.raises(NotImplementedError, match="null values") if ("pandas_constructor" in str(constructor) and PANDAS_VERSION < (1, 1, 0)) @@ -290,7 +300,7 @@ def test_key_with_nulls( def test_key_with_nulls_ignored( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) data = {"b": [4, 5, None], "a": [1, 2, 3]} result = ( @@ -332,7 +342,9 @@ def test_key_with_nulls_iter( assert len(result) == 4 -def test_no_agg(constructor: Constructor) -> None: +def test_no_agg(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) result = nw.from_native(constructor(data)).group_by(["a", "b"]).agg().sort("a", "b") expected = {"a": [1, 3], "b": [4, 6]} @@ -343,7 +355,7 @@ def test_group_by_categorical( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) if "pyarrow_table" in str(constructor) and PYARROW_VERSION < ( 15, @@ -370,7 +382,7 @@ def test_group_by_categorical( def test_group_by_shift_raises( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor): # Polars supports all kinds of crazy group-by aggregations, so @@ -412,7 +424,7 @@ def test_all_kind_of_aggs( # and modin lol https://github.com/modin-project/modin/issues/7414 # and cudf https://github.com/rapidsai/cudf/issues/17649 request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) if "pandas" in str(constructor) and PANDAS_VERSION < (1, 4): # Bug in old pandas, can't do DataFrameGroupBy[['b', 'b']] diff --git a/tests/read_scan_test.py b/tests/read_scan_test.py index dbb2cf624..55869b46b 100644 --- a/tests/read_scan_test.py +++ b/tests/read_scan_test.py @@ -52,8 +52,11 @@ def test_read_csv_kwargs(tmpdir: pytest.TempdirFactory) -> None: def test_scan_csv( tmpdir: pytest.TempdirFactory, + request: pytest.FixtureRequest, constructor: Constructor, ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df_pl = pl.DataFrame(data) filepath = str(tmpdir / "file.csv") # type: ignore[operator] df_pl.write_csv(filepath) @@ -66,8 +69,11 @@ def test_scan_csv( def test_scan_csv_v1( tmpdir: pytest.TempdirFactory, + request: pytest.FixtureRequest, constructor: Constructor, ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df_pl = pl.DataFrame(data) filepath = str(tmpdir / "file.csv") # type: ignore[operator] df_pl.write_csv(filepath) @@ -128,8 +134,11 @@ def test_read_parquet_kwargs(tmpdir: pytest.TempdirFactory) -> None: @pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="too old for pyarrow") def test_scan_parquet( tmpdir: pytest.TempdirFactory, + request: pytest.FixtureRequest, constructor: Constructor, ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df_pl = pl.DataFrame(data) filepath = str(tmpdir / "file.parquet") # type: ignore[operator] df_pl.write_parquet(filepath) @@ -143,8 +152,11 @@ def test_scan_parquet( @pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="too old for pyarrow") def test_scan_parquet_v1( tmpdir: pytest.TempdirFactory, + request: pytest.FixtureRequest, constructor: Constructor, ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df_pl = pl.DataFrame(data) filepath = str(tmpdir / "file.parquet") # type: ignore[operator] df_pl.write_parquet(filepath) diff --git a/tests/selectors_test.py b/tests/selectors_test.py index 103ea666d..80aa64803 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -24,7 +24,7 @@ def test_selectors(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(by_dtype([nw.Int64, nw.Float64]) + 1) @@ -33,7 +33,7 @@ def test_selectors(constructor: Constructor, request: pytest.FixtureRequest) -> def test_numeric(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(numeric() + 1) @@ -42,7 +42,7 @@ def test_numeric(constructor: Constructor, request: pytest.FixtureRequest) -> No def test_boolean(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(boolean()) @@ -51,7 +51,7 @@ def test_boolean(constructor: Constructor, request: pytest.FixtureRequest) -> No def test_string(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(string()) @@ -67,7 +67,7 @@ def test_categorical( 15, ): # pragma: no cover request.applymarker(pytest.mark.xfail) - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) expected = {"b": ["a", "b", "c"]} @@ -96,7 +96,7 @@ def test_set_ops( expected: list[str], request: pytest.FixtureRequest, ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(selector).collect_schema().names() diff --git a/tests/spark_like_test.py b/tests/spark_like_test.py deleted file mode 100644 index f7cd9e6a9..000000000 --- a/tests/spark_like_test.py +++ /dev/null @@ -1,1211 +0,0 @@ -"""PySpark support in Narwhals is still _very_ limited. - -Start with a simple test file whilst we develop the basics. -Once we're a bit further along, we can integrate PySpark tests into the main test suite. -""" - -from __future__ import annotations - -from contextlib import nullcontext as does_not_raise -from typing import TYPE_CHECKING -from typing import Any -from typing import Literal - -import pandas as pd -import pytest - -import narwhals.stable.v1 as nw -from narwhals.exceptions import ColumnNotFoundError -from tests.utils import assert_equal_data - -if TYPE_CHECKING: - from pyspark.sql import SparkSession - - from narwhals.dtypes import DType - from narwhals.typing import IntoFrame - from tests.utils import Constructor - - -# Apply filterwarnings to all tests in this module -pytestmark = [ - pytest.mark.filterwarnings( - "ignore:.*is_datetime64tz_dtype is deprecated and will be removed in a future version.*:DeprecationWarning" - ), - pytest.mark.filterwarnings( - "ignore:.*distutils Version classes are deprecated. Use packaging.version instead.*:DeprecationWarning" - ), - pytest.mark.filterwarnings("ignore: unclosed IntoFrame: - # NaN and NULL are not the same in PySpark - pd_df = pd.DataFrame(obj).replace({float("nan"): None}).reset_index() - return ( # type: ignore[no-any-return] - spark_session.createDataFrame(pd_df).repartition(2).orderBy("index").drop("index") - ) - - -@pytest.fixture(params=[_pyspark_constructor_with_session]) -def pyspark_constructor( - request: pytest.FixtureRequest, spark_session: SparkSession -) -> Constructor: - def _constructor(obj: Any) -> IntoFrame: - return request.param(obj, spark_session) # type: ignore[no-any-return] - - return _constructor - - -# copied from tests/translate/from_native_test.py -def test_series_only(pyspark_constructor: Constructor) -> None: - obj = pyspark_constructor({"a": [1, 2, 3]}) - with pytest.raises(TypeError, match="Cannot only use `series_only`"): - _ = nw.from_native(obj, series_only=True) - - -def test_eager_only_lazy(pyspark_constructor: Constructor) -> None: - dframe = pyspark_constructor({"a": [1, 2, 3]}) - with pytest.raises(TypeError, match="Cannot only use `eager_only`"): - _ = nw.from_native(dframe, eager_only=True) - - -# copied from tests/frame/with_columns_test.py -def test_columns(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - result = df.columns - expected = ["a", "b", "z"] - assert result == expected - - -# copied from tests/frame/with_columns_test.py -def test_with_columns_order(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - result = df.with_columns(nw.col("a") + 1, d=nw.col("a") - 1) - assert result.collect_schema().names() == ["a", "b", "z", "d"] - expected = {"a": [2, 4, 3], "b": [4, 4, 6], "z": [7.0, 8, 9], "d": [0, 2, 1]} - assert_equal_data(result, expected) - - -@pytest.mark.filterwarnings("ignore:If `index_col` is not specified for `to_spark`") -def test_with_columns_empty(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select().with_columns() - assert_equal_data(result, {}) - - -def test_with_columns_order_single_row(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9], "i": [0, 1, 2]} - df = nw.from_native(pyspark_constructor(data)).filter(nw.col("i") < 1).drop("i") - result = df.with_columns(nw.col("a") + 1, d=nw.col("a") - 1) - assert result.collect_schema().names() == ["a", "b", "z", "d"] - expected = {"a": [2], "b": [4], "z": [7.0], "d": [0]} - assert_equal_data(result, expected) - - -# copied from tests/frame/select_test.py -def test_select(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select("a") - expected = {"a": [1, 3, 2]} - assert_equal_data(result, expected) - - -@pytest.mark.filterwarnings("ignore:If `index_col` is not specified for `to_spark`") -def test_empty_select(pyspark_constructor: Constructor) -> None: - result = nw.from_native(pyspark_constructor({"a": [1, 2, 3]})).lazy().select() - assert result.collect().shape == (0, 0) - - -# copied from tests/frame/filter_test.py -def test_filter(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - result = df.filter(nw.col("a") > 1) - expected = {"a": [3, 2], "b": [4, 6], "z": [8.0, 9.0]} - assert_equal_data(result, expected) - - -# copied from tests/frame/schema_test.py -@pytest.mark.filterwarnings("ignore:Determining|Resolving.*") -def test_schema(pyspark_constructor: Constructor) -> None: - df = nw.from_native( - pyspark_constructor({"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8, 9]}) - ) - result = df.schema - expected = {"a": nw.Int64, "b": nw.Int64, "z": nw.Float64} - - result = df.schema - assert result == expected - result = df.lazy().collect().schema - assert result == expected - - -def test_collect_schema(pyspark_constructor: Constructor) -> None: - df = nw.from_native( - pyspark_constructor({"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8, 9]}) - ) - expected = {"a": nw.Int64, "b": nw.Int64, "z": nw.Float64} - - result = df.collect_schema() - assert result == expected - result = df.lazy().collect().collect_schema() - assert result == expected - - -# copied from tests/frame/drop_test.py -@pytest.mark.parametrize( - ("to_drop", "expected"), - [ - ("abc", ["b", "z"]), - (["abc"], ["b", "z"]), - (["abc", "b"], ["z"]), - ], -) -def test_drop( - pyspark_constructor: Constructor, to_drop: list[str], expected: list[str] -) -> None: - data = {"abc": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - assert df.drop(to_drop).collect_schema().names() == expected - if not isinstance(to_drop, str): - assert df.drop(*to_drop).collect_schema().names() == expected - - -@pytest.mark.parametrize( - ("strict", "context"), - [ - (True, pytest.raises(ColumnNotFoundError, match="z")), - (False, does_not_raise()), - ], -) -def test_drop_strict( - pyspark_constructor: Constructor, context: Any, *, strict: bool -) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6]} - to_drop = ["a", "z"] - - df = nw.from_native(pyspark_constructor(data)) - - with context: - names_out = df.drop(to_drop, strict=strict).collect_schema().names() - assert names_out == ["b"] - - -# copied from tests/frame/head_test.py -def test_head(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - expected = {"a": [1, 3], "b": [4, 4], "z": [7.0, 8.0]} - - df_raw = pyspark_constructor(data) - df = nw.from_native(df_raw) - - result = df.head(2) - assert_equal_data(result, expected) - - result = df.head(2) - assert_equal_data(result, expected) - - # negative indices not allowed for lazyframes - result = df.lazy().collect().head(-1) - assert_equal_data(result, expected) - - -# copied from tests/frame/sort_test.py -def test_sort(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - result = df.sort("a", "b") - expected = { - "a": [1, 2, 3], - "b": [4, 6, 4], - "z": [7.0, 9.0, 8.0], - } - assert_equal_data(result, expected) - result = df.sort("a", "b", descending=[True, False]).lazy().collect() - expected = { - "a": [3, 2, 1], - "b": [4, 6, 4], - "z": [8.0, 9.0, 7.0], - } - assert_equal_data(result, expected) - - -@pytest.mark.parametrize( - ("nulls_last", "expected"), - [ - (True, {"a": [0, 2, 0, -1], "b": [3, 2, 1, None]}), - (False, {"a": [-1, 0, 2, 0], "b": [None, 3, 2, 1]}), - ], -) -def test_sort_nulls( - pyspark_constructor: Constructor, *, nulls_last: bool, expected: dict[str, float] -) -> None: - data = {"a": [0, 0, 2, -1], "b": [1, 3, 2, None]} - df = nw.from_native(pyspark_constructor(data)) - result = df.sort("b", descending=True, nulls_last=nulls_last).lazy().collect() - assert_equal_data(result, expected) - - -# copied from tests/frame/add_test.py -def test_add(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - result = df.with_columns( - c=nw.col("a") + nw.col("b"), - d=nw.col("a") - nw.col("a").mean(), - e=nw.col("a") - nw.col("a").std(), - ) - expected = { - "a": [1, 3, 2], - "b": [4, 4, 6], - "z": [7.0, 8.0, 9.0], - "c": [5, 7, 8], - "d": [-1.0, 1.0, 0.0], - "e": [0.0, 2.0, 1.0], - } - assert_equal_data(result, expected) - - -def test_abs(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 2, 3, -4, 5]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select(nw.col("a").abs()) - expected = {"a": [1, 2, 3, 4, 5]} - assert_equal_data(result, expected) - - -# copied from tests/expr_and_series/all_horizontal_test.py -@pytest.mark.parametrize("expr1", ["a", nw.col("a")]) -@pytest.mark.parametrize("expr2", ["b", nw.col("b")]) -def test_allh(pyspark_constructor: Constructor, expr1: Any, expr2: Any) -> None: - data = { - "a": [False, False, True], - "b": [False, True, True], - } - df = nw.from_native(pyspark_constructor(data)) - result = df.select(all=nw.all_horizontal(expr1, expr2)) - - expected = {"all": [False, False, True]} - assert_equal_data(result, expected) - - -def test_allh_all(pyspark_constructor: Constructor) -> None: - data = { - "a": [False, False, True], - "b": [False, True, True], - } - df = nw.from_native(pyspark_constructor(data)) - result = df.select(all=nw.all_horizontal(nw.all())) - expected = {"all": [False, False, True]} - assert_equal_data(result, expected) - result = df.select(nw.all_horizontal(nw.all())) - expected = {"a": [False, False, True]} - assert_equal_data(result, expected) - - -# copied from tests/expr_and_series/sum_horizontal_test.py -@pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) -def test_sumh(pyspark_constructor: Constructor, col_expr: Any) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - result = df.with_columns(horizontal_sum=nw.sum_horizontal(col_expr, nw.col("b"))) - expected = { - "a": [1, 3, 2], - "b": [4, 4, 6], - "z": [7.0, 8.0, 9.0], - "horizontal_sum": [5, 7, 8], - } - assert_equal_data(result, expected) - - -def test_sumh_nullable(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 8, 3], "b": [4, 5, None], "idx": [0, 1, 2]} - expected = {"hsum": [5, 13, 3]} - - df = nw.from_native(pyspark_constructor(data)) - result = df.select("idx", hsum=nw.sum_horizontal("a", "b")).sort("idx").drop("idx") - assert_equal_data(result, expected) - - -def test_sumh_all(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 2, 3], "b": [10, 20, 30]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select(nw.sum_horizontal(nw.all())) - expected = { - "a": [11, 22, 33], - } - assert_equal_data(result, expected) - result = df.select(c=nw.sum_horizontal(nw.all())) - expected = { - "c": [11, 22, 33], - } - assert_equal_data(result, expected) - - -# copied from tests/expr_and_series/count_test.py -def test_count(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 2, 3], "b": [4, None, 6], "z": [7.0, None, None]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select(nw.col("a", "b", "z").count()) - expected = {"a": [3], "b": [2], "z": [1]} - assert_equal_data(result, expected) - - -# copied from tests/expr_and_series/double_test.py -def test_double(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - result = df.with_columns(nw.all() * 2) - expected = {"a": [2, 6, 4], "b": [8, 8, 12], "z": [14.0, 16.0, 18.0]} - assert_equal_data(result, expected) - - -def test_double_alias(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - result = df.with_columns(nw.col("a").alias("o"), nw.all() * 2) - expected = { - "a": [2, 6, 4], - "b": [8, 8, 12], - "z": [14.0, 16.0, 18.0], - "o": [1, 3, 2], - } - assert_equal_data(result, expected) - - -# copied from tests/expr_and_series/max_test.py -def test_expr_max_expr(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - - df = nw.from_native(pyspark_constructor(data)) - result = df.select(nw.col("a", "b", "z").max()) - expected = {"a": [3], "b": [6], "z": [9.0]} - assert_equal_data(result, expected) - - -# copied from tests/expr_and_series/min_test.py -def test_expr_min_expr(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select(nw.col("a", "b", "z").min()) - expected = {"a": [1], "b": [4], "z": [7.0]} - assert_equal_data(result, expected) - - -# copied from tests/expr_and_series/min_test.py -@pytest.mark.parametrize("expr", [nw.col("a", "b", "z").sum(), nw.sum("a", "b", "z")]) -def test_expr_sum_expr(pyspark_constructor: Constructor, expr: nw.Expr) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select(expr) - expected = {"a": [6], "b": [14], "z": [24.0]} - assert_equal_data(result, expected) - - -# copied from tests/expr_and_series/std_test.py -def test_std(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - - df = nw.from_native(pyspark_constructor(data)) - result = df.select( - nw.col("a").std().alias("a_ddof_default"), - nw.col("a").std(ddof=1).alias("a_ddof_1"), - nw.col("a").std(ddof=0).alias("a_ddof_0"), - nw.col("b").std(ddof=2).alias("b_ddof_2"), - nw.col("z").std(ddof=0).alias("z_ddof_0"), - ) - expected = { - "a_ddof_default": [1.0], - "a_ddof_1": [1.0], - "a_ddof_0": [0.816497], - "b_ddof_2": [1.632993], - "z_ddof_0": [0.816497], - } - assert_equal_data(result, expected) - - -# copied from tests/expr_and_series/var_test.py -def test_var(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2, None], "b": [4, 4, 6, None], "z": [7.0, 8, 9, None]} - - expected_results = { - "a_ddof_1": [1.0], - "a_ddof_0": [0.6666666666666666], - "b_ddof_2": [2.666666666666667], - "z_ddof_0": [0.6666666666666666], - } - - df = nw.from_native(pyspark_constructor(data)) - result = df.select( - nw.col("a").var(ddof=1).alias("a_ddof_1"), - nw.col("a").var(ddof=0).alias("a_ddof_0"), - nw.col("b").var(ddof=2).alias("b_ddof_2"), - nw.col("z").var(ddof=0).alias("z_ddof_0"), - ) - assert_equal_data(result, expected_results) - - -# copied from tests/group_by_test.py -def test_group_by_std(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 1, 2, 2], "b": [5, 4, 3, 2]} - result = ( - nw.from_native(pyspark_constructor(data)) - .group_by("a") - .agg(nw.col("b").std()) - .sort("a") - ) - expected = {"a": [1, 2], "b": [0.707107] * 2} - assert_equal_data(result, expected) - - -def test_group_by_simple_named(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 1, 2], "b": [4, 5, 6], "c": [7, 2, 1]} - df = nw.from_native(pyspark_constructor(data)).lazy() - result = ( - df.group_by("a") - .agg( - b_min=nw.col("b").min(), - b_max=nw.col("b").max(), - ) - .collect() - .sort("a") - ) - expected = { - "a": [1, 2], - "b_min": [4, 6], - "b_max": [5, 6], - } - assert_equal_data(result, expected) - - -def test_group_by_simple_unnamed(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 1, 2], "b": [4, 5, 6], "c": [7, 2, 1]} - df = nw.from_native(pyspark_constructor(data)).lazy() - result = ( - df.group_by("a") - .agg( - nw.col("b").min(), - nw.col("c").max(), - ) - .collect() - .sort("a") - ) - expected = { - "a": [1, 2], - "b": [4, 6], - "c": [7, 1], - } - assert_equal_data(result, expected) - - -def test_group_by_multiple_keys(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 1, 2], "b": [4, 4, 6], "c": [7, 2, 1]} - df = nw.from_native(pyspark_constructor(data)).lazy() - result = ( - df.group_by("a", "b") - .agg( - c_min=nw.col("c").min(), - c_max=nw.col("c").max(), - ) - .collect() - .sort("a") - ) - expected = { - "a": [1, 2], - "b": [4, 6], - "c_min": [2, 1], - "c_max": [7, 1], - } - assert_equal_data(result, expected) - - -# copied from tests/group_by_test.py -@pytest.mark.parametrize( - ("attr", "ddof"), - [ - ("std", 0), - ("var", 0), - ("std", 2), - ("var", 2), - ], -) -def test_group_by_depth_1_std_var( - pyspark_constructor: Constructor, - attr: str, - ddof: int, -) -> None: - data = {"a": [1, 1, 1, 2, 2, 2], "b": [4, 5, 6, 0, 5, 5]} - _pow = 0.5 if attr == "std" else 1 - expected = { - "a": [1, 2], - "b": [ - (sum((v - 5) ** 2 for v in [4, 5, 6]) / (3 - ddof)) ** _pow, - (sum((v - 10 / 3) ** 2 for v in [0, 5, 5]) / (3 - ddof)) ** _pow, - ], - } - expr = getattr(nw.col("b"), attr)(ddof=ddof) - result = nw.from_native(pyspark_constructor(data)).group_by("a").agg(expr).sort("a") - assert_equal_data(result, expected) - - -# copied from tests/frame/drop_nulls_test.py -def test_drop_nulls(pyspark_constructor: Constructor) -> None: - data = { - "a": [1.0, 2.0, None, 4.0], - "b": [None, 3.0, None, 5.0], - } - - result = nw.from_native(pyspark_constructor(data)).drop_nulls() - expected = { - "a": [2.0, 4.0], - "b": [3.0, 5.0], - } - assert_equal_data(result, expected) - - -@pytest.mark.parametrize( - ("subset", "expected"), - [ - ("a", {"a": [1, 2.0, 4.0], "b": [None, 3.0, 5.0]}), - (["a"], {"a": [1, 2.0, 4.0], "b": [None, 3.0, 5.0]}), - (["a", "b"], {"a": [2.0, 4.0], "b": [3.0, 5.0]}), - ], -) -def test_drop_nulls_subset( - pyspark_constructor: Constructor, - subset: str | list[str], - expected: dict[str, float], -) -> None: - data = { - "a": [1.0, 2.0, None, 4.0], - "b": [None, 3.0, None, 5.0], - } - - result = nw.from_native(pyspark_constructor(data)).drop_nulls(subset=subset) - assert_equal_data(result, expected) - - -# copied from tests/frame/rename_test.py -def test_rename(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - result = df.rename({"a": "x", "b": "y"}) - expected = {"x": [1, 3, 2], "y": [4, 4, 6], "z": [7.0, 8, 9]} - assert_equal_data(result, expected) - - -# adapted from tests/frame/unique_test.py -@pytest.mark.parametrize("subset", ["b", ["b"]]) -@pytest.mark.parametrize( - ("keep", "expected"), - [ - ("first", {"a": [1, 2], "b": [4, 6], "z": [7.0, 9.0]}), - ("last", {"a": [3, 2], "b": [4, 6], "z": [8.0, 9.0]}), - ("any", {"a": [1, 2], "b": [4, 6], "z": [7.0, 9.0]}), - ("none", {"a": [2], "b": [6], "z": [9]}), - ], -) -def test_unique( - pyspark_constructor: Constructor, - subset: str | list[str] | None, - keep: str, - expected: dict[str, list[float]], -) -> None: - if keep == "any": - context: Any = does_not_raise() - elif keep == "none": - context = pytest.raises( - ValueError, - match=r"`LazyFrame.unique` with PySpark backend only supports `keep='any'`.", - ) - else: - context = pytest.raises(ValueError, match=f": {keep}") - - with context: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - - result = df.unique(subset, keep=keep).sort("z") # type: ignore[arg-type] - assert_equal_data(result, expected) - - -def test_unique_none(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - result = df.unique().sort("z") - assert_equal_data(result, data) - - -def test_inner_join_two_keys(pyspark_constructor: Constructor) -> None: - data = { - "antananarivo": [1, 3, 2], - "bob": [4, 4, 6], - "zorro": [7.0, 8, 9], - "idx": [0, 1, 2], - } - df = nw.from_native(pyspark_constructor(data)) - df_right = nw.from_native(pyspark_constructor(data)) - result = df.join( - df_right, # type: ignore[arg-type] - left_on=["antananarivo", "bob"], - right_on=["antananarivo", "bob"], - how="inner", - ) - result = result.sort("idx").drop("idx_right") - - df = nw.from_native(pyspark_constructor(data)) - df_right = nw.from_native(pyspark_constructor(data)) - - result_on = df.join(df_right, on=["antananarivo", "bob"], how="inner") # type: ignore[arg-type] - result_on = result_on.sort("idx").drop("idx_right") - expected = { - "antananarivo": [1, 3, 2], - "bob": [4, 4, 6], - "zorro": [7.0, 8, 9], - "idx": [0, 1, 2], - "zorro_right": [7.0, 8, 9], - } - assert_equal_data(result, expected) - assert_equal_data(result_on, expected) - - -def test_inner_join_single_key(pyspark_constructor: Constructor) -> None: - data = { - "antananarivo": [1, 3, 2], - "bob": [4, 4, 6], - "zorro": [7.0, 8, 9], - "idx": [0, 1, 2], - } - df = nw.from_native(pyspark_constructor(data)) - df_right = nw.from_native(pyspark_constructor(data)) - result = ( - df.join( - df_right, # type: ignore[arg-type] - left_on="antananarivo", - right_on="antananarivo", - how="inner", - ) - .sort("idx") - .drop("idx_right") - ) - - df = nw.from_native(pyspark_constructor(data)) - df_right = nw.from_native(pyspark_constructor(data)) - result_on = ( - df.join( - df_right, # type: ignore[arg-type] - on="antananarivo", - how="inner", - ) - .sort("idx") - .drop("idx_right") - ) - - expected = { - "antananarivo": [1, 3, 2], - "bob": [4, 4, 6], - "zorro": [7.0, 8, 9], - "idx": [0, 1, 2], - "bob_right": [4, 4, 6], - "zorro_right": [7.0, 8, 9], - } - assert_equal_data(result, expected) - assert_equal_data(result_on, expected) - - -def test_cross_join(pyspark_constructor: Constructor) -> None: - data = {"antananarivo": [1, 3, 2]} - df = nw.from_native(pyspark_constructor(data)) - other = nw.from_native(pyspark_constructor(data)) - result = df.join(other, how="cross").sort("antananarivo", "antananarivo_right") # type: ignore[arg-type] - expected = { - "antananarivo": [1, 1, 1, 2, 2, 2, 3, 3, 3], - "antananarivo_right": [1, 2, 3, 1, 2, 3, 1, 2, 3], - } - assert_equal_data(result, expected) - - with pytest.raises( - ValueError, - match="Can not pass `left_on`, `right_on` or `on` keys for cross join", - ): - df.join(other, how="cross", left_on="antananarivo") # type: ignore[arg-type] - - -@pytest.mark.parametrize("how", ["inner", "left"]) -@pytest.mark.parametrize("suffix", ["_right", "_custom_suffix"]) -def test_suffix(pyspark_constructor: Constructor, how: str, suffix: str) -> None: - data = { - "antananarivo": [1, 3, 2], - "bob": [4, 4, 6], - "zorro": [7.0, 8, 9], - } - df = nw.from_native(pyspark_constructor(data)) - df_right = nw.from_native(pyspark_constructor(data)) - result = df.join( - df_right, # type: ignore[arg-type] - left_on=["antananarivo", "bob"], - right_on=["antananarivo", "bob"], - how=how, # type: ignore[arg-type] - suffix=suffix, - ) - result_cols = result.collect_schema().names() - assert result_cols == ["antananarivo", "bob", "zorro", f"zorro{suffix}"] - - -@pytest.mark.parametrize("suffix", ["_right", "_custom_suffix"]) -def test_cross_join_suffix(pyspark_constructor: Constructor, suffix: str) -> None: - data = {"antananarivo": [1, 3, 2]} - df = nw.from_native(pyspark_constructor(data)) - other = nw.from_native(pyspark_constructor(data)) - result = df.join(other, how="cross", suffix=suffix).sort( # type: ignore[arg-type] - "antananarivo", f"antananarivo{suffix}" - ) - expected = { - "antananarivo": [1, 1, 1, 2, 2, 2, 3, 3, 3], - f"antananarivo{suffix}": [1, 2, 3, 1, 2, 3, 1, 2, 3], - } - assert_equal_data(result, expected) - - -@pytest.mark.parametrize( - ("join_key", "filter_expr", "expected"), - [ - ( - ["antananarivo", "bob"], - (nw.col("bob") < 5), - {"antananarivo": [2], "bob": [6], "zorro": [9]}, - ), - (["bob"], (nw.col("bob") < 5), {"antananarivo": [2], "bob": [6], "zorro": [9]}), - ( - ["bob"], - (nw.col("bob") > 5), - {"antananarivo": [1, 3], "bob": [4, 4], "zorro": [7.0, 8.0]}, - ), - ], -) -def test_anti_join( - pyspark_constructor: Constructor, - join_key: list[str], - filter_expr: nw.Expr, - expected: dict[str, list[Any]], -) -> None: - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - other = df.filter(filter_expr) - result = df.join(other, how="anti", left_on=join_key, right_on=join_key) # type: ignore[arg-type] - assert_equal_data(result, expected) - - -@pytest.mark.parametrize( - ("join_key", "filter_expr", "expected"), - [ - ( - "antananarivo", - (nw.col("bob") > 5), - {"antananarivo": [2], "bob": [6], "zorro": [9]}, - ), - ( - ["antananarivo"], - (nw.col("bob") > 5), - {"antananarivo": [2], "bob": [6], "zorro": [9]}, - ), - ( - ["bob"], - (nw.col("bob") < 5), - {"antananarivo": [1, 3], "bob": [4, 4], "zorro": [7, 8]}, - ), - ( - ["antananarivo", "bob"], - (nw.col("bob") < 5), - {"antananarivo": [1, 3], "bob": [4, 4], "zorro": [7, 8]}, - ), - ], -) -def test_semi_join( - pyspark_constructor: Constructor, - join_key: list[str], - filter_expr: nw.Expr, - expected: dict[str, list[Any]], -) -> None: - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} - df = nw.from_native(pyspark_constructor(data)) - other = df.filter(filter_expr) - result = df.join(other, how="semi", left_on=join_key, right_on=join_key).sort( # type: ignore[arg-type] - "antananarivo" - ) - assert_equal_data(result, expected) - - -def test_left_join(pyspark_constructor: Constructor) -> None: - data_left = { - "antananarivo": [1.0, 2, 3], - "bob": [4.0, 5, 6], - "idx": [0.0, 1.0, 2.0], - } - data_right = { - "antananarivo": [1.0, 2, 3], - "co": [4.0, 5, 7], - "idx": [0.0, 1.0, 2.0], - } - df_left = nw.from_native(pyspark_constructor(data_left)) - df_right = nw.from_native(pyspark_constructor(data_right)) - result = ( - df_left.join(df_right, left_on="bob", right_on="co", how="left") # type: ignore[arg-type] - .sort("idx") - .drop("idx_right") - ) - expected = { - "antananarivo": [1, 2, 3], - "bob": [4, 5, 6], - "idx": [0, 1, 2], - "antananarivo_right": [1, 2, None], - } - assert_equal_data(result, expected) - - df_left = nw.from_native(pyspark_constructor(data_left)) - df_right = nw.from_native(pyspark_constructor(data_right)) - result_on_list = df_left.join( - df_right, # type: ignore[arg-type] - on=["antananarivo", "idx"], - how="left", - ) - result_on_list = result_on_list.sort("idx") - expected_on_list = { - "antananarivo": [1, 2, 3], - "bob": [4, 5, 6], - "idx": [0, 1, 2], - "co": [4, 5, 7], - } - assert_equal_data(result_on_list, expected_on_list) - - -def test_left_join_multiple_column(pyspark_constructor: Constructor) -> None: - data_left = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "idx": [0, 1, 2]} - data_right = {"antananarivo": [1, 2, 3], "c": [4, 5, 6], "idx": [0, 1, 2]} - df_left = nw.from_native(pyspark_constructor(data_left)) - df_right = nw.from_native(pyspark_constructor(data_right)) - result = ( - df_left.join( - df_right, # type: ignore[arg-type] - left_on=["antananarivo", "bob"], - right_on=["antananarivo", "c"], - how="left", - ) - .sort("idx") - .drop("idx_right") - ) - expected = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "idx": [0, 1, 2]} - assert_equal_data(result, expected) - - -def test_left_join_overlapping_column(pyspark_constructor: Constructor) -> None: - data_left = { - "antananarivo": [1.0, 2, 3], - "bob": [4.0, 5, 6], - "d": [1.0, 4, 2], - "idx": [0.0, 1.0, 2.0], - } - data_right = { - "antananarivo": [1.0, 2, 3], - "c": [4.0, 5, 6], - "d": [1.0, 4, 2], - "idx": [0.0, 1.0, 2.0], - } - df_left = nw.from_native(pyspark_constructor(data_left)) - df_right = nw.from_native(pyspark_constructor(data_right)) - result = df_left.join(df_right, left_on="bob", right_on="c", how="left").sort("idx") # type: ignore[arg-type] - result = result.drop("idx_right") - expected: dict[str, list[Any]] = { - "antananarivo": [1, 2, 3], - "bob": [4, 5, 6], - "d": [1, 4, 2], - "idx": [0, 1, 2], - "antananarivo_right": [1, 2, 3], - "d_right": [1, 4, 2], - } - assert_equal_data(result, expected) - - df_left = nw.from_native(pyspark_constructor(data_left)) - df_right = nw.from_native(pyspark_constructor(data_right)) - result = ( - df_left.join( - df_right, # type: ignore[arg-type] - left_on="antananarivo", - right_on="d", - how="left", - ) - .sort("idx") - .drop("idx_right") - ) - expected = { - "antananarivo": [1, 2, 3], - "bob": [4, 5, 6], - "d": [1, 4, 2], - "idx": [0, 1, 2], - "antananarivo_right": [1.0, 3.0, None], - "c": [4.0, 6.0, None], - } - assert_equal_data(result, expected) - - -# copied from tests/expr_and_series/arithmetic_test.py -@pytest.mark.parametrize( - ("attr", "rhs", "expected"), - [ - ("__add__", 1, [2, 3, 4]), - ("__sub__", 1, [0, 1, 2]), - ("__mul__", 2, [2, 4, 6]), - ("__truediv__", 2.0, [0.5, 1.0, 1.5]), - ("__truediv__", 1, [1, 2, 3]), - ("__floordiv__", 2, [0, 1, 1]), - ("__mod__", 2, [1, 0, 1]), - ("__pow__", 2, [1, 4, 9]), - ], -) -def test_arithmetic_expr( - attr: str, rhs: Any, expected: list[Any], pyspark_constructor: Constructor -) -> None: - data = {"a": [1.0, 2, 3]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select(getattr(nw.col("a"), attr)(rhs)) - assert_equal_data(result, {"a": expected}) - - -@pytest.mark.parametrize( - ("attr", "rhs", "expected"), - [ - ("__radd__", 1, [2, 3, 4]), - ("__rsub__", 1, [0, -1, -2]), - ("__rmul__", 2, [2, 4, 6]), - ("__rtruediv__", 2.0, [2, 1, 2 / 3]), - ("__rfloordiv__", 2, [2, 1, 0]), - ("__rmod__", 2, [0, 0, 2]), - ("__rpow__", 2, [2, 4, 8]), - ], -) -def test_right_arithmetic_expr( - attr: str, - rhs: Any, - expected: list[Any], - pyspark_constructor: Constructor, -) -> None: - data = {"a": [1, 2, 3]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select(getattr(nw.col("a"), attr)(rhs)) - assert_equal_data(result, {"literal": expected}) - - -# Copied from tests/expr_and_series/median_test.py -def test_median(pyspark_constructor: Constructor) -> None: - data = {"a": [3, 8, 2, None], "b": [5, 5, None, 7], "z": [7.0, 8, 9, None]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select( - a=nw.col("a").median(), b=nw.col("b").median(), z=nw.col("z").median() - ) - expected = {"a": [3.0], "b": [5.0], "z": [8.0]} - assert_equal_data(result, expected) - - -# copied from tests/expr_and_series/clip_test.py -def test_clip(pyspark_constructor: Constructor) -> None: - df = nw.from_native(pyspark_constructor({"a": [1, 2, 3, -4, 5]})) - result = df.select( - lower_only=nw.col("a").clip(lower_bound=3), - upper_only=nw.col("a").clip(upper_bound=4), - both=nw.col("a").clip(3, 4), - ) - expected = { - "lower_only": [3, 3, 3, 3, 5], - "upper_only": [1, 2, 3, -4, 4], - "both": [3, 3, 3, 3, 4], - } - assert_equal_data(result, expected) - - -# copied from tests/expr_and_series/is_between_test.py -@pytest.mark.parametrize( - ("closed", "expected"), - [ - ("left", [True, True, True, False]), - ("right", [False, True, True, True]), - ("both", [True, True, True, True]), - ("none", [False, True, True, False]), - ], -) -def test_is_between( - pyspark_constructor: Constructor, - closed: Literal["left", "right", "none", "both"], - expected: list[bool], -) -> None: - data = {"a": [1, 4, 2, 5]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select(nw.col("a").is_between(1, 5, closed=closed)) - expected_dict = {"a": expected} - assert_equal_data(result, expected_dict) - - -# copied from tests/expr_and_series/is_duplicated_test.py -def test_is_duplicated(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 1, 2, None], "b": [1, 2, None, None], "level_0": [0, 1, 2, 3]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select( - a=nw.col("a").is_duplicated(), - b=nw.col("b").is_duplicated(), - level_0=nw.col("level_0"), - ).sort("level_0") - expected = { - "a": [True, True, False, False], - "b": [False, False, True, True], - "level_0": [0, 1, 2, 3], - } - assert_equal_data(result, expected) - - -# copied from tests/expr_and_series/is_finite_test.py -def test_is_finite(pyspark_constructor: Constructor) -> None: - data = {"a": [float("nan"), float("inf"), 2.0, None]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select(finite=nw.col("a").is_finite()) - expected = {"finite": [False, False, True, False]} - assert_equal_data(result, expected) - - -def test_is_in(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 2, 3, 4, 5]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select(in_list=nw.col("a").is_in([2, 4])) - expected = {"in_list": [False, True, False, True, False]} - assert_equal_data(result, expected) - - -# copied from tests/expr_and_series/is_unique_test.py -def test_is_unique(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 1, 2, None], "b": [1, 2, None, None], "level_0": [0, 1, 2, 3]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select( - a=nw.col("a").is_unique(), - b=nw.col("b").is_unique(), - level_0=nw.col("level_0"), - ).sort("level_0") - expected = { - "a": [False, False, True, True], - "b": [True, True, False, False], - "level_0": [0, 1, 2, 3], - } - assert_equal_data(result, expected) - - -def test_len(pyspark_constructor: Constructor) -> None: - data = {"a": [1, 2, float("nan"), 4, None], "b": [None, 3, None, 5, None]} - df = nw.from_native(pyspark_constructor(data)) - result = df.select( - a=nw.col("a").len(), - b=nw.col("b").len(), - ) - expected = {"a": [5], "b": [5]} - assert_equal_data(result, expected) - - -# Copied from tests/expr_and_series/round_test.py -@pytest.mark.parametrize("decimals", [0, 1, 2]) -def test_round(pyspark_constructor: Constructor, decimals: int) -> None: - data = {"a": [2.12345, 2.56789, 3.901234]} - df = nw.from_native(pyspark_constructor(data)) - - expected_data = {k: [round(e, decimals) for e in v] for k, v in data.items()} - result_frame = df.select(nw.col("a").round(decimals)) - assert_equal_data(result_frame, expected_data) - - -# copied from tests/expr_and_series/skew_test.py -@pytest.mark.parametrize( - ("data", "expected"), - [ - pytest.param( - [], - None, - marks=pytest.mark.skip( - reason="PySpark cannot infer schema from empty datasets" - ), - ), - ([1], None), - ([1, 2], 0.0), - ([0.0, 0.0, 0.0], None), - ([1, 2, 3, 2, 1], 0.343622), - ], -) -def test_skew( - pyspark_constructor: Constructor, data: list[float], expected: float | None -) -> None: - df = nw.from_native(pyspark_constructor({"a": data})) - result = df.select(skew=nw.col("a").skew()) - assert_equal_data(result, {"skew": [expected]}) - - -# copied from tests/expr_and_series/list_test.py -@pytest.mark.parametrize( - ("dtype", "expected_lit"), - [(None, [2, 2, 2]), (nw.String, ["2", "2", "2"]), (nw.Float32, [2.0, 2.0, 2.0])], -) -def test_lit( - pyspark_constructor: Constructor, - dtype: DType | None, - expected_lit: list[Any], - request: pytest.FixtureRequest, -) -> None: - if dtype is not None: - request.applymarker(pytest.mark.xfail) - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} - df_raw = pyspark_constructor(data) - df = nw.from_native(df_raw).lazy() - result = df.with_columns(nw.lit(2, dtype).alias("lit")) - expected = { - "a": [1, 3, 2], - "b": [4, 4, 6], - "z": [7.0, 8.0, 9.0], - "lit": expected_lit, - } - assert_equal_data(result, expected) - - -@pytest.mark.parametrize( - ("col_name", "expr", "expected_result"), - [ - ("left_lit", nw.lit(1) + nw.col("a"), [2, 4, 3]), - ("right_lit", nw.col("a") + nw.lit(1), [2, 4, 3]), - ("left_lit_with_agg", nw.lit(1) + nw.col("a").mean(), [3]), - ("right_lit_with_agg", nw.col("a").mean() - nw.lit(1), [1]), - ("left_scalar", 1 + nw.col("a"), [2, 4, 3]), - ("right_scalar", nw.col("a") + 1, [2, 4, 3]), - ("left_scalar_with_agg", 1 + nw.col("a").mean(), [3]), - ("right_scalar_with_agg", nw.col("a").mean() - 1, [1]), - ], -) -def test_lit_operation( - pyspark_constructor: Constructor, - col_name: str, - expr: nw.Expr, - expected_result: list[int], - request: pytest.FixtureRequest, -) -> None: - if col_name in ( - "left_scalar_with_agg", - "left_lit_with_agg", - "right_lit", - "right_lit_with_agg", - ): - request.applymarker(pytest.mark.xfail) - - data = {"a": [1, 3, 2]} - df_raw = pyspark_constructor(data) - df = nw.from_native(df_raw).lazy() - result = df.select(expr.alias(col_name)) - expected = {col_name: expected_result} - assert_equal_data(result, expected) diff --git a/tests/stable_api_test.py b/tests/stable_api_test.py index c3d028563..862c5966f 100644 --- a/tests/stable_api_test.py +++ b/tests/stable_api_test.py @@ -16,7 +16,7 @@ def test_renamed_taxicab_norm( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "duckdb" in str(constructor): + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) # Suppose we need to rename `_l1_norm` to `_taxicab_norm`. # We need `narwhals.stable.v1` to stay stable. So, we @@ -46,10 +46,15 @@ def test_renamed_taxicab_norm( assert_equal_data(result, expected) -def test_renamed_taxicab_norm_dataframe(constructor: Constructor) -> None: +def test_renamed_taxicab_norm_dataframe( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: # Suppose we have `DataFrame._l1_norm` in `stable.v1`, but remove it # in the main namespace. Here, we check that it's still usable from # the stable api. + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + def func(df_any: Any) -> Any: df = nw_v1.from_native(df_any) df = df._l1_norm() @@ -60,10 +65,16 @@ def func(df_any: Any) -> Any: assert_equal_data(result, expected) -def test_renamed_taxicab_norm_dataframe_narwhalify(constructor: Constructor) -> None: +def test_renamed_taxicab_norm_dataframe_narwhalify( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: # Suppose we have `DataFrame._l1_norm` in `stable.v1`, but remove it # in the main namespace. Here, we check that it's still usable from # the stable api when using `narwhalify`. + + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + @nw_v1.narwhalify def func(df: Any) -> Any: return df._l1_norm() @@ -136,7 +147,10 @@ def test_series_docstrings() -> None: ), item -def test_dtypes(constructor: Constructor) -> None: +def test_dtypes(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw_v1.from_native( constructor({"a": [1], "b": [datetime(2020, 1, 1)], "c": [timedelta(1)]}) )