diff --git a/.github/workflows/check_tpch_queries.yml b/.github/workflows/check_tpch_queries.yml index 723fa6e80..ce7da6f8e 100644 --- a/.github/workflows/check_tpch_queries.yml +++ b/.github/workflows/check_tpch_queries.yml @@ -25,7 +25,7 @@ jobs: cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "pyproject.toml" - name: local-install - run: uv pip install -e ".[dev, core, dask]" --system + run: uv pip install -U --pre -e ".[dev, core, dask]" --system - name: generate-data run: cd tpch && python generate_data.py - name: tpch-tests diff --git a/.github/workflows/downstream_tests.yml b/.github/workflows/downstream_tests.yml index 548251ddc..5ad95b6d9 100644 --- a/.github/workflows/downstream_tests.yml +++ b/.github/workflows/downstream_tests.yml @@ -220,7 +220,7 @@ jobs: run: | cd tea-tasting pdm remove narwhals - pdm add ./.. + pdm add ./..[dev] - name: show-deps run: | cd tea-tasting diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index 91563d840..0e7e6a205 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -61,7 +61,7 @@ jobs: cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "pyproject.toml" - name: install-pretty-old-versions - run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.3.0 scipy==1.5.0 scikit-learn==1.1.0 tzdata --system + run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.5.0 scikit-learn==1.1.0 tzdata --system - name: install-reqs run: uv pip install -e ".[dev]" --system - name: show-deps @@ -75,7 +75,7 @@ jobs: echo "$DEPS" | grep 'polars==0.20.3' echo "$DEPS" | grep 'numpy==1.17.5' echo "$DEPS" | grep 'pyarrow==11.0.0' - echo "$DEPS" | grep 'pyspark==3.3.0' + echo "$DEPS" | grep 'pyspark==3.5.0' echo "$DEPS" | grep 'scipy==1.5.0' echo "$DEPS" | grep 'scikit-learn==1.1.0' - name: Run pytest @@ -84,7 +84,7 @@ jobs: not_so_old_versions: strategy: matrix: - python-version: ["3.9"] + python-version: ["3.10"] os: [ubuntu-latest] runs-on: ${{ matrix.os }} steps: @@ -99,7 +99,7 @@ jobs: cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "pyproject.toml" - name: install-not-so-old-versions - run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==14.0.0 "pyarrow-stubs<17" pyspark==3.4.0 scipy==1.8.0 scikit-learn==1.3.0 dask[dataframe]==2024.7 tzdata --system + run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==15.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.8.0 scikit-learn==1.3.0 dask[dataframe]==2024.10 tzdata --system - name: install-reqs run: uv pip install -e ".[dev]" --system - name: show-deps @@ -110,11 +110,11 @@ jobs: echo "$DEPS" | grep 'pandas==2.0.3' echo "$DEPS" | grep 'polars==0.20.8' echo "$DEPS" | grep 'numpy==1.24.4' - echo "$DEPS" | grep 'pyarrow==14.0.0' - echo "$DEPS" | grep 'pyspark==3.4.0' + echo "$DEPS" | grep 'pyarrow==15.0.0' + echo "$DEPS" | grep 'pyspark==3.5.0' echo "$DEPS" | grep 'scipy==1.8.0' echo "$DEPS" | grep 'scikit-learn==1.3.0' - echo "$DEPS" | grep 'dask==2024.7' + echo "$DEPS" | grep 'dask==2024.10' - name: Run pytest run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow --constructors=pandas,pyarrow,polars[eager],polars[lazy],dask @@ -158,8 +158,6 @@ jobs: run: | uv pip uninstall pyarrow --system uv pip install --extra-index-url https://pypi.fury.io/arrow-nightlies/ --pre pyarrow --system - - name: show-deps - run: uv pip freeze - name: install numpy nightly run: | uv pip uninstall numpy --system @@ -167,18 +165,22 @@ jobs: - name: install dask run: | uv pip uninstall dask dask-expr --system - python -m pip install git+https://github.com/dask/distributed git+https://github.com/dask/dask git+https://github.com/dask/dask-expr + python -m pip install git+https://github.com/dask/distributed git+https://github.com/dask/dask + - name: install duckdb + run: | + python -m pip install -U --pre duckdb - name: show-deps run: uv pip freeze - name: Assert nightlies dependencies run: | DEPS=$(uv pip freeze) - echo "$DEPS" | grep 'polars' + echo "$DEPS" | grep 'polars.*@' echo "$DEPS" | grep 'pandas.*dev' echo "$DEPS" | grep 'pyarrow.*dev' - echo "$DEPS" | grep 'numpy' - echo "$DEPS" | grep 'dask' + echo "$DEPS" | grep 'numpy.*dev' + echo "$DEPS" | grep 'dask.*@' + echo "$DEPS" | grep 'duckdb.*dev' - name: Run pytest run: | pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow \ - --constructors=pandas,pandas[nullable],pandas[pyarrow],pyarrow,polars[eager],polars[lazy],dask + --constructors=pandas,pandas[nullable],pandas[pyarrow],pyarrow,polars[eager],polars[lazy],dask,duckdb diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 3f015d405..bb46b4f0d 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -50,6 +50,10 @@ jobs: cache-dependency-glob: "pyproject.toml" - name: install-reqs run: uv pip install -e ".[dev, core, extra, dask, modin]" --system + - name: install pyspark + run: uv pip install -e ".[pyspark]" --system + # PySpark is not yet available on Python3.12+ + if: matrix.python-version != '3.12' - name: show-deps run: uv pip freeze - name: Run pytest @@ -59,7 +63,7 @@ jobs: pytest-full-coverage: strategy: matrix: - python-version: ["3.9", "3.11", "3.13"] + python-version: ["3.11", "3.13"] os: [ubuntu-latest] runs-on: ${{ matrix.os }} steps: @@ -78,7 +82,7 @@ jobs: - name: install pyspark run: uv pip install -e ".[pyspark]" --system # PySpark is not yet available on Python3.12+ - if: matrix.python-version == '3.9' || matrix.python-version == '3.11' + if: matrix.python-version != '3.13' - name: install ibis run: uv pip install -e ".[ibis]" --system # Ibis puts upper bounds on dependencies, and requires Python3.10+, diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fb149f30b..5e6929f16 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ ci: repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.8.1' + rev: 'v0.8.6' hooks: # Run the formatter. - id: ruff-format @@ -14,7 +14,7 @@ repos: alias: check-docstrings entry: python utils/check_docstrings.py - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.13.0' + rev: 'v1.14.1' hooks: - id: mypy additional_dependencies: ['polars==1.4.1', 'pytest==8.3.2'] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0f8a6eb0b..af0eb1cbc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -78,6 +78,10 @@ where `YOUR-GITHUB-USERNAME` will be your GitHub user name. Here's how you can set up your local development environment to contribute. +#### Prerequisites for PySpark tests + +If you want to run PySpark-related tests, you'll need to have Java installed. Refer to the [Spark documentation](https://spark.apache.org/docs/latest/#downloading) for more information. + #### Option 1: Use UV (recommended) 1. Make sure you have Python3.12 installed, create a virtual environment, diff --git a/README.md b/README.md index bb024c6c2..eee90ebd9 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,7 @@ Extremely lightweight and extensible compatibility layer between dataframe libraries! - **Full API support**: cuDF, Modin, pandas, Polars, PyArrow -- **Lazy-only support**: Dask -- **Interchange-level support**: DuckDB, Ibis, Vaex, anything which implements the DataFrame Interchange Protocol +- **Lazy-only support**: Dask. Work in progress: DuckDB, Ibis, PySpark. Seamlessly support all, without depending on any! diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md index 299ab2d4a..e0f7b6578 100644 --- a/docs/api-reference/expr.md +++ b/docs/api-reference/expr.md @@ -47,6 +47,7 @@ - over - pipe - quantile + - rank - replace_strict - rolling_mean - rolling_std diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index c2e35a3c5..0aea494f7 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -54,6 +54,7 @@ - null_count - pipe - quantile + - rank - rename - replace_strict - rolling_mean diff --git a/docs/backcompat.md b/docs/backcompat.md index 55b927fd8..b2d312e0a 100644 --- a/docs/backcompat.md +++ b/docs/backcompat.md @@ -111,6 +111,10 @@ before making any change. ### After `stable.v1` + +- Since Narwhals 1.21, passing a `DuckDBPyRelation` to `from_native` returns a `LazyFrame`. In + `narwhals.stable.v1`, it returns a `DataFrame` with `level='interchange'`. + - Since Narwhals 1.15, `Series` is generic in the native Series, meaning that you can write: ```python diff --git a/docs/basics/dataframe_conversion.md b/docs/basics/dataframe_conversion.md index 690f5d093..bdf803a23 100644 --- a/docs/basics/dataframe_conversion.md +++ b/docs/basics/dataframe_conversion.md @@ -14,6 +14,7 @@ To illustrate, we create dataframes in various formats: ```python exec="1" source="above" session="conversion" import narwhals as nw from narwhals.typing import IntoDataFrame +from typing import Any import duckdb import polars as pl @@ -45,11 +46,17 @@ print(df_to_pandas(df_polars)) ### Via PyCapsule Interface -Similarly, if your library uses Polars internally, you can convert any user-supplied dataframe to Polars format using Narwhals. +Similarly, if your library uses Polars internally, you can convert any user-supplied dataframe +which implements `__arrow_c_stream__`: ```python exec="1" source="above" session="conversion" result="python" -def df_to_polars(df: IntoDataFrame) -> pl.DataFrame: - return nw.from_arrow(nw.from_native(df), native_namespace=pl).to_native() +def df_to_polars(df_native: Any) -> pl.DataFrame: + if hasattr(df_native, "__arrow_c_stream__"): + return nw.from_arrow(df_native, native_namespace=pl).to_native() + msg = ( + f"Expected object which implements '__arrow_c_stream__' got: {type(df_native)}" + ) + raise TypeError(msg) print(df_to_polars(df_duckdb)) # You can only execute this line of code once. @@ -66,8 +73,9 @@ If you need to ingest the same dataframe multiple times, then you may want to go This may be less efficient than the PyCapsule approach above (and always requires PyArrow!), but is more forgiving: ```python exec="1" source="above" session="conversion" result="python" -def df_to_polars(df: IntoDataFrame) -> pl.DataFrame: - return pl.DataFrame(nw.from_native(df).to_arrow()) +def df_to_polars(df_native: IntoDataFrame) -> pl.DataFrame: + df = nw.from_native(df_native).lazy().collect() + return pl.DataFrame(nw.from_native(df, eager_only=True).to_arrow()) df_duckdb = duckdb.sql("SELECT * FROM df_polars") diff --git a/docs/css/extra.css b/docs/css/extra.css index 6d19bea30..9aee280d0 100644 --- a/docs/css/extra.css +++ b/docs/css/extra.css @@ -2,3 +2,7 @@ .md-typeset ul li { margin-bottom: 0.1em !important; } +.md-main__inner.md-grid { + max-width: initial; + margin-left: 5vw; +} diff --git a/docs/extending.md b/docs/extending.md index 2a8953987..588e234f4 100644 --- a/docs/extending.md +++ b/docs/extending.md @@ -15,17 +15,16 @@ Currently, Narwhals has **full API** support for the following libraries: It also has **lazy-only** support for [Dask](https://github.com/dask/dask), and **interchange** support for [DuckDB](https://github.com/duckdb/duckdb) and [Ibis](https://github.com/ibis-project/ibis). +We are working towards full "lazy-only" support for DuckDB, Ibis, and PySpark. + ### Levels of support Narwhals comes with three levels of support: - **Full API support**: cuDF, Modin, pandas, Polars, PyArrow -- **Lazy-only support**: Dask +- **Lazy-only support**: Dask. Work in progress: DuckDB, Ibis, PySpark. - **Interchange-level support**: DuckDB, Ibis, Vaex, anything which implements the DataFrame Interchange Protocol -The lazy-only layer is a major item on our 2025 roadmap, and hope to be able to bring libraries currently in -the "interchange" level into that one. - Libraries for which we have full support can benefit from the whole [Narwhals API](./api-reference/index.md). diff --git a/docs/installation.md b/docs/installation.md index a406b6295..8857b8029 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -30,7 +30,7 @@ To verify the installation, start the Python REPL and execute: ```python >>> import narwhals >>> narwhals.__version__ -'1.20.1' +'1.21.1' ``` If you see the version number, then the installation was successful! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 646a6a769..ac0a8cbe4 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -79,7 +79,7 @@ from narwhals.utils import maybe_reset_index from narwhals.utils import maybe_set_index -__version__ = "1.20.1" +__version__ = "1.21.1" __all__ = [ "Array", diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index c0efa50fe..e6bb6fa65 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -16,12 +16,14 @@ from narwhals._arrow.utils import validate_dataframe_comparand from narwhals._expression_parsing import evaluate_into_exprs from narwhals.dependencies import is_numpy_array +from narwhals.exceptions import ColumnNotFoundError from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_temporary_column_name from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop from narwhals.utils import scale_bytes +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -56,6 +58,7 @@ def __init__( self._implementation = Implementation.PYARROW self._backend_version = backend_version self._version = version + validate_backend_version(self._implementation, self._backend_version) def __narwhals_namespace__(self: Self) -> ArrowNamespace: from narwhals._arrow.namespace import ArrowNamespace @@ -98,23 +101,14 @@ def row(self: Self, index: int) -> tuple[Any, ...]: return tuple(col[index] for col in self._native_frame) @overload - def rows( - self: Self, - *, - named: Literal[True], - ) -> list[dict[str, Any]]: ... + def rows(self: Self, *, named: Literal[True]) -> list[dict[str, Any]]: ... @overload - def rows( - self: Self, - *, - named: Literal[False], - ) -> list[tuple[Any, ...]]: ... + def rows(self: Self, *, named: Literal[False]) -> list[tuple[Any, ...]]: ... + @overload def rows( - self: Self, - *, - named: bool, + self: Self, *, named: bool ) -> list[tuple[Any, ...]] | list[dict[str, Any]]: ... def rows(self: Self, *, named: bool) -> list[tuple[Any, ...]] | list[dict[str, Any]]: @@ -123,10 +117,7 @@ def rows(self: Self, *, named: bool) -> list[tuple[Any, ...]] | list[dict[str, A return self._native_frame.to_pylist() # type: ignore[no-any-return] def iter_rows( - self: Self, - *, - named: bool, - buffer_size: int, + self: Self, *, named: bool, buffer_size: int ) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: df = self._native_frame num_rows = df.num_rows @@ -260,9 +251,7 @@ def __getitem__( ) start = item.start or 0 stop = item.stop if item.stop is not None else len(self._native_frame) - return self._from_native_frame( - self._native_frame.slice(start, stop - start), - ) + return self._from_native_frame(self._native_frame.slice(start, stop - start)) elif isinstance(item, Sequence) or (is_numpy_array(item) and item.ndim == 1): if ( @@ -298,11 +287,7 @@ def estimated_size(self: Self, unit: SizeUnit) -> int | float: def columns(self: Self) -> list[str]: return self._native_frame.schema.names # type: ignore[no-any-return] - def select( - self: Self, - *exprs: IntoArrowExpr, - **named_exprs: IntoArrowExpr, - ) -> Self: + def select(self: Self, *exprs: IntoArrowExpr, **named_exprs: IntoArrowExpr) -> Self: import pyarrow as pa new_series = evaluate_into_exprs(self, *exprs, **named_exprs) @@ -310,16 +295,11 @@ def select( # return empty dataframe, like Polars does return self._from_native_frame(self._native_frame.__class__.from_arrays([])) names = [s.name for s in new_series] - df = pa.Table.from_arrays( - broadcast_series(new_series), - names=names, - ) + df = pa.Table.from_arrays(broadcast_series(new_series), names=names) return self._from_native_frame(df) def with_columns( - self: Self, - *exprs: IntoArrowExpr, - **named_exprs: IntoArrowExpr, + self: Self, *exprs: IntoArrowExpr, **named_exprs: IntoArrowExpr ) -> Self: native_frame = self._native_frame new_columns = evaluate_into_exprs(self, *exprs, **named_exprs) @@ -331,9 +311,7 @@ def with_columns( col_name = col_value.name column = validate_dataframe_comparand( - length=length, - other=col_value, - backend_version=self._backend_version, + length=length, other=col_value, backend_version=self._backend_version ) native_frame = ( @@ -608,12 +586,9 @@ def is_duplicated(self: Self) -> ArrowSeries: columns = self.columns index_token = generate_temporary_column_name(n_bytes=8, columns=columns) col_token = generate_temporary_column_name( - n_bytes=8, - columns=[*columns, index_token], + n_bytes=8, columns=[*columns, index_token] ) - df = self.with_row_index(index_token)._native_frame - row_count = ( df.append_column(col_token, pa.repeat(pa.scalar(1), len(self))) .group_by(columns) @@ -667,6 +642,9 @@ def unique( import pyarrow.compute as pc df = self._native_frame + if subset is not None and any(x not in self.columns for x in subset): + msg = f"Column(s) {subset} not found in {self.columns}" + raise ColumnNotFoundError(msg) subset = subset or self.columns if keep in {"any", "first", "last"}: diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 04c5c930e..df5c95367 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -87,8 +87,7 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: except KeyError as e: missing_columns = [x for x in column_names if x not in df.columns] raise ColumnNotFoundError.from_missing_and_available_column_names( - missing_columns=missing_columns, - available_columns=df.columns, + missing_columns=missing_columns, available_columns=df.columns ) from e return cls( @@ -163,66 +162,30 @@ def __lt__(self: Self, other: ArrowExpr | Any) -> Self: def __and__(self: Self, other: ArrowExpr | bool | Any) -> Self: return reuse_series_implementation(self, "__and__", other=other) - def __rand__(self: Self, other: ArrowExpr | bool | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__and__(self) # type: ignore[return-value] - def __or__(self: Self, other: ArrowExpr | bool | Any) -> Self: return reuse_series_implementation(self, "__or__", other=other) - def __ror__(self: Self, other: ArrowExpr | bool | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__or__(self) # type: ignore[return-value] - def __add__(self: Self, other: ArrowExpr | Any) -> Self: return reuse_series_implementation(self, "__add__", other=other) - def __radd__(self: Self, other: ArrowExpr | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__add__(self) # type: ignore[return-value] - def __sub__(self: Self, other: ArrowExpr | Any) -> Self: return reuse_series_implementation(self, "__sub__", other=other) - def __rsub__(self: Self, other: ArrowExpr | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__sub__(self) # type: ignore[return-value] - def __mul__(self: Self, other: ArrowExpr | Any) -> Self: return reuse_series_implementation(self, "__mul__", other=other) - def __rmul__(self: Self, other: ArrowExpr | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__mul__(self) # type: ignore[return-value] - def __pow__(self: Self, other: ArrowExpr | Any) -> Self: return reuse_series_implementation(self, "__pow__", other=other) - def __rpow__(self: Self, other: ArrowExpr | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__pow__(self) # type: ignore[return-value] - def __floordiv__(self: Self, other: ArrowExpr | Any) -> Self: return reuse_series_implementation(self, "__floordiv__", other=other) - def __rfloordiv__(self: Self, other: ArrowExpr | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__floordiv__(self) # type: ignore[return-value] - def __truediv__(self: Self, other: ArrowExpr | Any) -> Self: return reuse_series_implementation(self, "__truediv__", other=other) - def __rtruediv__(self: Self, other: ArrowExpr | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__truediv__(self) # type: ignore[return-value] - def __mod__(self: Self, other: ArrowExpr | Any) -> Self: return reuse_series_implementation(self, "__mod__", other=other) - def __rmod__(self: Self, other: ArrowExpr | Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__mod__(self) # type: ignore[return-value] - def __invert__(self: Self) -> Self: return reuse_series_implementation(self, "__invert__") @@ -320,7 +283,12 @@ def is_null(self: Self) -> Self: def is_nan(self: Self) -> Self: return reuse_series_implementation(self, "is_nan") - def is_between(self: Self, lower_bound: Any, upper_bound: Any, closed: str) -> Self: + def is_between( + self: Self, + lower_bound: Any, + upper_bound: Any, + closed: Literal["left", "right", "none", "both"], + ) -> Self: return reuse_series_implementation( self, "is_between", @@ -563,6 +531,16 @@ def rolling_std( ddof=ddof, ) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + return reuse_series_implementation( + self, "rank", method=method, descending=descending + ) + @property def dt(self: Self) -> ArrowExprDateTimeNamespace: return ArrowExprDateTimeNamespace(self) @@ -590,9 +568,7 @@ def __init__(self: Self, expr: ArrowExpr) -> None: def get_categories(self: Self) -> ArrowExpr: return reuse_series_namespace_implementation( - self._compliant_expr, - "cat", - "get_categories", + self._compliant_expr, "cat", "get_categories" ) @@ -702,12 +678,7 @@ def len_chars(self: Self) -> ArrowExpr: ) def replace( - self: Self, - pattern: str, - value: str, - *, - literal: bool, - n: int, + self: Self, pattern: str, value: str, *, literal: bool, n: int ) -> ArrowExpr: return reuse_series_namespace_implementation( self._compliant_expr, @@ -719,13 +690,7 @@ def replace( n=n, ) - def replace_all( - self: Self, - pattern: str, - value: str, - *, - literal: bool, - ) -> ArrowExpr: + def replace_all(self: Self, pattern: str, value: str, *, literal: bool) -> ArrowExpr: return reuse_series_namespace_implementation( self._compliant_expr, "str", @@ -737,26 +702,17 @@ def replace_all( def strip_chars(self: Self, characters: str | None) -> ArrowExpr: return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "strip_chars", - characters=characters, + self._compliant_expr, "str", "strip_chars", characters=characters ) def starts_with(self: Self, prefix: str) -> ArrowExpr: return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "starts_with", - prefix=prefix, + self._compliant_expr, "str", "starts_with", prefix=prefix ) def ends_with(self: Self, suffix: str) -> ArrowExpr: return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "ends_with", - suffix=suffix, + self._compliant_expr, "str", "ends_with", suffix=suffix ) def contains(self, pattern: str, *, literal: bool) -> ArrowExpr: @@ -771,24 +727,17 @@ def slice(self: Self, offset: int, length: int | None) -> ArrowExpr: def to_datetime(self: Self, format: str | None) -> ArrowExpr: # noqa: A002 return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "to_datetime", - format=format, + self._compliant_expr, "str", "to_datetime", format=format ) def to_uppercase(self: Self) -> ArrowExpr: return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "to_uppercase", + self._compliant_expr, "str", "to_uppercase" ) def to_lowercase(self: Self) -> ArrowExpr: return reuse_series_namespace_implementation( - self._compliant_expr, - "str", - "to_lowercase", + self._compliant_expr, "str", "to_lowercase" ) @@ -957,8 +906,4 @@ def __init__(self: Self, expr: ArrowExpr) -> None: self._expr = expr def len(self: Self) -> ArrowExpr: - return reuse_series_namespace_implementation( - self._expr, - "list", - "len", - ) + return reuse_series_namespace_implementation(self._expr, "list", "len") diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index ffb16578f..11ed914fe 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -171,7 +171,7 @@ def agg_arrow( function_name = remove_prefix(expr._function_name, "col->") if function_name in {"std", "var"}: - option = pc.VarianceOptions(ddof=expr._kwargs.get("ddof", 1)) + option = pc.VarianceOptions(ddof=expr._kwargs["ddof"]) elif function_name in {"len", "n_unique"}: option = pc.CountOptions(mode="all") elif function_name == "count": diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index 99f043ebd..b02ad32ee 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -359,12 +359,7 @@ def when( *predicates: IntoArrowExpr, ) -> ArrowWhen: plx = self.__class__(backend_version=self._backend_version, version=self._version) - if predicates: - condition = plx.all_horizontal(*predicates) - else: - msg = "at least one predicate needs to be provided" - raise TypeError(msg) - + condition = plx.all_horizontal(*predicates) return ArrowWhen(condition, self._backend_version, version=self._version) def concat_str( diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index cf7760d49..193fc25a2 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -18,6 +18,7 @@ from narwhals.utils import Implementation from narwhals.utils import generate_temporary_column_name from narwhals.utils import import_dtypes_module +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -54,6 +55,7 @@ def __init__( self._implementation = Implementation.PYARROW self._backend_version = backend_version self._version = version + validate_backend_version(self._implementation, self._backend_version) def _change_version(self: Self, version: Version) -> Self: return self.__class__( @@ -488,7 +490,10 @@ def all(self: Self, *, _return_py_scalar: bool = True) -> bool: ) def is_between( - self, lower_bound: Any, upper_bound: Any, closed: str = "both" + self, + lower_bound: Any, + upper_bound: Any, + closed: Literal["left", "right", "none", "both"], ) -> Self: import pyarrow.compute as pc @@ -1095,6 +1100,36 @@ def rolling_std( ** 0.5 ) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + if method == "average": + msg = ( + "`rank` with `method='average' is not supported for pyarrow backend. " + "The available methods are {'min', 'max', 'dense', 'ordinal'}." + ) + raise ValueError(msg) + + import pyarrow as pa # ignore-banned-import + import pyarrow.compute as pc # ignore-banned-import + + sort_keys = "descending" if descending else "ascending" + tiebreaker = "first" if method == "ordinal" else method + + native_series = self._native_series + if self._backend_version < (14, 0, 0): # pragma: no cover + native_series = native_series.combine_chunks() + + null_mask = pc.is_null(native_series) + + rank = pc.rank(native_series, sort_keys=sort_keys, tiebreaker=tiebreaker) + + result = pc.if_else(null_mask, pa.scalar(None), rank) + return self._from_native_series(result) + def __iter__(self: Self) -> Iterator[Any]: yield from ( maybe_extract_py_scalar(x, return_py_scalar=True) diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index 4dbc17a91..ca4852655 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -184,7 +184,7 @@ def broadcast_and_extract_native( rhs = rhs[0] if isinstance(rhs, ArrowDataFrame): - return NotImplemented + return NotImplemented # type: ignore[no-any-return] if isinstance(rhs, ArrowSeries): if len(rhs) == 1: diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 6542253a0..16053d69a 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -11,11 +11,14 @@ from narwhals._dask.utils import parse_exprs_and_named_exprs from narwhals._pandas_like.utils import native_to_narwhals_dtype from narwhals._pandas_like.utils import select_columns_by_name +from narwhals.exceptions import ColumnNotFoundError +from narwhals.typing import CompliantLazyFrame from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_temporary_column_name from narwhals.utils import parse_columns_to_drop from narwhals.utils import parse_version +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -29,7 +32,6 @@ from narwhals._dask.typing import IntoDaskExpr from narwhals.dtypes import DType from narwhals.utils import Version -from narwhals.typing import CompliantLazyFrame class DaskLazyFrame(CompliantLazyFrame): @@ -44,6 +46,7 @@ def __init__( self._backend_version = backend_version self._implementation = Implementation.DASK self._version = version + validate_backend_version(self._implementation, self._backend_version) def __native_namespace__(self: Self) -> ModuleType: if self._implementation is Implementation.DASK: @@ -195,6 +198,9 @@ def unique( *, keep: Literal["any", "none"] = "any", ) -> Self: + if subset is not None and any(x not in self.columns for x in subset): + msg = f"Column(s) {subset} not found in {self.columns}" + raise ColumnNotFoundError(msg) native_frame = self._native_frame if keep == "none": subset = subset or self.columns diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 7f50dbf74..373c29020 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -1,6 +1,5 @@ from __future__ import annotations -from copy import copy from typing import TYPE_CHECKING from typing import Any from typing import Callable @@ -12,6 +11,7 @@ from narwhals._dask.utils import binary_operation_returns_scalar from narwhals._dask.utils import maybe_evaluate from narwhals._dask.utils import narwhals_to_native_dtype +from narwhals._expression_parsing import infer_new_root_output_names from narwhals._pandas_like.utils import calculate_timestamp_date from narwhals._pandas_like.utils import calculate_timestamp_datetime from narwhals._pandas_like.utils import native_to_narwhals_dtype @@ -23,7 +23,11 @@ from narwhals.utils import import_dtypes_module if TYPE_CHECKING: - import dask_expr + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx + from typing_extensions import Self from narwhals._dask.dataframe import DaskLazyFrame @@ -32,12 +36,12 @@ from narwhals.utils import Version -class DaskExpr(CompliantExpr["dask_expr.Series"]): +class DaskExpr(CompliantExpr["dx.Series"]): _implementation: Implementation = Implementation.DASK def __init__( self, - call: Callable[[DaskLazyFrame], Sequence[dask_expr.Series]], + call: Callable[[DaskLazyFrame], Sequence[dx.Series]], *, depth: int, function_name: str, @@ -60,7 +64,7 @@ def __init__( self._version = version self._kwargs = kwargs - def __call__(self, df: DaskLazyFrame) -> Sequence[dask_expr.Series]: + def __call__(self, df: DaskLazyFrame) -> Sequence[dx.Series]: return self._call(df) def __narwhals_expr__(self) -> None: ... @@ -78,7 +82,7 @@ def from_column_names( backend_version: tuple[int, ...], version: Version, ) -> Self: - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: try: return [df._native_frame[column_name] for column_name in column_names] except KeyError as e: @@ -107,7 +111,7 @@ def from_column_indices( backend_version: tuple[int, ...], version: Version, ) -> Self: - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: return [ df._native_frame.iloc[:, column_index] for column_index in column_indices ] @@ -126,14 +130,14 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: def _from_call( self, - # First argument to `call` should be `dask_expr.Series` - call: Callable[..., dask_expr.Series], + # First argument to `call` should be `dx.Series` + call: Callable[..., dx.Series], expr_name: str, *, returns_scalar: bool, **kwargs: Any, ) -> Self: - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: results = [] inputs = self._call(df) _kwargs = {key: maybe_evaluate(df, value) for key, value in kwargs.items()} @@ -148,30 +152,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: results.append(result) return results - # Try tracking root and output names by combining them from all - # expressions appearing in args and kwargs. If any anonymous - # expression appears (e.g. nw.all()), then give up on tracking root names - # and just set it to None. - root_names = copy(self._root_names) - output_names = self._output_names - for arg in list(kwargs.values()): - if root_names is not None and isinstance(arg, self.__class__): - if arg._root_names is not None: - root_names.extend(arg._root_names) - else: - root_names = None - output_names = None - break - elif root_names is None: - output_names = None - break - - if not ( - (output_names is None and root_names is None) - or (output_names is not None and root_names is not None) - ): # pragma: no cover - msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues" - raise AssertionError(msg) + root_names, output_names = infer_new_root_output_names(self, **kwargs) return self.__class__( func, @@ -186,7 +167,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: ) def alias(self, name: str) -> Self: - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: inputs = self._call(df) return [_input.rename(name) for _input in inputs] @@ -210,14 +191,6 @@ def __add__(self, other: Any) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __radd__(self, other: Any) -> Self: - return self._from_call( - lambda _input, other: _input.__radd__(other), - "__radd__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __sub__(self, other: Any) -> Self: return self._from_call( lambda _input, other: _input.__sub__(other), @@ -226,14 +199,6 @@ def __sub__(self, other: Any) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __rsub__(self, other: Any) -> Self: - return self._from_call( - lambda _input, other: _input.__rsub__(other), - "__rsub__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __mul__(self, other: Any) -> Self: return self._from_call( lambda _input, other: _input.__mul__(other), @@ -242,14 +207,6 @@ def __mul__(self, other: Any) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __rmul__(self, other: Any) -> Self: - return self._from_call( - lambda _input, other: _input.__rmul__(other), - "__rmul__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __truediv__(self, other: Any) -> Self: return self._from_call( lambda _input, other: _input.__truediv__(other), @@ -258,14 +215,6 @@ def __truediv__(self, other: Any) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __rtruediv__(self, other: Any) -> Self: - return self._from_call( - lambda _input, other: _input.__rtruediv__(other), - "__rtruediv__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __floordiv__(self, other: Any) -> Self: return self._from_call( lambda _input, other: _input.__floordiv__(other), @@ -274,14 +223,6 @@ def __floordiv__(self, other: Any) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __rfloordiv__(self, other: Any) -> Self: - return self._from_call( - lambda _input, other: _input.__rfloordiv__(other), - "__rfloordiv__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __pow__(self, other: Any) -> Self: return self._from_call( lambda _input, other: _input.__pow__(other), @@ -290,14 +231,6 @@ def __pow__(self, other: Any) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __rpow__(self, other: Any) -> Self: - return self._from_call( - lambda _input, other: _input.__rpow__(other), - "__rpow__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __mod__(self, other: Any) -> Self: return self._from_call( lambda _input, other: _input.__mod__(other), @@ -306,14 +239,6 @@ def __mod__(self, other: Any) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __rmod__(self, other: Any) -> Self: - return self._from_call( - lambda _input, other: _input.__rmod__(other), - "__rmod__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __eq__(self, other: DaskExpr) -> Self: # type: ignore[override] return self._from_call( lambda _input, other: _input.__eq__(other), @@ -370,14 +295,6 @@ def __and__(self, other: DaskExpr) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __rand__(self, other: DaskExpr) -> Self: - return self._from_call( - lambda _input, other: _input.__rand__(other), - "__rand__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __or__(self, other: DaskExpr) -> Self: return self._from_call( lambda _input, other: _input.__or__(other), @@ -386,14 +303,6 @@ def __or__(self, other: DaskExpr) -> Self: returns_scalar=binary_operation_returns_scalar(self, other), ) - def __ror__(self, other: DaskExpr) -> Self: - return self._from_call( - lambda _input, other: _input.__ror__(other), - "__ror__", - other=other, - returns_scalar=binary_operation_returns_scalar(self, other), - ).alias("literal") - def __invert__(self: Self) -> Self: return self._from_call( lambda _input: _input.__invert__(), @@ -402,16 +311,12 @@ def __invert__(self: Self) -> Self: ) def mean(self) -> Self: - return self._from_call( - lambda _input: _input.mean(), - "mean", - returns_scalar=True, - ) + return self._from_call(lambda _input: _input.mean(), "mean", returns_scalar=True) def median(self) -> Self: from narwhals.exceptions import InvalidOperationError - def func(s: dask_expr.Series) -> dask_expr.Series: + def func(s: dx.Series) -> dx.Series: dtype = native_to_narwhals_dtype(s, self._version, Implementation.DASK) if not dtype.is_numeric(): msg = "`median` operation not supported for non-numeric input type." @@ -421,18 +326,10 @@ def func(s: dask_expr.Series) -> dask_expr.Series: return self._from_call(func, "median", returns_scalar=True) def min(self) -> Self: - return self._from_call( - lambda _input: _input.min(), - "min", - returns_scalar=True, - ) + return self._from_call(lambda _input: _input.min(), "min", returns_scalar=True) def max(self) -> Self: - return self._from_call( - lambda _input: _input.max(), - "max", - returns_scalar=True, - ) + return self._from_call(lambda _input: _input.max(), "max", returns_scalar=True) def std(self, ddof: int) -> Self: return self._from_call( @@ -451,11 +348,7 @@ def var(self, ddof: int) -> Self: ) def skew(self: Self) -> Self: - return self._from_call( - lambda _input: _input.skew(), - "skew", - returns_scalar=True, - ) + return self._from_call(lambda _input: _input.skew(), "skew", returns_scalar=True) def shift(self, n: int) -> Self: return self._from_call( @@ -524,35 +417,26 @@ def is_between( self, lower_bound: Self | Any, upper_bound: Self | Any, - closed: str = "both", + closed: Literal["left", "right", "none", "both"], ) -> Self: - if closed == "none": - closed = "neither" + closed_ = "neither" if closed == "none" else closed return self._from_call( lambda _input, lower_bound, upper_bound, closed: _input.between( - lower_bound, - upper_bound, - closed, + lower_bound, upper_bound, closed ), "is_between", lower_bound=lower_bound, upper_bound=upper_bound, - closed=closed, + closed=closed_, returns_scalar=self._returns_scalar, ) def sum(self) -> Self: - return self._from_call( - lambda _input: _input.sum(), - "sum", - returns_scalar=True, - ) + return self._from_call(lambda _input: _input.sum(), "sum", returns_scalar=True) def count(self) -> Self: return self._from_call( - lambda _input: _input.count(), - "count", - returns_scalar=True, + lambda _input: _input.count(), "count", returns_scalar=True ) def round(self, decimals: int) -> Self: @@ -605,9 +489,7 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> NoRetur def abs(self) -> Self: return self._from_call( - lambda _input: _input.abs(), - "abs", - returns_scalar=self._returns_scalar, + lambda _input: _input.abs(), "abs", returns_scalar=self._returns_scalar ) def all(self) -> Self: @@ -633,11 +515,11 @@ def fill_null( limit: int | None = None, ) -> DaskExpr: def func( - _input: dask_expr.Series, + _input: dx.Series, value: Any | None, strategy: str | None, limit: int | None, - ) -> dask_expr.Series: + ) -> dx.Series: if value is not None: res_ser = _input.fillna(value) else: @@ -674,45 +556,31 @@ def clip( def diff(self: Self) -> Self: return self._from_call( - lambda _input: _input.diff(), - "diff", - returns_scalar=self._returns_scalar, + lambda _input: _input.diff(), "diff", returns_scalar=self._returns_scalar ) def n_unique(self: Self) -> Self: return self._from_call( - lambda _input: _input.nunique(dropna=False), - "n_unique", - returns_scalar=True, + lambda _input: _input.nunique(dropna=False), "n_unique", returns_scalar=True ) def is_null(self: Self) -> Self: return self._from_call( - lambda _input: _input.isna(), - "is_null", - returns_scalar=self._returns_scalar, + lambda _input: _input.isna(), "is_null", returns_scalar=self._returns_scalar ) def is_nan(self: Self) -> Self: - def func(_input: dask_expr.Series) -> dask_expr.Series: + def func(_input: dx.Series) -> dx.Series: dtype = native_to_narwhals_dtype(_input, self._version, self._implementation) if dtype.is_numeric(): return _input != _input # noqa: PLR0124 msg = f"`.is_nan` only supported for numeric dtypes and not {dtype}, did you mean `.is_null`?" raise InvalidOperationError(msg) - return self._from_call( - func, - "is_null", - returns_scalar=self._returns_scalar, - ) + return self._from_call(func, "is_null", returns_scalar=self._returns_scalar) def len(self: Self) -> Self: - return self._from_call( - lambda _input: _input.size, - "len", - returns_scalar=True, - ) + return self._from_call(lambda _input: _input.size, "len", returns_scalar=True) def quantile( self: Self, @@ -721,24 +589,21 @@ def quantile( ) -> Self: if interpolation == "linear": - def func(_input: dask_expr.Series, quantile: float) -> dask_expr.Series: + def func(_input: dx.Series, quantile: float) -> dx.Series: if _input.npartitions > 1: msg = "`Expr.quantile` is not supported for Dask backend with multiple partitions." raise NotImplementedError(msg) return _input.quantile(q=quantile, method="dask") # pragma: no cover return self._from_call( - func, - "quantile", - quantile=quantile, - returns_scalar=True, + func, "quantile", quantile=quantile, returns_scalar=True ) else: msg = "`higher`, `lower`, `midpoint`, `nearest` - interpolation methods are not supported by Dask. Please use `linear` instead." raise NotImplementedError(msg) def is_first_distinct(self: Self) -> Self: - def func(_input: dask_expr.Series) -> dask_expr.Series: + def func(_input: dx.Series) -> dx.Series: _name = _input.name col_token = generate_temporary_column_name(n_bytes=8, columns=[_name]) _input = add_row_index( @@ -750,17 +615,14 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: first_distinct_index = _input.groupby(_name).agg({col_token: "min"})[ col_token ] - return _input[col_token].isin(first_distinct_index) return self._from_call( - func, - "is_first_distinct", - returns_scalar=self._returns_scalar, + func, "is_first_distinct", returns_scalar=self._returns_scalar ) def is_last_distinct(self: Self) -> Self: - def func(_input: dask_expr.Series) -> dask_expr.Series: + def func(_input: dx.Series) -> dx.Series: _name = _input.name col_token = generate_temporary_column_name(n_bytes=8, columns=[_name]) _input = add_row_index( @@ -770,17 +632,14 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: implementation=self._implementation, ) last_distinct_index = _input.groupby(_name).agg({col_token: "max"})[col_token] - return _input[col_token].isin(last_distinct_index) return self._from_call( - func, - "is_last_distinct", - returns_scalar=self._returns_scalar, + func, "is_last_distinct", returns_scalar=self._returns_scalar ) def is_duplicated(self: Self) -> Self: - def func(_input: dask_expr.Series) -> dask_expr.Series: + def func(_input: dx.Series) -> dx.Series: _name = _input.name return ( _input.to_frame() @@ -789,14 +648,10 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: > 1 ) - return self._from_call( - func, - "is_duplicated", - returns_scalar=self._returns_scalar, - ) + return self._from_call(func, "is_duplicated", returns_scalar=self._returns_scalar) def is_unique(self: Self) -> Self: - def func(_input: dask_expr.Series) -> dask_expr.Series: + def func(_input: dx.Series) -> dx.Series: _name = _input.name return ( _input.to_frame() @@ -805,11 +660,7 @@ def func(_input: dask_expr.Series) -> dask_expr.Series: == 1 ) - return self._from_call( - func, - "is_unique", - returns_scalar=self._returns_scalar, - ) + return self._from_call(func, "is_unique", returns_scalar=self._returns_scalar) def is_in(self: Self, other: Any) -> Self: return self._from_call( @@ -883,19 +734,13 @@ def dt(self: Self) -> DaskExprDateTimeNamespace: def name(self: Self) -> DaskExprNameNamespace: return DaskExprNameNamespace(self) - def cast( - self: Self, - dtype: DType | type[DType], - ) -> Self: + def cast(self: Self, dtype: DType | type[DType]) -> Self: def func(_input: Any, dtype: DType | type[DType]) -> Any: dtype = narwhals_to_native_dtype(dtype, self._version) return _input.astype(dtype) return self._from_call( - func, - "cast", - dtype=dtype, - returns_scalar=self._returns_scalar, + func, "cast", dtype=dtype, returns_scalar=self._returns_scalar ) def is_finite(self: Self) -> Self: @@ -920,12 +765,7 @@ def len_chars(self) -> DaskExpr: ) def replace( - self, - pattern: str, - value: str, - *, - literal: bool = False, - n: int = 1, + self, pattern: str, value: str, *, literal: bool = False, n: int = 1 ) -> DaskExpr: return self._compliant_expr._from_call( lambda _input, pattern, value, literal, n: _input.str.replace( @@ -939,13 +779,7 @@ def replace( returns_scalar=self._compliant_expr._returns_scalar, ) - def replace_all( - self, - pattern: str, - value: str, - *, - literal: bool = False, - ) -> DaskExpr: + def replace_all(self, pattern: str, value: str, *, literal: bool = False) -> DaskExpr: return self._compliant_expr._from_call( lambda _input, pattern, value, literal: _input.str.replace( pattern, value, n=-1, regex=not literal @@ -1137,7 +971,7 @@ def replace_time_zone(self, time_zone: str | None) -> DaskExpr: ) def convert_time_zone(self, time_zone: str) -> DaskExpr: - def func(s: dask_expr.Series, time_zone: str) -> dask_expr.Series: + def func(s: dx.Series, time_zone: str) -> dx.Series: dtype = native_to_narwhals_dtype( s, self._compliant_expr._version, Implementation.DASK ) @@ -1154,9 +988,7 @@ def func(s: dask_expr.Series, time_zone: str) -> dask_expr.Series: ) def timestamp(self, time_unit: Literal["ns", "us", "ms"] = "us") -> DaskExpr: - def func( - s: dask_expr.Series, time_unit: Literal["ns", "us", "ms"] = "us" - ) -> dask_expr.Series: + def func(s: dx.Series, time_unit: Literal["ns", "us", "ms"] = "us") -> dx.Series: dtype = native_to_narwhals_dtype( s, self._compliant_expr._version, Implementation.DASK ) diff --git a/narwhals/_dask/group_by.py b/narwhals/_dask/group_by.py index 7bda88ee5..60086efa2 100644 --- a/narwhals/_dask/group_by.py +++ b/narwhals/_dask/group_by.py @@ -12,7 +12,12 @@ if TYPE_CHECKING: import dask.dataframe as dd - import dask_expr + + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx + import pandas as pd from narwhals._dask.dataframe import DaskLazyFrame @@ -43,7 +48,10 @@ def var( ]: from functools import partial - import dask_expr as dx + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx return partial(dx._groupby.GroupBy.var, ddof=ddof) @@ -55,7 +63,10 @@ def std( ]: from functools import partial - import dask_expr as dx + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx return partial(dx._groupby.GroupBy.std, ddof=ddof) @@ -127,7 +138,7 @@ def _from_native_frame(self, df: DaskLazyFrame) -> DaskLazyFrame: def agg_dask( df: DaskLazyFrame, grouped: Any, - exprs: Sequence[CompliantExpr[dask_expr.Series]], + exprs: Sequence[CompliantExpr[dx.Series]], keys: list[str], from_dataframe: Callable[[Any], DaskLazyFrame], ) -> DaskLazyFrame: @@ -178,9 +189,7 @@ def agg_dask( function_name = remove_prefix(expr._function_name, "col->") kwargs = ( - {"ddof": expr._kwargs.get("ddof", 1)} - if function_name in {"std", "var"} - else {} + {"ddof": expr._kwargs["ddof"]} if function_name in {"std", "var"} else {} ) agg_function = POLARS_TO_DASK_AGGREGATIONS.get(function_name, function_name) diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 126afaae6..d8b2b7a9a 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -21,14 +21,17 @@ from narwhals.typing import CompliantNamespace if TYPE_CHECKING: - import dask_expr + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx from narwhals._dask.typing import IntoDaskExpr from narwhals.dtypes import DType from narwhals.utils import Version -class DaskNamespace(CompliantNamespace["dask_expr.Series"]): +class DaskNamespace(CompliantNamespace["dx.Series"]): @property def selectors(self) -> DaskSelectorNamespace: return DaskSelectorNamespace( @@ -40,7 +43,7 @@ def __init__(self, *, backend_version: tuple[int, ...], version: Version) -> Non self._version = version def all(self) -> DaskExpr: - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: return [df._native_frame[column_name] for column_name in df.columns] return DaskExpr( @@ -66,26 +69,30 @@ def nth(self, *column_indices: int) -> DaskExpr: ) def lit(self, value: Any, dtype: DType | None) -> DaskExpr: - def convert_if_dtype( - series: dask_expr.Series, dtype: DType | type[DType] - ) -> dask_expr.Series: - return ( - series.astype(narwhals_to_native_dtype(dtype, self._version)) - if dtype - else series - ) + import dask.dataframe as dd + import pandas as pd - return DaskExpr( - lambda df: [ - df._native_frame.assign(literal=value)["literal"].pipe( - convert_if_dtype, dtype + def func(df: DaskLazyFrame) -> list[dx.Series]: + return [ + dd.from_pandas( + pd.Series( + [value], + dtype=narwhals_to_native_dtype(dtype, self._version) + if dtype is not None + else None, + name="literal", + ), + npartitions=df._native_frame.npartitions, ) - ], + ] + + return DaskExpr( + func, depth=0, function_name="lit", root_names=None, output_names=["literal"], - returns_scalar=False, + returns_scalar=True, backend_version=self._backend_version, version=self._version, kwargs={}, @@ -95,7 +102,7 @@ def len(self) -> DaskExpr: import dask.dataframe as dd import pandas as pd - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: if not df.columns: return [ dd.from_pandas( @@ -121,7 +128,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: def all_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: series = [s for _expr in parsed_exprs for s in _expr(df)] return [reduce(lambda x, y: x & y, series).rename(series[0].name)] @@ -140,7 +147,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: def any_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: series = [s for _expr in parsed_exprs for s in _expr(df)] return [reduce(lambda x, y: x | y, series).rename(series[0].name)] @@ -159,7 +166,7 @@ def func(df: DaskLazyFrame) -> list[dask_expr.Series]: def sum_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: series = [s.fillna(0) for _expr in parsed_exprs for s in _expr(df)] return [reduce(lambda x, y: x + y, series).rename(series[0].name)] @@ -235,7 +242,7 @@ def concat( def mean_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: series = (s.fillna(0) for _expr in parsed_exprs for s in _expr(df)) non_na = (1 - s.isna() for _expr in parsed_exprs for s in _expr(df)) return [ @@ -262,7 +269,7 @@ def min_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: series = [s for _expr in parsed_exprs for s in _expr(df)] return [dd.concat(series, axis=1).min(axis=1).rename(series[0].name)] @@ -284,7 +291,7 @@ def max_horizontal(self, *exprs: IntoDaskExpr) -> DaskExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: series = [s for _expr in parsed_exprs for s in _expr(df)] return [dd.concat(series, axis=1).max(axis=1).rename(series[0].name)] @@ -306,12 +313,7 @@ def when( *predicates: IntoDaskExpr, ) -> DaskWhen: plx = self.__class__(backend_version=self._backend_version, version=self._version) - if predicates: - condition = plx.all_horizontal(*predicates) - else: - msg = "at least one predicate needs to be provided" - raise TypeError(msg) - + condition = plx.all_horizontal(*predicates) return DaskWhen( condition, self._backend_version, returns_scalar=False, version=self._version ) @@ -328,7 +330,7 @@ def concat_str( *parse_into_exprs(*more_exprs, namespace=self), ] - def func(df: DaskLazyFrame) -> list[dask_expr.Series]: + def func(df: DaskLazyFrame) -> list[dx.Series]: series = (s.astype(str) for _expr in parsed_exprs for s in _expr(df)) null_mask = [s for _expr in parsed_exprs for s in _expr.is_null()(df)] @@ -390,12 +392,12 @@ def __init__( self._returns_scalar = returns_scalar self._version = version - def __call__(self, df: DaskLazyFrame) -> Sequence[dask_expr.Series]: + def __call__(self, df: DaskLazyFrame) -> Sequence[dx.Series]: from narwhals._expression_parsing import parse_into_expr plx = df.__narwhals_namespace__() condition = parse_into_expr(self._condition, namespace=plx)(df)[0] - condition = cast("dask_expr.Series", condition) + condition = cast("dx.Series", condition) try: value_series = parse_into_expr(self._then_value, namespace=plx)(df)[0] except TypeError: @@ -403,7 +405,7 @@ def __call__(self, df: DaskLazyFrame) -> Sequence[dask_expr.Series]: _df = condition.to_frame("a") _df["tmp"] = self._then_value value_series = _df["tmp"] - value_series = cast("dask_expr.Series", value_series) + value_series = cast("dx.Series", value_series) validate_comparand(condition, value_series) if self._otherwise_value is None: @@ -414,6 +416,9 @@ def __call__(self, df: DaskLazyFrame) -> Sequence[dask_expr.Series]: # `self._otherwise_value` is a scalar and can't be converted to an expression return [value_series.where(condition, self._otherwise_value)] otherwise_series = otherwise_expr(df)[0] + + if otherwise_expr._returns_scalar: # type: ignore[attr-defined] + return [value_series.where(condition, otherwise_series[0])] validate_comparand(condition, otherwise_series) return [value_series.where(condition, otherwise_series)] diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index 2891d84ff..703e24860 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -8,7 +8,10 @@ from narwhals.utils import import_dtypes_module if TYPE_CHECKING: - import dask_expr + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx from typing_extensions import Self from narwhals._dask.dataframe import DaskLazyFrame @@ -135,7 +138,7 @@ def call(df: DaskLazyFrame) -> list[Any]: def __or__(self: Self, other: DaskSelector | Any) -> DaskSelector | Any: if isinstance(other, DaskSelector): - def call(df: DaskLazyFrame) -> list[dask_expr.Series]: + def call(df: DaskLazyFrame) -> list[dx.Series]: lhs = self._call(df) rhs = other._call(df) return [*(x for x in lhs if x.name not in {x.name for x in rhs}), *rhs] diff --git a/narwhals/_dask/utils.py b/narwhals/_dask/utils.py index cb1232496..cd303d8ec 100644 --- a/narwhals/_dask/utils.py +++ b/narwhals/_dask/utils.py @@ -14,7 +14,11 @@ if TYPE_CHECKING: import dask.dataframe as dd - import dask_expr + + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx from narwhals._dask.dataframe import DaskLazyFrame from narwhals._dask.expr import DaskExpr @@ -31,7 +35,8 @@ def maybe_evaluate(df: DaskLazyFrame, obj: Any) -> Any: msg = "Multi-output expressions (e.g. `nw.all()` or `nw.col('a', 'b')`) not supported in this context" raise NotImplementedError(msg) result = results[0] - validate_comparand(df._native_frame, result) + if not obj._returns_scalar: + validate_comparand(df._native_frame, result) if obj._returns_scalar: # Return scalar, let Dask do its broadcasting return result[0] @@ -41,7 +46,7 @@ def maybe_evaluate(df: DaskLazyFrame, obj: Any) -> Any: def parse_exprs_and_named_exprs( df: DaskLazyFrame, *exprs: Any, **named_exprs: Any -) -> dict[str, dask_expr.Series]: +) -> dict[str, dx.Series]: results = {} for expr in exprs: if hasattr(expr, "__narwhals_expr__"): @@ -81,10 +86,13 @@ def add_row_index( ) -def validate_comparand(lhs: dask_expr.Series, rhs: dask_expr.Series) -> None: - import dask_expr +def validate_comparand(lhs: dx.Series, rhs: dx.Series) -> None: + try: + import dask.dataframe.dask_expr as dx + except ModuleNotFoundError: + import dask_expr as dx - if not dask_expr._expr.are_co_aligned(lhs._expr, rhs._expr): # pragma: no cover + if not dx._expr.are_co_aligned(lhs._expr, rhs._expr): # pragma: no cover # are_co_aligned is a method which cheaply checks if two Dask expressions # have the same index, and therefore don't require index alignment. # If someone only operates on a Dask DataFrame via expressions, then this @@ -135,6 +143,8 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], version: Version) -> An return "category" if isinstance_or_issubclass(dtype, dtypes.Datetime): return "datetime64[us]" + if isinstance_or_issubclass(dtype, dtypes.Date): + return "date32[day][pyarrow]" if isinstance_or_issubclass(dtype, dtypes.Duration): return "timedelta64[ns]" if isinstance_or_issubclass(dtype, dtypes.List): # pragma: no cover @@ -151,11 +161,11 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], version: Version) -> An raise AssertionError(msg) -def name_preserving_sum(s1: dask_expr.Series, s2: dask_expr.Series) -> dask_expr.Series: +def name_preserving_sum(s1: dx.Series, s2: dx.Series) -> dx.Series: return (s1 + s2).rename(s1.name) -def name_preserving_div(s1: dask_expr.Series, s2: dask_expr.Series) -> dask_expr.Series: +def name_preserving_div(s1: dx.Series, s2: dx.Series) -> dx.Series: return (s1 / s2).rename(s1.name) diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 339fca137..33cfc19d2 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -1,97 +1,74 @@ from __future__ import annotations -import re -from functools import lru_cache +from itertools import chain from typing import TYPE_CHECKING from typing import Any +from typing import Iterable +from typing import Literal +from typing import Sequence +from narwhals._duckdb.utils import native_to_narwhals_dtype +from narwhals._duckdb.utils import parse_exprs_and_named_exprs from narwhals.dependencies import get_duckdb -from narwhals.utils import import_dtypes_module +from narwhals.exceptions import ColumnNotFoundError +from narwhals.utils import Implementation +from narwhals.utils import Version +from narwhals.utils import flatten +from narwhals.utils import generate_temporary_column_name +from narwhals.utils import parse_columns_to_drop from narwhals.utils import parse_version +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType + import duckdb import pandas as pd import pyarrow as pa from typing_extensions import Self + from narwhals._duckdb.expr import DuckDBExpr + from narwhals._duckdb.group_by import DuckDBGroupBy + from narwhals._duckdb.namespace import DuckDBNamespace from narwhals._duckdb.series import DuckDBInterchangeSeries from narwhals.dtypes import DType - from narwhals.utils import Version - - -@lru_cache(maxsize=16) -def native_to_narwhals_dtype(duckdb_dtype: str, version: Version) -> DType: - dtypes = import_dtypes_module(version) - if duckdb_dtype == "HUGEINT": - return dtypes.Int128() - if duckdb_dtype == "BIGINT": - return dtypes.Int64() - if duckdb_dtype == "INTEGER": - return dtypes.Int32() - if duckdb_dtype == "SMALLINT": - return dtypes.Int16() - if duckdb_dtype == "TINYINT": - return dtypes.Int8() - if duckdb_dtype == "UHUGEINT": - return dtypes.UInt128() - if duckdb_dtype == "UBIGINT": - return dtypes.UInt64() - if duckdb_dtype == "UINTEGER": - return dtypes.UInt32() - if duckdb_dtype == "USMALLINT": - return dtypes.UInt16() - if duckdb_dtype == "UTINYINT": - return dtypes.UInt8() - if duckdb_dtype == "DOUBLE": - return dtypes.Float64() - if duckdb_dtype == "FLOAT": - return dtypes.Float32() - if duckdb_dtype == "VARCHAR": - return dtypes.String() - if duckdb_dtype == "DATE": - return dtypes.Date() - if duckdb_dtype == "TIMESTAMP": - return dtypes.Datetime() - if duckdb_dtype == "BOOLEAN": - return dtypes.Boolean() - if duckdb_dtype == "INTERVAL": - return dtypes.Duration() - if duckdb_dtype.startswith("STRUCT"): - matchstruc_ = re.findall(r"(\w+)\s+(\w+)", duckdb_dtype) - return dtypes.Struct( - [ - dtypes.Field( - matchstruc_[i][0], - native_to_narwhals_dtype(matchstruc_[i][1], version), - ) - for i in range(len(matchstruc_)) - ] - ) - if match_ := re.match(r"(.*)\[\]$", duckdb_dtype): - return dtypes.List(native_to_narwhals_dtype(match_.group(1), version)) - if match_ := re.match(r"(\w+)\[(\d+)\]", duckdb_dtype): - return dtypes.Array( - native_to_narwhals_dtype(match_.group(1), version), - int(match_.group(2)), - ) - if duckdb_dtype.startswith("DECIMAL("): - return dtypes.Decimal() - return dtypes.Unknown() # pragma: no cover -class DuckDBInterchangeFrame: - def __init__(self, df: Any, version: Version) -> None: - self._native_frame = df +class DuckDBLazyFrame: + _implementation = Implementation.DUCKDB + + def __init__( + self, + df: duckdb.DuckDBPyRelation, + *, + backend_version: tuple[int, ...], + version: Version, + ) -> None: + self._native_frame: duckdb.DuckDBPyRelation = df self._version = version + self._backend_version = backend_version + validate_backend_version(self._implementation, self._backend_version) - def __narwhals_dataframe__(self) -> Any: + def __narwhals_dataframe__(self) -> Any: # pragma: no cover + # Keep around for backcompat. + if self._version is not Version.V1: + msg = "__narwhals_dataframe__ is not implemented for DuckDBLazyFrame" + raise AttributeError(msg) + return self + + def __narwhals_lazyframe__(self) -> Any: return self def __native_namespace__(self: Self) -> ModuleType: return get_duckdb() # type: ignore[no-any-return] + def __narwhals_namespace__(self) -> DuckDBNamespace: + from narwhals._duckdb.namespace import DuckDBNamespace + + return DuckDBNamespace( + backend_version=self._backend_version, version=self._version + ) + def __getitem__(self, item: str) -> DuckDBInterchangeSeries: from narwhals._duckdb.series import DuckDBInterchangeSeries @@ -99,42 +76,101 @@ def __getitem__(self, item: str) -> DuckDBInterchangeSeries: self._native_frame.select(item), version=self._version ) + def collect(self) -> Any: + try: + import pyarrow as pa # ignore-banned-import + except ModuleNotFoundError as exc: # pragma: no cover + msg = "PyArrow>=11.0.0 is required to collect `LazyFrame` backed by DuckDcollect `LazyFrame` backed by DuckDB" + raise ModuleNotFoundError(msg) from exc + + from narwhals._arrow.dataframe import ArrowDataFrame + + return ArrowDataFrame( + native_dataframe=self._native_frame.arrow(), + backend_version=parse_version(pa.__version__), + version=self._version, + ) + + def head(self, n: int) -> Self: + return self._from_native_frame(self._native_frame.limit(n)) + def select( self: Self, *exprs: Any, **named_exprs: Any, ) -> Self: - if named_exprs or not all(isinstance(x, str) for x in exprs): # pragma: no cover - msg = ( - "`select`-ing not by name is not supported for DuckDB backend.\n\n" - "If you would like to see this kind of object better supported in " - "Narwhals, please open a feature request " - "at https://github.com/narwhals-dev/narwhals/issues." + new_columns_map = parse_exprs_and_named_exprs(self, *exprs, **named_exprs) + if not new_columns_map: + # TODO(marco): return empty relation with 0 columns? + return self._from_native_frame(self._native_frame.limit(0)) + + if all(getattr(x, "_returns_scalar", False) for x in exprs) and all( + getattr(x, "_returns_scalar", False) for x in named_exprs.values() + ): + return self._from_native_frame( + self._native_frame.aggregate( + [val.alias(col) for col, val in new_columns_map.items()] + ) ) - raise NotImplementedError(msg) - return self._from_native_frame(self._native_frame.select(*exprs)) + return self._from_native_frame( + self._native_frame.select( + *(val.alias(col) for col, val in new_columns_map.items()) + ) + ) - def __getattr__(self, attr: str) -> Any: - if attr == "schema": - return { - column_name: native_to_narwhals_dtype(str(duckdb_dtype), self._version) - for column_name, duckdb_dtype in zip( - self._native_frame.columns, self._native_frame.types - ) - } - elif attr == "columns": - return self._native_frame.columns - - msg = ( # pragma: no cover - f"Attribute {attr} is not supported for metadata-only dataframes.\n\n" - "If you would like to see this kind of object better supported in " - "Narwhals, please open a feature request " - "at https://github.com/narwhals-dev/narwhals/issues." + def drop(self: Self, columns: list[str], strict: bool) -> Self: # noqa: FBT001 + columns_to_drop = parse_columns_to_drop( + compliant_frame=self, columns=columns, strict=strict ) - raise NotImplementedError(msg) # pragma: no cover + selection = (col for col in self.columns if col not in columns_to_drop) + return self._from_native_frame(self._native_frame.select(*selection)) + + def lazy(self) -> Self: + return self + + def with_columns( + self: Self, + *exprs: Any, + **named_exprs: Any, + ) -> Self: + from duckdb import ColumnExpression + + new_columns_map = parse_exprs_and_named_exprs(self, *exprs, **named_exprs) + result = [] + for col in self._native_frame.columns: + if col in new_columns_map: + result.append(new_columns_map.pop(col).alias(col)) + else: + result.append(ColumnExpression(col)) + for col, value in new_columns_map.items(): + result.append(value.alias(col)) + return self._from_native_frame(self._native_frame.select(*result)) + + def filter(self, *predicates: DuckDBExpr, **constraints: Any) -> Self: + plx = self.__narwhals_namespace__() + expr = plx.all_horizontal( + *chain(predicates, (plx.col(name) == v for name, v in constraints.items())) + ) + # `[0]` is safe as all_horizontal's expression only returns a single column + mask = expr._call(self)[0] + return self._from_native_frame(self._native_frame.filter(mask)) + + @property + def schema(self) -> dict[str, DType]: + return { + column_name: native_to_narwhals_dtype(str(duckdb_dtype), self._version) + for column_name, duckdb_dtype in zip( + self._native_frame.columns, self._native_frame.types + ) + } + + @property + def columns(self) -> list[str]: + return self._native_frame.columns # type: ignore[no-any-return] def to_pandas(self: Self) -> pd.DataFrame: + # only if version is v1, keep around for backcompat import pandas as pd # ignore-banned-import() if parse_version(pd.__version__) >= parse_version("1.0.0"): @@ -144,13 +180,90 @@ def to_pandas(self: Self) -> pd.DataFrame: raise NotImplementedError(msg) def to_arrow(self: Self) -> pa.Table: + # only if version is v1, keep around for backcompat return self._native_frame.arrow() def _change_version(self: Self, version: Version) -> Self: - return self.__class__(self._native_frame, version=version) + return self.__class__( + self._native_frame, version=version, backend_version=self._backend_version + ) def _from_native_frame(self: Self, df: Any) -> Self: - return self.__class__(df, version=self._version) + return self.__class__( + df, backend_version=self._backend_version, version=self._version + ) + + def group_by(self: Self, *keys: str, drop_null_keys: bool) -> DuckDBGroupBy: + from narwhals._duckdb.group_by import DuckDBGroupBy + + if drop_null_keys: + msg = "todo" + raise NotImplementedError(msg) + + return DuckDBGroupBy( + compliant_frame=self, keys=list(keys), drop_null_keys=drop_null_keys + ) + + def rename(self: Self, mapping: dict[str, str]) -> Self: + df = self._native_frame + selection = [ + f"{col} as {mapping[col]}" if col in mapping else col for col in df.columns + ] + return self._from_native_frame(df.select(", ".join(selection))) + + def join( + self: Self, + other: Self, + *, + how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner", + left_on: str | list[str] | None, + right_on: str | list[str] | None, + suffix: str, + ) -> Self: + if isinstance(left_on, str): + left_on = [left_on] + if isinstance(right_on, str): + right_on = [right_on] + original_alias = self._native_frame.alias + + if how not in ("inner", "left", "semi", "cross"): + msg = "Only inner and left join is implemented for DuckDB" + raise NotImplementedError(msg) + + if how == "cross": + if self._backend_version < (1, 1, 4): + msg = f"DuckDB>=1.1.4 is required for cross-join, found version: {self._backend_version}" + raise NotImplementedError(msg) + rel = self._native_frame.set_alias("lhs").cross( # pragma: no cover + other._native_frame.set_alias("rhs") + ) + else: + # help mypy + assert left_on is not None # noqa: S101 + assert right_on is not None # noqa: S101 + + conditions = [ + f'lhs."{left}" = rhs."{right}"' for left, right in zip(left_on, right_on) + ] + condition = " and ".join(conditions) + rel = self._native_frame.set_alias("lhs").join( + other._native_frame.set_alias("rhs"), condition=condition, how=how + ) + + if how in ("inner", "left", "cross"): + select = [f'lhs."{x}"' for x in self._native_frame.columns] + for col in other._native_frame.columns: + if col in self._native_frame.columns and ( + right_on is None or col not in right_on + ): + select.append(f'rhs."{col}" as "{col}{suffix}"') + elif right_on is None or col not in right_on: + select.append(col) + else: # semi + select = ["lhs.*"] + + res = rel.select(", ".join(select)).set_alias(original_alias) + return self._from_native_frame(res) def collect_schema(self) -> dict[str, DType]: return { @@ -159,3 +272,56 @@ def collect_schema(self) -> dict[str, DType]: self._native_frame.columns, self._native_frame.types ) } + + def unique(self, subset: Sequence[str] | None, keep: str) -> Self: + if subset is not None: + import duckdb + + rel = self._native_frame + # Sanitise input + if any(x not in rel.columns for x in subset): + msg = f"Columns {set(subset).difference(rel.columns)} not found in {rel.columns}." + raise ColumnNotFoundError(msg) + idx_name = f'"{generate_temporary_column_name(8, rel.columns)}"' + count_name = ( + f'"{generate_temporary_column_name(8, [*rel.columns, idx_name])}"' + ) + if keep == "none": + keep_condition = f"where {count_name}=1" + else: + keep_condition = f"where {idx_name}=1" + query = f""" + with cte as ( + select *, + row_number() over (partition by {",".join(subset)}) as {idx_name}, + count(*) over (partition by {",".join(subset)}) as {count_name} + from rel + ) + select * exclude ({idx_name}, {count_name}) from cte {keep_condition} + """ # noqa: S608 + return self._from_native_frame(duckdb.sql(query)) + return self._from_native_frame(self._native_frame.unique(", ".join(self.columns))) + + def sort( + self: Self, + by: str | Iterable[str], + *more_by: str, + descending: bool | Sequence[bool] = False, + nulls_last: bool = False, + ) -> Self: + flat_by = flatten([*flatten([by]), *more_by]) + if isinstance(descending, bool): + descending = [descending] * len(flat_by) + descending_str = ["desc" if x else "" for x in descending] + + result = self._native_frame.order( + ",".join( + ( + f'"{col}" {desc} nulls last' + if nulls_last + else f'"{col}" {desc} nulls first' + for col, desc in zip(flat_by, descending_str) + ) + ) + ) + return self._from_native_frame(result) diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py new file mode 100644 index 000000000..e5e612085 --- /dev/null +++ b/narwhals/_duckdb/expr.py @@ -0,0 +1,766 @@ +from __future__ import annotations + +import functools +from typing import TYPE_CHECKING +from typing import Any +from typing import Callable +from typing import Literal +from typing import NoReturn +from typing import Sequence + +from narwhals._duckdb.utils import binary_operation_returns_scalar +from narwhals._duckdb.utils import get_column_name +from narwhals._duckdb.utils import maybe_evaluate +from narwhals._duckdb.utils import narwhals_to_native_dtype +from narwhals._expression_parsing import infer_new_root_output_names +from narwhals.typing import CompliantExpr +from narwhals.utils import Implementation + +if TYPE_CHECKING: + import duckdb + from typing_extensions import Self + + from narwhals._duckdb.dataframe import DuckDBLazyFrame + from narwhals._duckdb.namespace import DuckDBNamespace + from narwhals.dtypes import DType + from narwhals.utils import Version + + +class DuckDBExpr(CompliantExpr["duckdb.Expression"]): + _implementation = Implementation.DUCKDB + + def __init__( + self, + call: Callable[[DuckDBLazyFrame], Sequence[duckdb.Expression]], + *, + depth: int, + function_name: str, + root_names: list[str] | None, + output_names: list[str] | None, + # Whether the expression is a length-1 Column resulting from + # a reduction, such as `nw.col('a').sum()` + returns_scalar: bool, + backend_version: tuple[int, ...], + version: Version, + kwargs: dict[str, Any], + ) -> None: + self._call = call + self._depth = depth + self._function_name = function_name + self._root_names = root_names + self._output_names = output_names + self._returns_scalar = returns_scalar + self._backend_version = backend_version + self._version = version + self._kwargs = kwargs + + def __call__(self, df: DuckDBLazyFrame) -> Sequence[duckdb.Expression]: + return self._call(df) + + def __narwhals_expr__(self) -> None: ... + + def __narwhals_namespace__(self) -> DuckDBNamespace: # pragma: no cover + # Unused, just for compatibility with PandasLikeExpr + from narwhals._duckdb.namespace import DuckDBNamespace + + return DuckDBNamespace( + backend_version=self._backend_version, version=self._version + ) + + @classmethod + def from_column_names( + cls: type[Self], + *column_names: str, + backend_version: tuple[int, ...], + version: Version, + ) -> Self: + def func(_: DuckDBLazyFrame) -> list[duckdb.Expression]: + from duckdb import ColumnExpression + + return [ColumnExpression(col_name) for col_name in column_names] + + return cls( + func, + depth=0, + function_name="col", + root_names=list(column_names), + output_names=list(column_names), + returns_scalar=False, + backend_version=backend_version, + version=version, + kwargs={}, + ) + + def _from_call( + self, + call: Callable[..., duckdb.Expression], + expr_name: str, + *, + returns_scalar: bool, + **kwargs: Any, + ) -> Self: + def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: + results = [] + inputs = self._call(df) + _kwargs = {key: maybe_evaluate(df, value) for key, value in kwargs.items()} + for _input in inputs: + input_col_name = get_column_name( + df, _input, returns_scalar=self._returns_scalar + ) + if self._returns_scalar: + # TODO(marco): once WindowExpression is supported, then + # we may need to call it with `over(1)` here, + # depending on the context? + pass + + column_result = call(_input, **_kwargs) + column_result = column_result.alias(input_col_name) + if returns_scalar: + # TODO(marco): once WindowExpression is supported, then + # we may need to call it with `over(1)` here, + # depending on the context? + pass + results.append(column_result) + return results + + root_names, output_names = infer_new_root_output_names(self, **kwargs) + + return self.__class__( + func, + depth=self._depth + 1, + function_name=f"{self._function_name}->{expr_name}", + root_names=root_names, + output_names=output_names, + returns_scalar=returns_scalar, + backend_version=self._backend_version, + version=self._version, + kwargs=kwargs, + ) + + def __and__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input & other, + "__and__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __or__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input | other, + "__or__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __add__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input + other, + "__add__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __truediv__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input / other, + "__truediv__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __floordiv__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__floordiv__(other), + "__floordiv__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __mod__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__mod__(other), + "__mod__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __sub__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input - other, + "__sub__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __mul__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input * other, + "__mul__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __pow__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input**other, + "__pow__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __lt__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input < other, + "__lt__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __gt__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input > other, + "__gt__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __le__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input <= other, + "__le__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __ge__(self, other: DuckDBExpr) -> Self: + return self._from_call( + lambda _input, other: _input >= other, + "__ge__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __eq__(self, other: DuckDBExpr) -> Self: # type: ignore[override] + return self._from_call( + lambda _input, other: _input == other, + "__eq__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __ne__(self, other: DuckDBExpr) -> Self: # type: ignore[override] + return self._from_call( + lambda _input, other: _input != other, + "__ne__", + other=other, + returns_scalar=binary_operation_returns_scalar(self, other), + ) + + def __invert__(self) -> Self: + return self._from_call( + lambda _input: ~_input, + "__invert__", + returns_scalar=self._returns_scalar, + ) + + def alias(self, name: str) -> Self: + def _alias(df: DuckDBLazyFrame) -> list[duckdb.Expression]: + return [col.alias(name) for col in self._call(df)] + + # Define this one manually, so that we can + # override `output_names` and not increase depth + return self.__class__( + _alias, + depth=self._depth, + function_name=self._function_name, + root_names=self._root_names, + output_names=[name], + returns_scalar=self._returns_scalar, + backend_version=self._backend_version, + version=self._version, + kwargs={**self._kwargs, "name": name}, + ) + + def abs(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("abs", _input), + "abs", + returns_scalar=self._returns_scalar, + ) + + def mean(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("mean", _input), + "mean", + returns_scalar=True, + ) + + def skew(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("skewness", _input), + "skew", + returns_scalar=True, + ) + + def median(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("median", _input), + "median", + returns_scalar=True, + ) + + def all(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("bool_and", _input), + "all", + returns_scalar=True, + ) + + def any(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("bool_or", _input), + "any", + returns_scalar=True, + ) + + def quantile( + self, + quantile: float, + interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"], + ) -> Self: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + def func(_input: duckdb.Expression) -> duckdb.Expression: + if interpolation == "linear": + return FunctionExpression( + "quantile_cont", _input, ConstantExpression(quantile) + ) + msg = "Only linear interpolation methods are supported for DuckDB quantile." + raise NotImplementedError(msg) + + return self._from_call( + func, + "quantile", + returns_scalar=True, + ) + + def clip(self, lower_bound: Any, upper_bound: Any) -> Self: + from duckdb import FunctionExpression + + def func( + _input: duckdb.Expression, lower_bound: Any, upper_bound: Any + ) -> duckdb.Expression: + return FunctionExpression( + "greatest", FunctionExpression("least", _input, upper_bound), lower_bound + ) + + return self._from_call( + func, + "clip", + lower_bound=lower_bound, + upper_bound=upper_bound, + returns_scalar=self._returns_scalar, + ) + + def is_between( + self, + lower_bound: Any, + upper_bound: Any, + closed: Literal["left", "right", "none", "both"], + ) -> Self: + def func( + _input: duckdb.Expression, lower_bound: Any, upper_bound: Any + ) -> duckdb.Expression: + if closed == "left": + return (_input >= lower_bound) & (_input < upper_bound) + elif closed == "right": + return (_input > lower_bound) & (_input <= upper_bound) + elif closed == "none": + return (_input > lower_bound) & (_input < upper_bound) + return (_input >= lower_bound) & (_input <= upper_bound) + + return self._from_call( + func, + "is_between", + lower_bound=lower_bound, + upper_bound=upper_bound, + returns_scalar=self._returns_scalar, + ) + + def sum(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("sum", _input), "sum", returns_scalar=True + ) + + def n_unique(self) -> Self: + from duckdb import CaseExpression + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + def func(_input: duckdb.Expression) -> duckdb.Expression: + # https://stackoverflow.com/a/79338887/4451315 + return FunctionExpression( + "array_unique", FunctionExpression("array_agg", _input) + ) + FunctionExpression( + "max", + CaseExpression( + condition=_input.isnotnull(), value=ConstantExpression(0) + ).otherwise(ConstantExpression(1)), + ) + + return self._from_call( + func, + "n_unique", + returns_scalar=True, + ) + + def count(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("count", _input), + "count", + returns_scalar=True, + ) + + def len(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("count"), "len", returns_scalar=True + ) + + def std(self, ddof: int) -> Self: + from duckdb import FunctionExpression + + if ddof == 1: + func = "stddev_samp" + elif ddof == 0: + func = "stddev_pop" + else: + msg = f"std with ddof {ddof} is not supported in DuckDB" + raise NotImplementedError(msg) + return self._from_call( + lambda _input: FunctionExpression(func, _input), "std", returns_scalar=True + ) + + def var(self, ddof: int) -> Self: + from duckdb import FunctionExpression + + if ddof == 1: + func = "var_samp" + elif ddof == 0: + func = "var_pop" + else: + msg = f"var with ddof {ddof} is not supported in DuckDB" + raise NotImplementedError(msg) + return self._from_call( + lambda _input: FunctionExpression(func, _input), "var", returns_scalar=True + ) + + def max(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("max", _input), "max", returns_scalar=True + ) + + def min(self) -> Self: + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression("min", _input), "min", returns_scalar=True + ) + + def is_null(self) -> Self: + return self._from_call( + lambda _input: _input.isnull(), "is_null", returns_scalar=self._returns_scalar + ) + + def is_in(self, other: Sequence[Any]) -> Self: + from duckdb import ConstantExpression + + return self._from_call( + lambda _input: functools.reduce( + lambda x, y: x | _input.isin(ConstantExpression(y)), + other[1:], + _input.isin(ConstantExpression(other[0])), + ), + "is_in", + returns_scalar=self._returns_scalar, + ) + + def round(self, decimals: int) -> Self: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + return self._from_call( + lambda _input: FunctionExpression( + "round", _input, ConstantExpression(decimals) + ), + "round", + returns_scalar=self._returns_scalar, + ) + + def fill_null(self, value: Any, strategy: Any, limit: int | None) -> Self: + from duckdb import CoalesceOperator + from duckdb import ConstantExpression + + if strategy is not None: + msg = "todo" + raise NotImplementedError(msg) + + return self._from_call( + lambda _input: CoalesceOperator(_input, ConstantExpression(value)), + "fill_null", + returns_scalar=self._returns_scalar, + ) + + def cast( + self: Self, + dtype: DType | type[DType], + ) -> Self: + def func(_input: Any, dtype: DType | type[DType]) -> Any: + native_dtype = narwhals_to_native_dtype(dtype, self._version) + return _input.cast(native_dtype) + + return self._from_call( + func, + "cast", + dtype=dtype, + returns_scalar=self._returns_scalar, + ) + + @property + def str(self: Self) -> DuckDBExprStringNamespace: + return DuckDBExprStringNamespace(self) + + @property + def dt(self: Self) -> DuckDBExprDateTimeNamespace: + return DuckDBExprDateTimeNamespace(self) + + +class DuckDBExprStringNamespace: + def __init__(self, expr: DuckDBExpr) -> None: + self._compliant_expr = expr + + def starts_with(self, prefix: str) -> DuckDBExpr: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression( + "starts_with", _input, ConstantExpression(prefix) + ), + "starts_with", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def ends_with(self, suffix: str) -> DuckDBExpr: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression( + "ends_with", _input, ConstantExpression(suffix) + ), + "ends_with", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def contains(self, pattern: str, *, literal: bool) -> DuckDBExpr: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + def func(_input: duckdb.Expression) -> duckdb.Expression: + if literal: + return FunctionExpression("contains", _input, ConstantExpression(pattern)) + return FunctionExpression( + "regexp_matches", _input, ConstantExpression(pattern) + ) + + return self._compliant_expr._from_call( + func, "contains", returns_scalar=self._compliant_expr._returns_scalar + ) + + def slice(self, offset: int, length: int) -> DuckDBExpr: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + def func(_input: duckdb.Expression) -> duckdb.Expression: + return FunctionExpression( + "array_slice", + _input, + ConstantExpression(offset + 1) + if offset >= 0 + else FunctionExpression("length", _input) + offset + 1, + FunctionExpression("length", _input) + if length is None + else ConstantExpression(length) + offset, + ) + + return self._compliant_expr._from_call( + func, "slice", returns_scalar=self._compliant_expr._returns_scalar + ) + + def to_lowercase(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("lower", _input), + "to_lowercase", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def to_uppercase(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("upper", _input), + "to_uppercase", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def strip_chars(self, characters: str | None) -> DuckDBExpr: + import string + + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression( + "trim", + _input, + ConstantExpression( + string.whitespace if characters is None else characters + ), + ), + "strip_chars", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def replace_all( + self, pattern: str, value: str, *, literal: bool = False + ) -> DuckDBExpr: + from duckdb import ConstantExpression + from duckdb import FunctionExpression + + if literal is False: + msg = "`replace_all` for DuckDB currently only supports `literal=True`." + raise NotImplementedError(msg) + return self._compliant_expr._from_call( + lambda _input: FunctionExpression( + "replace", _input, ConstantExpression(pattern), ConstantExpression(value) + ), + "replace_all", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def replace(self, pattern: str, value: str, *, literal: bool, n: int) -> NoReturn: + msg = "`replace` is currently not supported for DuckDB" + raise NotImplementedError(msg) + + +class DuckDBExprDateTimeNamespace: + def __init__(self, expr: DuckDBExpr) -> None: + self._compliant_expr = expr + + def year(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("year", _input), + "year", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def month(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("month", _input), + "month", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def day(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("day", _input), + "day", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def hour(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("hour", _input), + "hour", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def minute(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("minute", _input), + "minute", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def second(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("second", _input), + "second", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def millisecond(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("millisecond", _input) + - FunctionExpression("second", _input) * 1_000, + "millisecond", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def microsecond(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("microsecond", _input) + - FunctionExpression("second", _input) * 1_000_000, + "microsecond", + returns_scalar=self._compliant_expr._returns_scalar, + ) + + def nanosecond(self) -> DuckDBExpr: + from duckdb import FunctionExpression + + return self._compliant_expr._from_call( + lambda _input: FunctionExpression("nanosecond", _input) + - FunctionExpression("second", _input) * 1_000_000_000, + "nanosecond", + returns_scalar=self._compliant_expr._returns_scalar, + ) diff --git a/narwhals/_duckdb/group_by.py b/narwhals/_duckdb/group_by.py new file mode 100644 index 000000000..0b312ff03 --- /dev/null +++ b/narwhals/_duckdb/group_by.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from copy import copy +from typing import TYPE_CHECKING + +from narwhals._expression_parsing import parse_into_exprs + +if TYPE_CHECKING: + from narwhals._duckdb.dataframe import DuckDBLazyFrame + from narwhals._duckdb.typing import IntoDuckDBExpr + + +class DuckDBGroupBy: + def __init__( + self, + compliant_frame: DuckDBLazyFrame, + keys: list[str], + drop_null_keys: bool, # noqa: FBT001 + ) -> None: + self._compliant_frame = compliant_frame + self._keys = keys + + def agg( + self, + *aggs: IntoDuckDBExpr, + **named_aggs: IntoDuckDBExpr, + ) -> DuckDBLazyFrame: + exprs = parse_into_exprs( + *aggs, + namespace=self._compliant_frame.__narwhals_namespace__(), + **named_aggs, + ) + output_names: list[str] = copy(self._keys) + for expr in exprs: + if expr._output_names is None: # pragma: no cover + msg = ( + "Anonymous expressions are not supported in group_by.agg.\n" + "Instead of `nw.all()`, try using a named expression, such as " + "`nw.col('a', 'b')`\n" + ) + raise ValueError(msg) + + output_names.extend(expr._output_names) + + agg_columns = [ + *self._keys, + *(x for expr in exprs for x in expr(self._compliant_frame)), + ] + try: + return self._compliant_frame._from_native_frame( + self._compliant_frame._native_frame.aggregate( + agg_columns, group_expr=",".join(self._keys) + ) + ) + except ValueError as exc: # pragma: no cover + msg = "Failed to aggregated - does your aggregation function return a scalar?" + raise RuntimeError(msg) from exc diff --git a/narwhals/_duckdb/namespace.py b/narwhals/_duckdb/namespace.py new file mode 100644 index 000000000..c91d11d3f --- /dev/null +++ b/narwhals/_duckdb/namespace.py @@ -0,0 +1,314 @@ +from __future__ import annotations + +import functools +import operator +from functools import reduce +from typing import TYPE_CHECKING +from typing import Any +from typing import Literal +from typing import Sequence +from typing import cast + +from narwhals._duckdb.expr import DuckDBExpr +from narwhals._duckdb.utils import narwhals_to_native_dtype +from narwhals._expression_parsing import combine_root_names +from narwhals._expression_parsing import parse_into_exprs +from narwhals._expression_parsing import reduce_output_names +from narwhals.typing import CompliantNamespace + +if TYPE_CHECKING: + import duckdb + + from narwhals._duckdb.dataframe import DuckDBLazyFrame + from narwhals._duckdb.typing import IntoDuckDBExpr + from narwhals.dtypes import DType + from narwhals.utils import Version + + +def get_column_name(df: DuckDBLazyFrame, column: duckdb.Expression) -> str: + return str(df._native_frame.select(column).columns[0]) + + +class DuckDBNamespace(CompliantNamespace["duckdb.Expression"]): + def __init__(self, *, backend_version: tuple[int, ...], version: Version) -> None: + self._backend_version = backend_version + self._version = version + + def all(self) -> DuckDBExpr: + def _all(df: DuckDBLazyFrame) -> list[duckdb.Expression]: + from duckdb import ColumnExpression + + return [ColumnExpression(col_name) for col_name in df.columns] + + return DuckDBExpr( + call=_all, + depth=0, + function_name="all", + root_names=None, + output_names=None, + returns_scalar=False, + backend_version=self._backend_version, + version=self._version, + kwargs={}, + ) + + def concat( + self, + items: Sequence[DuckDBLazyFrame], + *, + how: Literal["horizontal", "vertical", "diagonal"], + ) -> DuckDBLazyFrame: + if how == "horizontal": + msg = "horizontal concat not supported for duckdb. Please join instead" + raise TypeError(msg) + if how == "diagonal": + msg = "Not implemented yet" + raise NotImplementedError(msg) + first = items[0] + schema = first.schema + if how == "vertical" and not all(x.schema == schema for x in items[1:]): + msg = "inputs should all have the same schema" + raise TypeError(msg) + res = functools.reduce( + lambda x, y: x.union(y), (item._native_frame for item in items) + ) + return first._from_native_frame(res) + + def all_horizontal(self, *exprs: IntoDuckDBExpr) -> DuckDBExpr: + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: + cols = [c for _expr in parsed_exprs for c in _expr(df)] + col_name = get_column_name(df, cols[0]) + return [reduce(operator.and_, cols).alias(col_name)] + + return DuckDBExpr( + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="all_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + returns_scalar=False, + backend_version=self._backend_version, + version=self._version, + kwargs={"exprs": exprs}, + ) + + def any_horizontal(self, *exprs: IntoDuckDBExpr) -> DuckDBExpr: + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: + cols = [c for _expr in parsed_exprs for c in _expr(df)] + col_name = get_column_name(df, cols[0]) + return [reduce(operator.or_, cols).alias(col_name)] + + return DuckDBExpr( + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="or_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + returns_scalar=False, + backend_version=self._backend_version, + version=self._version, + kwargs={"exprs": exprs}, + ) + + def max_horizontal(self, *exprs: IntoDuckDBExpr) -> DuckDBExpr: + from duckdb import FunctionExpression + + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: + cols = [c for _expr in parsed_exprs for c in _expr(df)] + col_name = get_column_name(df, cols[0]) + return [FunctionExpression("greatest", *cols).alias(col_name)] + + return DuckDBExpr( + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="max_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + returns_scalar=False, + backend_version=self._backend_version, + version=self._version, + kwargs={"exprs": exprs}, + ) + + def min_horizontal(self, *exprs: IntoDuckDBExpr) -> DuckDBExpr: + from duckdb import FunctionExpression + + parsed_exprs = parse_into_exprs(*exprs, namespace=self) + + def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: + cols = [c for _expr in parsed_exprs for c in _expr(df)] + col_name = get_column_name(df, cols[0]) + return [FunctionExpression("least", *cols).alias(col_name)] + + return DuckDBExpr( + call=func, + depth=max(x._depth for x in parsed_exprs) + 1, + function_name="min_horizontal", + root_names=combine_root_names(parsed_exprs), + output_names=reduce_output_names(parsed_exprs), + returns_scalar=False, + backend_version=self._backend_version, + version=self._version, + kwargs={"exprs": exprs}, + ) + + def when( + self, + *predicates: IntoDuckDBExpr, + ) -> DuckDBWhen: + plx = self.__class__(backend_version=self._backend_version, version=self._version) + condition = plx.all_horizontal(*predicates) + return DuckDBWhen( + condition, self._backend_version, returns_scalar=False, version=self._version + ) + + def col(self, *column_names: str) -> DuckDBExpr: + return DuckDBExpr.from_column_names( + *column_names, backend_version=self._backend_version, version=self._version + ) + + def lit(self, value: Any, dtype: DType | None) -> DuckDBExpr: + from duckdb import ConstantExpression + + def func(_df: DuckDBLazyFrame) -> list[duckdb.Expression]: + if dtype is not None: + return [ + ConstantExpression(value) + .cast(narwhals_to_native_dtype(dtype, version=self._version)) + .alias("literal") + ] + return [ConstantExpression(value).alias("literal")] + + return DuckDBExpr( + func, + depth=0, + function_name="lit", + root_names=None, + output_names=["literal"], + returns_scalar=True, + backend_version=self._backend_version, + version=self._version, + kwargs={}, + ) + + def len(self) -> DuckDBExpr: + def func(_df: DuckDBLazyFrame) -> list[duckdb.Expression]: + from duckdb import FunctionExpression + + return [FunctionExpression("count").alias("len")] + + return DuckDBExpr( + call=func, + depth=0, + function_name="len", + root_names=None, + output_names=["len"], + returns_scalar=True, + backend_version=self._backend_version, + version=self._version, + kwargs={}, + ) + + +class DuckDBWhen: + def __init__( + self, + condition: DuckDBExpr, + backend_version: tuple[int, ...], + then_value: Any = None, + otherwise_value: Any = None, + *, + returns_scalar: bool, + version: Version, + ) -> None: + self._backend_version = backend_version + self._condition = condition + self._then_value = then_value + self._otherwise_value = otherwise_value + self._returns_scalar = returns_scalar + self._version = version + + def __call__(self, df: DuckDBLazyFrame) -> Sequence[duckdb.Expression]: + from duckdb import CaseExpression + from duckdb import ConstantExpression + + from narwhals._expression_parsing import parse_into_expr + + plx = df.__narwhals_namespace__() + condition = parse_into_expr(self._condition, namespace=plx)(df)[0] + condition = cast("duckdb.Expression", condition) + + try: + value = parse_into_expr(self._then_value, namespace=plx)(df)[0] + except TypeError: + # `self._otherwise_value` is a scalar and can't be converted to an expression + value = ConstantExpression(self._then_value) + value = cast("duckdb.Expression", value) + + if self._otherwise_value is None: + return [CaseExpression(condition=condition, value=value)] + try: + otherwise_expr = parse_into_expr(self._otherwise_value, namespace=plx) + except TypeError: + # `self._otherwise_value` is a scalar and can't be converted to an expression + return [ + CaseExpression(condition=condition, value=value).otherwise( + ConstantExpression(self._otherwise_value) + ) + ] + otherwise = otherwise_expr(df)[0] + return [CaseExpression(condition=condition, value=value).otherwise(otherwise)] + + def then(self, value: DuckDBExpr | Any) -> DuckDBThen: + self._then_value = value + + return DuckDBThen( + self, + depth=0, + function_name="whenthen", + root_names=None, + output_names=None, + returns_scalar=self._returns_scalar, + backend_version=self._backend_version, + version=self._version, + kwargs={"value": value}, + ) + + +class DuckDBThen(DuckDBExpr): + def __init__( + self, + call: DuckDBWhen, + *, + depth: int, + function_name: str, + root_names: list[str] | None, + output_names: list[str] | None, + returns_scalar: bool, + backend_version: tuple[int, ...], + version: Version, + kwargs: dict[str, Any], + ) -> None: + self._backend_version = backend_version + self._version = version + self._call = call + self._depth = depth + self._function_name = function_name + self._root_names = root_names + self._output_names = output_names + self._returns_scalar = returns_scalar + self._kwargs = kwargs + + def otherwise(self, value: DuckDBExpr | Any) -> DuckDBExpr: + # type ignore because we are setting the `_call` attribute to a + # callable object of type `DuckDBWhen`, base class has the attribute as + # only a `Callable` + self._call._otherwise_value = value # type: ignore[attr-defined] + self._function_name = "whenotherwise" + return self diff --git a/narwhals/_duckdb/series.py b/narwhals/_duckdb/series.py index dc7485e98..bec9e0e08 100644 --- a/narwhals/_duckdb/series.py +++ b/narwhals/_duckdb/series.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING from typing import Any -from narwhals._duckdb.dataframe import native_to_narwhals_dtype +from narwhals._duckdb.utils import native_to_narwhals_dtype from narwhals.dependencies import get_duckdb if TYPE_CHECKING: diff --git a/narwhals/_duckdb/typing.py b/narwhals/_duckdb/typing.py new file mode 100644 index 000000000..65d1ba3a7 --- /dev/null +++ b/narwhals/_duckdb/typing.py @@ -0,0 +1,16 @@ +from __future__ import annotations # pragma: no cover + +from typing import TYPE_CHECKING # pragma: no cover +from typing import Union # pragma: no cover + +if TYPE_CHECKING: + import sys + + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + + from narwhals._duckdb.expr import DuckDBExpr + + IntoDuckDBExpr: TypeAlias = Union[DuckDBExpr, str] diff --git a/narwhals/_duckdb/utils.py b/narwhals/_duckdb/utils.py new file mode 100644 index 000000000..62f126db9 --- /dev/null +++ b/narwhals/_duckdb/utils.py @@ -0,0 +1,214 @@ +from __future__ import annotations + +import re +from functools import lru_cache +from typing import TYPE_CHECKING +from typing import Any +from typing import Sequence + +from narwhals.dtypes import DType +from narwhals.exceptions import InvalidIntoExprError +from narwhals.utils import import_dtypes_module +from narwhals.utils import isinstance_or_issubclass + +if TYPE_CHECKING: + import duckdb + + from narwhals._duckdb.dataframe import DuckDBLazyFrame + from narwhals._duckdb.expr import DuckDBExpr + from narwhals._duckdb.typing import IntoDuckDBExpr + from narwhals.utils import Version + + +def get_column_name( + df: DuckDBLazyFrame, column: duckdb.Expression, *, returns_scalar: bool +) -> str: + if returns_scalar: + return str(df._native_frame.aggregate([column]).columns[0]) + return str(df._native_frame.select(column).columns[0]) + + +def maybe_evaluate(df: DuckDBLazyFrame, obj: Any) -> Any: + import duckdb + + from narwhals._duckdb.expr import DuckDBExpr + + if isinstance(obj, DuckDBExpr): + column_results = obj._call(df) + if len(column_results) != 1: # pragma: no cover + msg = "Multi-output expressions (e.g. `nw.all()` or `nw.col('a', 'b')`) not supported in this context" + raise NotImplementedError(msg) + column_result = column_results[0] + if obj._returns_scalar: + msg = "Reductions are not yet supported for DuckDB, at least until they implement duckdb.WindowExpression" + raise NotImplementedError(msg) + return column_result + if isinstance_or_issubclass(obj, DType): + return obj + return duckdb.ConstantExpression(obj) + + +def parse_exprs_and_named_exprs( + df: DuckDBLazyFrame, + *exprs: IntoDuckDBExpr, + **named_exprs: IntoDuckDBExpr, +) -> dict[str, duckdb.Expression]: + result_columns: dict[str, list[duckdb.Expression]] = {} + for expr in exprs: + column_list = _columns_from_expr(df, expr) + if isinstance(expr, str): # pragma: no cover + output_names = [expr] + elif expr._output_names is None: + output_names = [ + get_column_name(df, col, returns_scalar=expr._returns_scalar) + for col in column_list + ] + else: + output_names = expr._output_names + result_columns.update(zip(output_names, column_list)) + for col_alias, expr in named_exprs.items(): + columns_list = _columns_from_expr(df, expr) + if len(columns_list) != 1: # pragma: no cover + msg = "Named expressions must return a single column" + raise AssertionError(msg) + result_columns[col_alias] = columns_list[0] + return result_columns + + +def _columns_from_expr( + df: DuckDBLazyFrame, expr: IntoDuckDBExpr +) -> Sequence[duckdb.Expression]: + if isinstance(expr, str): # pragma: no cover + from duckdb import ColumnExpression + + return [ColumnExpression(expr)] + elif hasattr(expr, "__narwhals_expr__"): + col_output_list = expr._call(df) + if expr._output_names is not None and ( + len(col_output_list) != len(expr._output_names) + ): # pragma: no cover + msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues" + raise AssertionError(msg) + return col_output_list + else: + raise InvalidIntoExprError.from_invalid_type(type(expr)) + + +@lru_cache(maxsize=16) +def native_to_narwhals_dtype(duckdb_dtype: str, version: Version) -> DType: + dtypes = import_dtypes_module(version) + if duckdb_dtype == "HUGEINT": + return dtypes.Int128() + if duckdb_dtype == "BIGINT": + return dtypes.Int64() + if duckdb_dtype == "INTEGER": + return dtypes.Int32() + if duckdb_dtype == "SMALLINT": + return dtypes.Int16() + if duckdb_dtype == "TINYINT": + return dtypes.Int8() + if duckdb_dtype == "UHUGEINT": + return dtypes.UInt128() + if duckdb_dtype == "UBIGINT": + return dtypes.UInt64() + if duckdb_dtype == "UINTEGER": + return dtypes.UInt32() + if duckdb_dtype == "USMALLINT": + return dtypes.UInt16() + if duckdb_dtype == "UTINYINT": + return dtypes.UInt8() + if duckdb_dtype == "DOUBLE": + return dtypes.Float64() + if duckdb_dtype == "FLOAT": + return dtypes.Float32() + if duckdb_dtype == "VARCHAR": + return dtypes.String() + if duckdb_dtype == "DATE": + return dtypes.Date() + if duckdb_dtype == "TIMESTAMP": + return dtypes.Datetime() + if duckdb_dtype == "BOOLEAN": + return dtypes.Boolean() + if duckdb_dtype == "INTERVAL": + return dtypes.Duration() + if duckdb_dtype.startswith("STRUCT"): + matchstruc_ = re.findall(r"(\w+)\s+(\w+)", duckdb_dtype) + return dtypes.Struct( + [ + dtypes.Field( + matchstruc_[i][0], + native_to_narwhals_dtype(matchstruc_[i][1], version), + ) + for i in range(len(matchstruc_)) + ] + ) + if match_ := re.match(r"(.*)\[\]$", duckdb_dtype): + return dtypes.List(native_to_narwhals_dtype(match_.group(1), version)) + if match_ := re.match(r"(\w+)\[(\d+)\]", duckdb_dtype): + return dtypes.Array( + native_to_narwhals_dtype(match_.group(1), version), + int(match_.group(2)), + ) + if duckdb_dtype.startswith("DECIMAL("): + return dtypes.Decimal() + return dtypes.Unknown() # pragma: no cover + + +def narwhals_to_native_dtype(dtype: DType | type[DType], version: Version) -> str: + dtypes = import_dtypes_module(version) + if isinstance_or_issubclass(dtype, dtypes.Float64): + return "FLOAT" + if isinstance_or_issubclass(dtype, dtypes.Float32): + return "DOUBLE" + if isinstance_or_issubclass(dtype, dtypes.Int64): + return "BIGINT" + if isinstance_or_issubclass(dtype, dtypes.Int32): + return "INT" + if isinstance_or_issubclass(dtype, dtypes.Int16): + return "SMALLINT" + if isinstance_or_issubclass(dtype, dtypes.Int8): + return "TINYINT" + if isinstance_or_issubclass(dtype, dtypes.UInt64): + return "UBIGINT" + if isinstance_or_issubclass(dtype, dtypes.UInt32): + return "UINT" + if isinstance_or_issubclass(dtype, dtypes.UInt16): # pragma: no cover + return "USMALLINT" + if isinstance_or_issubclass(dtype, dtypes.UInt8): # pragma: no cover + return "UTINYINT" + if isinstance_or_issubclass(dtype, dtypes.String): + return "VARCHAR" + if isinstance_or_issubclass(dtype, dtypes.Boolean): # pragma: no cover + return "BOOLEAN" + if isinstance_or_issubclass(dtype, dtypes.Categorical): + msg = "Categorical not supported by DuckDB" + raise NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Datetime): + _time_unit = getattr(dtype, "time_unit", "us") + _time_zone = getattr(dtype, "time_zone", None) + msg = "todo" + raise NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Duration): # pragma: no cover + _time_unit = getattr(dtype, "time_unit", "us") + msg = "todo" + raise NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Date): # pragma: no cover + return "DATE" + if isinstance_or_issubclass(dtype, dtypes.List): + msg = "todo" + raise NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Struct): # pragma: no cover + msg = "todo" + raise NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Array): # pragma: no cover + msg = "todo" + raise NotImplementedError(msg) + msg = f"Unknown dtype: {dtype}" # pragma: no cover + raise AssertionError(msg) + + +def binary_operation_returns_scalar(lhs: DuckDBExpr, rhs: DuckDBExpr | Any) -> bool: + # If `rhs` is a DuckDBExpr, we look at `_returns_scalar`. If it isn't, + # it means that it was a scalar (e.g. nw.col('a') + 1), and so we default + # to `True`. + return lhs._returns_scalar and getattr(rhs, "_returns_scalar", True) diff --git a/narwhals/_expression_parsing.py b/narwhals/_expression_parsing.py index 4d51eb719..99bb3bb24 100644 --- a/narwhals/_expression_parsing.py +++ b/narwhals/_expression_parsing.py @@ -125,6 +125,38 @@ def parse_into_expr( raise InvalidIntoExprError.from_invalid_type(type(into_expr)) +def infer_new_root_output_names( + expr: CompliantExpr[Any], **kwargs: Any +) -> tuple[list[str] | None, list[str] | None]: + """Return new root and output names after chaining expressions. + + Try tracking root and output names by combining them from all expressions appearing in kwargs. + If any anonymous expression appears (e.g. nw.all()), then give up on tracking root names + and just set it to None. + """ + root_names = copy(expr._root_names) + output_names = expr._output_names + for arg in list(kwargs.values()): + if root_names is not None and isinstance(arg, expr.__class__): + if arg._root_names is not None: + root_names.extend(arg._root_names) + else: + root_names = None + output_names = None + break + elif root_names is None: + output_names = None + break + + if not ( + (output_names is None and root_names is None) + or (output_names is not None and root_names is not None) + ): # pragma: no cover + msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues" + raise AssertionError(msg) + return root_names, output_names + + @overload def reuse_series_implementation( expr: PandasLikeExprT, @@ -201,30 +233,8 @@ def func(df: CompliantDataFrame) -> Sequence[CompliantSeries]: raise AssertionError(msg) return out - # Try tracking root and output names by combining them from all - # expressions appearing in args and kwargs. If any anonymous - # expression appears (e.g. nw.all()), then give up on tracking root names - # and just set it to None. - root_names = copy(expr._root_names) - output_names = expr._output_names - for arg in list(kwargs.values()): - if root_names is not None and isinstance(arg, expr.__class__): - if arg._root_names is not None: - root_names.extend(arg._root_names) - else: - root_names = None - output_names = None - break - elif root_names is None: - output_names = None - break + root_names, output_names = infer_new_root_output_names(expr, **kwargs) - if not ( - (output_names is None and root_names is None) - or (output_names is not None and root_names is not None) - ): # pragma: no cover - msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues" - raise AssertionError(msg) return plx._create_expr_from_callable( # type: ignore[return-value] func, # type: ignore[arg-type] depth=expr._depth + 1, diff --git a/narwhals/_ibis/dataframe.py b/narwhals/_ibis/dataframe.py index f62a31e8b..6fe8997a9 100644 --- a/narwhals/_ibis/dataframe.py +++ b/narwhals/_ibis/dataframe.py @@ -5,7 +5,9 @@ from typing import Any from narwhals.dependencies import get_ibis +from narwhals.utils import Implementation from narwhals.utils import import_dtypes_module +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -69,9 +71,15 @@ def native_to_narwhals_dtype(ibis_dtype: Any, version: Version) -> DType: class IbisInterchangeFrame: - def __init__(self, df: Any, version: Version) -> None: + _implementation = Implementation.IBIS + + def __init__( + self, df: Any, *, backend_version: tuple[int, ...], version: Version + ) -> None: self._native_frame = df self._version = version + self._backend_version = backend_version + validate_backend_version(self._implementation, self._backend_version) def __narwhals_dataframe__(self) -> Any: return self @@ -125,10 +133,14 @@ def __getattr__(self, attr: str) -> Any: raise NotImplementedError(msg) def _change_version(self: Self, version: Version) -> Self: - return self.__class__(self._native_frame, version=version) + return self.__class__( + self._native_frame, version=version, backend_version=self._backend_version + ) def _from_native_frame(self: Self, df: Any) -> Self: - return self.__class__(df, version=self._version) + return self.__class__( + df, version=self._version, backend_version=self._backend_version + ) def collect_schema(self) -> dict[str, DType]: return { diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index c10aacec5..e11c02710 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -20,6 +20,7 @@ from narwhals._pandas_like.utils import select_columns_by_name from narwhals._pandas_like.utils import validate_dataframe_comparand from narwhals.dependencies import is_numpy_array +from narwhals.exceptions import ColumnNotFoundError from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import generate_temporary_column_name @@ -27,6 +28,7 @@ from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop from narwhals.utils import scale_bytes +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -59,6 +61,7 @@ def __init__( self._implementation = implementation self._backend_version = backend_version self._version = version + validate_backend_version(self._implementation, self._backend_version) def __narwhals_dataframe__(self) -> Self: return self @@ -692,6 +695,9 @@ def unique( # The param `maintain_order` is only here for compatibility with the Polars API # and has no effect on the output. mapped_keep = {"none": False, "any": "first"}.get(keep, keep) + if subset is not None and any(x not in self.columns for x in subset): + msg = f"Column(s) {subset} not found in {self.columns}" + raise ColumnNotFoundError(msg) return self._from_native_frame( self._native_frame.drop_duplicates(subset=subset, keep=mapped_keep) ) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index df451b492..c694b3420 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -34,6 +34,7 @@ # So, instead of using "cumcount" we use "cumsum" on notna() to get the same result "col->cum_count": "cumsum", "col->shift": "shift", + "col->rank": "rank", } @@ -179,68 +180,31 @@ def __lt__(self, other: PandasLikeExpr | Any) -> Self: def __and__(self, other: PandasLikeExpr | bool | Any) -> Self: return reuse_series_implementation(self, "__and__", other=other) - def __rand__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__and__(self) # type: ignore[no-any-return] - def __or__(self, other: PandasLikeExpr | bool | Any) -> Self: return reuse_series_implementation(self, "__or__", other=other) - def __ror__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__or__(self) # type: ignore[no-any-return] - def __add__(self, other: PandasLikeExpr | Any) -> Self: return reuse_series_implementation(self, "__add__", other=other) - def __radd__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__add__(self) # type: ignore[no-any-return] - def __sub__(self, other: PandasLikeExpr | Any) -> Self: return reuse_series_implementation(self, "__sub__", other=other) - def __rsub__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__sub__(self) # type: ignore[no-any-return] - def __mul__(self, other: PandasLikeExpr | Any) -> Self: return reuse_series_implementation(self, "__mul__", other=other) - def __rmul__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__mul__(self) # type: ignore[no-any-return] - def __truediv__(self, other: PandasLikeExpr | Any) -> Self: return reuse_series_implementation(self, "__truediv__", other=other) - def __rtruediv__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__truediv__(self) # type: ignore[no-any-return] - def __floordiv__(self, other: PandasLikeExpr | Any) -> Self: return reuse_series_implementation(self, "__floordiv__", other=other) - def __rfloordiv__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__floordiv__(self) # type: ignore[no-any-return] - def __pow__(self, other: PandasLikeExpr | Any) -> Self: return reuse_series_implementation(self, "__pow__", other=other) - def __rpow__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__pow__(self) # type: ignore[no-any-return] - def __mod__(self, other: PandasLikeExpr | Any) -> Self: return reuse_series_implementation(self, "__mod__", other=other) - def __rmod__(self, other: Any) -> Self: - other = self.__narwhals_namespace__().lit(other, dtype=None) - return other.__mod__(self) # type: ignore[no-any-return] - # Unary - def __invert__(self) -> Self: return reuse_series_implementation(self, "__invert__") @@ -298,7 +262,10 @@ def clip(self, lower_bound: Any, upper_bound: Any) -> Self: ) def is_between( - self, lower_bound: Any, upper_bound: Any, closed: str = "both" + self, + lower_bound: Any, + upper_bound: Any, + closed: Literal["left", "right", "none", "both"], ) -> Self: return reuse_series_implementation( self, @@ -420,7 +387,7 @@ def alias(self, name: str) -> Self: kwargs={**self._kwargs, "name": name}, ) - def over(self, keys: list[str]) -> Self: + def over(self: Self, keys: list[str]) -> Self: if self._function_name in MANY_TO_MANY_AGG_FUNCTIONS_TO_PANDAS_EQUIVALENT: def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: @@ -448,9 +415,16 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: df = df.with_columns(~plx.col(*self._root_names).is_null()) if self._function_name == "col->shift": - kwargs = {"periods": self._kwargs.get("n", 1)} - else: - # Cumulative operation + kwargs = {"periods": self._kwargs["n"]} + elif self._function_name == "col->rank": + _method = self._kwargs.get("method", "average") + kwargs = { + "method": "first" if _method == "ordinal" else _method, + "ascending": not self._kwargs["descending"], + "na_option": "keep", + "pct": False, + } + else: # Cumulative operation kwargs = {"skipna": True} res_native = getattr( @@ -654,6 +628,16 @@ def rolling_std( ddof=ddof, ) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + return reuse_series_implementation( + self, "rank", method=method, descending=descending + ) + @property def str(self: Self) -> PandasLikeExprStringNamespace: return PandasLikeExprStringNamespace(self) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 3741c7130..a1eca5b5d 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -209,14 +209,13 @@ def agg_pandas( # noqa: PLR0915 is_n_unique = function_name == "nunique" is_std = function_name == "std" is_var = function_name == "var" - ddof = expr._kwargs.get("ddof", 1) for root_name, output_name in zip(expr._root_names, expr._output_names): if is_n_unique: nunique_aggs[output_name] = root_name - elif is_std and ddof != 1: + elif is_std and (ddof := expr._kwargs["ddof"]) != 1: std_aggs[ddof][0].append(root_name) std_aggs[ddof][1].append(output_name) - elif is_var and ddof != 1: + elif is_var and (ddof := expr._kwargs["ddof"]) != 1: var_aggs[ddof][0].append(root_name) var_aggs[ddof][1].append(output_name) else: @@ -337,7 +336,7 @@ def agg_pandas( # noqa: PLR0915 "pandas API. If you can, please rewrite your query such that group-by aggregations " "are simple (e.g. mean, std, min, max, ...). \n\n" "Please see: " - "https://narwhals-dev.github.io/narwhals/pandas_like_concepts/improve_group_by_operation.md/", + "https://narwhals-dev.github.io/narwhals/pandas_like_concepts/improve_group_by_operation/", UserWarning, stacklevel=find_stacklevel(), ) @@ -349,7 +348,7 @@ def func(df: Any) -> Any: results_keys = expr(from_dataframe(df)) if not all(len(x) == 1 for x in results_keys): msg = f"Aggregation '{expr._function_name}' failed to aggregate - does your aggregation function return a scalar? \ - \n\n Please see: https://narwhals-dev.github.io/narwhals/pandas_like_concepts/improve_group_by_operation.md/" + \n\n Please see: https://narwhals-dev.github.io/narwhals/pandas_like_concepts/improve_group_by_operation/" raise ValueError(msg) for result_keys in results_keys: diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 7885d7de0..212c9c938 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -371,12 +371,7 @@ def when( plx = self.__class__( self._implementation, self._backend_version, version=self._version ) - if predicates: - condition = plx.all_horizontal(*predicates) - else: - msg = "at least one predicate needs to be provided" - raise TypeError(msg) - + condition = plx.all_horizontal(*predicates) return PandasWhen( condition, self._implementation, self._backend_version, version=self._version ) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index cf8972deb..35ec672e4 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -24,6 +24,7 @@ from narwhals.typing import CompliantSeries from narwhals.utils import Implementation from narwhals.utils import import_dtypes_module +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -94,6 +95,7 @@ def __init__( self._implementation = implementation self._backend_version = backend_version self._version = version + validate_backend_version(self._implementation, self._backend_version) def __native_namespace__(self: Self) -> ModuleType: if self._implementation in { @@ -261,7 +263,10 @@ def to_list(self) -> Any: return self._native_series.to_list() def is_between( - self, lower_bound: Any, upper_bound: Any, closed: str = "both" + self, + lower_bound: Any, + upper_bound: Any, + closed: Literal["left", "right", "none", "both"], ) -> PandasLikeSeries: ser = self._native_series _, lower_bound = broadcast_align_and_extract_native(self, lower_bound) @@ -298,13 +303,13 @@ def arg_true(self) -> PandasLikeSeries: def arg_min(self) -> int: ser = self._native_series if self._implementation is Implementation.PANDAS and self._backend_version < (1,): - return ser.values.argmin() # type: ignore[no-any-return] # noqa: PD011 + return ser.values.argmin() # type: ignore[no-any-return] return ser.argmin() # type: ignore[no-any-return] def arg_max(self) -> int: ser = self._native_series if self._implementation is Implementation.PANDAS and self._backend_version < (1,): - return ser.values.argmax() # type: ignore[no-any-return] # noqa: PD011 + return ser.values.argmax() # type: ignore[no-any-return] return ser.argmax() # type: ignore[no-any-return] # Binary comparisons @@ -1117,6 +1122,56 @@ def is_finite(self: Self) -> Self: s = self._native_series return self._from_native_series((s > float("-inf")) & (s < float("inf"))) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + pd_method = "first" if method == "ordinal" else method + native_series = self._native_series + dtypes = import_dtypes_module(self._version) + if ( + self._implementation is Implementation.PANDAS + and self._backend_version < (3,) + and self.dtype + in { + dtypes.Int64, + dtypes.Int32, + dtypes.Int16, + dtypes.Int8, + dtypes.UInt64, + dtypes.UInt32, + dtypes.UInt16, + dtypes.UInt8, + } + and (null_mask := native_series.isna()).any() + ): + # crazy workaround for the case of `na_option="keep"` and nullable + # integer dtypes. This should be supported in pandas > 3.0 + # https://github.com/pandas-dev/pandas/issues/56976 + ranked_series = ( + native_series.to_frame() + .assign(**{f"{native_series.name}_is_null": null_mask}) + .groupby(f"{native_series.name}_is_null") + .rank( + method=pd_method, + na_option="keep", + ascending=not descending, + pct=False, + )[native_series.name] + ) + + else: + ranked_series = native_series.rank( + method=pd_method, + na_option="keep", + ascending=not descending, + pct=False, + ) + + return self._from_native_series(ranked_series) + @property def str(self) -> PandasLikeSeriesStringNamespace: return PandasLikeSeriesStringNamespace(self) diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index 655e60773..08d490581 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -118,7 +118,7 @@ def broadcast_align_and_extract_native( lhs_index = lhs._native_series.index if isinstance(rhs, PandasLikeDataFrame): - return NotImplemented + return NotImplemented # type: ignore[no-any-return] if isinstance(rhs, PandasLikeSeries): rhs_index = rhs._native_series.index @@ -637,10 +637,11 @@ def narwhals_to_native_dtype( # noqa: PLR0915 else f"timedelta64[{du_time_unit}]" ) if isinstance_or_issubclass(dtype, dtypes.Date): - if dtype_backend == "pyarrow-nullable": - return "date32[pyarrow]" - msg = "Date dtype only supported for pyarrow-backed data types in pandas" - raise NotImplementedError(msg) + try: + import pyarrow as pa # ignore-banned-import + except ModuleNotFoundError: # pragma: no cover + msg = "PyArrow>=11.0.0 is required for `Date` dtype." + return "date32[pyarrow]" if isinstance_or_issubclass(dtype, dtypes.Enum): msg = "Converting to Enum is not (yet) supported" raise NotImplementedError(msg) diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index 760b5f4b6..d5e115284 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -15,6 +15,7 @@ from narwhals.utils import Implementation from narwhals.utils import is_sequence_but_not_str from narwhals.utils import parse_columns_to_drop +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -45,6 +46,7 @@ def __init__( self._backend_version = backend_version self._implementation = Implementation.POLARS self._version = version + validate_backend_version(self._implementation, self._backend_version) def __repr__(self: Self) -> str: # pragma: no cover return "PolarsDataFrame" @@ -343,6 +345,7 @@ def __init__( self._backend_version = backend_version self._implementation = Implementation.POLARS self._version = version + validate_backend_version(self._implementation, self._backend_version) def __repr__(self: Self) -> str: # pragma: no cover return "PolarsLazyFrame" diff --git a/narwhals/_polars/expr.py b/narwhals/_polars/expr.py index 230ce37d5..0e4240010 100644 --- a/narwhals/_polars/expr.py +++ b/narwhals/_polars/expr.py @@ -186,27 +186,15 @@ def __or__(self: Self, other: PolarsExpr | bool | Any) -> Self: def __add__(self: Self, other: Any) -> Self: return self._from_native_expr(self._native_expr.__add__(extract_native(other))) - def __radd__(self: Self, other: Any) -> Self: - return self._from_native_expr(self._native_expr.__radd__(extract_native(other))) - def __sub__(self: Self, other: Any) -> Self: return self._from_native_expr(self._native_expr.__sub__(extract_native(other))) - def __rsub__(self: Self, other: Any) -> Self: - return self._from_native_expr(self._native_expr.__rsub__(extract_native(other))) - def __mul__(self: Self, other: Any) -> Self: return self._from_native_expr(self._native_expr.__mul__(extract_native(other))) - def __rmul__(self: Self, other: Any) -> Self: - return self._from_native_expr(self._native_expr.__rmul__(extract_native(other))) - def __pow__(self: Self, other: Any) -> Self: return self._from_native_expr(self._native_expr.__pow__(extract_native(other))) - def __rpow__(self: Self, other: Any) -> Self: - return self._from_native_expr(self._native_expr.__rpow__(extract_native(other))) - def __invert__(self: Self) -> Self: return self._from_native_expr(self._native_expr.__invert__()) diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index 30cd90fd5..33572db7c 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -10,6 +10,7 @@ from narwhals._polars.utils import narwhals_to_native_dtype from narwhals._polars.utils import native_to_narwhals_dtype from narwhals.utils import Implementation +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from types import ModuleType @@ -38,6 +39,7 @@ def __init__( self._backend_version = backend_version self._implementation = Implementation.POLARS self._version = version + validate_backend_version(self._implementation, self._backend_version) def __repr__(self: Self) -> str: # pragma: no cover return "PolarsSeries" diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index f4a779c23..101d5ad24 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -1,5 +1,6 @@ from __future__ import annotations +from itertools import chain from typing import TYPE_CHECKING from typing import Any from typing import Iterable @@ -8,10 +9,12 @@ from narwhals._spark_like.utils import native_to_narwhals_dtype from narwhals._spark_like.utils import parse_exprs_and_named_exprs +from narwhals.exceptions import ColumnNotFoundError from narwhals.utils import Implementation from narwhals.utils import flatten from narwhals.utils import parse_columns_to_drop from narwhals.utils import parse_version +from narwhals.utils import validate_backend_version if TYPE_CHECKING: from pyspark.sql import DataFrame @@ -37,6 +40,7 @@ def __init__( self._backend_version = backend_version self._implementation = Implementation.PYSPARK self._version = version + validate_backend_version(self._implementation, self._backend_version) def __native_namespace__(self) -> Any: # pragma: no cover if self._implementation is Implementation.PYSPARK: @@ -104,9 +108,11 @@ def select( new_columns_list = [col.alias(col_name) for col_name, col in new_columns.items()] return self._from_native_frame(self._native_frame.select(*new_columns_list)) - def filter(self, *predicates: SparkLikeExpr) -> Self: + def filter(self, *predicates: SparkLikeExpr, **constraints: Any) -> Self: plx = self.__narwhals_namespace__() - expr = plx.all_horizontal(*predicates) + expr = plx.all_horizontal( + *chain(predicates, (plx.col(name) == v for name, v in constraints.items())) + ) # `[0]` is safe as all_horizontal's expression only returns a single column condition = expr._call(self)[0] spark_df = self._native_frame.where(condition) @@ -201,6 +207,11 @@ def unique( if keep != "any": msg = "`LazyFrame.unique` with PySpark backend only supports `keep='any'`." raise ValueError(msg) + + if subset is not None and any(x not in self.columns for x in subset): + msg = f"Column(s) {subset} not found in {self.columns}" + raise ColumnNotFoundError(msg) + subset = [subset] if isinstance(subset, str) else subset return self._from_native_frame(self._native_frame.dropDuplicates(subset=subset)) diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index d33e9e3c7..32139cf01 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -1,11 +1,12 @@ from __future__ import annotations -from copy import copy from typing import TYPE_CHECKING from typing import Any from typing import Callable +from typing import Literal from typing import Sequence +from narwhals._expression_parsing import infer_new_root_output_names from narwhals._spark_like.utils import get_column_name from narwhals._spark_like.utils import maybe_evaluate from narwhals.typing import CompliantExpr @@ -106,30 +107,7 @@ def func(df: SparkLikeLazyFrame) -> list[Column]: results.append(column_result) return results - # Try tracking root and output names by combining them from all - # expressions appearing in args and kwargs. If any anonymous - # expression appears (e.g. nw.all()), then give up on tracking root names - # and just set it to None. - root_names = copy(self._root_names) - output_names = self._output_names - for arg in list(kwargs.values()): - if root_names is not None and isinstance(arg, self.__class__): - if arg._root_names is not None: - root_names.extend(arg._root_names) - else: # pragma: no cover - root_names = None - output_names = None - break - elif root_names is None: - output_names = None - break - - if not ( - (output_names is None and root_names is None) - or (output_names is not None and root_names is not None) - ): # pragma: no cover - msg = "Safety assertion failed, please report a bug to https://github.com/narwhals-dev/narwhals/issues" - raise AssertionError(msg) + root_names, output_names = infer_new_root_output_names(self, **kwargs) return self.__class__( func, @@ -143,9 +121,25 @@ def func(df: SparkLikeLazyFrame) -> list[Column]: kwargs=kwargs, ) + def __eq__(self, other: SparkLikeExpr) -> Self: # type: ignore[override] + return self._from_call( + lambda _input, other: _input.__eq__(other), + "__eq__", + other=other, + returns_scalar=False, + ) + + def __ne__(self, other: SparkLikeExpr) -> Self: # type: ignore[override] + return self._from_call( + lambda _input, other: _input.__ne__(other), + "__ne__", + other=other, + returns_scalar=False, + ) + def __add__(self, other: SparkLikeExpr) -> Self: return self._from_call( - lambda _input, other: _input + other, + lambda _input, other: _input.__add__(other), "__add__", other=other, returns_scalar=False, @@ -153,7 +147,7 @@ def __add__(self, other: SparkLikeExpr) -> Self: def __sub__(self, other: SparkLikeExpr) -> Self: return self._from_call( - lambda _input, other: _input - other, + lambda _input, other: _input.__sub__(other), "__sub__", other=other, returns_scalar=False, @@ -161,16 +155,50 @@ def __sub__(self, other: SparkLikeExpr) -> Self: def __mul__(self, other: SparkLikeExpr) -> Self: return self._from_call( - lambda _input, other: _input * other, + lambda _input, other: _input.__mul__(other), "__mul__", other=other, returns_scalar=False, ) - def __lt__(self, other: SparkLikeExpr) -> Self: + def __truediv__(self, other: SparkLikeExpr) -> Self: return self._from_call( - lambda _input, other: _input < other, - "__lt__", + lambda _input, other: _input.__truediv__(other), + "__truediv__", + other=other, + returns_scalar=False, + ) + + def __floordiv__(self, other: SparkLikeExpr) -> Self: + def _floordiv(_input: Column, other: Column) -> Column: + from pyspark.sql import functions as F # noqa: N812 + + return F.floor(_input / other) + + return self._from_call( + _floordiv, "__floordiv__", other=other, returns_scalar=False + ) + + def __pow__(self, other: SparkLikeExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__pow__(other), + "__pow__", + other=other, + returns_scalar=False, + ) + + def __mod__(self, other: SparkLikeExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__mod__(other), + "__mod__", + other=other, + returns_scalar=False, + ) + + def __ge__(self, other: SparkLikeExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__ge__(other), + "__ge__", other=other, returns_scalar=False, ) @@ -183,6 +211,50 @@ def __gt__(self, other: SparkLikeExpr) -> Self: returns_scalar=False, ) + def __le__(self, other: SparkLikeExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__le__(other), + "__le__", + other=other, + returns_scalar=False, + ) + + def __lt__(self, other: SparkLikeExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__lt__(other), + "__lt__", + other=other, + returns_scalar=False, + ) + + def __and__(self, other: SparkLikeExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__and__(other), + "__and__", + other=other, + returns_scalar=False, + ) + + def __or__(self, other: SparkLikeExpr) -> Self: + return self._from_call( + lambda _input, other: _input.__or__(other), + "__or__", + other=other, + returns_scalar=False, + ) + + def __invert__(self) -> Self: + return self._from_call( + lambda _input: _input.__invert__(), + "__invert__", + returns_scalar=self._returns_scalar, + ) + + def abs(self) -> Self: + from pyspark.sql import functions as F # noqa: N812 + + return self._from_call(F.abs, "abs", returns_scalar=self._returns_scalar) + def alias(self, name: str) -> Self: def _alias(df: SparkLikeLazyFrame) -> list[Column]: return [col.alias(name) for col in self._call(df)] @@ -212,36 +284,37 @@ def any(self) -> Self: return self._from_call(F.bool_or, "any", returns_scalar=True) def count(self) -> Self: - def _count(_input: Column) -> Column: - from pyspark.sql import functions as F # noqa: N812 - - return F.count(_input) + from pyspark.sql import functions as F # noqa: N812 - return self._from_call(_count, "count", returns_scalar=True) + return self._from_call(F.count, "count", returns_scalar=True) def max(self) -> Self: - def _max(_input: Column) -> Column: - from pyspark.sql import functions as F # noqa: N812 - - return F.max(_input) + from pyspark.sql import functions as F # noqa: N812 - return self._from_call(_max, "max", returns_scalar=True) + return self._from_call(F.max, "max", returns_scalar=True) def mean(self) -> Self: - def _mean(_input: Column) -> Column: + from pyspark.sql import functions as F # noqa: N812 + + return self._from_call(F.mean, "mean", returns_scalar=True) + + def median(self) -> Self: + def _median(_input: Column) -> Column: + import pyspark # ignore-banned-import from pyspark.sql import functions as F # noqa: N812 - return F.mean(_input) + if parse_version(pyspark.__version__) < (3, 4): + # Use percentile_approx with default accuracy parameter (10000) + return F.percentile_approx(_input.cast("double"), 0.5) - return self._from_call(_mean, "mean", returns_scalar=True) + return F.median(_input) - def min(self) -> Self: - def _min(_input: Column) -> Column: - from pyspark.sql import functions as F # noqa: N812 + return self._from_call(_median, "median", returns_scalar=True) - return F.min(_input) + def min(self) -> Self: + from pyspark.sql import functions as F # noqa: N812 - return self._from_call(_min, "min", returns_scalar=True) + return self._from_call(F.min, "min", returns_scalar=True) def null_count(self) -> Self: def _null_count(_input: Column) -> Column: @@ -252,12 +325,9 @@ def _null_count(_input: Column) -> Column: return self._from_call(_null_count, "null_count", returns_scalar=True) def sum(self) -> Self: - def _sum(_input: Column) -> Column: - from pyspark.sql import functions as F # noqa: N812 - - return F.sum(_input) + from pyspark.sql import functions as F # noqa: N812 - return self._from_call(_sum, "sum", returns_scalar=True) + return self._from_call(F.sum, "sum", returns_scalar=True) def std(self: Self, ddof: int) -> Self: from functools import partial @@ -266,12 +336,7 @@ def std(self: Self, ddof: int) -> Self: from narwhals._spark_like.utils import _std - func = partial( - _std, - ddof=ddof, - backend_version=self._backend_version, - np_version=parse_version(np.__version__), - ) + func = partial(_std, ddof=ddof, np_version=parse_version(np.__version__)) return self._from_call(func, "std", returns_scalar=True, ddof=ddof) @@ -282,11 +347,136 @@ def var(self: Self, ddof: int) -> Self: from narwhals._spark_like.utils import _var - func = partial( - _var, - ddof=ddof, - backend_version=self._backend_version, - np_version=parse_version(np.__version__), - ) + func = partial(_var, ddof=ddof, np_version=parse_version(np.__version__)) return self._from_call(func, "var", returns_scalar=True, ddof=ddof) + + def clip( + self, + lower_bound: Any | None = None, + upper_bound: Any | None = None, + ) -> Self: + def _clip(_input: Column, lower_bound: Any, upper_bound: Any) -> Column: + from pyspark.sql import functions as F # noqa: N812 + + result = _input + if lower_bound is not None: + # Convert lower_bound to a literal Column + result = F.when(result < lower_bound, F.lit(lower_bound)).otherwise( + result + ) + if upper_bound is not None: + # Convert upper_bound to a literal Column + result = F.when(result > upper_bound, F.lit(upper_bound)).otherwise( + result + ) + return result + + return self._from_call( + _clip, + "clip", + lower_bound=lower_bound, + upper_bound=upper_bound, + returns_scalar=self._returns_scalar, + ) + + def is_between( + self, + lower_bound: Any, + upper_bound: Any, + closed: Literal["left", "right", "none", "both"], + ) -> Self: + def _is_between(_input: Column, lower_bound: Any, upper_bound: Any) -> Column: + if closed == "both": + return (_input >= lower_bound) & (_input <= upper_bound) + if closed == "none": + return (_input > lower_bound) & (_input < upper_bound) + if closed == "left": + return (_input >= lower_bound) & (_input < upper_bound) + return (_input > lower_bound) & (_input <= upper_bound) + + return self._from_call( + _is_between, + "is_between", + lower_bound=lower_bound, + upper_bound=upper_bound, + returns_scalar=self._returns_scalar, + ) + + def is_duplicated(self) -> Self: + def _is_duplicated(_input: Column) -> Column: + from pyspark.sql import Window + from pyspark.sql import functions as F # noqa: N812 + + # Create a window spec that treats each value separately. + return F.count("*").over(Window.partitionBy(_input)) > 1 + + return self._from_call( + _is_duplicated, "is_duplicated", returns_scalar=self._returns_scalar + ) + + def is_finite(self) -> Self: + def _is_finite(_input: Column) -> Column: + from pyspark.sql import functions as F # noqa: N812 + + # A value is finite if it's not NaN, not NULL, and not infinite + return ( + ~F.isnan(_input) + & ~F.isnull(_input) + & (_input != float("inf")) + & (_input != float("-inf")) + ) + + return self._from_call( + _is_finite, "is_finite", returns_scalar=self._returns_scalar + ) + + def is_in(self, values: Sequence[Any]) -> Self: + def _is_in(_input: Column, values: Sequence[Any]) -> Column: + return _input.isin(values) + + return self._from_call( + _is_in, + "is_in", + values=values, + returns_scalar=self._returns_scalar, + ) + + def is_unique(self) -> Self: + def _is_unique(_input: Column) -> Column: + from pyspark.sql import Window + from pyspark.sql import functions as F # noqa: N812 + + # Create a window spec that treats each value separately + return F.count("*").over(Window.partitionBy(_input)) == 1 + + return self._from_call( + _is_unique, "is_unique", returns_scalar=self._returns_scalar + ) + + def len(self) -> Self: + def _len(_input: Column) -> Column: + from pyspark.sql import functions as F # noqa: N812 + + # Use count(*) to count all rows including nulls + return F.count("*") + + return self._from_call(_len, "len", returns_scalar=True) + + def round(self, decimals: int) -> Self: + def _round(_input: Column, decimals: int) -> Column: + from pyspark.sql import functions as F # noqa: N812 + + return F.round(_input, decimals) + + return self._from_call( + _round, + "round", + decimals=decimals, + returns_scalar=self._returns_scalar, + ) + + def skew(self) -> Self: + from pyspark.sql import functions as F # noqa: N812 + + return self._from_call(F.skewness, "skew", returns_scalar=True) diff --git a/narwhals/_spark_like/group_by.py b/narwhals/_spark_like/group_by.py index d53237b59..cbcf87692 100644 --- a/narwhals/_spark_like/group_by.py +++ b/narwhals/_spark_like/group_by.py @@ -79,21 +79,26 @@ def _from_native_frame(self, df: SparkLikeLazyFrame) -> SparkLikeLazyFrame: ) -def get_spark_function( - function_name: str, backend_version: tuple[int, ...], **kwargs: Any -) -> Column: +def get_spark_function(function_name: str, **kwargs: Any) -> Column: + from pyspark.sql import functions as F # noqa: N812 + if function_name in {"std", "var"}: import numpy as np # ignore-banned-import return partial( _std if function_name == "std" else _var, - ddof=kwargs.get("ddof", 1), - backend_version=backend_version, + ddof=kwargs["ddof"], np_version=parse_version(np.__version__), ) - from pyspark.sql import functions as F # noqa: N812 + elif function_name == "len": + # Use count(*) to count all rows including nulls + def _count(*_args: Any, **_kwargs: Any) -> Column: + return F.count("*") + + return _count - return getattr(F, function_name) + else: + return getattr(F, function_name) def agg_pyspark( @@ -127,9 +132,7 @@ def agg_pyspark( function_name = POLARS_TO_PYSPARK_AGGREGATIONS.get( expr._function_name, expr._function_name ) - agg_func = get_spark_function( - function_name, backend_version=expr._backend_version, **expr._kwargs - ) + agg_func = get_spark_function(function_name, **expr._kwargs) simple_aggregations.update( {output_name: agg_func(keys[0]) for output_name in expr._output_names} ) @@ -143,12 +146,7 @@ def agg_pyspark( raise AssertionError(msg) function_name = remove_prefix(expr._function_name, "col->") - pyspark_function = POLARS_TO_PYSPARK_AGGREGATIONS.get( - function_name, function_name - ) - agg_func = get_spark_function( - pyspark_function, backend_version=expr._backend_version, **expr._kwargs - ) + agg_func = get_spark_function(function_name, **expr._kwargs) simple_aggregations.update( { @@ -162,6 +160,6 @@ def agg_pyspark( result_simple = grouped.agg(*agg_columns) except ValueError as exc: # pragma: no cover msg = "Failed to aggregated - does your aggregation function return a scalar? \ - \n\n Please see: https://narwhals-dev.github.io/narwhals/pandas_like_concepts/improve_group_by_operation.md/" + \n\n Please see: https://narwhals-dev.github.io/narwhals/pandas_like_concepts/improve_group_by_operation/" raise RuntimeError(msg) from exc return from_dataframe(result_simple) diff --git a/narwhals/_spark_like/namespace.py b/narwhals/_spark_like/namespace.py index d34867b00..56cc4d271 100644 --- a/narwhals/_spark_like/namespace.py +++ b/narwhals/_spark_like/namespace.py @@ -16,6 +16,7 @@ from narwhals._spark_like.dataframe import SparkLikeLazyFrame from narwhals._spark_like.typing import IntoSparkLikeExpr + from narwhals.dtypes import DType from narwhals.utils import Version @@ -67,6 +68,28 @@ def col(self, *column_names: str) -> SparkLikeExpr: *column_names, backend_version=self._backend_version, version=self._version ) + def lit(self, value: object, dtype: DType | None) -> SparkLikeExpr: + if dtype is not None: + msg = "todo" + raise NotImplementedError(msg) + + def _lit(_: SparkLikeLazyFrame) -> list[Column]: + import pyspark.sql.functions as F # noqa: N812 + + return [F.lit(value).alias("literal")] + + return SparkLikeExpr( # type: ignore[abstract] + call=_lit, + depth=0, + function_name="lit", + root_names=None, + output_names=["literal"], + returns_scalar=True, + backend_version=self._backend_version, + version=self._version, + kwargs={}, + ) + def sum_horizontal(self, *exprs: IntoSparkLikeExpr) -> SparkLikeExpr: parsed_exprs = parse_into_exprs(*exprs, namespace=self) diff --git a/narwhals/_spark_like/utils.py b/narwhals/_spark_like/utils.py index a3c77033c..fb3a3f3c4 100644 --- a/narwhals/_spark_like/utils.py +++ b/narwhals/_spark_like/utils.py @@ -120,13 +120,8 @@ def maybe_evaluate(df: SparkLikeLazyFrame, obj: Any) -> Any: return obj -def _std( - _input: Column | str, - ddof: int, - backend_version: tuple[int, ...], - np_version: tuple[int, ...], -) -> Column: - if backend_version < (3, 5) or np_version > (2, 0): +def _std(_input: Column | str, ddof: int, np_version: tuple[int, ...]) -> Column: + if np_version > (2, 0): from pyspark.sql import functions as F # noqa: N812 if ddof == 1: @@ -142,13 +137,8 @@ def _std( return stddev(input_col, ddof=ddof) -def _var( - _input: Column | str, - ddof: int, - backend_version: tuple[int, ...], - np_version: tuple[int, ...], -) -> Column: - if backend_version < (3, 5) or np_version > (2, 0): +def _var(_input: Column | str, ddof: int, np_version: tuple[int, ...]) -> Column: + if np_version > (2, 0): from pyspark.sql import functions as F # noqa: N812 if ddof == 1: diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index fcf8648dc..dd786ef3d 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -5015,7 +5015,7 @@ def join_asof( ... datetime(2016, 5, 25, 13, 30, 0, 23), ... datetime(2016, 5, 25, 13, 30, 0, 38), ... datetime(2016, 5, 25, 13, 30, 0, 48), - ... datetime(2016, 5, 25, 13, 30, 0, 48), + ... datetime(2016, 5, 25, 13, 30, 0, 49), ... datetime(2016, 5, 25, 13, 30, 0, 48), ... ], ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], @@ -5036,7 +5036,7 @@ def join_asof( ... df = nw.from_native(df_native) ... other = nw.from_native(other_native) ... return ( - ... df.sort("datetime") + ... df.sort("datetime", "ticker") ... .join_asof(other, on="datetime", by="ticker") ... .sort("datetime", "ticker") ... .collect() @@ -5056,15 +5056,15 @@ def join_asof( │ 2016-05-25 13:30:00.000038 ┆ MSFT ┆ 51.95 ┆ 155 ┆ 51.97 ┆ 51.98 │ │ 2016-05-25 13:30:00.000048 ┆ AAPL ┆ 98.0 ┆ 100 ┆ null ┆ null │ │ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.77 ┆ 100 ┆ 720.5 ┆ 720.93 │ - │ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.92 ┆ 100 ┆ 720.5 ┆ 720.93 │ + │ 2016-05-25 13:30:00.000049 ┆ GOOG ┆ 720.92 ┆ 100 ┆ 720.5 ┆ 720.93 │ └────────────────────────────┴────────┴────────┴──────────┴───────┴────────┘ >>> agnostic_join_asof_datetime_by_ticker(trades_dask, quotes_dask) datetime ticker price quantity bid ask 0 2016-05-25 13:30:00.000023 MSFT 51.95 75 51.95 51.96 0 2016-05-25 13:30:00.000038 MSFT 51.95 155 51.97 51.98 - 2 2016-05-25 13:30:00.000048 AAPL 98.00 100 NaN NaN - 1 2016-05-25 13:30:00.000048 GOOG 720.92 100 720.50 720.93 - 3 2016-05-25 13:30:00.000048 GOOG 720.77 100 720.50 720.93 + 1 2016-05-25 13:30:00.000048 AAPL 98.00 100 NaN NaN + 2 2016-05-25 13:30:00.000048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.000049 GOOG 720.92 100 720.50 720.93 """ return super().join_asof( other, diff --git a/narwhals/dependencies.py b/narwhals/dependencies.py index 0c5d11720..43904a0ba 100644 --- a/narwhals/dependencies.py +++ b/narwhals/dependencies.py @@ -87,16 +87,16 @@ def get_duckdb() -> Any: return sys.modules.get("duckdb", None) -def get_dask_expr() -> Any: - """Get dask_expr module (if already imported - else return None).""" - return sys.modules.get("dask_expr", None) - - def get_ibis() -> Any: """Get ibis module (if already imported - else return None).""" return sys.modules.get("ibis", None) +def get_dask_expr() -> Any: + """Get dask_expr module (if already imported - else return None).""" + return sys.modules.get("dask_expr", None) + + def get_pyspark() -> Any: # pragma: no cover """Get pyspark module (if already imported - else return None).""" return sys.modules.get("pyspark", None) diff --git a/narwhals/expr.py b/narwhals/expr.py index 6e3cacb02..807a7f04b 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -19,14 +19,19 @@ from typing_extensions import Self from narwhals.dtypes import DType + from narwhals.typing import CompliantExpr + from narwhals.typing import CompliantNamespace + from narwhals.typing import CompliantSeriesT_co from narwhals.typing import IntoExpr -def extract_compliant(expr: Expr, other: Any) -> Any: +def extract_compliant( + plx: CompliantNamespace[CompliantSeriesT_co], other: Any +) -> CompliantExpr[CompliantSeriesT_co] | CompliantSeriesT_co | Any: from narwhals.series import Series if isinstance(other, Expr): - return other._to_compliant_expr(expr) + return other._to_compliant_expr(plx) if isinstance(other, Series): return other._compliant_series return other @@ -58,23 +63,27 @@ def alias(self, name: str) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [4, 5]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [4, 5]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [4, 5]}) + >>> + >>> data = {"a": [1, 2], "b": [4, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_alias(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select((nw.col("b") + 10).alias("c")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_alias`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_alias(df_pd) c 0 14 1 15 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_alias(df_pl) shape: (2, 1) ┌─────┐ │ c │ @@ -84,7 +93,8 @@ def alias(self, name: str) -> Self: │ 14 │ │ 15 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_alias(df_pa) pyarrow.Table c: int64 ---- @@ -105,11 +115,12 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se A new expression. Examples: - >>> import polars as pl >>> import pandas as pd + >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3, 4]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -117,19 +128,21 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se Lets define a library-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_pipe(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").pipe(lambda x: x + 1)).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_pipe`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_pipe(df_pd) a 0 2 1 3 2 4 3 5 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_pipe(df_pl) shape: (4, 1) ┌─────┐ │ a │ @@ -141,7 +154,8 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se │ 4 │ │ 5 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_pipe(df_pa) pyarrow.Table a: int64 ---- @@ -164,27 +178,29 @@ def cast(self: Self, dtype: DType | type[DType]) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> from datetime import date - >>> df_pd = pd.DataFrame({"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0]}) - >>> df_pl = pl.DataFrame({"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0]}) - >>> df_pa = pa.table({"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0]}) + >>> + >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_cast(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("foo").cast(nw.Float32), nw.col("bar").cast(nw.UInt8) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cast`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_cast(df_pd) foo bar 0 1.0 6 1 2.0 7 2 3.0 8 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_cast(df_pl) shape: (3, 2) ┌─────┬─────┐ │ foo ┆ bar │ @@ -195,7 +211,7 @@ def cast(self: Self, dtype: DType | type[DType]) -> Self: │ 2.0 ┆ 7 │ │ 3.0 ┆ 8 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_cast(df_pa) pyarrow.Table foo: float bar: uint8 @@ -227,11 +243,12 @@ def __and__(self, other: Any) -> Self: ) def __rand__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__rand__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__and__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __or__(self, other: Any) -> Self: return self.__class__( @@ -239,11 +256,12 @@ def __or__(self, other: Any) -> Self: ) def __ror__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__ror__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__or__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __add__(self, other: Any) -> Self: return self.__class__( @@ -253,11 +271,12 @@ def __add__(self, other: Any) -> Self: ) def __radd__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__radd__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__add__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __sub__(self, other: Any) -> Self: return self.__class__( @@ -267,11 +286,12 @@ def __sub__(self, other: Any) -> Self: ) def __rsub__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__rsub__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__sub__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __truediv__(self, other: Any) -> Self: return self.__class__( @@ -281,11 +301,12 @@ def __truediv__(self, other: Any) -> Self: ) def __rtruediv__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__rtruediv__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__truediv__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __mul__(self, other: Any) -> Self: return self.__class__( @@ -295,11 +316,12 @@ def __mul__(self, other: Any) -> Self: ) def __rmul__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__rmul__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__mul__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __le__(self, other: Any) -> Self: return self.__class__( @@ -329,11 +351,12 @@ def __pow__(self, other: Any) -> Self: ) def __rpow__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__rpow__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__pow__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __floordiv__(self, other: Any) -> Self: return self.__class__( @@ -343,11 +366,12 @@ def __floordiv__(self, other: Any) -> Self: ) def __rfloordiv__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__rfloordiv__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__floordiv__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) def __mod__(self, other: Any) -> Self: return self.__class__( @@ -357,11 +381,12 @@ def __mod__(self, other: Any) -> Self: ) def __rmod__(self, other: Any) -> Self: - return self.__class__( - lambda plx: self._to_compliant_expr(plx).__rmod__( - extract_compliant(plx, other) + def func(plx: CompliantNamespace[Any]) -> CompliantExpr[Any]: + return plx.lit(extract_compliant(plx, other), dtype=None).__mod__( + extract_compliant(plx, self) ) - ) + + return self.__class__(func) # --- unary --- def __invert__(self) -> Self: @@ -379,22 +404,26 @@ def any(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [True, False], "b": [True, True]}) - >>> df_pl = pl.DataFrame({"a": [True, False], "b": [True, True]}) - >>> df_pa = pa.table({"a": [True, False], "b": [True, True]}) + >>> + >>> data = {"a": [True, False], "b": [True, True]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_any(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").any()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_any`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_any(df_pd) a b 0 True True - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_any(df_pl) shape: (1, 2) ┌──────┬──────┐ │ a ┆ b │ @@ -403,7 +432,8 @@ def any(self) -> Self: ╞══════╪══════╡ │ true ┆ true │ └──────┴──────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_any(df_pa) pyarrow.Table a: bool b: bool @@ -425,22 +455,26 @@ def all(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [True, False], "b": [True, True]}) - >>> df_pl = pl.DataFrame({"a": [True, False], "b": [True, True]}) - >>> df_pa = pa.table({"a": [True, False], "b": [True, True]}) + >>> + >>> data = {"a": [True, False], "b": [True, True]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_all(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").all()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_all`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_all(df_pd) a b 0 False True - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_all(df_pl) shape: (1, 2) ┌───────┬──────┐ │ a ┆ b │ @@ -449,7 +483,8 @@ def all(self) -> Self: ╞═══════╪══════╡ │ false ┆ true │ └───────┴──────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_all(df_pa) pyarrow.Table a: bool b: bool @@ -516,27 +551,28 @@ def ewm_mean( >>> import polars as pl >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_ewm_mean(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").ewm_mean(com=1, ignore_nulls=False) ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass either pandas or Polars to `agnostic_ewm_mean`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_ewm_mean(df_pd) a 0 1.000000 1 1.666667 2 2.428571 - >>> my_library_agnostic_function(df_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_ewm_mean(df_pl) # doctest: +NORMALIZE_WHITESPACE shape: (3, 1) ┌──────────┐ │ a │ @@ -572,22 +608,26 @@ def mean(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [-1, 0, 1], "b": [2, 4, 6]}) - >>> df_pl = pl.DataFrame({"a": [-1, 0, 1], "b": [2, 4, 6]}) - >>> df_pa = pa.table({"a": [-1, 0, 1], "b": [2, 4, 6]}) + >>> + >>> data = {"a": [-1, 0, 1], "b": [2, 4, 6]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_mean(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").mean()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_mean`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_mean(df_pd) a b 0 0.0 4.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_mean(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -596,7 +636,8 @@ def mean(self) -> Self: ╞═════╪═════╡ │ 0.0 ┆ 4.0 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_mean(df_pa) pyarrow.Table a: double b: double @@ -621,22 +662,26 @@ def median(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]}) - >>> df_pl = pl.DataFrame({"a": [1, 8, 3], "b": [4, 5, 2]}) - >>> df_pa = pa.table({"a": [1, 8, 3], "b": [4, 5, 2]}) + >>> + >>> data = {"a": [1, 8, 3], "b": [4, 5, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_median(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").median()).to_native() - We can then pass any supported library such as pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_median`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_median(df_pd) a b 0 3.0 4.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_median(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -645,7 +690,8 @@ def median(self) -> Self: ╞═════╪═════╡ │ 3.0 ┆ 4.0 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_median(df_pa) pyarrow.Table a: double b: double @@ -660,7 +706,7 @@ def std(self, *, ddof: int = 1) -> Self: Arguments: ddof: "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof, - where N represents the number of elements. By default ddof is 1. + where N represents the number of elements. By default ddof is 1. Returns: A new expression. @@ -671,22 +717,25 @@ def std(self, *, ddof: int = 1) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [20, 25, 60], "b": [1.5, 1, -1.4]}) - >>> df_pl = pl.DataFrame({"a": [20, 25, 60], "b": [1.5, 1, -1.4]}) - >>> df_pa = pa.table({"a": [20, 25, 60], "b": [1.5, 1, -1.4]}) + >>> + >>> data = {"a": [20, 25, 60], "b": [1.5, 1, -1.4]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_std(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").std(ddof=0)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_std`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_std(df_pd) a b 0 17.79513 1.265789 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_std(df_pl) shape: (1, 2) ┌──────────┬──────────┐ │ a ┆ b │ @@ -695,7 +744,7 @@ def std(self, *, ddof: int = 1) -> Self: ╞══════════╪══════════╡ │ 17.79513 ┆ 1.265789 │ └──────────┴──────────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_std(df_pa) pyarrow.Table a: double b: double @@ -722,9 +771,11 @@ def var(self, *, ddof: int = 1) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [20, 25, 60], "b": [1.5, 1, -1.4]}) - >>> df_pl = pl.DataFrame({"a": [20, 25, 60], "b": [1.5, 1, -1.4]}) - >>> df_pa = pa.table({"a": [20, 25, 60], "b": [1.5, 1, -1.4]}) + >>> + >>> data = {"a": [20, 25, 60], "b": [1.5, 1, -1.4]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: @@ -732,11 +783,13 @@ def var(self, *, ddof: int = 1) -> Self: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").var(ddof=0)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_var`: >>> agnostic_var(df_pd) a b 0 316.666667 1.602222 + >>> agnostic_var(df_pl) shape: (1, 2) ┌────────────┬──────────┐ @@ -746,6 +799,7 @@ def var(self, *, ddof: int = 1) -> Self: ╞════════════╪══════════╡ │ 316.666667 ┆ 1.602222 │ └────────────┴──────────┘ + >>> agnostic_var(df_pa) pyarrow.Table a: double @@ -782,6 +836,7 @@ def map_batches( >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -789,7 +844,7 @@ def map_batches( Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_map_batches(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a", "b").map_batches( @@ -797,14 +852,15 @@ def map_batches( ... ) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_map_batches`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_map_batches(df_pd) a b 0 2.0 5.0 1 3.0 6.0 2 4.0 7.0 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_map_batches(df_pl) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -815,7 +871,7 @@ def map_batches( │ 3.0 ┆ 6.0 │ │ 4.0 ┆ 7.0 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_map_batches(df_pa) pyarrow.Table a: double b: double @@ -840,22 +896,27 @@ def skew(self: Self) -> Self: >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw - >>> df_pd = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]}) - >>> df_pa = pa.Table.from_pandas(df_pd) + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 2, 3, 4, 5], "b": [1, 1, 2, 10, 100]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> @nw.narwhalify - ... def func(df): - ... return df.select(nw.col("a", "b").skew()) + >>> def agnostic_skew(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.col("a", "b").skew()).to_native() - We can then pass pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_skew`: - >>> func(df_pd) + >>> agnostic_skew(df_pd) a b 0 0.0 1.472427 - >>> func(df_pl) + + >>> agnostic_skew(df_pl) shape: (1, 2) ┌─────┬──────────┐ │ a ┆ b │ @@ -864,7 +925,8 @@ def skew(self: Self) -> Self: ╞═════╪══════════╡ │ 0.0 ┆ 1.472427 │ └─────┴──────────┘ - >>> func(df_pa) + + >>> agnostic_skew(df_pa) pyarrow.Table a: double b: double @@ -886,22 +948,25 @@ def sum(self) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [5, 10], "b": [50, 100]}) - >>> df_pl = pl.DataFrame({"a": [5, 10], "b": [50, 100]}) - >>> df_pa = pa.table({"a": [5, 10], "b": [50, 100]}) + >>> + >>> data = {"a": [5, 10], "b": [50, 100]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_sum(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").sum()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_sum`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_sum(df_pd) a b 0 15 150 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_sum(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -910,7 +975,7 @@ def sum(self) -> Expr: ╞═════╪═════╡ │ 15 ┆ 150 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_sum(df_pa) pyarrow.Table a: int64 b: int64 @@ -932,22 +997,26 @@ def min(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [4, 3]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [4, 3]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [4, 3]}) + >>> + >>> data = {"a": [1, 2], "b": [4, 3]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_min(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.min("a", "b")).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_min`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_min(df_pd) a b 0 1 3 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_min(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -956,7 +1025,8 @@ def min(self) -> Self: ╞═════╪═════╡ │ 1 ┆ 3 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_min(df_pa) pyarrow.Table a: int64 b: int64 @@ -978,22 +1048,26 @@ def max(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [10, 20], "b": [50, 100]}) - >>> df_pl = pl.DataFrame({"a": [10, 20], "b": [50, 100]}) - >>> df_pa = pa.table({"a": [10, 20], "b": [50, 100]}) + >>> + >>> data = {"a": [10, 20], "b": [50, 100]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_max(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.max("a", "b")).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_max`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_max(df_pd) a b 0 20 100 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_max(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1002,7 +1076,8 @@ def max(self) -> Self: ╞═════╪═════╡ │ 20 ┆ 100 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_max(df_pa) pyarrow.Table a: int64 b: int64 @@ -1024,9 +1099,11 @@ def arg_min(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [10, 20], "b": [150, 100]}) - >>> df_pl = pl.DataFrame({"a": [10, 20], "b": [150, 100]}) - >>> df_pa = pa.table({"a": [10, 20], "b": [150, 100]}) + >>> + >>> data = {"a": [10, 20], "b": [150, 100]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: @@ -1036,11 +1113,13 @@ def arg_min(self) -> Self: ... nw.col("a", "b").arg_min().name.suffix("_arg_min") ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_arg_min`: >>> agnostic_arg_min(df_pd) a_arg_min b_arg_min 0 0 1 + >>> agnostic_arg_min(df_pl) shape: (1, 2) ┌───────────┬───────────┐ @@ -1050,6 +1129,7 @@ def arg_min(self) -> Self: ╞═══════════╪═══════════╡ │ 0 ┆ 1 │ └───────────┴───────────┘ + >>> agnostic_arg_min(df_pa) pyarrow.Table a_arg_min: int64 @@ -1072,9 +1152,11 @@ def arg_max(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [10, 20], "b": [150, 100]}) - >>> df_pl = pl.DataFrame({"a": [10, 20], "b": [150, 100]}) - >>> df_pa = pa.table({"a": [10, 20], "b": [150, 100]}) + >>> + >>> data = {"a": [10, 20], "b": [150, 100]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: @@ -1084,11 +1166,13 @@ def arg_max(self) -> Self: ... nw.col("a", "b").arg_max().name.suffix("_arg_max") ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_arg_max`: >>> agnostic_arg_max(df_pd) a_arg_max b_arg_max 0 1 0 + >>> agnostic_arg_max(df_pl) shape: (1, 2) ┌───────────┬───────────┐ @@ -1098,6 +1182,7 @@ def arg_max(self) -> Self: ╞═══════════╪═══════════╡ │ 1 ┆ 0 │ └───────────┴───────────┘ + >>> agnostic_arg_max(df_pa) pyarrow.Table a_arg_max: int64 @@ -1120,22 +1205,26 @@ def count(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 3], "b": [None, 4, 4]}) - >>> df_pa = pa.table({"a": [1, 2, 3], "b": [None, 4, 4]}) + >>> + >>> data = {"a": [1, 2, 3], "b": [None, 4, 4]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_count(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.all().count()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_count`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_count(df_pd) a b 0 3 2 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_count(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1144,7 +1233,8 @@ def count(self) -> Self: ╞═════╪═════╡ │ 3 ┆ 2 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_count(df_pa) pyarrow.Table a: int64 b: int64 @@ -1166,22 +1256,25 @@ def n_unique(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 3, 3, 5]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 1, 3, 3, 5]}) - >>> df_pa = pa.table({"a": [1, 2, 3, 4, 5], "b": [1, 1, 3, 3, 5]}) + >>> + >>> data = {"a": [1, 2, 3, 4, 5], "b": [1, 1, 3, 3, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_n_unique(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").n_unique()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_n_unique`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_n_unique(df_pd) a b 0 5 3 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_n_unique(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1190,7 +1283,7 @@ def n_unique(self) -> Self: ╞═════╪═════╡ │ 5 ┆ 3 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_n_unique(df_pa) pyarrow.Table a: int64 b: int64 @@ -1217,24 +1310,28 @@ def unique(self, *, maintain_order: bool = False) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]}) - >>> df_pl = pl.DataFrame({"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]}) - >>> df_pa = pa.table({"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]}) + >>> + >>> data = {"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_unique(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").unique(maintain_order=True)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_unique`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_unique(df_pd) a b 0 1 2 1 3 4 2 5 6 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_unique(df_pl) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1245,7 +1342,8 @@ def unique(self, *, maintain_order: bool = False) -> Self: │ 3 ┆ 4 │ │ 5 ┆ 6 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_unique(df_pa) pyarrow.Table a: int64 b: int64 @@ -1269,6 +1367,7 @@ def abs(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, -2], "b": [-3, 4]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -1276,17 +1375,19 @@ def abs(self) -> Self: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_abs(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").abs()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_abs`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_abs(df_pd) a b 0 1 3 1 2 4 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_abs(df_pl) shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1296,7 +1397,8 @@ def abs(self) -> Self: │ 1 ┆ 3 │ │ 2 ┆ 4 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_abs(df_pa) pyarrow.Table a: int64 b: int64 @@ -1321,26 +1423,29 @@ def cum_sum(self: Self, *, reverse: bool = False) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]}) - >>> df_pl = pl.DataFrame({"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]}) - >>> df_pa = pa.table({"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]}) + >>> + >>> data = {"a": [1, 1, 3, 5, 5], "b": [2, 4, 4, 6, 6]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_cum_sum(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a", "b").cum_sum()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cum_sum`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_cum_sum(df_pd) a b 0 1 2 1 2 6 2 5 10 3 10 16 4 15 22 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_cum_sum(df_pl) shape: (5, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1353,7 +1458,7 @@ def cum_sum(self: Self, *, reverse: bool = False) -> Self: │ 10 ┆ 16 │ │ 15 ┆ 22 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_cum_sum(df_pa) pyarrow.Table a: int64 b: int64 @@ -1381,31 +1486,35 @@ def diff(self) -> Self: nw.col("a").diff().fill_null(0).cast(nw.Int64) Examples: - >>> import polars as pl >>> import pandas as pd + >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 1, 3, 5, 5]}) - >>> df_pl = pl.DataFrame({"a": [1, 1, 3, 5, 5]}) - >>> df_pa = pa.table({"a": [1, 1, 3, 5, 5]}) + >>> + >>> data = {"a": [1, 1, 3, 5, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_diff(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(a_diff=nw.col("a").diff()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_diff`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_diff(df_pd) a_diff 0 NaN 1 0.0 2 2.0 3 2.0 4 0.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_diff(df_pl) shape: (5, 1) ┌────────┐ │ a_diff │ @@ -1418,7 +1527,8 @@ def diff(self) -> Self: │ 2 │ │ 0 │ └────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_diff(df_pa) pyarrow.Table a_diff: int64 ---- @@ -1445,31 +1555,35 @@ def shift(self, n: int) -> Self: nw.col("a").shift(1).fill_null(0).cast(nw.Int64) Examples: - >>> import polars as pl >>> import pandas as pd + >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 1, 3, 5, 5]}) - >>> df_pl = pl.DataFrame({"a": [1, 1, 3, 5, 5]}) - >>> df_pa = pa.table({"a": [1, 1, 3, 5, 5]}) + >>> + >>> data = {"a": [1, 1, 3, 5, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_shift(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(a_shift=nw.col("a").shift(n=1)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_shift`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_shift(df_pd) a_shift 0 NaN 1 1.0 2 1.0 3 3.0 4 5.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_shift(df_pl) shape: (5, 1) ┌─────────┐ │ a_shift │ @@ -1482,7 +1596,8 @@ def shift(self, n: int) -> Self: │ 3 │ │ 5 │ └─────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_shift(df_pa) pyarrow.Table a_shift: int64 ---- @@ -1514,18 +1629,20 @@ def replace_strict( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> df_pd = pd.DataFrame({"a": [3, 0, 1, 2]}) - >>> df_pl = pl.DataFrame({"a": [3, 0, 1, 2]}) - >>> df_pa = pa.table({"a": [3, 0, 1, 2]}) + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [3, 0, 1, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define dataframe-agnostic functions: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_replace_strict(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... b=nw.col("a").replace_strict( @@ -1535,15 +1652,17 @@ def replace_strict( ... ) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_replace_strict`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_replace_strict(df_pd) a b 0 3 three 1 0 zero 2 1 one 3 2 two - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_replace_strict(df_pl) shape: (4, 2) ┌─────┬───────┐ │ a ┆ b │ @@ -1555,7 +1674,8 @@ def replace_strict( │ 1 ┆ one │ │ 2 ┆ two │ └─────┴───────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_replace_strict(df_pa) pyarrow.Table a: int64 b: string @@ -1588,35 +1708,38 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> df_pd = pd.DataFrame({"a": [5, None, 1, 2]}) - >>> df_pl = pl.DataFrame({"a": [5, None, 1, 2]}) - >>> df_pa = pa.table({"a": [5, None, 1, 2]}) + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [5, None, 1, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define dataframe-agnostic functions: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_sort(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").sort()).to_native() - >>> def func_descend(df): - ... df = nw.from_native(df) - ... df = df.select(nw.col("a").sort(descending=True)) - ... return nw.to_native(df) + >>> def agnostic_sort_descending(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.select(nw.col("a").sort(descending=True)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_sort` and `agnostic_sort_descending`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_sort(df_pd) a 1 NaN 2 1.0 3 2.0 0 5.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_sort(df_pl) shape: (4, 1) ┌──────┐ │ a │ @@ -1628,19 +1751,21 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: │ 2 │ │ 5 │ └──────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_sort(df_pa) pyarrow.Table a: int64 ---- a: [[null,1,2,5]] - >>> func_descend(df_pd) + >>> agnostic_sort_descending(df_pd) a 1 NaN 0 5.0 3 2.0 2 1.0 - >>> func_descend(df_pl) + + >>> agnostic_sort_descending(df_pl) shape: (4, 1) ┌──────┐ │ a │ @@ -1652,7 +1777,8 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: │ 2 │ │ 1 │ └──────┘ - >>> func_descend(df_pa) + + >>> agnostic_sort_descending(df_pa) pyarrow.Table a: int64 ---- @@ -1666,10 +1792,10 @@ def sort(self, *, descending: bool = False, nulls_last: bool = False) -> Self: # --- transform --- def is_between( - self, + self: Self, lower_bound: Any | IntoExpr, upper_bound: Any | IntoExpr, - closed: str = "both", + closed: Literal["left", "right", "none", "both"] = "both", ) -> Self: """Check if this expression is between the given lower and upper bounds. @@ -1687,26 +1813,30 @@ def is_between( >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2, 3, 4, 5]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 3, 4, 5]}) - >>> df_pa = pa.table({"a": [1, 2, 3, 4, 5]}) + >>> + >>> data = {"a": [1, 2, 3, 4, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_between(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").is_between(2, 4, "right")).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_between`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_is_between(df_pd) a 0 False 1 False 2 True 3 True 4 False - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_is_between(df_pl) shape: (5, 1) ┌───────┐ │ a │ @@ -1719,7 +1849,8 @@ def is_between( │ true │ │ false │ └───────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_is_between(df_pa) pyarrow.Table a: bool ---- @@ -1748,26 +1879,29 @@ def is_in(self, other: Any) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2, 9, 10]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 9, 10]}) - >>> df_pa = pa.table({"a": [1, 2, 9, 10]}) + >>> + >>> data = {"a": [1, 2, 9, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_in(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns(b=nw.col("a").is_in([1, 2])).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_in`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_is_in(df_pd) a b 0 1 True 1 2 True 2 9 False 3 10 False - >>> my_library_agnostic_function(df_pl) + >>> agnostic_is_in(df_pl) shape: (4, 2) ┌─────┬───────┐ │ a ┆ b │ @@ -1779,7 +1913,8 @@ def is_in(self, other: Any) -> Self: │ 9 ┆ false │ │ 10 ┆ false │ └─────┴───────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_is_in(df_pa) pyarrow.Table a: int64 b: bool @@ -1788,8 +1923,11 @@ def is_in(self, other: Any) -> Self: b: [[true,true,false,false]] """ if isinstance(other, Iterable) and not isinstance(other, (str, bytes)): - other = extract_compliant(self, other) - return self.__class__(lambda plx: self._to_compliant_expr(plx).is_in(other)) + return self.__class__( + lambda plx: self._to_compliant_expr(plx).is_in( + extract_compliant(plx, other) + ) + ) else: msg = "Narwhals `is_in` doesn't accept expressions as an argument, as opposed to Polars. You should provide an iterable instead." raise NotImplementedError(msg) @@ -1804,32 +1942,36 @@ def filter(self, *predicates: Any) -> Self: A new expression. Examples: - >>> import polars as pl >>> import pandas as pd + >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [2, 3, 4, 5, 6, 7], "b": [10, 11, 12, 13, 14, 15]}) - >>> df_pl = pl.DataFrame({"a": [2, 3, 4, 5, 6, 7], "b": [10, 11, 12, 13, 14, 15]}) - >>> df_pa = pa.table({"a": [2, 3, 4, 5, 6, 7], "b": [10, 11, 12, 13, 14, 15]}) + >>> + >>> data = {"a": [2, 3, 4, 5, 6, 7], "b": [10, 11, 12, 13, 14, 15]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").filter(nw.col("a") > 4), ... nw.col("b").filter(nw.col("b") < 13), ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_filter`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_filter(df_pd) a b 3 5 10 4 6 11 5 7 12 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_filter(df_pl) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -1840,7 +1982,8 @@ def filter(self, *predicates: Any) -> Self: │ 6 ┆ 11 │ │ 7 ┆ 12 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_filter(df_pa) pyarrow.Table a: int64 b: int64 @@ -1871,13 +2014,19 @@ def is_null(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> df_pd = pd.DataFrame( - ... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, float("nan"), 3.0, 5.0]} - ... ) - >>> df_pl = pl.DataFrame( - ... {"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, None, 3.0, 5.0]} + ... { + ... "a": [2, 4, None, 3, 5], + ... "b": [2.0, 4.0, float("nan"), 3.0, 5.0], + ... } ... ) - >>> df_pa = pa.table({"a": [2, 4, None, 3, 5], "b": [2.0, 4.0, None, 3.0, 5.0]}) + >>> data = { + ... "a": [2, 4, None, 3, 5], + ... "b": [2.0, 4.0, None, 3.0, 5.0], + ... } + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: @@ -1887,7 +2036,8 @@ def is_null(self) -> Self: ... a_is_null=nw.col("a").is_null(), b_is_null=nw.col("b").is_null() ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_is_null`: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_is_null`: >>> agnostic_is_null(df_pd) a b a_is_null b_is_null @@ -1942,6 +2092,7 @@ def is_nan(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"orig": [0.0, None, 2.0]} >>> df_pd = pd.DataFrame(data).astype({"orig": "Float64"}) >>> df_pl = pl.DataFrame(data) @@ -1956,7 +2107,8 @@ def is_nan(self) -> Self: ... divided_is_nan=(nw.col("orig") / nw.col("orig")).is_nan(), ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_self_div_is_nan`: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_self_div_is_nan`: >>> print(agnostic_self_div_is_nan(df_pd)) orig divided divided_is_nan @@ -1985,7 +2137,6 @@ def is_nan(self) -> Self: orig: [[0,null,2]] divided: [[nan,null,1]] divided_is_nan: [[true,null,false]] - """ return self.__class__(lambda plx: self._to_compliant_expr(plx).is_nan()) @@ -2001,6 +2152,7 @@ def arg_true(self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, None, None, 2]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2008,17 +2160,19 @@ def arg_true(self) -> Self: We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_arg_true(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").is_null().arg_true()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_arg_true`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_arg_true(df_pd) a 1 1 2 2 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_arg_true(df_pl) shape: (2, 1) ┌─────┐ │ a │ @@ -2028,7 +2182,8 @@ def arg_true(self) -> Self: │ 1 │ │ 2 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_arg_true(df_pa) pyarrow.Table a: int64 ---- @@ -2063,24 +2218,19 @@ def fill_null( >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> df_pd = pd.DataFrame( ... { ... "a": [2, 4, None, None, 3, 5], ... "b": [2.0, 4.0, float("nan"), float("nan"), 3.0, 5.0], ... } ... ) - >>> df_pl = pl.DataFrame( - ... { - ... "a": [2, 4, None, None, 3, 5], - ... "b": [2.0, 4.0, None, None, 3.0, 5.0], - ... } - ... ) - >>> df_pa = pa.table( - ... { - ... "a": [2, 4, None, None, 3, 5], - ... "b": [2.0, 4.0, None, None, 3.0, 5.0], - ... } - ... ) + >>> data = { + ... "a": [2, 4, None, None, 3, 5], + ... "b": [2.0, 4.0, None, None, 3.0, 5.0], + ... } + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: @@ -2088,7 +2238,8 @@ def fill_null( ... df = nw.from_native(df_native) ... return df.with_columns(nw.col("a", "b").fill_null(0)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_fill_null`: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_fill_null`: >>> agnostic_fill_null(df_pd) a b @@ -2196,12 +2347,12 @@ def drop_nulls(self) -> Self: for reference. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> df_pd = pd.DataFrame({"a": [2.0, 4.0, float("nan"), 3.0, None, 5.0]}) >>> df_pl = pl.DataFrame({"a": [2.0, 4.0, None, 3.0, None, 5.0]}) >>> df_pa = pa.table({"a": [2.0, 4.0, None, 3.0, None, 5.0]}) @@ -2212,7 +2363,8 @@ def drop_nulls(self) -> Self: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").drop_nulls()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_drop_nulls`: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_drop_nulls`: >>> agnostic_drop_nulls(df_pd) a @@ -2220,6 +2372,7 @@ def drop_nulls(self) -> Self: 1 4.0 3 3.0 5 5.0 + >>> agnostic_drop_nulls(df_pl) shape: (4, 1) ┌─────┐ @@ -2232,6 +2385,7 @@ def drop_nulls(self) -> Self: │ 3.0 │ │ 5.0 │ └─────┘ + >>> agnostic_drop_nulls(df_pa) pyarrow.Table a: double @@ -2261,31 +2415,35 @@ def sample( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> df_pd = pd.DataFrame({"a": [1, 2, 3]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 3]}) - >>> df_pa = pa.table({"a": [1, 2, 3]}) + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 2, 3]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_sample(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").sample(fraction=1.0, with_replacement=True) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_sample`: - >>> my_library_agnostic_function(df_pd) # doctest: +SKIP + >>> agnostic_sample(df_pd) # doctest: +SKIP a 2 3 0 1 2 3 - >>> my_library_agnostic_function(df_pl) # doctest: +SKIP + + >>> agnostic_sample(df_pl) # doctest: +SKIP shape: (3, 1) ┌─────┐ │ a │ @@ -2296,7 +2454,8 @@ def sample( │ 3 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(df_pa) # doctest: +SKIP + + >>> agnostic_sample(df_pa) # doctest: +SKIP pyarrow.Table a: int64 ---- @@ -2320,11 +2479,12 @@ def over(self, *keys: str | Iterable[str]) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3], "b": [1, 1, 2]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2338,13 +2498,15 @@ def over(self, *keys: str | Iterable[str]) -> Self: ... a_min_per_group=nw.col("a").min().over("b") ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_min_over_b`: >>> agnostic_min_over_b(df_pd) a b a_min_per_group 0 1 1 1 1 2 1 1 2 3 2 3 + >>> agnostic_min_over_b(df_pl) shape: (3, 3) ┌─────┬─────┬─────────────────┐ @@ -2356,6 +2518,7 @@ def over(self, *keys: str | Iterable[str]) -> Self: │ 2 ┆ 1 ┆ 1 │ │ 3 ┆ 2 ┆ 3 │ └─────┴─────┴─────────────────┘ + >>> agnostic_min_over_b(df_pa) pyarrow.Table a: int64 @@ -2378,6 +2541,7 @@ def over(self, *keys: str | Iterable[str]) -> Self: 0 1 1 1 1 2 1 3 2 3 2 3 + >>> agnostic_cum_sum(df_pl) shape: (3, 3) ┌─────┬─────┬─────┐ @@ -2401,11 +2565,12 @@ def is_duplicated(self) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2413,19 +2578,21 @@ def is_duplicated(self) -> Self: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_duplicated(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.all().is_duplicated()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_duplicated`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_is_duplicated(df_pd) a b 0 True True 1 False True 2 False False 3 True False - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_is_duplicated(df_pl) shape: (4, 2) ┌───────┬───────┐ │ a ┆ b │ @@ -2437,7 +2604,8 @@ def is_duplicated(self) -> Self: │ false ┆ false │ │ true ┆ false │ └───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_is_duplicated(df_pa) pyarrow.Table a: bool b: bool @@ -2454,11 +2622,12 @@ def is_unique(self) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2466,19 +2635,21 @@ def is_unique(self) -> Self: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_unique(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.all().is_unique()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_unique`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_is_unique(df_pd) a b 0 False False 1 True False 2 True True 3 False True - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_is_unique(df_pl) shape: (4, 2) ┌───────┬───────┐ │ a ┆ b │ @@ -2490,7 +2661,8 @@ def is_unique(self) -> Self: │ true ┆ true │ │ false ┆ true │ └───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_is_unique(df_pa) pyarrow.Table a: bool b: bool @@ -2512,11 +2684,12 @@ def null_count(self) -> Self: for reference. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, None, 1], "b": ["a", None, "b", None]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2528,11 +2701,13 @@ def null_count(self) -> Self: ... df = nw.from_native(df_native) ... return df.select(nw.all().null_count()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_null_count`: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_null_count`: >>> agnostic_null_count(df_pd) a b 0 1 2 + >>> agnostic_null_count(df_pl) shape: (1, 2) ┌─────┬─────┐ @@ -2542,6 +2717,7 @@ def null_count(self) -> Self: ╞═════╪═════╡ │ 1 ┆ 2 │ └─────┴─────┘ + >>> agnostic_null_count(df_pa) pyarrow.Table a: int64 @@ -2559,11 +2735,12 @@ def is_first_distinct(self) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2571,19 +2748,21 @@ def is_first_distinct(self) -> Self: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_first_distinct(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.all().is_first_distinct()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_first_distinct`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_is_first_distinct(df_pd) a b 0 True True 1 True False 2 True True 3 False True - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_is_first_distinct(df_pl) shape: (4, 2) ┌───────┬───────┐ │ a ┆ b │ @@ -2595,7 +2774,8 @@ def is_first_distinct(self) -> Self: │ true ┆ true │ │ false ┆ true │ └───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_is_first_distinct(df_pa) pyarrow.Table a: bool b: bool @@ -2614,31 +2794,34 @@ def is_last_distinct(self) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [1, 2, 3, 1], "b": ["a", "a", "b", "c"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_last_distinct(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.all().is_last_distinct()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_last_distinct`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_is_last_distinct(df_pd) a b 0 False False 1 True True 2 True True 3 True True - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_is_last_distinct(df_pl) shape: (4, 2) ┌───────┬───────┐ │ a ┆ b │ @@ -2650,7 +2833,8 @@ def is_last_distinct(self) -> Self: │ true ┆ true │ │ true ┆ true │ └───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_is_last_distinct(df_pa) pyarrow.Table a: bool b: bool @@ -2682,11 +2866,12 @@ def quantile( native 'dask' - method. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": list(range(50)), "b": list(range(50, 100))} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2694,19 +2879,20 @@ def quantile( Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_quantile(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a", "b").quantile(0.5, interpolation="linear") ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_quantile`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_quantile(df_pd) a b 0 24.5 74.5 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_quantile(df_pl) shape: (1, 2) ┌──────┬──────┐ │ a ┆ b │ @@ -2715,7 +2901,8 @@ def quantile( ╞══════╪══════╡ │ 24.5 ┆ 74.5 │ └──────┴──────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_quantile(df_pa) pyarrow.Table a: double b: double @@ -2737,11 +2924,12 @@ def head(self, n: int = 10) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": list(range(10))} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2749,18 +2937,20 @@ def head(self, n: int = 10) -> Self: Let's define a dataframe-agnostic function that returns the first 3 rows: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_head(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").head(3)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_head`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_head(df_pd) a 0 0 1 1 2 2 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_head(df_pl) shape: (3, 1) ┌─────┐ │ a │ @@ -2771,7 +2961,8 @@ def head(self, n: int = 10) -> Self: │ 1 │ │ 2 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_head(df_pa) pyarrow.Table a: int64 ---- @@ -2789,11 +2980,12 @@ def tail(self, n: int = 10) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": list(range(10))} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2801,18 +2993,20 @@ def tail(self, n: int = 10) -> Self: Let's define a dataframe-agnostic function that returns the last 3 rows: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_tail(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").tail(3)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_tail`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_tail(df_pd) a 7 7 8 8 9 9 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_tail(df_pl) shape: (3, 1) ┌─────┐ │ a │ @@ -2823,7 +3017,8 @@ def tail(self, n: int = 10) -> Self: │ 8 │ │ 9 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_tail(df_pa) pyarrow.Table a: int64 ---- @@ -2850,11 +3045,12 @@ def round(self, decimals: int = 0) -> Self: Polars and Arrow round away from 0 (e.g. -0.5 to -1.0, 0.5 to 1.0, 1.5 to 2.0, 2.5 to 3.0, etc..). Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1.12345, 2.56789, 3.901234]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2862,18 +3058,20 @@ def round(self, decimals: int = 0) -> Self: Let's define a dataframe-agnostic function that rounds to the first decimal: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_round(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").round(1)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_round`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_round(df_pd) a 0 1.1 1 2.6 2 3.9 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_round(df_pl) shape: (3, 1) ┌─────┐ │ a │ @@ -2884,7 +3082,8 @@ def round(self, decimals: int = 0) -> Self: │ 2.6 │ │ 3.9 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_round(df_pa) pyarrow.Table a: double ---- @@ -2901,31 +3100,35 @@ def len(self) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": ["x", "y", "z"], "b": [1, 2, 1]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) >>> df_pa = pa.table(data) - Let's define a dataframe-agnostic function that computes the len over different values of "b" column: + Let's define a dataframe-agnostic function that computes the len over + different values of "b" column: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_len(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").filter(nw.col("b") == 1).len().alias("a1"), ... nw.col("a").filter(nw.col("b") == 2).len().alias("a2"), ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_len`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_len(df_pd) a1 a2 0 2 1 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_len(df_pl) shape: (1, 2) ┌─────┬─────┐ │ a1 ┆ a2 │ @@ -2934,7 +3137,8 @@ def len(self) -> Self: ╞═════╪═════╡ │ 2 ┆ 1 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_len(df_pa) pyarrow.Table a1: int64 a2: int64 @@ -2955,11 +3159,12 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -2968,17 +3173,19 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: Let's define a dataframe-agnostic function in which gather every 2 rows, starting from a offset of 1: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_gather_every(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").gather_every(n=2, offset=1)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_gather_every`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_gather_every(df_pd) a 1 2 3 4 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_gather_every(df_pl) shape: (2, 1) ┌─────┐ │ a │ @@ -2988,7 +3195,8 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: │ 2 │ │ 4 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_gather_every(df_pa) pyarrow.Table a: int64 ---- @@ -3020,29 +3228,31 @@ def clip( >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - - >>> s = [1, 2, 3] - >>> df_pd = pd.DataFrame({"s": s}) - >>> df_pl = pl.DataFrame({"s": s}) - >>> df_pa = pa.table({"s": s}) + >>> + >>> data = {"a": [1, 2, 3]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def func_lower(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_clip_lower(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.select(nw.col("s").clip(2)).to_native() + ... return df.select(nw.col("a").clip(2)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func_lower`: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_clip_lower`: - >>> func_lower(df_pd) - s + >>> agnostic_clip_lower(df_pd) + a 0 2 1 2 2 3 - >>> func_lower(df_pl) + + >>> agnostic_clip_lower(df_pl) shape: (3, 1) ┌─────┐ - │ s │ + │ a │ │ --- │ │ i64 │ ╞═════╡ @@ -3050,29 +3260,32 @@ def clip( │ 2 │ │ 3 │ └─────┘ - >>> func_lower(df_pa) + + >>> agnostic_clip_lower(df_pa) pyarrow.Table - s: int64 + a: int64 ---- - s: [[2,2,3]] + a: [[2,2,3]] We define another library agnostic function: - >>> def func_upper(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_clip_upper(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.select(nw.col("s").clip(upper_bound=2)).to_native() + ... return df.select(nw.col("a").clip(upper_bound=2)).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func_upper`: + We can then pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_clip_upper`: - >>> func_upper(df_pd) - s + >>> agnostic_clip_upper(df_pd) + a 0 1 1 2 2 2 - >>> func_upper(df_pl) + + >>> agnostic_clip_upper(df_pl) shape: (3, 1) ┌─────┐ - │ s │ + │ a │ │ --- │ │ i64 │ ╞═════╡ @@ -3080,39 +3293,42 @@ def clip( │ 2 │ │ 2 │ └─────┘ - >>> func_upper(df_pa) + + >>> agnostic_clip_upper(df_pa) pyarrow.Table - s: int64 + a: int64 ---- - s: [[1,2,2]] + a: [[1,2,2]] We can have both at the same time - >>> s = [-1, 1, -3, 3, -5, 5] - >>> df_pd = pd.DataFrame({"s": s}) - >>> df_pl = pl.DataFrame({"s": s}) - >>> df_pa = pa.table({"s": s}) + >>> data = {"a": [-1, 1, -3, 3, -5, 5]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_clip(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.select(nw.col("s").clip(-1, 3)).to_native() + ... return df.select(nw.col("a").clip(-1, 3)).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or + PyArrow to `agnostic_clip`: - >>> my_library_agnostic_function(df_pd) - s + >>> agnostic_clip(df_pd) + a 0 -1 1 1 2 -1 3 3 4 -1 5 3 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_clip(df_pl) shape: (6, 1) ┌─────┐ - │ s │ + │ a │ │ --- │ │ i64 │ ╞═════╡ @@ -3123,11 +3339,12 @@ def clip( │ -1 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_clip(df_pa) pyarrow.Table - s: int64 + a: int64 ---- - s: [[-1,1,-1,3,-1,3]] + a: [[-1,1,-1,3,-1,3]] """ return self.__class__( lambda plx: self._to_compliant_expr(plx).clip( @@ -3150,7 +3367,7 @@ def mode(self: Self) -> Self: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - + >>> >>> data = { ... "a": [1, 1, 2, 3], ... "b": [1, 1, 2, 2], @@ -3161,17 +3378,18 @@ def mode(self: Self) -> Self: We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_mode(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").mode()).sort("a").to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_mode`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_mode(df_pd) a 0 1 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_mode(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -3181,7 +3399,7 @@ def mode(self: Self) -> Self: │ 1 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_mode(df_pa) pyarrow.Table a: int64 ---- @@ -3201,28 +3419,34 @@ def is_finite(self: Self) -> Self: Expression of `Boolean` data type. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [float("nan"), float("inf"), 2.0, None]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_is_finite(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").is_finite()).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_is_finite`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_is_finite(df_pd) a 0 False 1 False 2 True 3 False - >>> my_library_agnostic_function(pl.DataFrame(data)) + + >>> agnostic_is_finite(df_pl) shape: (4, 1) ┌───────┐ │ a │ @@ -3235,7 +3459,7 @@ def is_finite(self: Self) -> Self: │ null │ └───────┘ - >>> my_library_agnostic_function(pa.table(data)) + >>> agnostic_is_finite(df_pa) pyarrow.Table a: bool ---- @@ -3253,32 +3477,37 @@ def cum_count(self: Self, *, reverse: bool = False) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": ["x", "k", None, "d"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_cum_count(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.col("a").cum_count().alias("cum_count"), ... nw.col("a").cum_count(reverse=True).alias("cum_count_reverse"), ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cum_count`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_cum_count(df_pd) a cum_count cum_count_reverse 0 x 1 3 1 k 2 2 2 None 2 1 3 d 3 1 - >>> my_library_agnostic_function(pl.DataFrame(data)) + >>> agnostic_cum_count(df_pl) shape: (4, 3) ┌──────┬───────────┬───────────────────┐ │ a ┆ cum_count ┆ cum_count_reverse │ @@ -3291,7 +3520,7 @@ def cum_count(self: Self, *, reverse: bool = False) -> Self: │ d ┆ 3 ┆ 1 │ └──────┴───────────┴───────────────────┘ - >>> my_library_agnostic_function(pa.table(data)) + >>> agnostic_cum_count(df_pa) pyarrow.Table a: string cum_count: uint32 @@ -3315,32 +3544,37 @@ def cum_min(self: Self, *, reverse: bool = False) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [3, 1, None, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_cum_min(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.col("a").cum_min().alias("cum_min"), ... nw.col("a").cum_min(reverse=True).alias("cum_min_reverse"), ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cum_min`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_cum_min(df_pd) a cum_min cum_min_reverse 0 3.0 3.0 1.0 1 1.0 1.0 1.0 2 NaN NaN NaN 3 2.0 1.0 2.0 - >>> my_library_agnostic_function(pl.DataFrame(data)) + >>> agnostic_cum_min(df_pl) shape: (4, 3) ┌──────┬─────────┬─────────────────┐ │ a ┆ cum_min ┆ cum_min_reverse │ @@ -3353,7 +3587,7 @@ def cum_min(self: Self, *, reverse: bool = False) -> Self: │ 2 ┆ 1 ┆ 2 │ └──────┴─────────┴─────────────────┘ - >>> my_library_agnostic_function(pa.table(data)) + >>> agnostic_cum_min(df_pa) pyarrow.Table a: int64 cum_min: int64 @@ -3377,32 +3611,37 @@ def cum_max(self: Self, *, reverse: bool = False) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 3, None, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_cum_max(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.col("a").cum_max().alias("cum_max"), ... nw.col("a").cum_max(reverse=True).alias("cum_max_reverse"), ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_cum_max(df_pd) a cum_max cum_max_reverse 0 1.0 1.0 3.0 1 3.0 3.0 3.0 2 NaN NaN NaN 3 2.0 3.0 2.0 - >>> my_library_agnostic_function(pl.DataFrame(data)) + >>> agnostic_cum_max(df_pl) shape: (4, 3) ┌──────┬─────────┬─────────────────┐ │ a ┆ cum_max ┆ cum_max_reverse │ @@ -3415,7 +3654,7 @@ def cum_max(self: Self, *, reverse: bool = False) -> Self: │ 2 ┆ 3 ┆ 2 │ └──────┴─────────┴─────────────────┘ - >>> my_library_agnostic_function(pa.table(data)) + >>> agnostic_cum_max(df_pa) pyarrow.Table a: int64 cum_max: int64 @@ -3439,32 +3678,37 @@ def cum_prod(self: Self, *, reverse: bool = False) -> Self: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 3, None, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_cum_prod(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.col("a").cum_prod().alias("cum_prod"), ... nw.col("a").cum_prod(reverse=True).alias("cum_prod_reverse"), ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_cum_prod`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_cum_prod(df_pd) a cum_prod cum_prod_reverse 0 1.0 1.0 6.0 1 3.0 3.0 6.0 2 NaN NaN NaN 3 2.0 6.0 2.0 - >>> my_library_agnostic_function(pl.DataFrame(data)) + >>> agnostic_cum_prod(df_pl) shape: (4, 3) ┌──────┬──────────┬──────────────────┐ │ a ┆ cum_prod ┆ cum_prod_reverse │ @@ -3477,7 +3721,7 @@ def cum_prod(self: Self, *, reverse: bool = False) -> Self: │ 2 ┆ 6 ┆ 2 │ └──────┴──────────┴──────────────────┘ - >>> my_library_agnostic_function(pa.table(data)) + >>> agnostic_cum_prod(df_pa) pyarrow.Table a: int64 cum_prod: int64 @@ -3523,11 +3767,12 @@ def rolling_sum( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1.0, 2.0, None, 4.0]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -3541,7 +3786,8 @@ def rolling_sum( ... b=nw.col("a").rolling_sum(window_size=3, min_periods=1) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_sum`: >>> agnostic_rolling_sum(df_pd) a b @@ -3563,7 +3809,7 @@ def rolling_sum( │ 4.0 ┆ 6.0 │ └──────┴─────┘ - >>> agnostic_rolling_sum(df_pa) # doctest:+ELLIPSIS + >>> agnostic_rolling_sum(df_pa) pyarrow.Table a: double b: double @@ -3615,11 +3861,12 @@ def rolling_mean( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1.0, 2.0, None, 4.0]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -3633,7 +3880,8 @@ def rolling_mean( ... b=nw.col("a").rolling_mean(window_size=3, min_periods=1) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_mean`: >>> agnostic_rolling_mean(df_pd) a b @@ -3655,7 +3903,7 @@ def rolling_mean( │ 4.0 ┆ 3.0 │ └──────┴─────┘ - >>> agnostic_rolling_mean(df_pa) # doctest:+ELLIPSIS + >>> agnostic_rolling_mean(df_pa) pyarrow.Table a: double b: double @@ -3709,11 +3957,12 @@ def rolling_var( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1.0, 2.0, None, 4.0]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -3727,7 +3976,8 @@ def rolling_var( ... b=nw.col("a").rolling_var(window_size=3, min_periods=1) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_var`: >>> agnostic_rolling_var(df_pd) a b @@ -3749,7 +3999,7 @@ def rolling_var( │ 4.0 ┆ 2.0 │ └──────┴──────┘ - >>> agnostic_rolling_var(df_pa) # doctest:+ELLIPSIS + >>> agnostic_rolling_var(df_pa) pyarrow.Table a: double b: double @@ -3801,11 +4051,12 @@ def rolling_std( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1.0, 2.0, None, 4.0]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -3819,7 +4070,8 @@ def rolling_std( ... b=nw.col("a").rolling_std(window_size=3, min_periods=1) ... ).to_native() - We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_rolling_std`: >>> agnostic_rolling_std(df_pd) a b @@ -3841,7 +4093,7 @@ def rolling_std( │ 4.0 ┆ 1.414214 │ └──────┴──────────┘ - >>> agnostic_rolling_std(df_pa) # doctest:+ELLIPSIS + >>> agnostic_rolling_std(df_pa) pyarrow.Table a: double b: double @@ -3862,6 +4114,103 @@ def rolling_std( ) ) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"] = "average", + *, + descending: bool = False, + ) -> Self: + """Assign ranks to data, dealing with ties appropriately. + + Notes: + The resulting dtype may differ between backends. + + Arguments: + method: The method used to assign ranks to tied elements. + The following methods are available (default is 'average'): + + - 'average' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - 'min' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - 'max' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - 'dense' : Like 'min', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - 'ordinal' : All values are given a distinct rank, corresponding to the + order that the values occur in the Series. + + descending: Rank in descending order. + + Returns: + A new expression with rank data. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> + >>> data = {"a": [3, 6, 1, 1, 6]} + + We define a dataframe-agnostic function that computes the dense rank for + the data: + + >>> def agnostic_dense_rank(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... result = df.with_columns(rnk=nw.col("a").rank(method="dense")) + ... return result.to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dense_rank`: + + >>> agnostic_dense_rank(pd.DataFrame(data)) + a rnk + 0 3 2.0 + 1 6 3.0 + 2 1 1.0 + 3 1 1.0 + 4 6 3.0 + + >>> agnostic_dense_rank(pl.DataFrame(data)) + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ rnk │ + │ --- ┆ --- │ + │ i64 ┆ u32 │ + ╞═════╪═════╡ + │ 3 ┆ 2 │ + │ 6 ┆ 3 │ + │ 1 ┆ 1 │ + │ 1 ┆ 1 │ + │ 6 ┆ 3 │ + └─────┴─────┘ + + >>> agnostic_dense_rank(pa.table(data)) + pyarrow.Table + a: int64 + rnk: uint64 + ---- + a: [[3,6,1,1,6]] + rnk: [[2,3,1,1,3]] + """ + supported_rank_methods = {"average", "min", "max", "dense", "ordinal"} + if method not in supported_rank_methods: + msg = ( + "Ranking method must be one of {'average', 'min', 'max', 'dense', 'ordinal'}. " + f"Found '{method}'" + ) + raise ValueError(msg) + + return self.__class__( + lambda plx: self._to_compliant_expr(plx).rank( + method=method, descending=descending + ) + ) + @property def str(self: Self) -> ExprStringNamespace[Self]: return ExprStringNamespace(self) @@ -3901,8 +4250,10 @@ def get_categories(self: Self) -> ExprT: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"fruits": ["apple", "mango", "mango"]} >>> df_pd = pd.DataFrame(data, dtype="category") >>> df_pl = pl.DataFrame(data, schema={"fruits": pl.Categorical}) @@ -3910,17 +4261,19 @@ def get_categories(self: Self) -> ExprT: We define a dataframe-agnostic function to get unique categories from column 'fruits': - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_cat_get_categories(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("fruits").cat.get_categories()).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas or Polars to + `agnostic_cat_get_categories`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_cat_get_categories(df_pd) fruits 0 apple 1 mango - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_cat_get_categories(df_pl) shape: (2, 1) ┌────────┐ │ fruits │ @@ -3949,23 +4302,27 @@ def len_chars(self: Self) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"words": ["foo", "Café", "345", "東京", None]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_len_chars(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... words_len=nw.col("words").str.len_chars() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_len_chars`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_len_chars(df_pd) words words_len 0 foo 3.0 1 Café 4.0 @@ -3973,7 +4330,7 @@ def len_chars(self: Self) -> ExprT: 3 東京 2.0 4 None NaN - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_len_chars(df_pl) shape: (5, 2) ┌───────┬───────────┐ │ words ┆ words_len │ @@ -3986,6 +4343,14 @@ def len_chars(self: Self) -> ExprT: │ 東京 ┆ 2 │ │ null ┆ null │ └───────┴───────────┘ + + >>> agnostic_str_len_chars(df_pa) + pyarrow.Table + words: string + words_len: int32 + ---- + words: [["foo","Café","345","東京",null]] + words_len: [[3,4,3,2,null]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.len_chars() @@ -4008,27 +4373,31 @@ def replace( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"foo": ["123abc", "abc abc123"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_replace(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... df = df.with_columns(replaced=nw.col("foo").str.replace("abc", "")) ... return df.to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_replace`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_replace(df_pd) foo replaced 0 123abc 123 1 abc abc123 abc123 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_replace(df_pl) shape: (2, 2) ┌────────────┬──────────┐ │ foo ┆ replaced │ @@ -4038,6 +4407,14 @@ def replace( │ 123abc ┆ 123 │ │ abc abc123 ┆ abc123 │ └────────────┴──────────┘ + + >>> agnostic_str_replace(df_pa) + pyarrow.Table + foo: string + replaced: string + ---- + foo: [["123abc","abc abc123"]] + replaced: [["123"," abc123"]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.replace( @@ -4061,27 +4438,31 @@ def replace_all( Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"foo": ["123abc", "abc abc123"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_replace_all(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... df = df.with_columns(replaced=nw.col("foo").str.replace_all("abc", "")) ... return df.to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_replace_all`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_replace_all(df_pd) foo replaced 0 123abc 123 1 abc abc123 123 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_replace_all(df_pl) shape: (2, 2) ┌────────────┬──────────┐ │ foo ┆ replaced │ @@ -4091,6 +4472,14 @@ def replace_all( │ 123abc ┆ 123 │ │ abc abc123 ┆ 123 │ └────────────┴──────────┘ + + >>> agnostic_str_replace_all(df_pa) + pyarrow.Table + foo: string + replaced: string + ---- + foo: [["123abc","abc abc123"]] + replaced: [["123"," 123"]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.replace_all( @@ -4102,34 +4491,44 @@ def strip_chars(self: Self, characters: str | None = None) -> ExprT: r"""Remove leading and trailing characters. Arguments: - characters: The set of characters to be removed. All combinations of this set of characters will be stripped from the start and end of the string. If set to None (default), all leading and trailing whitespace is removed instead. + characters: The set of characters to be removed. All combinations of this + set of characters will be stripped from the start and end of the string. + If set to None (default), all leading and trailing whitespace is removed + instead. Returns: A new expression. Examples: + >>> from typing import Any >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrame - >>> from typing import Any + >>> >>> data = {"fruits": ["apple", "\nmango"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrame) -> dict[str, Any]: + >>> def agnostic_str_strip_chars(df_native: IntoFrame) -> dict[str, Any]: ... df = nw.from_native(df_native) ... df = df.with_columns(stripped=nw.col("fruits").str.strip_chars()) ... return df.to_dict(as_series=False) - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_strip_chars`: + + >>> agnostic_str_strip_chars(df_pd) + {'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']} - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_strip_chars(df_pl) {'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']} - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_strip_chars(df_pa) {'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']} """ return self._expr.__class__( @@ -4148,29 +4547,33 @@ def starts_with(self: Self, prefix: str) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"fruits": ["apple", "mango", None]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_starts_with(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... has_prefix=nw.col("fruits").str.starts_with("app") ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_starts_with`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_starts_with(df_pd) fruits has_prefix 0 apple True 1 mango False 2 None None - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_starts_with(df_pl) shape: (3, 2) ┌────────┬────────────┐ │ fruits ┆ has_prefix │ @@ -4181,6 +4584,14 @@ def starts_with(self: Self, prefix: str) -> ExprT: │ mango ┆ false │ │ null ┆ null │ └────────┴────────────┘ + + >>> agnostic_str_starts_with(df_pa) + pyarrow.Table + fruits: string + has_prefix: bool + ---- + fruits: [["apple","mango",null]] + has_prefix: [[true,false,null]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.starts_with(prefix) @@ -4198,29 +4609,33 @@ def ends_with(self: Self, suffix: str) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"fruits": ["apple", "mango", None]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_ends_with(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... has_suffix=nw.col("fruits").str.ends_with("ngo") ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_ends_with`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_ends_with(df_pd) fruits has_suffix 0 apple False 1 mango True 2 None None - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_ends_with(df_pl) shape: (3, 2) ┌────────┬────────────┐ │ fruits ┆ has_suffix │ @@ -4231,6 +4646,14 @@ def ends_with(self: Self, suffix: str) -> ExprT: │ mango ┆ true │ │ null ┆ null │ └────────┴────────────┘ + + >>> agnostic_str_ends_with(df_pa) + pyarrow.Table + fruits: string + has_suffix: bool + ---- + fruits: [["apple","mango",null]] + has_suffix: [[false,true,null]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.ends_with(suffix) @@ -4253,6 +4676,7 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> ExprT: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"pets": ["cat", "dog", "rabbit and parrot", "dove", None]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -4260,7 +4684,7 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> ExprT: We define a dataframe-agnostic function: - >>> def agnostic_contains(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_contains(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... default_match=nw.col("pets").str.contains("parrot|Dove"), @@ -4270,9 +4694,10 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> ExprT: ... ), ... ).to_native() - We can then pass any supported library such as pandas, Polars, or PyArrow to `agnostic_contains`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_contains`: - >>> agnostic_contains(df_pd) + >>> agnostic_str_contains(df_pd) pets default_match case_insensitive_match literal_match 0 cat False False False 1 dog False False False @@ -4280,7 +4705,7 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> ExprT: 3 dove False True False 4 None None None None - >>> agnostic_contains(df_pl) + >>> agnostic_str_contains(df_pl) shape: (5, 4) ┌───────────────────┬───────────────┬────────────────────────┬───────────────┐ │ pets ┆ default_match ┆ case_insensitive_match ┆ literal_match │ @@ -4294,7 +4719,7 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> ExprT: │ null ┆ null ┆ null ┆ null │ └───────────────────┴───────────────┴────────────────────────┴───────────────┘ - >>> agnostic_contains(df_pa) + >>> agnostic_str_contains(df_pa) pyarrow.Table pets: string default_match: bool @@ -4326,30 +4751,34 @@ def slice(self: Self, offset: int, length: int | None = None) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"s": ["pear", None, "papaya", "dragonfruit"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_slice(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... s_sliced=nw.col("s").str.slice(4, length=3) ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_slice`: - >>> my_library_agnostic_function(df_pd) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_str_slice(df_pd) # doctest: +NORMALIZE_WHITESPACE s s_sliced 0 pear 1 None None 2 papaya ya 3 dragonfruit onf - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_slice(df_pl) shape: (4, 2) ┌─────────────┬──────────┐ │ s ┆ s_sliced │ @@ -4362,20 +4791,28 @@ def slice(self: Self, offset: int, length: int | None = None) -> ExprT: │ dragonfruit ┆ onf │ └─────────────┴──────────┘ + >>> agnostic_str_slice(df_pa) + pyarrow.Table + s: string + s_sliced: string + ---- + s: [["pear",null,"papaya","dragonfruit"]] + s_sliced: [["",null,"ya","onf"]] + Using negative indexes: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_slice_negative(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns(s_sliced=nw.col("s").str.slice(-3)).to_native() - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_slice_negative(df_pd) s s_sliced 0 pear ear 1 None None 2 papaya aya 3 dragonfruit uit - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_slice_negative(df_pl) shape: (4, 2) ┌─────────────┬──────────┐ │ s ┆ s_sliced │ @@ -4387,6 +4824,14 @@ def slice(self: Self, offset: int, length: int | None = None) -> ExprT: │ papaya ┆ aya │ │ dragonfruit ┆ uit │ └─────────────┴──────────┘ + + >>> agnostic_str_slice_negative(df_pa) + pyarrow.Table + s: string + s_sliced: string + ---- + s: [["pear",null,"papaya","dragonfruit"]] + s_sliced: [["ear",null,"aya","uit"]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.slice( @@ -4409,30 +4854,34 @@ def head(self: Self, n: int = 5) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"lyrics": ["Atatata", "taata", "taatatata", "zukkyun"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_head(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... lyrics_head=nw.col("lyrics").str.head() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_head`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_head(df_pd) lyrics lyrics_head 0 Atatata Atata 1 taata taata 2 taatatata taata 3 zukkyun zukky - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_head(df_pl) shape: (4, 2) ┌───────────┬─────────────┐ │ lyrics ┆ lyrics_head │ @@ -4444,6 +4893,14 @@ def head(self: Self, n: int = 5) -> ExprT: │ taatatata ┆ taata │ │ zukkyun ┆ zukky │ └───────────┴─────────────┘ + + >>> agnostic_str_head(df_pa) + pyarrow.Table + lyrics: string + lyrics_head: string + ---- + lyrics: [["Atatata","taata","taatatata","zukkyun"]] + lyrics_head: [["Atata","taata","taata","zukky"]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.slice(0, n) @@ -4464,30 +4921,34 @@ def tail(self: Self, n: int = 5) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"lyrics": ["Atatata", "taata", "taatatata", "zukkyun"]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_tail(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... lyrics_tail=nw.col("lyrics").str.tail() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_tail`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_tail(df_pd) lyrics lyrics_tail 0 Atatata atata 1 taata taata 2 taatatata atata 3 zukkyun kkyun - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_tail(df_pl) shape: (4, 2) ┌───────────┬─────────────┐ │ lyrics ┆ lyrics_tail │ @@ -4499,6 +4960,14 @@ def tail(self: Self, n: int = 5) -> ExprT: │ taatatata ┆ atata │ │ zukkyun ┆ kkyun │ └───────────┴─────────────┘ + + >>> agnostic_str_tail(df_pa) + pyarrow.Table + lyrics: string + lyrics_tail: string + ---- + lyrics: [["Atatata","taata","taatatata","zukkyun"]] + lyrics_tail: [["atata","taata","atata","kkyun"]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.slice( @@ -4532,6 +5001,7 @@ def to_datetime(self: Self, format: str | None = None) -> ExprT: # noqa: A002 >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = ["2020-01-01", "2020-01-02"] >>> df_pd = pd.DataFrame({"a": data}) >>> df_pl = pl.DataFrame({"a": data}) @@ -4539,19 +5009,21 @@ def to_datetime(self: Self, format: str | None = None) -> ExprT: # noqa: A002 We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_to_datetime(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").str.to_datetime(format="%Y-%m-%d") ... ).to_native() - We can then pass any supported library such as pandas, Polars, or PyArrow: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_to_datetime`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_to_datetime(df_pd) a 0 2020-01-01 1 2020-01-02 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_str_to_datetime(df_pl) shape: (2, 1) ┌─────────────────────┐ │ a │ @@ -4561,7 +5033,8 @@ def to_datetime(self: Self, format: str | None = None) -> ExprT: # noqa: A002 │ 2020-01-01 00:00:00 │ │ 2020-01-02 00:00:00 │ └─────────────────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_str_to_datetime(df_pa) pyarrow.Table a: timestamp[us] ---- @@ -4585,29 +5058,33 @@ def to_uppercase(self: Self) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"fruits": ["apple", "mango", None]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_to_uppercase(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... upper_col=nw.col("fruits").str.to_uppercase() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_to_uppercase`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_to_uppercase(df_pd) fruits upper_col 0 apple APPLE 1 mango MANGO 2 None None - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_to_uppercase(df_pl) shape: (3, 2) ┌────────┬───────────┐ │ fruits ┆ upper_col │ @@ -4619,6 +5096,13 @@ def to_uppercase(self: Self) -> ExprT: │ null ┆ null │ └────────┴───────────┘ + >>> agnostic_str_to_uppercase(df_pa) + pyarrow.Table + fruits: string + upper_col: string + ---- + fruits: [["apple","mango",null]] + upper_col: [["APPLE","MANGO",null]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.to_uppercase() @@ -4633,29 +5117,33 @@ def to_lowercase(self: Self) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"fruits": ["APPLE", "MANGO", None]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_str_to_lowercase(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... lower_col=nw.col("fruits").str.to_lowercase() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_str_to_lowercase`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_str_to_lowercase(df_pd) fruits lower_col 0 APPLE apple 1 MANGO mango 2 None None - >>> my_library_agnostic_function(df_pl) + >>> agnostic_str_to_lowercase(df_pl) shape: (3, 2) ┌────────┬───────────┐ │ fruits ┆ lower_col │ @@ -4666,6 +5154,14 @@ def to_lowercase(self: Self) -> ExprT: │ MANGO ┆ mango │ │ null ┆ null │ └────────┴───────────┘ + + >>> agnostic_str_to_lowercase(df_pa) + pyarrow.Table + fruits: string + lower_col: string + ---- + fruits: [["APPLE","MANGO",null]] + lower_col: [["apple","mango",null]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.to_lowercase() @@ -4686,29 +5182,33 @@ def date(self: Self) -> ExprT: NotImplementedError: If pandas default backend is being used. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [datetime(2012, 1, 7, 10, 20), datetime(2023, 3, 10, 11, 32)]} >>> df_pd = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow") >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a library agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_date(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a").dt.date()).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_date`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_date(df_pd) a 0 2012-01-07 1 2023-03-10 - >>> my_library_agnostic_function(df_pl) # docetst + >>> agnostic_dt_date(df_pl) shape: (2, 1) ┌────────────┐ │ a │ @@ -4718,6 +5218,12 @@ def date(self: Self) -> ExprT: │ 2012-01-07 │ │ 2023-03-10 │ └────────────┘ + + >>> agnostic_dt_date(df_pa) + pyarrow.Table + a: date32[day] + ---- + a: [[2012-01-07,2023-03-10]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.date() @@ -4732,11 +5238,13 @@ def year(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 6, 1), @@ -4746,23 +5254,26 @@ def year(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_year(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.col("datetime").dt.year().alias("year") ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_year`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_year(df_pd) datetime year 0 1978-06-01 1978 1 2024-12-13 2024 2 2065-01-01 2065 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_year(df_pl) shape: (3, 2) ┌─────────────────────┬──────┐ │ datetime ┆ year │ @@ -4773,6 +5284,14 @@ def year(self: Self) -> ExprT: │ 2024-12-13 00:00:00 ┆ 2024 │ │ 2065-01-01 00:00:00 ┆ 2065 │ └─────────────────────┴──────┘ + + >>> agnostic_dt_year(df_pa) + pyarrow.Table + datetime: timestamp[us] + year: int64 + ---- + datetime: [[1978-06-01 00:00:00.000000,2024-12-13 00:00:00.000000,2065-01-01 00:00:00.000000]] + year: [[1978,2024,2065]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.year() @@ -4787,11 +5306,13 @@ def month(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 6, 1), @@ -4801,34 +5322,44 @@ def month(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_month(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( - ... nw.col("datetime").dt.year().alias("year"), ... nw.col("datetime").dt.month().alias("month"), ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_month`: - >>> my_library_agnostic_function(df_pd) - datetime year month - 0 1978-06-01 1978 6 - 1 2024-12-13 2024 12 - 2 2065-01-01 2065 1 - >>> my_library_agnostic_function(df_pl) - shape: (3, 3) - ┌─────────────────────┬──────┬───────┐ - │ datetime ┆ year ┆ month │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i32 ┆ i8 │ - ╞═════════════════════╪══════╪═══════╡ - │ 1978-06-01 00:00:00 ┆ 1978 ┆ 6 │ - │ 2024-12-13 00:00:00 ┆ 2024 ┆ 12 │ - │ 2065-01-01 00:00:00 ┆ 2065 ┆ 1 │ - └─────────────────────┴──────┴───────┘ + >>> agnostic_dt_month(df_pd) + datetime month + 0 1978-06-01 6 + 1 2024-12-13 12 + 2 2065-01-01 1 + + >>> agnostic_dt_month(df_pl) + shape: (3, 2) + ┌─────────────────────┬───────┐ + │ datetime ┆ month │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i8 │ + ╞═════════════════════╪═══════╡ + │ 1978-06-01 00:00:00 ┆ 6 │ + │ 2024-12-13 00:00:00 ┆ 12 │ + │ 2065-01-01 00:00:00 ┆ 1 │ + └─────────────────────┴───────┘ + + >>> agnostic_dt_month(df_pa) + pyarrow.Table + datetime: timestamp[us] + month: int64 + ---- + datetime: [[1978-06-01 00:00:00.000000,2024-12-13 00:00:00.000000,2065-01-01 00:00:00.000000]] + month: [[6,12,1]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.month() @@ -4843,11 +5374,13 @@ def day(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 6, 1), @@ -4857,35 +5390,44 @@ def day(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_day(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( - ... nw.col("datetime").dt.year().alias("year"), - ... nw.col("datetime").dt.month().alias("month"), ... nw.col("datetime").dt.day().alias("day"), ... ).to_native() - We can then pass either pandas or Polars to `func`: - - >>> my_library_agnostic_function(df_pd) - datetime year month day - 0 1978-06-01 1978 6 1 - 1 2024-12-13 2024 12 13 - 2 2065-01-01 2065 1 1 - >>> my_library_agnostic_function(df_pl) - shape: (3, 4) - ┌─────────────────────┬──────┬───────┬─────┐ - │ datetime ┆ year ┆ month ┆ day │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i32 ┆ i8 ┆ i8 │ - ╞═════════════════════╪══════╪═══════╪═════╡ - │ 1978-06-01 00:00:00 ┆ 1978 ┆ 6 ┆ 1 │ - │ 2024-12-13 00:00:00 ┆ 2024 ┆ 12 ┆ 13 │ - │ 2065-01-01 00:00:00 ┆ 2065 ┆ 1 ┆ 1 │ - └─────────────────────┴──────┴───────┴─────┘ + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_day`: + + >>> agnostic_dt_day(df_pd) + datetime day + 0 1978-06-01 1 + 1 2024-12-13 13 + 2 2065-01-01 1 + + >>> agnostic_dt_day(df_pl) + shape: (3, 2) + ┌─────────────────────┬─────┐ + │ datetime ┆ day │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i8 │ + ╞═════════════════════╪═════╡ + │ 1978-06-01 00:00:00 ┆ 1 │ + │ 2024-12-13 00:00:00 ┆ 13 │ + │ 2065-01-01 00:00:00 ┆ 1 │ + └─────────────────────┴─────┘ + + >>> agnostic_dt_day(df_pa) + pyarrow.Table + datetime: timestamp[us] + day: int64 + ---- + datetime: [[1978-06-01 00:00:00.000000,2024-12-13 00:00:00.000000,2065-01-01 00:00:00.000000]] + day: [[1,13,1]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.day() @@ -4900,11 +5442,13 @@ def hour(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 1, 1, 1), @@ -4914,23 +5458,26 @@ def hour(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_hour(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.col("datetime").dt.hour().alias("hour") ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_hour`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_hour(df_pd) datetime hour 0 1978-01-01 01:00:00 1 1 2024-10-13 05:00:00 5 2 2065-01-01 10:00:00 10 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_hour(df_pl) shape: (3, 2) ┌─────────────────────┬──────┐ │ datetime ┆ hour │ @@ -4941,6 +5488,14 @@ def hour(self: Self) -> ExprT: │ 2024-10-13 05:00:00 ┆ 5 │ │ 2065-01-01 10:00:00 ┆ 10 │ └─────────────────────┴──────┘ + + >>> agnostic_dt_hour(df_pa) + pyarrow.Table + datetime: timestamp[us] + hour: int64 + ---- + datetime: [[1978-01-01 01:00:00.000000,2024-10-13 05:00:00.000000,2065-01-01 10:00:00.000000]] + hour: [[1,5,10]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.hour() @@ -4955,11 +5510,13 @@ def minute(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 1, 1, 1, 1), @@ -4969,34 +5526,44 @@ def minute(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_minute(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( - ... nw.col("datetime").dt.hour().alias("hour"), ... nw.col("datetime").dt.minute().alias("minute"), ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_minute`: - >>> my_library_agnostic_function(df_pd) - datetime hour minute - 0 1978-01-01 01:01:00 1 1 - 1 2024-10-13 05:30:00 5 30 - 2 2065-01-01 10:20:00 10 20 - >>> my_library_agnostic_function(df_pl) - shape: (3, 3) - ┌─────────────────────┬──────┬────────┐ - │ datetime ┆ hour ┆ minute │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i8 ┆ i8 │ - ╞═════════════════════╪══════╪════════╡ - │ 1978-01-01 01:01:00 ┆ 1 ┆ 1 │ - │ 2024-10-13 05:30:00 ┆ 5 ┆ 30 │ - │ 2065-01-01 10:20:00 ┆ 10 ┆ 20 │ - └─────────────────────┴──────┴────────┘ + >>> agnostic_dt_minute(df_pd) + datetime minute + 0 1978-01-01 01:01:00 1 + 1 2024-10-13 05:30:00 30 + 2 2065-01-01 10:20:00 20 + + >>> agnostic_dt_minute(df_pl) + shape: (3, 2) + ┌─────────────────────┬────────┐ + │ datetime ┆ minute │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i8 │ + ╞═════════════════════╪════════╡ + │ 1978-01-01 01:01:00 ┆ 1 │ + │ 2024-10-13 05:30:00 ┆ 30 │ + │ 2065-01-01 10:20:00 ┆ 20 │ + └─────────────────────┴────────┘ + + >>> agnostic_dt_minute(df_pa) + pyarrow.Table + datetime: timestamp[us] + minute: int64 + ---- + datetime: [[1978-01-01 01:01:00.000000,2024-10-13 05:30:00.000000,2065-01-01 10:20:00.000000]] + minute: [[1,30,20]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.minute() @@ -5009,11 +5576,13 @@ def second(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 1, 1, 1, 1, 1), @@ -5023,35 +5592,44 @@ def second(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_second(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( - ... nw.col("datetime").dt.hour().alias("hour"), - ... nw.col("datetime").dt.minute().alias("minute"), ... nw.col("datetime").dt.second().alias("second"), ... ).to_native() - We can then pass either pandas or Polars to `func`: - - >>> my_library_agnostic_function(df_pd) - datetime hour minute second - 0 1978-01-01 01:01:01 1 1 1 - 1 2024-10-13 05:30:14 5 30 14 - 2 2065-01-01 10:20:30 10 20 30 - >>> my_library_agnostic_function(df_pl) - shape: (3, 4) - ┌─────────────────────┬──────┬────────┬────────┐ - │ datetime ┆ hour ┆ minute ┆ second │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i8 ┆ i8 ┆ i8 │ - ╞═════════════════════╪══════╪════════╪════════╡ - │ 1978-01-01 01:01:01 ┆ 1 ┆ 1 ┆ 1 │ - │ 2024-10-13 05:30:14 ┆ 5 ┆ 30 ┆ 14 │ - │ 2065-01-01 10:20:30 ┆ 10 ┆ 20 ┆ 30 │ - └─────────────────────┴──────┴────────┴────────┘ + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_second`: + + >>> agnostic_dt_second(df_pd) + datetime second + 0 1978-01-01 01:01:01 1 + 1 2024-10-13 05:30:14 14 + 2 2065-01-01 10:20:30 30 + + >>> agnostic_dt_second(df_pl) + shape: (3, 2) + ┌─────────────────────┬────────┐ + │ datetime ┆ second │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i8 │ + ╞═════════════════════╪════════╡ + │ 1978-01-01 01:01:01 ┆ 1 │ + │ 2024-10-13 05:30:14 ┆ 14 │ + │ 2065-01-01 10:20:30 ┆ 30 │ + └─────────────────────┴────────┘ + + >>> agnostic_dt_second(df_pa) + pyarrow.Table + datetime: timestamp[us] + second: int64 + ---- + datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.000000,2065-01-01 10:20:30.000000]] + second: [[1,14,30]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.second() @@ -5064,11 +5642,13 @@ def millisecond(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 1, 1, 1, 1, 1, 0), @@ -5078,36 +5658,44 @@ def millisecond(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_millisecond(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( - ... nw.col("datetime").dt.hour().alias("hour"), - ... nw.col("datetime").dt.minute().alias("minute"), - ... nw.col("datetime").dt.second().alias("second"), ... nw.col("datetime").dt.millisecond().alias("millisecond"), ... ).to_native() - We can then pass either pandas or Polars to `func`: - - >>> my_library_agnostic_function(df_pd) - datetime hour minute second millisecond - 0 1978-01-01 01:01:01.000 1 1 1 0 - 1 2024-10-13 05:30:14.505 5 30 14 505 - 2 2065-01-01 10:20:30.067 10 20 30 67 - >>> my_library_agnostic_function(df_pl) - shape: (3, 5) - ┌─────────────────────────┬──────┬────────┬────────┬─────────────┐ - │ datetime ┆ hour ┆ minute ┆ second ┆ millisecond │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i8 ┆ i8 ┆ i8 ┆ i32 │ - ╞═════════════════════════╪══════╪════════╪════════╪═════════════╡ - │ 1978-01-01 01:01:01 ┆ 1 ┆ 1 ┆ 1 ┆ 0 │ - │ 2024-10-13 05:30:14.505 ┆ 5 ┆ 30 ┆ 14 ┆ 505 │ - │ 2065-01-01 10:20:30.067 ┆ 10 ┆ 20 ┆ 30 ┆ 67 │ - └─────────────────────────┴──────┴────────┴────────┴─────────────┘ + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_millisecond`: + + >>> agnostic_dt_millisecond(df_pd) + datetime millisecond + 0 1978-01-01 01:01:01.000 0 + 1 2024-10-13 05:30:14.505 505 + 2 2065-01-01 10:20:30.067 67 + + >>> agnostic_dt_millisecond(df_pl) + shape: (3, 2) + ┌─────────────────────────┬─────────────┐ + │ datetime ┆ millisecond │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i32 │ + ╞═════════════════════════╪═════════════╡ + │ 1978-01-01 01:01:01 ┆ 0 │ + │ 2024-10-13 05:30:14.505 ┆ 505 │ + │ 2065-01-01 10:20:30.067 ┆ 67 │ + └─────────────────────────┴─────────────┘ + + >>> agnostic_dt_millisecond(df_pa) + pyarrow.Table + datetime: timestamp[us] + millisecond: int64 + ---- + datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.505000,2065-01-01 10:20:30.067000]] + millisecond: [[0,505,67]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.millisecond() @@ -5120,11 +5708,13 @@ def microsecond(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 1, 1, 1, 1, 1, 0), @@ -5134,36 +5724,44 @@ def microsecond(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_microsecond(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( - ... nw.col("datetime").dt.hour().alias("hour"), - ... nw.col("datetime").dt.minute().alias("minute"), - ... nw.col("datetime").dt.second().alias("second"), ... nw.col("datetime").dt.microsecond().alias("microsecond"), ... ).to_native() - We can then pass either pandas or Polars to `func`: - - >>> my_library_agnostic_function(df_pd) - datetime hour minute second microsecond - 0 1978-01-01 01:01:01.000 1 1 1 0 - 1 2024-10-13 05:30:14.505 5 30 14 505000 - 2 2065-01-01 10:20:30.067 10 20 30 67000 - >>> my_library_agnostic_function(df_pl) - shape: (3, 5) - ┌─────────────────────────┬──────┬────────┬────────┬─────────────┐ - │ datetime ┆ hour ┆ minute ┆ second ┆ microsecond │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i8 ┆ i8 ┆ i8 ┆ i32 │ - ╞═════════════════════════╪══════╪════════╪════════╪═════════════╡ - │ 1978-01-01 01:01:01 ┆ 1 ┆ 1 ┆ 1 ┆ 0 │ - │ 2024-10-13 05:30:14.505 ┆ 5 ┆ 30 ┆ 14 ┆ 505000 │ - │ 2065-01-01 10:20:30.067 ┆ 10 ┆ 20 ┆ 30 ┆ 67000 │ - └─────────────────────────┴──────┴────────┴────────┴─────────────┘ + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_microsecond`: + + >>> agnostic_dt_microsecond(df_pd) + datetime microsecond + 0 1978-01-01 01:01:01.000 0 + 1 2024-10-13 05:30:14.505 505000 + 2 2065-01-01 10:20:30.067 67000 + + >>> agnostic_dt_microsecond(df_pl) + shape: (3, 2) + ┌─────────────────────────┬─────────────┐ + │ datetime ┆ microsecond │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i32 │ + ╞═════════════════════════╪═════════════╡ + │ 1978-01-01 01:01:01 ┆ 0 │ + │ 2024-10-13 05:30:14.505 ┆ 505000 │ + │ 2065-01-01 10:20:30.067 ┆ 67000 │ + └─────────────────────────┴─────────────┘ + + >>> agnostic_dt_microsecond(df_pa) + pyarrow.Table + datetime: timestamp[us] + microsecond: int64 + ---- + datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.505000,2065-01-01 10:20:30.067000]] + microsecond: [[0,505000,67000]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.microsecond() @@ -5176,11 +5774,13 @@ def nanosecond(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "datetime": [ ... datetime(1978, 1, 1, 1, 1, 1, 0), @@ -5190,36 +5790,44 @@ def nanosecond(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_nanosecond(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( - ... nw.col("datetime").dt.hour().alias("hour"), - ... nw.col("datetime").dt.minute().alias("minute"), - ... nw.col("datetime").dt.second().alias("second"), ... nw.col("datetime").dt.nanosecond().alias("nanosecond"), ... ).to_native() - We can then pass either pandas or Polars to `func`: - - >>> my_library_agnostic_function(df_pd) - datetime hour minute second nanosecond - 0 1978-01-01 01:01:01.000 1 1 1 0 - 1 2024-10-13 05:30:14.500 5 30 14 500000000 - 2 2065-01-01 10:20:30.060 10 20 30 60000000 - >>> my_library_agnostic_function(df_pl) - shape: (3, 5) - ┌─────────────────────────┬──────┬────────┬────────┬────────────┐ - │ datetime ┆ hour ┆ minute ┆ second ┆ nanosecond │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ i8 ┆ i8 ┆ i8 ┆ i32 │ - ╞═════════════════════════╪══════╪════════╪════════╪════════════╡ - │ 1978-01-01 01:01:01 ┆ 1 ┆ 1 ┆ 1 ┆ 0 │ - │ 2024-10-13 05:30:14.500 ┆ 5 ┆ 30 ┆ 14 ┆ 500000000 │ - │ 2065-01-01 10:20:30.060 ┆ 10 ┆ 20 ┆ 30 ┆ 60000000 │ - └─────────────────────────┴──────┴────────┴────────┴────────────┘ + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_nanosecond`: + + >>> agnostic_dt_nanosecond(df_pd) + datetime nanosecond + 0 1978-01-01 01:01:01.000 0 + 1 2024-10-13 05:30:14.500 500000000 + 2 2065-01-01 10:20:30.060 60000000 + + >>> agnostic_dt_nanosecond(df_pl) + shape: (3, 2) + ┌─────────────────────────┬────────────┐ + │ datetime ┆ nanosecond │ + │ --- ┆ --- │ + │ datetime[μs] ┆ i32 │ + ╞═════════════════════════╪════════════╡ + │ 1978-01-01 01:01:01 ┆ 0 │ + │ 2024-10-13 05:30:14.500 ┆ 500000000 │ + │ 2065-01-01 10:20:30.060 ┆ 60000000 │ + └─────────────────────────┴────────────┘ + + >>> agnostic_dt_nanosecond(df_pa) + pyarrow.Table + datetime: timestamp[us] + nanosecond: int64 + ---- + datetime: [[1978-01-01 01:01:01.000000,2024-10-13 05:30:14.500000,2065-01-01 10:20:30.060000]] + nanosecond: [[0,500000000,60000000]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.nanosecond() @@ -5232,30 +5840,35 @@ def ordinal_day(self: Self) -> ExprT: A new expression. Examples: + >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl - >>> from datetime import datetime + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [datetime(2020, 1, 1), datetime(2020, 8, 3)]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_ordinal_day(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... a_ordinal_day=nw.col("a").dt.ordinal_day() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_ordinal_day`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_ordinal_day(df_pd) a a_ordinal_day 0 2020-01-01 1 1 2020-08-03 216 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_ordinal_day(df_pl) shape: (2, 2) ┌─────────────────────┬───────────────┐ │ a ┆ a_ordinal_day │ @@ -5265,6 +5878,14 @@ def ordinal_day(self: Self) -> ExprT: │ 2020-01-01 00:00:00 ┆ 1 │ │ 2020-08-03 00:00:00 ┆ 216 │ └─────────────────────┴───────────────┘ + + >>> agnostic_dt_ordinal_day(df_pa) + pyarrow.Table + a: timestamp[us] + a_ordinal_day: int64 + ---- + a: [[2020-01-01 00:00:00.000000,2020-08-03 00:00:00.000000]] + a_ordinal_day: [[1,216]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.ordinal_day() @@ -5276,7 +5897,6 @@ def weekday(self: Self) -> ExprT: Returns: Returns the ISO weekday number where monday = 1 and sunday = 7 - Examples: >>> from datetime import datetime >>> import pandas as pd @@ -5284,6 +5904,7 @@ def weekday(self: Self) -> ExprT: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [datetime(2020, 1, 1), datetime(2020, 8, 3)]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) @@ -5291,17 +5912,19 @@ def weekday(self: Self) -> ExprT: We define a dataframe-agnostic function: - >>> def agnostic_weekday(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_weekday(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns(a_weekday=nw.col("a").dt.weekday()).to_native() - We can then pass either pandas, Polars, PyArrow, and other supported libraries to `agnostic_weekday`: + We can then pass either pandas, Polars, PyArrow, and other supported libraries to + `agnostic_dt_weekday`: - >>> agnostic_weekday(df_pd) + >>> agnostic_dt_weekday(df_pd) a a_weekday 0 2020-01-01 3 1 2020-08-03 1 - >>> agnostic_weekday(df_pl) + + >>> agnostic_dt_weekday(df_pl) shape: (2, 2) ┌─────────────────────┬───────────┐ │ a ┆ a_weekday │ @@ -5311,7 +5934,8 @@ def weekday(self: Self) -> ExprT: │ 2020-01-01 00:00:00 ┆ 3 │ │ 2020-08-03 00:00:00 ┆ 1 │ └─────────────────────┴───────────┘ - >>> agnostic_weekday(df_pa) + + >>> agnostic_dt_weekday(df_pa) pyarrow.Table a: timestamp[us] a_weekday: int64 @@ -5335,30 +5959,35 @@ def total_minutes(self: Self) -> ExprT: consider using `fill_null()` and `cast` in this case. Examples: + >>> from datetime import timedelta >>> import pandas as pd >>> import polars as pl - >>> from datetime import timedelta + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [timedelta(minutes=10), timedelta(minutes=20, seconds=40)]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_total_minutes(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... a_total_minutes=nw.col("a").dt.total_minutes() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_total_minutes`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_total_minutes(df_pd) a a_total_minutes 0 0 days 00:10:00 10 1 0 days 00:20:40 20 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_total_minutes(df_pl) shape: (2, 2) ┌──────────────┬─────────────────┐ │ a ┆ a_total_minutes │ @@ -5368,6 +5997,14 @@ def total_minutes(self: Self) -> ExprT: │ 10m ┆ 10 │ │ 20m 40s ┆ 20 │ └──────────────┴─────────────────┘ + + >>> agnostic_dt_total_minutes(df_pa) + pyarrow.Table + a: duration[us] + a_total_minutes: int64 + ---- + a: [[600000000,1240000000]] + a_total_minutes: [[10,20]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.total_minutes() @@ -5385,30 +6022,35 @@ def total_seconds(self: Self) -> ExprT: consider using `fill_null()` and `cast` in this case. Examples: + >>> from datetime import timedelta >>> import pandas as pd >>> import polars as pl - >>> from datetime import timedelta + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [timedelta(seconds=10), timedelta(seconds=20, milliseconds=40)]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_total_seconds(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... a_total_seconds=nw.col("a").dt.total_seconds() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_total_seconds`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_total_seconds(df_pd) a a_total_seconds 0 0 days 00:00:10 10 1 0 days 00:00:20.040000 20 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_total_seconds(df_pl) shape: (2, 2) ┌──────────────┬─────────────────┐ │ a ┆ a_total_seconds │ @@ -5418,6 +6060,14 @@ def total_seconds(self: Self) -> ExprT: │ 10s ┆ 10 │ │ 20s 40ms ┆ 20 │ └──────────────┴─────────────────┘ + + >>> agnostic_dt_total_seconds(df_pa) + pyarrow.Table + a: duration[us] + a_total_seconds: int64 + ---- + a: [[10000000,20040000]] + a_total_seconds: [[10,20]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.total_seconds() @@ -5435,11 +6085,13 @@ def total_milliseconds(self: Self) -> ExprT: consider using `fill_null()` and `cast` in this case. Examples: + >>> from datetime import timedelta >>> import pandas as pd >>> import polars as pl - >>> from datetime import timedelta + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [ ... timedelta(milliseconds=10), @@ -5448,22 +6100,25 @@ def total_milliseconds(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_total_milliseconds(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... a_total_milliseconds=nw.col("a").dt.total_milliseconds() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_total_milliseconds`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_total_milliseconds(df_pd) a a_total_milliseconds 0 0 days 00:00:00.010000 10 1 0 days 00:00:00.020040 20 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_total_milliseconds(df_pl) shape: (2, 2) ┌──────────────┬──────────────────────┐ │ a ┆ a_total_milliseconds │ @@ -5473,6 +6128,14 @@ def total_milliseconds(self: Self) -> ExprT: │ 10ms ┆ 10 │ │ 20040µs ┆ 20 │ └──────────────┴──────────────────────┘ + + >>> agnostic_dt_total_milliseconds(df_pa) + pyarrow.Table + a: duration[us] + a_total_milliseconds: int64 + ---- + a: [[10000,20040]] + a_total_milliseconds: [[10,20]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.total_milliseconds() @@ -5490,11 +6153,13 @@ def total_microseconds(self: Self) -> ExprT: consider using `fill_null()` and `cast` in this case. Examples: + >>> from datetime import timedelta >>> import pandas as pd >>> import polars as pl - >>> from datetime import timedelta + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [ ... timedelta(microseconds=10), @@ -5503,22 +6168,25 @@ def total_microseconds(self: Self) -> ExprT: ... } >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_total_microseconds(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... a_total_microseconds=nw.col("a").dt.total_microseconds() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_total_microseconds`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_total_microseconds(df_pd) a a_total_microseconds 0 0 days 00:00:00.000010 10 1 0 days 00:00:00.001200 1200 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_total_microseconds(df_pl) shape: (2, 2) ┌──────────────┬──────────────────────┐ │ a ┆ a_total_microseconds │ @@ -5528,6 +6196,14 @@ def total_microseconds(self: Self) -> ExprT: │ 10µs ┆ 10 │ │ 1200µs ┆ 1200 │ └──────────────┴──────────────────────┘ + + >>> agnostic_dt_total_microseconds(df_pa) + pyarrow.Table + a: duration[us] + a_total_microseconds: int64 + ---- + a: [[10,1200]] + a_total_microseconds: [[10,1200]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.total_microseconds() @@ -5545,11 +6221,12 @@ def total_nanoseconds(self: Self) -> ExprT: consider using `fill_null()` and `cast` in this case. Examples: + >>> from datetime import timedelta >>> import pandas as pd >>> import polars as pl - >>> from datetime import timedelta >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = ["2024-01-01 00:00:00.000000001", "2024-01-01 00:00:00.000000002"] >>> df_pd = pd.DataFrame({"a": pd.to_datetime(data)}) >>> df_pl = pl.DataFrame({"a": data}).with_columns( @@ -5558,19 +6235,21 @@ def total_nanoseconds(self: Self) -> ExprT: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_total_nanoseconds(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... a_diff_total_nanoseconds=nw.col("a").diff().dt.total_nanoseconds() ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_total_nanoseconds`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_total_nanoseconds(df_pd) a a_diff_total_nanoseconds 0 2024-01-01 00:00:00.000000001 NaN 1 2024-01-01 00:00:00.000000002 1.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_total_nanoseconds(df_pl) shape: (2, 2) ┌───────────────────────────────┬──────────────────────────┐ │ a ┆ a_diff_total_nanoseconds │ @@ -5629,33 +6308,39 @@ def to_string(self: Self, format: str) -> ExprT: # noqa: A002 >>> from datetime import datetime >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> data = [ - ... datetime(2020, 3, 1), - ... datetime(2020, 4, 1), - ... datetime(2020, 5, 1), - ... ] - >>> df_pd = pd.DataFrame({"a": data}) - >>> df_pl = pl.DataFrame({"a": data}) + >>> + >>> data = { + ... "a": [ + ... datetime(2020, 3, 1), + ... datetime(2020, 4, 1), + ... datetime(2020, 5, 1), + ... ] + ... } + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_to_string(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").dt.to_string("%Y/%m/%d %H:%M:%S") ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_to_string`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_to_string(df_pd) a 0 2020/03/01 00:00:00 1 2020/04/01 00:00:00 2 2020/05/01 00:00:00 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_dt_to_string(df_pl) shape: (3, 1) ┌─────────────────────┐ │ a │ @@ -5666,6 +6351,12 @@ def to_string(self: Self, format: str) -> ExprT: # noqa: A002 │ 2020/04/01 00:00:00 │ │ 2020/05/01 00:00:00 │ └─────────────────────┘ + + >>> agnostic_dt_to_string(df_pa) + pyarrow.Table + a: string + ---- + a: [["2020/03/01 00:00:00.000000","2020/04/01 00:00:00.000000","2020/05/01 00:00:00.000000"]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).dt.to_string(format) @@ -5682,11 +6373,12 @@ def replace_time_zone(self: Self, time_zone: str | None) -> ExprT: Examples: >>> from datetime import datetime, timezone - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [ ... datetime(2024, 1, 1, tzinfo=timezone.utc), @@ -5699,19 +6391,21 @@ def replace_time_zone(self: Self, time_zone: str | None) -> ExprT: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_replace_time_zone(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").dt.replace_time_zone("Asia/Kathmandu") ... ).to_native() - We can then pass pandas / PyArrow / Polars / any other supported library: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_replace_time_zone`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_replace_time_zone(df_pd) a 0 2024-01-01 00:00:00+05:45 1 2024-01-02 00:00:00+05:45 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_replace_time_zone(df_pl) shape: (2, 1) ┌──────────────────────────────┐ │ a │ @@ -5721,7 +6415,8 @@ def replace_time_zone(self: Self, time_zone: str | None) -> ExprT: │ 2024-01-01 00:00:00 +0545 │ │ 2024-01-02 00:00:00 +0545 │ └──────────────────────────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_dt_replace_time_zone(df_pa) pyarrow.Table a: timestamp[us, tz=Asia/Kathmandu] ---- @@ -5745,11 +6440,12 @@ def convert_time_zone(self: Self, time_zone: str) -> ExprT: Examples: >>> from datetime import datetime, timezone - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [ ... datetime(2024, 1, 1, tzinfo=timezone.utc), @@ -5762,19 +6458,21 @@ def convert_time_zone(self: Self, time_zone: str) -> ExprT: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_convert_time_zone(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.col("a").dt.convert_time_zone("Asia/Kathmandu") ... ).to_native() - We can then pass pandas / PyArrow / Polars / any other supported library: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_convert_time_zone`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_convert_time_zone(df_pd) a 0 2024-01-01 05:45:00+05:45 1 2024-01-02 05:45:00+05:45 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_convert_time_zone(df_pl) shape: (2, 1) ┌──────────────────────────────┐ │ a │ @@ -5784,7 +6482,8 @@ def convert_time_zone(self: Self, time_zone: str) -> ExprT: │ 2024-01-01 05:45:00 +0545 │ │ 2024-01-02 05:45:00 +0545 │ └──────────────────────────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_dt_convert_time_zone(df_pa) pyarrow.Table a: timestamp[us, tz=Asia/Kathmandu] ---- @@ -5809,11 +6508,12 @@ def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ExprT: Examples: >>> from datetime import date - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"date": [date(2001, 1, 1), None, date(2001, 1, 3)]} >>> df_pd = pd.DataFrame(data, dtype="datetime64[ns]") >>> df_pl = pl.DataFrame(data) @@ -5821,21 +6521,23 @@ def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ExprT: Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_dt_timestamp(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.col("date").dt.timestamp().alias("timestamp_us"), ... nw.col("date").dt.timestamp("ms").alias("timestamp_ms"), ... ).to_native() - We can then pass pandas / PyArrow / Polars / any other supported library: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dt_timestamp`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_dt_timestamp(df_pd) date timestamp_us timestamp_ms 0 2001-01-01 9.783072e+14 9.783072e+11 1 NaT NaN NaN 2 2001-01-03 9.784800e+14 9.784800e+11 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_dt_timestamp(df_pl) shape: (3, 3) ┌────────────┬─────────────────┬──────────────┐ │ date ┆ timestamp_us ┆ timestamp_ms │ @@ -5846,7 +6548,8 @@ def timestamp(self: Self, time_unit: Literal["ns", "us", "ms"] = "us") -> ExprT: │ null ┆ null ┆ null │ │ 2001-01-03 ┆ 978480000000000 ┆ 978480000000 │ └────────────┴─────────────────┴──────────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_dt_timestamp(df_pa) pyarrow.Table date: date32[day] timestamp_us: int64 @@ -5883,27 +6586,33 @@ def keep(self: Self) -> ExprT: expression in a chain. Only one name operation per expression will work. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> >>> data = {"foo": [1, 2], "BAR": [4, 5]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_name_keep(df_native: IntoFrame) -> list[str]: ... df = nw.from_native(df_native) - ... return df.select( - ... nw.col("foo").alias("alias_for_foo").name.keep() - ... ).to_native() + ... return df.select(nw.col("foo").alias("alias_for_foo").name.keep()).columns + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_keep`: - We can then pass either pandas or Polars to `func`: + >>> agnostic_name_keep(df_pd) + ['foo'] + + >>> agnostic_name_keep(df_pl) + ['foo'] - >>> my_library_agnostic_function(df_pd).columns - Index(['foo'], dtype='object') - >>> my_library_agnostic_function(df_pl).columns + >>> agnostic_name_keep(df_pa) ['foo'] """ return self._expr.__class__( @@ -5925,26 +6634,34 @@ def map(self: Self, function: Callable[[str], str]) -> ExprT: expression in a chain. Only one name operation per expression will work. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> >>> data = {"foo": [1, 2], "BAR": [4, 5]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: >>> renaming_func = lambda s: s[::-1] # reverse column name - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_name_map(df_native: IntoFrame) -> list[str]: ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo", "BAR").name.map(renaming_func)).to_native() + ... return df.select(nw.col("foo", "BAR").name.map(renaming_func)).columns - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_map`: - >>> my_library_agnostic_function(df_pd).columns - Index(['oof', 'RAB'], dtype='object') - >>> my_library_agnostic_function(df_pl).columns + >>> agnostic_name_map(df_pd) + ['oof', 'RAB'] + + >>> agnostic_name_map(df_pl) + ['oof', 'RAB'] + + >>> agnostic_name_map(df_pa) ['oof', 'RAB'] """ return self._expr.__class__( @@ -5966,26 +6683,33 @@ def prefix(self: Self, prefix: str) -> ExprT: expression in a chain. Only one name operation per expression will work. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> >>> data = {"foo": [1, 2], "BAR": [4, 5]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def add_colname_prefix(df_native: IntoFrameT, prefix: str) -> IntoFrameT: + >>> def agnostic_name_prefix(df_native: IntoFrame, prefix: str) -> list[str]: ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo", "BAR").name.prefix(prefix)).to_native() + ... return df.select(nw.col("foo", "BAR").name.prefix(prefix)).columns + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_prefix`: - We can then pass either pandas or Polars to `func`: + >>> agnostic_name_prefix(df_pd, "with_prefix_") + ['with_prefix_foo', 'with_prefix_BAR'] - >>> add_colname_prefix(df_pd, "with_prefix_").columns - Index(['with_prefix_foo', 'with_prefix_BAR'], dtype='object') + >>> agnostic_name_prefix(df_pl, "with_prefix_") + ['with_prefix_foo', 'with_prefix_BAR'] - >>> add_colname_prefix(df_pl, "with_prefix_").columns + >>> agnostic_name_prefix(df_pa, "with_prefix_") ['with_prefix_foo', 'with_prefix_BAR'] """ return self._expr.__class__( @@ -6007,25 +6731,33 @@ def suffix(self: Self, suffix: str) -> ExprT: expression in a chain. Only one name operation per expression will work. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> >>> data = {"foo": [1, 2], "BAR": [4, 5]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def add_colname_suffix(df_native: IntoFrameT, suffix: str) -> IntoFrameT: + >>> def agnostic_name_suffix(df_native: IntoFrame, suffix: str) -> list[str]: ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo", "BAR").name.suffix(suffix)).to_native() + ... return df.select(nw.col("foo", "BAR").name.suffix(suffix)).columns - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_suffix`: - >>> add_colname_suffix(df_pd, "_with_suffix").columns - Index(['foo_with_suffix', 'BAR_with_suffix'], dtype='object') - >>> add_colname_suffix(df_pl, "_with_suffix").columns + >>> agnostic_name_suffix(df_pd, "_with_suffix") + ['foo_with_suffix', 'BAR_with_suffix'] + + >>> agnostic_name_suffix(df_pl, "_with_suffix") + ['foo_with_suffix', 'BAR_with_suffix'] + + >>> agnostic_name_suffix(df_pa, "_with_suffix") ['foo_with_suffix', 'BAR_with_suffix'] """ return self._expr.__class__( @@ -6044,25 +6776,33 @@ def to_lowercase(self: Self) -> ExprT: expression in a chain. Only one name operation per expression will work. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> >>> data = {"foo": [1, 2], "BAR": [4, 5]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def to_lower(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_name_to_lowercase(df_native: IntoFrame) -> list[str]: ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo", "BAR").name.to_lowercase()).to_native() + ... return df.select(nw.col("foo", "BAR").name.to_lowercase()).columns - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_to_lowercase`: + + >>> agnostic_name_to_lowercase(df_pd) + ['foo', 'bar'] - >>> to_lower(df_pd).columns - Index(['foo', 'bar'], dtype='object') - >>> to_lower(df_pl).columns + >>> agnostic_name_to_lowercase(df_pl) + ['foo', 'bar'] + + >>> agnostic_name_to_lowercase(df_pa) ['foo', 'bar'] """ return self._expr.__class__( @@ -6081,24 +6821,33 @@ def to_uppercase(self: Self) -> ExprT: expression in a chain. Only one name operation per expression will work. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> >>> data = {"foo": [1, 2], "BAR": [4, 5]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def to_upper(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_name_to_uppercase(df_native: IntoFrame) -> list[str]: ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo", "BAR").name.to_uppercase()).to_native() + ... return df.select(nw.col("foo", "BAR").name.to_uppercase()).columns + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_name_to_uppercase`: + + >>> agnostic_name_to_uppercase(df_pd) + ['FOO', 'BAR'] + + >>> agnostic_name_to_uppercase(df_pl) + ['FOO', 'BAR'] - We can then pass either pandas or Polars to `func`: - >>> to_upper(df_pd).columns - Index(['FOO', 'BAR'], dtype='object') - >>> to_upper(df_pl).columns + >>> agnostic_name_to_uppercase(df_pa) ['FOO', 'BAR'] """ return self._expr.__class__( @@ -6119,11 +6868,12 @@ def len(self: Self) -> ExprT: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [[1, 2], [3, 4, None], None, []]} Let's define a dataframe-agnostic function: @@ -6185,23 +6935,27 @@ def col(*names: str | Iterable[str]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [3, 4]}) - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [3, 4]}) + >>> + >>> data = {"a": [1, 2], "b": [3, 4]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_col(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a") * nw.col("b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_col`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_col(df_pd) a 0 3 1 8 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_col(df_pl) shape: (2, 1) ┌─────┐ │ a │ @@ -6211,7 +6965,8 @@ def col(*names: str | Iterable[str]) -> Expr: │ 3 │ │ 8 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_col(df_pa) pyarrow.Table a: int64 ---- @@ -6243,6 +6998,7 @@ def nth(*indices: int | Sequence[int]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2], "b": [3, 4]} >>> df_pl = pl.DataFrame(data) >>> df_pd = pd.DataFrame(data) @@ -6250,17 +7006,18 @@ def nth(*indices: int | Sequence[int]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_nth(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.nth(0) * 2).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_nth`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_nth(df_pd) a 0 2 1 4 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_nth(df_pl) shape: (2, 1) ┌─────┐ │ a │ @@ -6270,7 +7027,8 @@ def nth(*indices: int | Sequence[int]) -> Expr: │ 2 │ │ 4 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_nth(df_pa) pyarrow.Table a: int64 ---- @@ -6296,24 +7054,28 @@ def all_() -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> df_pa = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> + >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_all(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.all() * 2).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_all`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_all(df_pd) a b 0 2 8 1 4 10 2 6 12 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_all(df_pl) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -6324,7 +7086,8 @@ def all_() -> Expr: │ 4 ┆ 10 │ │ 6 ┆ 12 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_all(df_pa) pyarrow.Table a: int64 b: int64 @@ -6348,22 +7111,25 @@ def len_() -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [5, 10]}) + >>> + >>> data = {"a": [1, 2], "b": [5, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_len(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.len()).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_len`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_len(df_pd) len 0 2 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_len(df_pl) shape: (1, 1) ┌─────┐ │ len │ @@ -6372,7 +7138,7 @@ def len_() -> Expr: ╞═════╡ │ 2 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_len(df_pa) pyarrow.Table len: int64 ---- @@ -6403,22 +7169,26 @@ def sum(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2]}) - >>> df_pd = pd.DataFrame({"a": [1, 2]}) - >>> df_pa = pa.table({"a": [1, 2]}) + >>> + >>> data = {"a": [1, 2]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_sum(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.sum("a")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_sum`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_sum(df_pd) a 0 3 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_sum(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -6427,7 +7197,8 @@ def sum(*columns: str) -> Expr: ╞═════╡ │ 3 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_sum(df_pa) pyarrow.Table a: int64 ---- @@ -6454,22 +7225,26 @@ def mean(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 8, 3]}) - >>> df_pd = pd.DataFrame({"a": [1, 8, 3]}) - >>> df_pa = pa.table({"a": [1, 8, 3]}) + >>> + >>> data = {"a": [1, 8, 3]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_mean(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.mean("a")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_mean`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_mean(df_pd) a 0 4.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_mean(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -6478,7 +7253,8 @@ def mean(*columns: str) -> Expr: ╞═════╡ │ 4.0 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_mean(df_pa) pyarrow.Table a: double ---- @@ -6492,7 +7268,8 @@ def median(*columns: str) -> Expr: Notes: - Syntactic sugar for ``nw.col(columns).median()`` - - Results might slightly differ across backends due to differences in the underlying algorithms used to compute the median. + - Results might slightly differ across backends due to differences in the + underlying algorithms used to compute the median. Arguments: columns: Name(s) of the columns to use in the aggregation function @@ -6506,22 +7283,26 @@ def median(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [4, 5, 2]}) - >>> df_pl = pl.DataFrame({"a": [4, 5, 2]}) - >>> df_pa = pa.table({"a": [4, 5, 2]}) + >>> + >>> data = {"a": [4, 5, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_median(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.median("a")).to_native() - We can then pass any supported library such as pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_median`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_median(df_pd) a 0 4.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_median(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -6530,7 +7311,8 @@ def median(*columns: str) -> Expr: ╞═════╡ │ 4.0 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_median(df_pa) pyarrow.Table a: double ---- @@ -6557,22 +7339,26 @@ def min(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [5, 10]}) + >>> + >>> data = {"a": [1, 2], "b": [5, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_min(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.min("b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_min`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_min(df_pd) b 0 5 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_min(df_pl) shape: (1, 1) ┌─────┐ │ b │ @@ -6581,7 +7367,8 @@ def min(*columns: str) -> Expr: ╞═════╡ │ 5 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_min(df_pa) pyarrow.Table b: int64 ---- @@ -6608,22 +7395,26 @@ def max(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [5, 10]}) + >>> + >>> data = {"a": [1, 2], "b": [5, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_max(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.max("a")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_max`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_max(df_pd) a 0 2 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_max(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -6632,7 +7423,8 @@ def max(*columns: str) -> Expr: ╞═════╡ │ 2 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_max(df_pa) pyarrow.Table a: int64 ---- @@ -6660,6 +7452,7 @@ def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3], "b": [5, 10, None]} >>> df_pl = pl.DataFrame(data) >>> df_pd = pd.DataFrame(data) @@ -6667,18 +7460,19 @@ def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_sum_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.sum_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_sum_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_sum_horizontal(df_pd) a 0 6.0 1 12.0 2 3.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_sum_horizontal(df_pl) shape: (3, 1) ┌─────┐ │ a │ @@ -6689,7 +7483,8 @@ def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 12 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_sum_horizontal(df_pa) pyarrow.Table a: int64 ---- @@ -6719,11 +7514,12 @@ def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 8, 3], ... "b": [4, 5, None], @@ -6733,18 +7529,20 @@ def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function that computes the horizontal min of "a" and "b" columns: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_min_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.min_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_min_horizontal`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_min_horizontal(pd.DataFrame(data)) a 0 1.0 1 5.0 2 3.0 - >>> my_library_agnostic_function(pl.DataFrame(data)) + + >>> agnostic_min_horizontal(pl.DataFrame(data)) shape: (3, 1) ┌─────┐ │ a │ @@ -6755,7 +7553,8 @@ def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 5 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(pa.table(data)) + + >>> agnostic_min_horizontal(pa.table(data)) pyarrow.Table a: int64 ---- @@ -6785,11 +7584,12 @@ def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 8, 3], ... "b": [4, 5, None], @@ -6799,18 +7599,20 @@ def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function that computes the horizontal max of "a" and "b" columns: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_max_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.max_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_max_horizontal`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_max_horizontal(pd.DataFrame(data)) a 0 4.0 1 8.0 2 3.0 - >>> my_library_agnostic_function(pl.DataFrame(data)) + + >>> agnostic_max_horizontal(pl.DataFrame(data)) shape: (3, 1) ┌─────┐ │ a │ @@ -6821,7 +7623,8 @@ def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 8 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(pa.table(data)) + + >>> agnostic_max_horizontal(pa.table(data)) pyarrow.Table a: int64 ---- @@ -6840,6 +7643,9 @@ def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: class When: def __init__(self, *predicates: IntoExpr | Iterable[IntoExpr]) -> None: self._predicates = flatten([predicates]) + if not self._predicates: + msg = "At least one predicate needs to be provided to `narwhals.when`." + raise TypeError(msg) def _extract_predicates(self, plx: Any) -> Any: return [extract_compliant(plx, v) for v in self._predicates] @@ -6875,9 +7681,9 @@ def when(*predicates: IntoExpr | Iterable[IntoExpr]) -> When: If not appended, and none of the conditions are `True`, `None` will be returned. Arguments: - predicates: Condition(s) that must be met in order to apply the subsequent statement. - Accepts one or more boolean expressions, which are implicitly combined with `&`. - String input is parsed as a column name. + predicates: Condition(s) that must be met in order to apply the subsequent + statement. Accepts one or more boolean expressions, which are implicitly + combined with `&`. String input is parsed as a column name. Returns: A "when" object, which `.then` can be called on. @@ -6888,26 +7694,30 @@ def when(*predicates: IntoExpr | Iterable[IntoExpr]) -> When: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2, 3], "b": [5, 10, 15]}) - >>> df_pd = pd.DataFrame({"a": [1, 2, 3], "b": [5, 10, 15]}) - >>> df_pa = pa.table({"a": [1, 2, 3], "b": [5, 10, 15]}) + >>> + >>> data = {"a": [1, 2, 3], "b": [5, 10, 15]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_when_then_otherwise(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.when(nw.col("a") < 3).then(5).otherwise(6).alias("a_when") ... ).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_when_then_otherwise`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_when_then_otherwise(df_pd) a b a_when 0 1 5 5 1 2 10 5 2 3 15 6 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_when_then_otherwise(df_pl) shape: (3, 3) ┌─────┬─────┬────────┐ │ a ┆ b ┆ a_when │ @@ -6918,7 +7728,8 @@ def when(*predicates: IntoExpr | Iterable[IntoExpr]) -> When: │ 2 ┆ 10 ┆ 5 │ │ 3 ┆ 15 ┆ 6 │ └─────┴─────┴────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_when_then_otherwise(df_pa) pyarrow.Table a: int64 b: int64 @@ -6935,7 +7746,8 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: r"""Compute the bitwise AND horizontally across columns. Arguments: - exprs: Name(s) of the columns to use in the aggregation function. Accepts expression input. + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. Returns: A new expression. @@ -6946,6 +7758,7 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [False, False, True, True, False, None], ... "b": [False, True, True, None, None, None], @@ -6956,13 +7769,14 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_all_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select("a", "b", all=nw.all_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_all_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_all_horizontal(df_pd) a b all 0 False False False 1 False True False @@ -6971,7 +7785,7 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: 4 False False 5 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_all_horizontal(df_pl) shape: (6, 3) ┌───────┬───────┬───────┐ │ a ┆ b ┆ all │ @@ -6986,7 +7800,7 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ null ┆ null ┆ null │ └───────┴───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_all_horizontal(df_pa) pyarrow.Table a: bool b: bool @@ -7006,12 +7820,13 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: ) -def lit(value: Any, dtype: DType | None = None) -> Expr: +def lit(value: Any, dtype: DType | type[DType] | None = None) -> Expr: """Return an expression representing a literal value. Arguments: value: The value to use as literal. - dtype: The data type of the literal value. If not provided, the data type will be inferred. + dtype: The data type of the literal value. If not provided, the data type will + be inferred. Returns: A new expression. @@ -7022,23 +7837,27 @@ def lit(value: Any, dtype: DType | None = None) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2]}) - >>> df_pd = pd.DataFrame({"a": [1, 2]}) - >>> df_pa = pa.table({"a": [1, 2]}) + >>> + >>> data = {"a": [1, 2]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_lit(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns(nw.lit(3)).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_lit`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_lit(df_pd) a literal 0 1 3 1 2 3 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_lit(df_pl) shape: (2, 2) ┌─────┬─────────┐ │ a ┆ literal │ @@ -7048,7 +7867,8 @@ def lit(value: Any, dtype: DType | None = None) -> Expr: │ 1 ┆ 3 │ │ 2 ┆ 3 │ └─────┴─────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_lit(df_pa) pyarrow.Table a: int64 literal: int64 @@ -7074,7 +7894,8 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: r"""Compute the bitwise OR horizontally across columns. Arguments: - exprs: Name(s) of the columns to use in the aggregation function. Accepts expression input. + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. Returns: A new expression. @@ -7085,6 +7906,7 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [False, False, True, True, False, None], ... "b": [False, True, True, None, None, None], @@ -7095,13 +7917,14 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_any_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select("a", "b", any=nw.any_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_any_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_any_horizontal(df_pd) a b any 0 False False False 1 False True True @@ -7110,7 +7933,7 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: 4 False 5 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_any_horizontal(df_pl) shape: (6, 3) ┌───────┬───────┬───────┐ │ a ┆ b ┆ any │ @@ -7125,7 +7948,7 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ null ┆ null ┆ null │ └───────┴───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_any_horizontal(df_pa) pyarrow.Table a: bool b: bool @@ -7161,6 +7984,7 @@ def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 8, 3], ... "b": [4, 5, None], @@ -7173,19 +7997,20 @@ def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function that computes the horizontal mean of "a" and "b" columns: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_mean_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.mean_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_mean_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_mean_horizontal(df_pd) a 0 2.5 1 6.5 2 3.0 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_mean_horizontal(df_pl) shape: (3, 1) ┌─────┐ │ a │ @@ -7197,7 +8022,7 @@ def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 3.0 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_mean_horizontal(df_pa) pyarrow.Table a: double ---- @@ -7236,11 +8061,12 @@ def concat_str( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 2, 3], ... "b": ["dogs", "cats", None], @@ -7250,7 +8076,7 @@ def concat_str( We define a dataframe-agnostic function that computes the horizontal string concatenation of different columns - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_concat_str(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.concat_str( @@ -7263,15 +8089,16 @@ def concat_str( ... ).alias("full_sentence") ... ).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_concat_str`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_concat_str(pd.DataFrame(data)) full_sentence 0 2 dogs play 1 4 cats swim 2 None - >>> my_library_agnostic_function(pl.DataFrame(data)) + >>> agnostic_concat_str(pl.DataFrame(data)) shape: (3, 1) ┌───────────────┐ │ full_sentence │ @@ -7283,7 +8110,7 @@ def concat_str( │ null │ └───────────────┘ - >>> my_library_agnostic_function(pa.table(data)) + >>> agnostic_concat_str(pa.table(data)) pyarrow.Table full_sentence: string ---- diff --git a/narwhals/functions.py b/narwhals/functions.py index 75cd9000e..ed167fb0d 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -1102,6 +1102,7 @@ def _scan_csv_impl( Implementation.MODIN, Implementation.CUDF, Implementation.DASK, + Implementation.DUCKDB, ): native_frame = native_namespace.read_csv(source, **kwargs) elif implementation is Implementation.PYARROW: @@ -1190,6 +1191,7 @@ def _read_parquet_impl( Implementation.PANDAS, Implementation.MODIN, Implementation.CUDF, + Implementation.DUCKDB, ): native_frame = native_namespace.read_parquet(source, **kwargs) elif implementation is Implementation.PYARROW: @@ -1273,6 +1275,7 @@ def _scan_parquet_impl( Implementation.MODIN, Implementation.CUDF, Implementation.DASK, + Implementation.DUCKDB, ): native_frame = native_namespace.read_parquet(source, **kwargs) elif implementation is Implementation.PYARROW: diff --git a/narwhals/series.py b/narwhals/series.py index c3e6f181b..8385b43ad 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -2605,7 +2605,10 @@ def fill_null( ) def is_between( - self, lower_bound: Any | Self, upper_bound: Any | Self, closed: str = "both" + self: Self, + lower_bound: Any | Self, + upper_bound: Any | Self, + closed: Literal["left", "right", "none", "both"] = "both", ) -> Self: """Get a boolean mask of the values that are between the given lower/upper bounds. @@ -4738,6 +4741,101 @@ def __iter__(self: Self) -> Iterator[Any]: def __contains__(self: Self, other: Any) -> bool: return self._compliant_series.__contains__(other) # type: ignore[no-any-return] + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"] = "average", + *, + descending: bool = False, + ) -> Self: + """Assign ranks to data, dealing with ties appropriately. + + Notes: + The resulting dtype may differ between backends. + + Arguments: + method: The method used to assign ranks to tied elements. + The following methods are available (default is 'average'): + + - 'average' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - 'min' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - 'max' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - 'dense' : Like 'min', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - 'ordinal' : All values are given a distinct rank, corresponding to the + order that the values occur in the Series. + + descending: Rank in descending order. + + Returns: + A new series with rank data as values. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> + >>> data = [3, 6, 1, 1, 6] + + We define a dataframe-agnostic function that computes the dense rank for + the data: + + >>> def agnostic_dense_rank(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.rank(method="dense").to_native() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_dense_rank`: + + >>> agnostic_dense_rank(pd.Series(data)) + 0 2.0 + 1 3.0 + 2 1.0 + 3 1.0 + 4 3.0 + dtype: float64 + + >>> agnostic_dense_rank(pl.Series(data)) # doctest:+NORMALIZE_WHITESPACE + shape: (5,) + Series: '' [u32] + [ + 2 + 3 + 1 + 1 + 3 + ] + + >>> agnostic_dense_rank(pa.chunked_array([data])) # doctest:+ELLIPSIS + + [ + [ + 2, + 3, + 1, + 1, + 3 + ] + ] + """ + supported_rank_methods = {"average", "min", "max", "dense", "ordinal"} + if method not in supported_rank_methods: + msg = ( + "Ranking method must be one of {'average', 'min', 'max', 'dense', 'ordinal'}. " + f"Found '{method}'" + ) + raise ValueError(msg) + + return self._from_compliant_series( + self._compliant_series.rank(method=method, descending=descending) + ) + @property def str(self: Self) -> SeriesStringNamespace[Self]: return SeriesStringNamespace(self) diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index ba5117425..cb5d2006c 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -2353,24 +2353,28 @@ def all() -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> df_pl = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> df_pa = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> + >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_all(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.all() * 2).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_all`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_all(df_pd) a b 0 2 8 1 4 10 2 6 12 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_all(df_pl) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -2381,7 +2385,8 @@ def all() -> Expr: │ 4 ┆ 10 │ │ 6 ┆ 12 │ └─────┴─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_all(df_pa) pyarrow.Table a: int64 b: int64 @@ -2407,23 +2412,27 @@ def col(*names: str | Iterable[str]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [3, 4]}) - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [3, 4]}) + >>> + >>> data = {"a": [1, 2], "b": [3, 4]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_col(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.col("a") * nw.col("b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_col`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_col(df_pd) a 0 3 1 8 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_col(df_pl) shape: (2, 1) ┌─────┐ │ a │ @@ -2433,7 +2442,8 @@ def col(*names: str | Iterable[str]) -> Expr: │ 3 │ │ 8 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_col(df_pa) pyarrow.Table a: int64 ---- @@ -2461,6 +2471,7 @@ def nth(*indices: int | Sequence[int]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2], "b": [3, 4]} >>> df_pl = pl.DataFrame(data) >>> df_pd = pd.DataFrame(data) @@ -2468,17 +2479,18 @@ def nth(*indices: int | Sequence[int]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_nth(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.nth(0) * 2).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_nth`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_nth(df_pd) a 0 2 1 4 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_nth(df_pl) shape: (2, 1) ┌─────┐ │ a │ @@ -2488,7 +2500,8 @@ def nth(*indices: int | Sequence[int]) -> Expr: │ 2 │ │ 4 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_nth(df_pa) pyarrow.Table a: int64 ---- @@ -2509,22 +2522,25 @@ def len() -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [5, 10]}) + >>> + >>> data = {"a": [1, 2], "b": [5, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_len(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.len()).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_len`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_len(df_pd) len 0 2 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_len(df_pl) shape: (1, 1) ┌─────┐ │ len │ @@ -2533,7 +2549,7 @@ def len() -> Expr: ╞═════╡ │ 2 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_len(df_pa) pyarrow.Table len: int64 ---- @@ -2542,12 +2558,13 @@ def len() -> Expr: return _stableify(nw.len()) -def lit(value: Any, dtype: DType | None = None) -> Expr: +def lit(value: Any, dtype: DType | type[DType] | None = None) -> Expr: """Return an expression representing a literal value. Arguments: value: The value to use as literal. - dtype: The data type of the literal value. If not provided, the data type will be inferred. + dtype: The data type of the literal value. If not provided, the data type will + be inferred. Returns: A new expression. @@ -2558,23 +2575,27 @@ def lit(value: Any, dtype: DType | None = None) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2]}) - >>> df_pd = pd.DataFrame({"a": [1, 2]}) - >>> df_pa = pa.table({"a": [1, 2]}) + >>> + >>> data = {"a": [1, 2]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_lit(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns(nw.lit(3)).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_lit`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_lit(df_pd) a literal 0 1 3 1 2 3 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_lit(df_pl) shape: (2, 2) ┌─────┬─────────┐ │ a ┆ literal │ @@ -2584,7 +2605,8 @@ def lit(value: Any, dtype: DType | None = None) -> Expr: │ 1 ┆ 3 │ │ 2 ┆ 3 │ └─────┴─────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_lit(df_pa) pyarrow.Table a: int64 literal: int64 @@ -2613,22 +2635,26 @@ def min(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [5, 10]}) + >>> + >>> data = {"a": [1, 2], "b": [5, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_min(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.min("b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_min`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_min(df_pd) b 0 5 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_min(df_pl) shape: (1, 1) ┌─────┐ │ b │ @@ -2637,7 +2663,8 @@ def min(*columns: str) -> Expr: ╞═════╡ │ 5 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_min(df_pa) pyarrow.Table b: int64 ---- @@ -2664,22 +2691,26 @@ def max(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pl = pl.DataFrame({"a": [1, 2], "b": [5, 10]}) - >>> df_pa = pa.table({"a": [1, 2], "b": [5, 10]}) + >>> + >>> data = {"a": [1, 2], "b": [5, 10]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_max(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.max("a")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_max`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_max(df_pd) a 0 2 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_max(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -2688,7 +2719,8 @@ def max(*columns: str) -> Expr: ╞═════╡ │ 2 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_max(df_pa) pyarrow.Table a: int64 ---- @@ -2715,22 +2747,26 @@ def mean(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 8, 3]}) - >>> df_pd = pd.DataFrame({"a": [1, 8, 3]}) - >>> df_pa = pa.table({"a": [1, 8, 3]}) + >>> + >>> data = {"a": [1, 8, 3]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_mean(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.mean("a")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_mean`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_mean(df_pd) a 0 4.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_mean(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -2739,7 +2775,8 @@ def mean(*columns: str) -> Expr: ╞═════╡ │ 4.0 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_mean(df_pa) pyarrow.Table a: double ---- @@ -2753,7 +2790,8 @@ def median(*columns: str) -> Expr: Notes: - Syntactic sugar for ``nw.col(columns).median()`` - - Results might slightly differ across backends due to differences in the underlying algorithms used to compute the median. + - Results might slightly differ across backends due to differences in the + underlying algorithms used to compute the median. Arguments: columns: Name(s) of the columns to use in the aggregation function @@ -2767,22 +2805,26 @@ def median(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pd = pd.DataFrame({"a": [4, 5, 2]}) - >>> df_pl = pl.DataFrame({"a": [4, 5, 2]}) - >>> df_pa = pa.table({"a": [4, 5, 2]}) + >>> + >>> data = {"a": [4, 5, 2]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) Let's define a dataframe agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_median(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.median("a")).to_native() - We can then pass any supported library such as pandas, Polars, or PyArrow to `func`: + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_median`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_median(df_pd) a 0 4.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_median(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -2791,7 +2833,8 @@ def median(*columns: str) -> Expr: ╞═════╡ │ 4.0 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_median(df_pa) pyarrow.Table a: double ---- @@ -2818,22 +2861,26 @@ def sum(*columns: str) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2]}) - >>> df_pd = pd.DataFrame({"a": [1, 2]}) - >>> df_pa = pa.table({"a": [1, 2]}) + >>> + >>> data = {"a": [1, 2]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_sum(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.sum("a")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_sum`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_sum(df_pd) a 0 3 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_sum(df_pl) shape: (1, 1) ┌─────┐ │ a │ @@ -2842,7 +2889,8 @@ def sum(*columns: str) -> Expr: ╞═════╡ │ 3 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_sum(df_pa) pyarrow.Table a: int64 ---- @@ -2870,6 +2918,7 @@ def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = {"a": [1, 2, 3], "b": [5, 10, None]} >>> df_pl = pl.DataFrame(data) >>> df_pd = pd.DataFrame(data) @@ -2877,18 +2926,19 @@ def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_sum_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.sum_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to `agnostic_sum_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_sum_horizontal(df_pd) a 0 6.0 1 12.0 2 3.0 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_sum_horizontal(df_pl) shape: (3, 1) ┌─────┐ │ a │ @@ -2899,7 +2949,8 @@ def sum_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 12 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_sum_horizontal(df_pa) pyarrow.Table a: int64 ---- @@ -2912,7 +2963,8 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: r"""Compute the bitwise AND horizontally across columns. Arguments: - exprs: Name(s) of the columns to use in the aggregation function. Accepts expression input. + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. Returns: A new expression. @@ -2923,6 +2975,7 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [False, False, True, True, False, None], ... "b": [False, True, True, None, None, None], @@ -2933,13 +2986,14 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_all_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select("a", "b", all=nw.all_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_all_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_all_horizontal(df_pd) a b all 0 False False False 1 False True False @@ -2948,7 +3002,7 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: 4 False False 5 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_all_horizontal(df_pl) shape: (6, 3) ┌───────┬───────┬───────┐ │ a ┆ b ┆ all │ @@ -2963,7 +3017,7 @@ def all_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ null ┆ null ┆ null │ └───────┴───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_all_horizontal(df_pa) pyarrow.Table a: bool b: bool @@ -2980,7 +3034,8 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: r"""Compute the bitwise OR horizontally across columns. Arguments: - exprs: Name(s) of the columns to use in the aggregation function. Accepts expression input. + exprs: Name(s) of the columns to use in the aggregation function. Accepts + expression input. Returns: A new expression. @@ -2991,6 +3046,7 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [False, False, True, True, False, None], ... "b": [False, True, True, None, None, None], @@ -3001,13 +3057,14 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_any_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select("a", "b", any=nw.any_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_any_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_any_horizontal(df_pd) a b any 0 False False False 1 False True True @@ -3016,7 +3073,7 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: 4 False 5 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_any_horizontal(df_pl) shape: (6, 3) ┌───────┬───────┬───────┐ │ a ┆ b ┆ any │ @@ -3031,7 +3088,7 @@ def any_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ null ┆ null ┆ null │ └───────┴───────┴───────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_any_horizontal(df_pa) pyarrow.Table a: bool b: bool @@ -3060,6 +3117,7 @@ def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 8, 3], ... "b": [4, 5, None], @@ -3072,19 +3130,20 @@ def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function that computes the horizontal mean of "a" and "b" columns: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_mean_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.mean_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_mean_horizontal`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_mean_horizontal(df_pd) a 0 2.5 1 6.5 2 3.0 - >>> my_library_agnostic_function(df_pl) + >>> agnostic_mean_horizontal(df_pl) shape: (3, 1) ┌─────┐ │ a │ @@ -3096,7 +3155,7 @@ def mean_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 3.0 │ └─────┘ - >>> my_library_agnostic_function(df_pa) + >>> agnostic_mean_horizontal(df_pa) pyarrow.Table a: double ---- @@ -3119,11 +3178,12 @@ def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 8, 3], ... "b": [4, 5, None], @@ -3133,18 +3193,20 @@ def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function that computes the horizontal min of "a" and "b" columns: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_min_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.min_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_min_horizontal`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_min_horizontal(pd.DataFrame(data)) a 0 1.0 1 5.0 2 3.0 - >>> my_library_agnostic_function(pl.DataFrame(data)) + + >>> agnostic_min_horizontal(pl.DataFrame(data)) shape: (3, 1) ┌─────┐ │ a │ @@ -3155,7 +3217,8 @@ def min_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 5 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(pa.table(data)) + + >>> agnostic_min_horizontal(pa.table(data)) pyarrow.Table a: int64 ---- @@ -3178,11 +3241,12 @@ def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 8, 3], ... "b": [4, 5, None], @@ -3192,18 +3256,20 @@ def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: We define a dataframe-agnostic function that computes the horizontal max of "a" and "b" columns: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_max_horizontal(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select(nw.max_horizontal("a", "b")).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_max_horizontal`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_max_horizontal(pd.DataFrame(data)) a 0 4.0 1 8.0 2 3.0 - >>> my_library_agnostic_function(pl.DataFrame(data)) + + >>> agnostic_max_horizontal(pl.DataFrame(data)) shape: (3, 1) ┌─────┐ │ a │ @@ -3214,7 +3280,8 @@ def max_horizontal(*exprs: IntoExpr | Iterable[IntoExpr]) -> Expr: │ 8 │ │ 3 │ └─────┘ - >>> my_library_agnostic_function(pa.table(data)) + + >>> agnostic_max_horizontal(pa.table(data)) pyarrow.Table a: int64 ---- @@ -3405,11 +3472,12 @@ def concat_str( A new expression. Examples: - >>> import narwhals as nw - >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> >>> data = { ... "a": [1, 2, 3], ... "b": ["dogs", "cats", None], @@ -3419,7 +3487,7 @@ def concat_str( We define a dataframe-agnostic function that computes the horizontal string concatenation of different columns - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_concat_str(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.select( ... nw.concat_str( @@ -3432,15 +3500,16 @@ def concat_str( ... ).alias("full_sentence") ... ).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow + to `agnostic_concat_str`: - >>> my_library_agnostic_function(pd.DataFrame(data)) + >>> agnostic_concat_str(pd.DataFrame(data)) full_sentence 0 2 dogs play 1 4 cats swim 2 None - >>> my_library_agnostic_function(pl.DataFrame(data)) + >>> agnostic_concat_str(pl.DataFrame(data)) shape: (3, 1) ┌───────────────┐ │ full_sentence │ @@ -3452,7 +3521,7 @@ def concat_str( │ null │ └───────────────┘ - >>> my_library_agnostic_function(pa.table(data)) + >>> agnostic_concat_str(pa.table(data)) pyarrow.Table full_sentence: string ---- @@ -3495,9 +3564,9 @@ def when(*predicates: IntoExpr | Iterable[IntoExpr]) -> When: If not appended, and none of the conditions are `True`, `None` will be returned. Arguments: - predicates: Condition(s) that must be met in order to apply the subsequent statement. - Accepts one or more boolean expressions, which are implicitly combined with `&`. - String input is parsed as a column name. + predicates: Condition(s) that must be met in order to apply the subsequent + statement. Accepts one or more boolean expressions, which are implicitly + combined with `&`. String input is parsed as a column name. Returns: A "when" object, which `.then` can be called on. @@ -3508,26 +3577,30 @@ def when(*predicates: IntoExpr | Iterable[IntoExpr]) -> When: >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT - >>> df_pl = pl.DataFrame({"a": [1, 2, 3], "b": [5, 10, 15]}) - >>> df_pd = pd.DataFrame({"a": [1, 2, 3], "b": [5, 10, 15]}) - >>> df_pa = pa.table({"a": [1, 2, 3], "b": [5, 10, 15]}) + >>> + >>> data = {"a": [1, 2, 3], "b": [5, 10, 15]} + >>> df_pl = pl.DataFrame(data) + >>> df_pd = pd.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_when_then_otherwise(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... nw.when(nw.col("a") < 3).then(5).otherwise(6).alias("a_when") ... ).to_native() - We can pass any supported library such as Pandas, Polars, or PyArrow to `func`: + We can pass any supported library such as Pandas, Polars, or PyArrow to + `agnostic_when_then_otherwise`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_when_then_otherwise(df_pd) a b a_when 0 1 5 5 1 2 10 5 2 3 15 6 - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_when_then_otherwise(df_pl) shape: (3, 3) ┌─────┬─────┬────────┐ │ a ┆ b ┆ a_when │ @@ -3538,7 +3611,8 @@ def when(*predicates: IntoExpr | Iterable[IntoExpr]) -> When: │ 2 ┆ 10 ┆ 5 │ │ 3 ┆ 15 ┆ 6 │ └─────┴─────┴────────┘ - >>> my_library_agnostic_function(df_pa) + + >>> agnostic_when_then_otherwise(df_pa) pyarrow.Table a: int64 b: int64 diff --git a/narwhals/translate.py b/narwhals/translate.py index 8542a62f0..9ad868016 100644 --- a/narwhals/translate.py +++ b/narwhals/translate.py @@ -684,7 +684,10 @@ def _from_native_impl( # noqa: PLR0915 msg = "Cannot only use `eager_only` or `eager_or_interchange_only` with dask DataFrame" raise TypeError(msg) return native_object - if get_dask_expr() is None: # pragma: no cover + if ( + parse_version(get_dask().__version__) <= (2024, 12, 1) + and get_dask_expr() is None + ): # pragma: no cover msg = "Please install dask-expr" raise ImportError(msg) return LazyFrame( @@ -698,20 +701,32 @@ def _from_native_impl( # noqa: PLR0915 # DuckDB elif is_duckdb_relation(native_object): - from narwhals._duckdb.dataframe import DuckDBInterchangeFrame + from narwhals._duckdb.dataframe import DuckDBLazyFrame if eager_only or series_only: # pragma: no cover if not pass_through: msg = ( "Cannot only use `series_only=True` or `eager_only=False` " - "with DuckDB Relation" + "with DuckDBPyRelation" ) else: return native_object raise TypeError(msg) - return DataFrame( - DuckDBInterchangeFrame(native_object, version=version), - level="interchange", + import duckdb # ignore-banned-import + + backend_version = parse_version(duckdb.__version__) + if version is Version.V1: + return DataFrame( + DuckDBLazyFrame( + native_object, backend_version=backend_version, version=version + ), + level="interchange", + ) + return LazyFrame( + DuckDBLazyFrame( + native_object, backend_version=backend_version, version=version + ), + level="full", ) # Ibis @@ -726,8 +741,13 @@ def _from_native_impl( # noqa: PLR0915 ) raise TypeError(msg) return native_object + import ibis # ignore-banned-import + + backend_version = parse_version(ibis.__version__) return DataFrame( - IbisInterchangeFrame(native_object, version=version), + IbisInterchangeFrame( + native_object, version=version, backend_version=backend_version + ), level="interchange", ) diff --git a/narwhals/typing.py b/narwhals/typing.py index ff29cb57e..859e98dff 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -81,10 +81,22 @@ def __narwhals_namespace__(self) -> CompliantNamespace[CompliantSeriesT_co]: ... def is_null(self) -> Self: ... def alias(self, name: str) -> Self: ... def cast(self, dtype: DType) -> Self: ... + def __and__(self, other: Any) -> Self: ... + def __or__(self, other: Any) -> Self: ... + def __add__(self, other: Any) -> Self: ... + def __sub__(self, other: Any) -> Self: ... + def __mul__(self, other: Any) -> Self: ... + def __floordiv__(self, other: Any) -> Self: ... + def __truediv__(self, other: Any) -> Self: ... + def __mod__(self, other: Any) -> Self: ... + def __pow__(self, other: Any) -> Self: ... class CompliantNamespace(Protocol, Generic[CompliantSeriesT_co]): def col(self, *column_names: str) -> CompliantExpr[CompliantSeriesT_co]: ... + def lit( + self, value: Any, dtype: DType | None + ) -> CompliantExpr[CompliantSeriesT_co]: ... IntoExpr: TypeAlias = Union["Expr", str, "Series[Any]"] diff --git a/narwhals/utils.py b/narwhals/utils.py index 2125d46c4..509a0e36a 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -16,6 +16,8 @@ from narwhals.dependencies import get_cudf from narwhals.dependencies import get_dask_dataframe +from narwhals.dependencies import get_duckdb +from narwhals.dependencies import get_ibis from narwhals.dependencies import get_modin from narwhals.dependencies import get_pandas from narwhals.dependencies import get_polars @@ -73,6 +75,10 @@ class Implementation(Enum): """Polars implementation.""" DASK = auto() """Dask implementation.""" + DUCKDB = auto() + """DuckDB implementation.""" + IBIS = auto() + """Ibis implementation.""" UNKNOWN = auto() """Unknown implementation.""" @@ -97,6 +103,8 @@ def from_native_namespace( get_pyspark_sql(): Implementation.PYSPARK, get_polars(): Implementation.POLARS, get_dask_dataframe(): Implementation.DASK, + get_duckdb(): Implementation.DUCKDB, + get_ibis(): Implementation.IBIS, } return mapping.get(native_namespace, Implementation.UNKNOWN) @@ -147,7 +155,11 @@ def is_pandas_like(self) -> bool: >>> df.implementation.is_pandas_like() True """ - return self in {Implementation.PANDAS, Implementation.MODIN, Implementation.CUDF} + return self in { + Implementation.PANDAS, + Implementation.MODIN, + Implementation.CUDF, + } def is_polars(self) -> bool: """Return whether implementation is Polars. @@ -245,6 +257,59 @@ def is_dask(self) -> bool: """ return self is Implementation.DASK # pragma: no cover + def is_duckdb(self) -> bool: + """Return whether implementation is DuckDB. + + Returns: + Boolean. + + Examples: + >>> import polars as pl + >>> import narwhals as nw + >>> df_native = pl.DataFrame({"a": [1, 2, 3]}) + >>> df = nw.from_native(df_native) + >>> df.implementation.is_duckdb() + False + """ + return self is Implementation.DUCKDB # pragma: no cover + + def is_ibis(self) -> bool: + """Return whether implementation is Ibis. + + Returns: + Boolean. + + Examples: + >>> import polars as pl + >>> import narwhals as nw + >>> df_native = pl.DataFrame({"a": [1, 2, 3]}) + >>> df = nw.from_native(df_native) + >>> df.implementation.is_ibis() + False + """ + return self is Implementation.IBIS # pragma: no cover + + +MIN_VERSIONS: dict[Implementation, tuple[int, ...]] = { + Implementation.PANDAS: (0, 25, 3), + Implementation.MODIN: (0, 25, 3), + Implementation.CUDF: (24, 10), + Implementation.PYARROW: (11,), + Implementation.PYSPARK: (3, 5), + Implementation.POLARS: (0, 20, 3), + Implementation.DASK: (2024, 8), + Implementation.DUCKDB: (1,), + Implementation.IBIS: (6,), +} + + +def validate_backend_version( + implementation: Implementation, backend_version: tuple[int, ...] +) -> None: + if backend_version < (min_version := MIN_VERSIONS[implementation]): + msg = f"Minimum version of {implementation} supported by Narwhals is {min_version}, found: {backend_version}" + raise ValueError(msg) + def import_dtypes_module(version: Version) -> DTypes: if version is Version.V1: @@ -307,7 +372,7 @@ def _is_iterable(arg: Any | Iterable[Any]) -> bool: return isinstance(arg, Iterable) and not isinstance(arg, (str, bytes, Series)) -def parse_version(version: Sequence[str | int]) -> tuple[int, ...]: +def parse_version(version: str) -> tuple[int, ...]: """Simple version parser; split into a tuple of ints for comparison. Arguments: @@ -317,9 +382,10 @@ def parse_version(version: Sequence[str | int]) -> tuple[int, ...]: Parsed version number. """ # lifted from Polars - if isinstance(version, str): # pragma: no cover - version = version.split(".") - return tuple(int(re.sub(r"\D", "", str(v))) for v in version) + # [marco]: Take care of DuckDB pre-releases which end with e.g. `-dev4108` + # and pandas pre-releases which end with e.g. .dev0+618.gb552dc95c9 + version = re.sub(r"(\D?dev.*$)", "", version) + return tuple(int(re.sub(r"\D", "", str(v))) for v in version.split(".")) def isinstance_or_issubclass(obj: Any, cls: Any) -> bool: diff --git a/pyproject.toml b/pyproject.toml index 45aa54ad8..91770923e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.20.1" +version = "1.21.1" dependencies = [] requires-python = ">=3.8" authors = [ @@ -14,22 +14,28 @@ authors = [ description = "Extremely lightweight compatibility layer between dataframe libraries" readme = "README.md" keywords = ["dataframes", "interoperability", "pandas", "polars", "pyarrow", "dask", "modin", "cudf"] -license = {file = "LICENSE.md"} classifiers = [ + "License :: OSI Approved :: MIT License", "Programming Language :: Python", "Operating System :: OS Independent", ] [project.optional-dependencies] -cudf = ["cudf>=24.10.0"] -modin = ["modin"] +# These should be aligned with MIN_VERSIONS in narwhals/utils.py +# Exception: modin, because `modin.__version__` isn't aligned with +# `modin.pandas.__version__`. The latter is the one that we make +# API decisions based on, so that's the one we track internally. +# We have yet to determine the minimum Modin version we support +# https://github.com/narwhals-dev/narwhals/issues/817 pandas = ["pandas>=0.25.3"] -polars = ["polars>=0.20.3"] -ibis = ["ibis-framework>=6.0.0", "rich", "packaging", "pyarrow_hotfix"] +modin = ["modin"] +cudf = ["cudf>=24.10.0"] pyarrow = ["pyarrow>=11.0.0"] -dask = ["dask[dataframe]>=2024.7"] +pyspark = ["pyspark>=3.5.0"] +polars = ["polars>=0.20.3"] +dask = ["dask[dataframe]>=2024.8"] duckdb = ["duckdb>=1.0"] -pyspark = ["pyspark>=3.3.0"] +ibis = ["ibis-framework>=6.0.0", "rich", "packaging", "pyarrow_hotfix"] dev = [ "covdefaults", "pre-commit", @@ -110,6 +116,7 @@ lint.ignore = [ "E501", "FIX", "ISC001", + "PD003", "PD010", "PD901", # This is a auxiliary library so dataframe variables have no concrete business meaning "PLR0911", @@ -150,27 +157,23 @@ docstring-code-format = true testpaths = ["tests"] filterwarnings = [ "error", - 'ignore:distutils Version classes are deprecated:DeprecationWarning', - 'ignore:In the future `np.bool`', - 'ignore:make_block is deprecated and will be removed', - 'ignore:np.find_common_type is deprecated', - 'ignore:is_sparse is deprecated and will be removed', - 'ignore:Passing a BlockManager to DataFrame is deprecated', 'ignore:.*defaulting to pandas implementation', 'ignore:.*implementation has mismatches with pandas', - 'ignore:.*Do not use the `random` module inside strategies', 'ignore:.*You are using pyarrow version', - 'ignore:.*but when imported by', - 'ignore:Distributing .*This may take some time', - 'ignore:.*The default coalesce behavior', - 'ignore:is_datetime64tz_dtype is deprecated', + # This warning was temporarily raised by pandas but then reverted. + 'ignore:.*Passing a BlockManager to DataFrame:DeprecationWarning', + # This warning was temporarily raised by Polars but then reverted. + 'ignore:.*The default coalesce behavior of left join will change:DeprecationWarning', 'ignore: unclosed pl.LazyFrame: return pl.LazyFrame(obj) +def duckdb_lazy_constructor(obj: Any) -> duckdb.DuckDBPyRelation: + import duckdb + + _df = pl.LazyFrame(obj) + return duckdb.table("_df") + + def dask_lazy_p1_constructor(obj: Any) -> IntoFrame: # pragma: no cover import dask.dataframe as dd @@ -125,23 +128,23 @@ def pyarrow_table_constructor(obj: Any) -> IntoDataFrame: return pa.table(obj) # type: ignore[no-any-return] -@pytest.fixture(scope="session") -def spark_session() -> Generator[SparkSession, None, None]: # pragma: no cover +def pyspark_lazy_constructor() -> Callable[[Any], IntoFrame]: # pragma: no cover try: from pyspark.sql import SparkSession except ImportError: # pragma: no cover pytest.skip("pyspark is not installed") - return + return None import warnings + from atexit import register - os.environ["PYARROW_IGNORE_TIMEZONE"] = "1" with warnings.catch_warnings(): # The spark session seems to trigger a polars warning. # Polars is imported in the tests, but not used in the spark operations warnings.filterwarnings( "ignore", r"Using fork\(\) can cause Polars", category=RuntimeWarning ) + session = ( SparkSession.builder.appName("unit-tests") .master("local[1]") @@ -151,8 +154,26 @@ def spark_session() -> Generator[SparkSession, None, None]: # pragma: no cover .config("spark.sql.shuffle.partitions", "2") .getOrCreate() ) - yield session - session.stop() + + register(session.stop) + + def _constructor(obj: Any) -> IntoFrame: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + r".*is_datetime64tz_dtype is deprecated and will be removed in a future version.*", + module="pyspark", + category=DeprecationWarning, + ) + pd_df = pd.DataFrame(obj).replace({float("nan"): None}).reset_index() + return ( # type: ignore[no-any-return] + session.createDataFrame(pd_df) + .repartition(2) + .orderBy("index") + .drop("index") + ) + + return _constructor EAGER_CONSTRUCTORS: dict[str, Callable[[Any], IntoDataFrame]] = { @@ -168,6 +189,8 @@ def spark_session() -> Generator[SparkSession, None, None]: # pragma: no cover LAZY_CONSTRUCTORS: dict[str, Callable[[Any], IntoFrame]] = { "dask": dask_lazy_p2_constructor, "polars[lazy]": polars_lazy_constructor, + "duckdb": duckdb_lazy_constructor, + "pyspark": pyspark_lazy_constructor, # type: ignore[dict-item] } GPU_CONSTRUCTORS: dict[str, Callable[[Any], IntoFrame]] = {"cudf": cudf_constructor} @@ -196,7 +219,13 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: constructors.append(EAGER_CONSTRUCTORS[constructor]) constructors_ids.append(constructor) elif constructor in LAZY_CONSTRUCTORS: - constructors.append(LAZY_CONSTRUCTORS[constructor]) + if constructor == "pyspark": + if sys.version_info < (3, 12): # pragma: no cover + constructors.append(pyspark_lazy_constructor()) + else: # pragma: no cover + continue + else: + constructors.append(LAZY_CONSTRUCTORS[constructor]) constructors_ids.append(constructor) else: # pragma: no cover msg = f"Expected one of {EAGER_CONSTRUCTORS.keys()} or {LAZY_CONSTRUCTORS.keys()}, got {constructor}" @@ -207,4 +236,14 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: "constructor_eager", eager_constructors, ids=eager_constructors_ids ) elif "constructor" in metafunc.fixturenames: + if ( + any( + x in str(metafunc.module) + for x in ("list", "name", "unpivot", "from_dict", "from_numpy", "tail") + ) + and LAZY_CONSTRUCTORS["duckdb"] in constructors + ): + # TODO(unassigned): list and name namespaces still need implementing for duckdb + constructors.remove(LAZY_CONSTRUCTORS["duckdb"]) + constructors_ids.remove("duckdb") metafunc.parametrize("constructor", constructors, ids=constructors_ids) diff --git a/tests/expr_and_series/all_horizontal_test.py b/tests/expr_and_series/all_horizontal_test.py index 706c42baf..826c0fe19 100644 --- a/tests/expr_and_series/all_horizontal_test.py +++ b/tests/expr_and_series/all_horizontal_test.py @@ -57,6 +57,8 @@ def test_allh_nth( ) -> None: if "polars" in str(constructor) and POLARS_VERSION < (1, 0): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "a": [False, False, True], "b": [False, True, True], diff --git a/tests/expr_and_series/any_all_test.py b/tests/expr_and_series/any_all_test.py index c5f22ad9a..7fd81f04d 100644 --- a/tests/expr_and_series/any_all_test.py +++ b/tests/expr_and_series/any_all_test.py @@ -1,12 +1,17 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data -def test_any_all(constructor: Constructor) -> None: +def test_any_all(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native( constructor( { diff --git a/tests/expr_and_series/any_horizontal_test.py b/tests/expr_and_series/any_horizontal_test.py index 4eb082b51..06157f393 100644 --- a/tests/expr_and_series/any_horizontal_test.py +++ b/tests/expr_and_series/any_horizontal_test.py @@ -11,7 +11,11 @@ @pytest.mark.parametrize("expr1", ["a", nw.col("a")]) @pytest.mark.parametrize("expr2", ["b", nw.col("b")]) -def test_anyh(constructor: Constructor, expr1: Any, expr2: Any) -> None: +def test_anyh( + request: pytest.FixtureRequest, constructor: Constructor, expr1: Any, expr2: Any +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "a": [False, False, True], "b": [False, True, True], @@ -23,7 +27,9 @@ def test_anyh(constructor: Constructor, expr1: Any, expr2: Any) -> None: assert_equal_data(result, expected) -def test_anyh_all(constructor: Constructor) -> None: +def test_anyh_all(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "a": [False, False, True], "b": [False, True, True], diff --git a/tests/expr_and_series/arithmetic_test.py b/tests/expr_and_series/arithmetic_test.py index eb38c6a14..1baae44e5 100644 --- a/tests/expr_and_series/arithmetic_test.py +++ b/tests/expr_and_series/arithmetic_test.py @@ -11,6 +11,7 @@ from hypothesis import given import narwhals.stable.v1 as nw +from tests.utils import DASK_VERSION from tests.utils import PANDAS_VERSION from tests.utils import Constructor from tests.utils import ConstructorEager @@ -37,6 +38,8 @@ def test_arithmetic_expr( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor) and attr == "__floordiv__": + request.applymarker(pytest.mark.xfail) if attr == "__mod__" and any( x in str(constructor) for x in ["pandas_pyarrow", "modin_pyarrow"] ): @@ -67,11 +70,12 @@ def test_right_arithmetic_expr( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: + if "dask" in str(constructor) and DASK_VERSION < (2024, 10): + request.applymarker(pytest.mark.xfail) if attr == "__rmod__" and any( x in str(constructor) for x in ["pandas_pyarrow", "modin_pyarrow"] ): request.applymarker(pytest.mark.xfail) - data = {"a": [1, 2, 3]} df = nw.from_native(constructor(data)) result = df.select(getattr(nw.col("a"), attr)(rhs)) @@ -241,6 +245,10 @@ def test_arithmetic_expr_left_literal( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: + if ("duckdb" in str(constructor) and attr == "__floordiv__") or ( + "dask" in str(constructor) and DASK_VERSION < (2024, 10) + ): + request.applymarker(pytest.mark.xfail) if attr == "__mod__" and any( x in str(constructor) for x in ["pandas_pyarrow", "modin_pyarrow"] ): diff --git a/tests/expr_and_series/binary_test.py b/tests/expr_and_series/binary_test.py index 3693ccebd..308745cb4 100644 --- a/tests/expr_and_series/binary_test.py +++ b/tests/expr_and_series/binary_test.py @@ -1,11 +1,18 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw +from tests.utils import DASK_VERSION from tests.utils import Constructor from tests.utils import assert_equal_data -def test_expr_binary(constructor: Constructor) -> None: +def test_expr_binary(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ("dask" in str(constructor) and DASK_VERSION < (2024, 10)) or "pyspark" in str( + constructor + ): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_raw = constructor(data) result = nw.from_native(df_raw).with_columns( diff --git a/tests/expr_and_series/cast_test.py b/tests/expr_and_series/cast_test.py index e956dd455..ba2b82493 100644 --- a/tests/expr_and_series/cast_test.py +++ b/tests/expr_and_series/cast_test.py @@ -13,6 +13,7 @@ from tests.utils import PANDAS_VERSION from tests.utils import PYARROW_VERSION from tests.utils import Constructor +from tests.utils import ConstructorEager from tests.utils import assert_equal_data from tests.utils import is_windows @@ -59,6 +60,8 @@ def test_cast( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "pyarrow_table_constructor" in str(constructor) and PYARROW_VERSION <= ( 15, ): # pragma: no cover @@ -109,18 +112,18 @@ def test_cast( def test_cast_series( - constructor: Constructor, + constructor_eager: ConstructorEager, request: pytest.FixtureRequest, ) -> None: - if "pyarrow_table_constructor" in str(constructor) and PYARROW_VERSION <= ( + if "pyarrow_table_constructor" in str(constructor_eager) and PYARROW_VERSION <= ( 15, ): # pragma: no cover request.applymarker(pytest.mark.xfail) - if "modin_constructor" in str(constructor): + if "modin_constructor" in str(constructor_eager): # TODO(unassigned): in modin, we end up with `' None: def test_cast_raises_for_unknown_dtype( constructor: Constructor, request: pytest.FixtureRequest ) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (15,): # Unsupported cast from string to dictionary using function cast_dictionary request.applymarker(pytest.mark.xfail) @@ -196,8 +201,10 @@ def test_cast_datetime_tz_aware( ) -> None: if ( "dask" in str(constructor) + or "duckdb" in str(constructor) or "cudf" in str(constructor) # https://github.com/rapidsai/cudf/issues/16973 or ("pyarrow_table" in str(constructor) and is_windows()) + or ("pyspark" in str(constructor)) ): request.applymarker(pytest.mark.xfail) @@ -222,7 +229,10 @@ def test_cast_datetime_tz_aware( def test_cast_struct(request: pytest.FixtureRequest, constructor: Constructor) -> None: - if any(backend in str(constructor) for backend in ("dask", "modin", "cudf")): + if any( + backend in str(constructor) + for backend in ("dask", "modin", "cudf", "duckdb", "pyspark") + ): request.applymarker(pytest.mark.xfail) if "pandas" in str(constructor) and PANDAS_VERSION < (2, 2): diff --git a/tests/expr_and_series/clip_test.py b/tests/expr_and_series/clip_test.py index 838ca6b08..2ae9e043d 100644 --- a/tests/expr_and_series/clip_test.py +++ b/tests/expr_and_series/clip_test.py @@ -28,6 +28,9 @@ def test_clip_expr_expressified( ) -> None: if "modin_pyarrow" in str(constructor): request.applymarker(pytest.mark.xfail) + if "cudf" in str(constructor): + # https://github.com/rapidsai/cudf/issues/17682 + request.applymarker(pytest.mark.xfail) data = {"a": [1, 2, 3, -4, 5], "lb": [3, 2, 1, 1, 1], "ub": [4, 4, 2, 2, 2]} df = nw.from_native(constructor(data)) @@ -57,6 +60,9 @@ def test_clip_series_expressified( ) -> None: if "modin_pyarrow" in str(constructor_eager): request.applymarker(pytest.mark.xfail) + if "cudf" in str(constructor_eager): + # https://github.com/rapidsai/cudf/issues/17682 + request.applymarker(pytest.mark.xfail) data = {"a": [1, 2, 3, -4, 5], "lb": [3, 2, 1, 1, 1], "ub": [4, 4, 2, 2, 2]} df = nw.from_native(constructor_eager(data), eager_only=True) diff --git a/tests/expr_and_series/concat_str_test.py b/tests/expr_and_series/concat_str_test.py index 26366d2f2..37d4a581d 100644 --- a/tests/expr_and_series/concat_str_test.py +++ b/tests/expr_and_series/concat_str_test.py @@ -21,8 +21,14 @@ ], ) def test_concat_str( - constructor: Constructor, *, ignore_nulls: bool, expected: list[str] + constructor: Constructor, + *, + ignore_nulls: bool, + expected: list[str], + request: pytest.FixtureRequest, ) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = ( df.select( diff --git a/tests/expr_and_series/convert_time_zone_test.py b/tests/expr_and_series/convert_time_zone_test.py index aa4235549..9a18ee07f 100644 --- a/tests/expr_and_series/convert_time_zone_test.py +++ b/tests/expr_and_series/convert_time_zone_test.py @@ -28,6 +28,8 @@ def test_convert_time_zone( or ("pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1)) or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1)) or ("cudf" in str(constructor)) + or ("duckdb" in str(constructor)) + or ("pyspark" in str(constructor)) ): request.applymarker(pytest.mark.xfail) data = { @@ -84,6 +86,8 @@ def test_convert_time_zone_from_none( or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1)) or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) or ("cudf" in str(constructor)) + or ("duckdb" in str(constructor)) + or ("pyspark" in str(constructor)) ): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) and POLARS_VERSION < (0, 20, 7): diff --git a/tests/expr_and_series/cum_count_test.py b/tests/expr_and_series/cum_count_test.py index 6ddf6c991..dab77ebbc 100644 --- a/tests/expr_and_series/cum_count_test.py +++ b/tests/expr_and_series/cum_count_test.py @@ -21,6 +21,8 @@ def test_cum_count_expr( ) -> None: if "dask" in str(constructor) and reverse: request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) name = "reverse_cum_count" if reverse else "cum_count" df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/cum_max_test.py b/tests/expr_and_series/cum_max_test.py index 054537d34..3df5a6ad4 100644 --- a/tests/expr_and_series/cum_max_test.py +++ b/tests/expr_and_series/cum_max_test.py @@ -23,6 +23,8 @@ def test_cum_max_expr( ) -> None: if "dask" in str(constructor) and reverse: request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if PYARROW_VERSION < (13, 0, 0) and "pyarrow_table" in str(constructor): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/cum_min_test.py b/tests/expr_and_series/cum_min_test.py index bb92f5b9d..a758dc8b4 100644 --- a/tests/expr_and_series/cum_min_test.py +++ b/tests/expr_and_series/cum_min_test.py @@ -23,6 +23,8 @@ def test_cum_min_expr( ) -> None: if "dask" in str(constructor) and reverse: request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if PYARROW_VERSION < (13, 0, 0) and "pyarrow_table" in str(constructor): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/cum_prod_test.py b/tests/expr_and_series/cum_prod_test.py index 1d5816ff2..2d6861b8d 100644 --- a/tests/expr_and_series/cum_prod_test.py +++ b/tests/expr_and_series/cum_prod_test.py @@ -23,6 +23,8 @@ def test_cum_prod_expr( ) -> None: if "dask" in str(constructor) and reverse: request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if PYARROW_VERSION < (13, 0, 0) and "pyarrow_table" in str(constructor): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/cum_sum_test.py b/tests/expr_and_series/cum_sum_test.py index 8df3396bc..8a419c9a9 100644 --- a/tests/expr_and_series/cum_sum_test.py +++ b/tests/expr_and_series/cum_sum_test.py @@ -18,6 +18,8 @@ def test_cum_sum_expr( request: pytest.FixtureRequest, constructor: Constructor, *, reverse: bool ) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "dask" in str(constructor) and reverse: request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/diff_test.py b/tests/expr_and_series/diff_test.py index da433f7ad..f7730a2d4 100644 --- a/tests/expr_and_series/diff_test.py +++ b/tests/expr_and_series/diff_test.py @@ -22,6 +22,8 @@ def test_diff( if "pyarrow_table_constructor" in str(constructor) and PYARROW_VERSION < (13,): # pc.pairwisediff is available since pyarrow 13.0.0 request.applymarker(pytest.mark.xfail) + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.with_columns(c_diff=nw.col("c").diff()).filter(nw.col("i") > 0) expected = { diff --git a/tests/expr_and_series/dt/datetime_attributes_test.py b/tests/expr_and_series/dt/datetime_attributes_test.py index ad5f8dc3f..9f578d3c1 100644 --- a/tests/expr_and_series/dt/datetime_attributes_test.py +++ b/tests/expr_and_series/dt/datetime_attributes_test.py @@ -49,6 +49,10 @@ def test_datetime_attributes( request.applymarker(pytest.mark.xfail) if attribute == "date" and "cudf" in str(constructor): request.applymarker(pytest.mark.xfail) + if "duckdb" in str(constructor) and attribute in ("date", "weekday", "ordinal_day"): + request.applymarker(pytest.mark.xfail) + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(getattr(nw.col("a").dt, attribute)()) @@ -118,6 +122,8 @@ def test_to_date(request: pytest.FixtureRequest, constructor: Constructor) -> No "pandas_nullable_constructor", "cudf", "modin_constructor", + "duckdb", + "pyspark", ) ): request.applymarker(pytest.mark.xfail) diff --git a/tests/expr_and_series/dt/datetime_duration_test.py b/tests/expr_and_series/dt/datetime_duration_test.py index 09f227c79..7ec281daa 100644 --- a/tests/expr_and_series/dt/datetime_duration_test.py +++ b/tests/expr_and_series/dt/datetime_duration_test.py @@ -46,6 +46,8 @@ def test_duration_attributes( ) -> None: if PANDAS_VERSION < (2, 2) and "pandas_pyarrow" in str(constructor): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/dt/timestamp_test.py b/tests/expr_and_series/dt/timestamp_test.py index e205d8179..b7e20519f 100644 --- a/tests/expr_and_series/dt/timestamp_test.py +++ b/tests/expr_and_series/dt/timestamp_test.py @@ -50,6 +50,8 @@ def test_timestamp_datetimes( time_unit: Literal["ns", "us", "ms"], expected: list[int | None], ) -> None: + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) if original_time_unit == "s" and "polars" in str(constructor): request.applymarker(pytest.mark.xfail) if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < ( @@ -90,6 +92,8 @@ def test_timestamp_datetimes_tz_aware( time_unit: Literal["ns", "us", "ms"], expected: list[int | None], ) -> None: + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) if ( (any(x in str(constructor) for x in ("pyarrow",)) and is_windows()) or ("pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) @@ -136,6 +140,8 @@ def test_timestamp_dates( time_unit: Literal["ns", "us", "ms"], expected: list[int | None], ) -> None: + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) if any( x in str(constructor) for x in ( @@ -161,6 +167,8 @@ def test_timestamp_dates( def test_timestamp_invalid_date( request: pytest.FixtureRequest, constructor: Constructor ) -> None: + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) if "polars" in str(constructor): request.applymarker(pytest.mark.xfail) data_str = {"a": ["x", "y", None]} diff --git a/tests/expr_and_series/dt/to_string_test.py b/tests/expr_and_series/dt/to_string_test.py index 629b39806..3cc3f0edd 100644 --- a/tests/expr_and_series/dt/to_string_test.py +++ b/tests/expr_and_series/dt/to_string_test.py @@ -59,7 +59,11 @@ def test_dt_to_string_series(constructor_eager: ConstructorEager, fmt: str) -> N ], ) @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") -def test_dt_to_string_expr(constructor: Constructor, fmt: str) -> None: +def test_dt_to_string_expr( + constructor: Constructor, fmt: str, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) input_frame = nw.from_native(constructor(data)) expected_col = [datetime.strftime(d, fmt) for d in data["a"]] @@ -132,8 +136,13 @@ def test_dt_to_string_iso_local_datetime_series( ) @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") def test_dt_to_string_iso_local_datetime_expr( - constructor: Constructor, data: datetime, expected: str + constructor: Constructor, + data: datetime, + expected: str, + request: pytest.FixtureRequest, ) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = constructor({"a": [data]}) result = nw.from_native(df).with_columns( @@ -166,8 +175,13 @@ def test_dt_to_string_iso_local_date_series( ) @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") def test_dt_to_string_iso_local_date_expr( - constructor: Constructor, data: datetime, expected: str + constructor: Constructor, + data: datetime, + expected: str, + request: pytest.FixtureRequest, ) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = constructor({"a": [data]}) result = nw.from_native(df).with_columns( nw.col("a").dt.to_string("%Y-%m-%d").alias("b") diff --git a/tests/expr_and_series/fill_null_test.py b/tests/expr_and_series/fill_null_test.py index 57f767d4d..39b0a3c64 100644 --- a/tests/expr_and_series/fill_null_test.py +++ b/tests/expr_and_series/fill_null_test.py @@ -12,7 +12,9 @@ from tests.utils import assert_equal_data -def test_fill_null(constructor: Constructor) -> None: +def test_fill_null(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "a": [0.0, None, 2, 3, 4], "b": [1.0, None, None, 5, 3], @@ -47,7 +49,11 @@ def test_fill_null_exceptions(constructor: Constructor) -> None: df.with_columns(nw.col("a").fill_null(strategy="invalid")) # type: ignore # noqa: PGH003 -def test_fill_null_strategies_with_limit_as_none(constructor: Constructor) -> None: +def test_fill_null_strategies_with_limit_as_none( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data_limits = { "a": [1, None, None, None, 5, 6, None, None, None, 10], "b": ["a", None, None, None, "b", "c", None, None, None, "d"], @@ -113,7 +119,11 @@ def test_fill_null_strategies_with_limit_as_none(constructor: Constructor) -> No assert_equal_data(result_backward, expected_backward) -def test_fill_null_limits(constructor: Constructor) -> None: +def test_fill_null_limits( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) context: Any = ( pytest.raises(NotImplementedError, match="The limit keyword is not supported") if "cudf" in str(constructor) diff --git a/tests/expr_and_series/is_between_test.py b/tests/expr_and_series/is_between_test.py index 57ad545c0..a24277fa5 100644 --- a/tests/expr_and_series/is_between_test.py +++ b/tests/expr_and_series/is_between_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import Literal + import pytest import narwhals.stable.v1 as nw @@ -17,7 +19,11 @@ ("none", [False, True, True, False]), ], ) -def test_is_between(constructor: Constructor, closed: str, expected: list[bool]) -> None: +def test_is_between( + constructor: Constructor, + closed: Literal["left", "right", "none", "both"], + expected: list[bool], +) -> None: data = {"a": [1, 4, 2, 5]} df = nw.from_native(constructor(data)) result = df.select(nw.col("a").is_between(1, 5, closed=closed)) @@ -43,7 +49,9 @@ def test_is_between_expressified(constructor: Constructor) -> None: ], ) def test_is_between_series( - constructor_eager: ConstructorEager, closed: str, expected: list[bool] + constructor_eager: ConstructorEager, + closed: Literal["left", "right", "none", "both"], + expected: list[bool], ) -> None: data = {"a": [1, 4, 2, 5]} df = nw.from_native(constructor_eager(data), eager_only=True) diff --git a/tests/expr_and_series/is_duplicated_test.py b/tests/expr_and_series/is_duplicated_test.py index d4ce3461f..d97d30cbd 100644 --- a/tests/expr_and_series/is_duplicated_test.py +++ b/tests/expr_and_series/is_duplicated_test.py @@ -1,12 +1,18 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data -def test_is_duplicated_expr(constructor: Constructor) -> None: +def test_is_duplicated_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 1, 2], "b": [1, 2, 3], "index": [0, 1, 2]} df = nw.from_native(constructor(data)) result = df.select(nw.col("a", "b").is_duplicated(), "index").sort("index") @@ -14,7 +20,11 @@ def test_is_duplicated_expr(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_is_duplicated_w_nulls_expr(constructor: Constructor) -> None: +def test_is_duplicated_w_nulls_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 1, None], "b": [1, None, None], "index": [0, 1, 2]} df = nw.from_native(constructor(data)) result = df.select(nw.col("a", "b").is_duplicated(), "index").sort("index") diff --git a/tests/expr_and_series/is_finite_test.py b/tests/expr_and_series/is_finite_test.py index 270ba7d52..4fb0246e9 100644 --- a/tests/expr_and_series/is_finite_test.py +++ b/tests/expr_and_series/is_finite_test.py @@ -11,7 +11,9 @@ @pytest.mark.filterwarnings("ignore:invalid value encountered in cast") -def test_is_finite_expr(constructor: Constructor) -> None: +def test_is_finite_expr(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) or "pyarrow_table" in str(constructor): expected = {"a": [False, False, True, None]} elif ( diff --git a/tests/expr_and_series/is_first_distinct_test.py b/tests/expr_and_series/is_first_distinct_test.py index 7084fb3fb..6870c3394 100644 --- a/tests/expr_and_series/is_first_distinct_test.py +++ b/tests/expr_and_series/is_first_distinct_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -11,7 +13,11 @@ } -def test_is_first_distinct_expr(constructor: Constructor) -> None: +def test_is_first_distinct_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.all().is_first_distinct()) expected = { diff --git a/tests/expr_and_series/is_last_distinct_test.py b/tests/expr_and_series/is_last_distinct_test.py index b91c171d3..9362cd02a 100644 --- a/tests/expr_and_series/is_last_distinct_test.py +++ b/tests/expr_and_series/is_last_distinct_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -11,7 +13,11 @@ } -def test_is_last_distinct_expr(constructor: Constructor) -> None: +def test_is_last_distinct_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.all().is_last_distinct()) expected = { diff --git a/tests/expr_and_series/is_nan_test.py b/tests/expr_and_series/is_nan_test.py index 806dc7535..0280d6555 100644 --- a/tests/expr_and_series/is_nan_test.py +++ b/tests/expr_and_series/is_nan_test.py @@ -24,7 +24,9 @@ ] -def test_nan(constructor: Constructor) -> None: +def test_nan(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data_na = {"int": [0, 1, None]} df = nw.from_native(constructor(data_na)).with_columns( float=nw.col("int").cast(nw.Float64), float_na=nw.col("int") / nw.col("int") @@ -93,7 +95,9 @@ def test_nan_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_nan_non_float(constructor: Constructor) -> None: +def test_nan_non_float(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) from polars.exceptions import InvalidOperationError as PlInvalidOperationError from pyarrow.lib import ArrowNotImplementedError diff --git a/tests/expr_and_series/is_null_test.py b/tests/expr_and_series/is_null_test.py index 5d5250da9..cf4d2e73b 100644 --- a/tests/expr_and_series/is_null_test.py +++ b/tests/expr_and_series/is_null_test.py @@ -1,12 +1,17 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data -def test_null(constructor: Constructor) -> None: +def test_null(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + data_na = {"a": [None, 3, 2], "z": [7.0, None, None]} expected = {"a": [True, False, False], "z": [True, False, False]} df = nw.from_native(constructor(data_na)) diff --git a/tests/expr_and_series/is_unique_test.py b/tests/expr_and_series/is_unique_test.py index b44878886..92e725623 100644 --- a/tests/expr_and_series/is_unique_test.py +++ b/tests/expr_and_series/is_unique_test.py @@ -1,12 +1,16 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data -def test_is_unique_expr(constructor: Constructor) -> None: +def test_is_unique_expr(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "a": [1, 1, 2], "b": [1, 2, 3], @@ -22,7 +26,11 @@ def test_is_unique_expr(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_is_unique_w_nulls_expr(constructor: Constructor) -> None: +def test_is_unique_w_nulls_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "a": [None, 1, 2], "b": [None, 2, None], diff --git a/tests/expr_and_series/len_test.py b/tests/expr_and_series/len_test.py index fffcbd4a3..142fe488b 100644 --- a/tests/expr_and_series/len_test.py +++ b/tests/expr_and_series/len_test.py @@ -34,7 +34,10 @@ def test_len_chaining( assert_equal_data(df, expected) -def test_namespace_len(constructor: Constructor) -> None: +def test_namespace_len(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})).select( nw.len(), a=nw.len() ) diff --git a/tests/expr_and_series/list/len_test.py b/tests/expr_and_series/list/len_test.py index 7066fc6cf..375cfc7d8 100644 --- a/tests/expr_and_series/list/len_test.py +++ b/tests/expr_and_series/list/len_test.py @@ -17,7 +17,9 @@ def test_len_expr( request: pytest.FixtureRequest, constructor: Constructor, ) -> None: - if any(backend in str(constructor) for backend in ("dask", "modin", "cudf")): + if any( + backend in str(constructor) for backend in ("dask", "modin", "cudf", "pyspark") + ): request.applymarker(pytest.mark.xfail) if "pandas" in str(constructor) and PANDAS_VERSION < (2, 2): diff --git a/tests/frame/lit_test.py b/tests/expr_and_series/lit_test.py similarity index 64% rename from tests/frame/lit_test.py rename to tests/expr_and_series/lit_test.py index f51bd5c76..f24e6d4a1 100644 --- a/tests/frame/lit_test.py +++ b/tests/expr_and_series/lit_test.py @@ -1,5 +1,6 @@ from __future__ import annotations +from datetime import date from typing import TYPE_CHECKING from typing import Any @@ -7,6 +8,8 @@ import pytest import narwhals.stable.v1 as nw +from tests.utils import DASK_VERSION +from tests.utils import PANDAS_VERSION from tests.utils import Constructor from tests.utils import assert_equal_data @@ -19,8 +22,13 @@ [(None, [2, 2, 2]), (nw.String, ["2", "2", "2"]), (nw.Float32, [2.0, 2.0, 2.0])], ) def test_lit( - constructor: Constructor, dtype: DType | None, expected_lit: list[Any] + request: pytest.FixtureRequest, + constructor: Constructor, + dtype: DType | None, + expected_lit: list[Any], ) -> None: + if "pyspark" in str(constructor) and dtype is not None: + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_raw = constructor(data) df = nw.from_native(df_raw).lazy() @@ -84,11 +92,44 @@ def test_lit_operation( expected_result: list[int], request: pytest.FixtureRequest, ) -> None: - if "dask_lazy_p2" in str(constructor) and "lit_with_agg" in col_name: + if "duckdb" in str(constructor) and col_name in ( + "left_scalar_with_agg", + "left_lit_with_agg", + "right_lit", + "right_lit_with_agg", + ): + request.applymarker(pytest.mark.xfail) + if ( + "dask" in str(constructor) + and col_name in ("left_lit", "left_scalar") + and DASK_VERSION < (2024, 10) + ): request.applymarker(pytest.mark.xfail) + if "pyspark" in str(constructor) and col_name in { + "left_lit_with_agg", + "left_scalar_with_agg", + "right_lit_with_agg", + "right_lit", + }: + request.applymarker(pytest.mark.xfail) + data = {"a": [1, 3, 2]} df_raw = constructor(data) df = nw.from_native(df_raw).lazy() result = df.select(expr.alias(col_name)) expected = {col_name: expected_result} assert_equal_data(result, expected) + + +@pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="too old for pyarrow") +def test_date_lit(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "dask" in str(constructor) or "pyspark" in str(constructor): + # https://github.com/dask/dask/issues/11637 + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor({"a": [1]})) + result = df.with_columns(nw.lit(date(2020, 1, 1), dtype=nw.Date)).collect_schema() + if df.implementation.is_cudf(): + # cudf has no date dtype + assert result == {"a": nw.Int64, "literal": nw.Datetime} + else: + assert result == {"a": nw.Int64, "literal": nw.Date} diff --git a/tests/expr_and_series/max_horizontal_test.py b/tests/expr_and_series/max_horizontal_test.py index c86e11318..9df17fed3 100644 --- a/tests/expr_and_series/max_horizontal_test.py +++ b/tests/expr_and_series/max_horizontal_test.py @@ -14,7 +14,12 @@ @pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) @pytest.mark.filterwarnings(r"ignore:.*All-NaN slice encountered:RuntimeWarning") -def test_maxh(constructor: Constructor, col_expr: Any) -> None: +def test_maxh( + request: pytest.FixtureRequest, constructor: Constructor, col_expr: Any +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select(horizontal_max=nw.max_horizontal(col_expr, nw.col("b"), "z")) expected = {"horizontal_max": expected_values} @@ -22,7 +27,10 @@ def test_maxh(constructor: Constructor, col_expr: Any) -> None: @pytest.mark.filterwarnings(r"ignore:.*All-NaN slice encountered:RuntimeWarning") -def test_maxh_all(constructor: Constructor) -> None: +def test_maxh_all(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select(nw.max_horizontal(nw.all()), c=nw.max_horizontal(nw.all())) expected = {"a": expected_values, "c": expected_values} diff --git a/tests/expr_and_series/mean_horizontal_test.py b/tests/expr_and_series/mean_horizontal_test.py index 485bf1750..5ed472e31 100644 --- a/tests/expr_and_series/mean_horizontal_test.py +++ b/tests/expr_and_series/mean_horizontal_test.py @@ -10,7 +10,11 @@ @pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) -def test_meanh(constructor: Constructor, col_expr: Any) -> None: +def test_meanh( + constructor: Constructor, col_expr: Any, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, None, None], "b": [4, None, 6, None]} df = nw.from_native(constructor(data)) result = df.select(horizontal_mean=nw.mean_horizontal(col_expr, nw.col("b"))) @@ -18,7 +22,9 @@ def test_meanh(constructor: Constructor, col_expr: Any) -> None: assert_equal_data(result, expected) -def test_meanh_all(constructor: Constructor) -> None: +def test_meanh_all(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [2, 4, 6], "b": [10, 20, 30]} df = nw.from_native(constructor(data)) result = df.select(nw.mean_horizontal(nw.all())) diff --git a/tests/expr_and_series/median_test.py b/tests/expr_and_series/median_test.py index 7c50988dc..9c509a182 100644 --- a/tests/expr_and_series/median_test.py +++ b/tests/expr_and_series/median_test.py @@ -41,16 +41,17 @@ def test_median_series( @pytest.mark.parametrize("expr", [nw.col("s").median(), nw.median("s")]) def test_median_expr_raises_on_str( - constructor: Constructor, - expr: nw.Expr, + constructor: Constructor, expr: nw.Expr, request: pytest.FixtureRequest ) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) from polars.exceptions import InvalidOperationError as PlInvalidOperationError df = nw.from_native(constructor(data)) - if "polars_lazy" in str(constructor): + if isinstance(df, nw.LazyFrame): with pytest.raises( - PlInvalidOperationError, - match="`median` operation not supported for dtype `str`", + (InvalidOperationError, PlInvalidOperationError), + match="`median` operation not supported", ): df.select(expr).lazy().collect() else: diff --git a/tests/expr_and_series/min_horizontal_test.py b/tests/expr_and_series/min_horizontal_test.py index 787e3e2a4..bbb0b9149 100644 --- a/tests/expr_and_series/min_horizontal_test.py +++ b/tests/expr_and_series/min_horizontal_test.py @@ -14,7 +14,12 @@ @pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) @pytest.mark.filterwarnings(r"ignore:.*All-NaN slice encountered:RuntimeWarning") -def test_minh(constructor: Constructor, col_expr: Any) -> None: +def test_minh( + request: pytest.FixtureRequest, constructor: Constructor, col_expr: Any +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select(horizontal_min=nw.min_horizontal(col_expr, nw.col("b"), "z")) expected = {"horizontal_min": expected_values} @@ -22,7 +27,10 @@ def test_minh(constructor: Constructor, col_expr: Any) -> None: @pytest.mark.filterwarnings(r"ignore:.*All-NaN slice encountered:RuntimeWarning") -def test_minh_all(constructor: Constructor) -> None: +def test_minh_all(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select(nw.min_horizontal(nw.all()), c=nw.min_horizontal(nw.all())) expected = {"a": expected_values, "c": expected_values} diff --git a/tests/expr_and_series/n_unique_test.py b/tests/expr_and_series/n_unique_test.py index 90bffb04b..cfa14e0d7 100644 --- a/tests/expr_and_series/n_unique_test.py +++ b/tests/expr_and_series/n_unique_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -11,7 +13,9 @@ } -def test_n_unique(constructor: Constructor) -> None: +def test_n_unique(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.all().n_unique()) expected = {"a": [3], "b": [4]} diff --git a/tests/expr_and_series/name/keep_test.py b/tests/expr_and_series/name/keep_test.py index 6c89d09fc..e382db733 100644 --- a/tests/expr_and_series/name/keep_test.py +++ b/tests/expr_and_series/name/keep_test.py @@ -12,21 +12,34 @@ data = {"foo": [1, 2, 3], "BAR": [4, 5, 6]} -def test_keep(constructor: Constructor) -> None: +def test_keep(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo", "BAR") * 2).name.keep()) expected = {k: [e * 2 for e in v] for k, v in data.items()} assert_equal_data(result, expected) -def test_keep_after_alias(constructor: Constructor) -> None: +def test_keep_after_alias( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo")).alias("alias_for_foo").name.keep()) expected = {"foo": data["foo"]} assert_equal_data(result, expected) -def test_keep_raise_anonymous(constructor: Constructor) -> None: +def test_keep_raise_anonymous( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df_raw = constructor(data) df = nw.from_native(df_raw) diff --git a/tests/expr_and_series/name/map_test.py b/tests/expr_and_series/name/map_test.py index 5afda2ee8..276138ef9 100644 --- a/tests/expr_and_series/name/map_test.py +++ b/tests/expr_and_series/name/map_test.py @@ -16,21 +16,34 @@ def map_func(s: str | None) -> str: return str(s)[::-1].lower() -def test_map(constructor: Constructor) -> None: +def test_map(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo", "BAR") * 2).name.map(function=map_func)) expected = {map_func(k): [e * 2 for e in v] for k, v in data.items()} assert_equal_data(result, expected) -def test_map_after_alias(constructor: Constructor) -> None: +def test_map_after_alias( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo")).alias("alias_for_foo").name.map(function=map_func)) expected = {map_func("foo"): data["foo"]} assert_equal_data(result, expected) -def test_map_raise_anonymous(constructor: Constructor) -> None: +def test_map_raise_anonymous( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df_raw = constructor(data) df = nw.from_native(df_raw) diff --git a/tests/expr_and_series/name/prefix_test.py b/tests/expr_and_series/name/prefix_test.py index 6f3fb3c9b..934d1d664 100644 --- a/tests/expr_and_series/name/prefix_test.py +++ b/tests/expr_and_series/name/prefix_test.py @@ -13,21 +13,34 @@ prefix = "with_prefix_" -def test_prefix(constructor: Constructor) -> None: +def test_prefix(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo", "BAR") * 2).name.prefix(prefix)) expected = {prefix + str(k): [e * 2 for e in v] for k, v in data.items()} assert_equal_data(result, expected) -def test_suffix_after_alias(constructor: Constructor) -> None: +def test_suffix_after_alias( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo")).alias("alias_for_foo").name.prefix(prefix)) expected = {prefix + "foo": data["foo"]} assert_equal_data(result, expected) -def test_prefix_raise_anonymous(constructor: Constructor) -> None: +def test_prefix_raise_anonymous( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df_raw = constructor(data) df = nw.from_native(df_raw) diff --git a/tests/expr_and_series/name/suffix_test.py b/tests/expr_and_series/name/suffix_test.py index 1c5816154..479546630 100644 --- a/tests/expr_and_series/name/suffix_test.py +++ b/tests/expr_and_series/name/suffix_test.py @@ -13,21 +13,34 @@ suffix = "_with_suffix" -def test_suffix(constructor: Constructor) -> None: +def test_suffix(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo", "BAR") * 2).name.suffix(suffix)) expected = {str(k) + suffix: [e * 2 for e in v] for k, v in data.items()} assert_equal_data(result, expected) -def test_suffix_after_alias(constructor: Constructor) -> None: +def test_suffix_after_alias( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo")).alias("alias_for_foo").name.suffix(suffix)) expected = {"foo" + suffix: data["foo"]} assert_equal_data(result, expected) -def test_suffix_raise_anonymous(constructor: Constructor) -> None: +def test_suffix_raise_anonymous( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df_raw = constructor(data) df = nw.from_native(df_raw) diff --git a/tests/expr_and_series/name/to_lowercase_test.py b/tests/expr_and_series/name/to_lowercase_test.py index 882663f60..1b39fc726 100644 --- a/tests/expr_and_series/name/to_lowercase_test.py +++ b/tests/expr_and_series/name/to_lowercase_test.py @@ -12,21 +12,34 @@ data = {"foo": [1, 2, 3], "BAR": [4, 5, 6]} -def test_to_lowercase(constructor: Constructor) -> None: +def test_to_lowercase(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("foo", "BAR") * 2).name.to_lowercase()) expected = {k.lower(): [e * 2 for e in v] for k, v in data.items()} assert_equal_data(result, expected) -def test_to_lowercase_after_alias(constructor: Constructor) -> None: +def test_to_lowercase_after_alias( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select((nw.col("BAR")).alias("ALIAS_FOR_BAR").name.to_lowercase()) expected = {"bar": data["BAR"]} assert_equal_data(result, expected) -def test_to_lowercase_raise_anonymous(constructor: Constructor) -> None: +def test_to_lowercase_raise_anonymous( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df_raw = constructor(data) df = nw.from_native(df_raw) diff --git a/tests/expr_and_series/name/to_uppercase_test.py b/tests/expr_and_series/name/to_uppercase_test.py index 785da4957..e6703212d 100644 --- a/tests/expr_and_series/name/to_uppercase_test.py +++ b/tests/expr_and_series/name/to_uppercase_test.py @@ -12,21 +12,31 @@ data = {"foo": [1, 2, 3], "BAR": [4, 5, 6]} -def test_to_uppercase(constructor: Constructor) -> None: +def test_to_uppercase(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select((nw.col("foo", "BAR") * 2).name.to_uppercase()) expected = {k.upper(): [e * 2 for e in v] for k, v in data.items()} assert_equal_data(result, expected) -def test_to_uppercase_after_alias(constructor: Constructor) -> None: +def test_to_uppercase_after_alias( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select((nw.col("foo")).alias("alias_for_foo").name.to_uppercase()) expected = {"FOO": data["foo"]} assert_equal_data(result, expected) -def test_to_uppercase_raise_anonymous(constructor: Constructor) -> None: +def test_to_uppercase_raise_anonymous( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if any(x in str(constructor) for x in ("duckdb", "pyspark")): + request.applymarker(pytest.mark.xfail) df_raw = constructor(data) df = nw.from_native(df_raw) diff --git a/tests/expr_and_series/nth_test.py b/tests/expr_and_series/nth_test.py index 8179fb261..a7dc7f648 100644 --- a/tests/expr_and_series/nth_test.py +++ b/tests/expr_and_series/nth_test.py @@ -25,6 +25,8 @@ def test_nth( expected: dict[str, list[int]], request: pytest.FixtureRequest, ) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) and POLARS_VERSION < (1, 0, 0): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/null_count_test.py b/tests/expr_and_series/null_count_test.py index 0f2250713..3bd15c66c 100644 --- a/tests/expr_and_series/null_count_test.py +++ b/tests/expr_and_series/null_count_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -11,7 +13,11 @@ } -def test_null_count_expr(constructor: Constructor) -> None: +def test_null_count_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.all().null_count()) expected = { diff --git a/tests/expr_and_series/operators_test.py b/tests/expr_and_series/operators_test.py index ff01747a6..f36d853d4 100644 --- a/tests/expr_and_series/operators_test.py +++ b/tests/expr_and_series/operators_test.py @@ -3,6 +3,7 @@ import pytest import narwhals.stable.v1 as nw +from tests.utils import DASK_VERSION from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data @@ -20,7 +21,9 @@ ], ) def test_comparand_operators_scalar_expr( - constructor: Constructor, operator: str, expected: list[bool] + constructor: Constructor, + operator: str, + expected: list[bool], ) -> None: data = {"a": [0, 1, 2]} df = nw.from_native(constructor(data)) @@ -40,7 +43,9 @@ def test_comparand_operators_scalar_expr( ], ) def test_comparand_operators_expr( - constructor: Constructor, operator: str, expected: list[bool] + constructor: Constructor, + operator: str, + expected: list[bool], ) -> None: data = {"a": [0, 1, 1], "b": [0, 0, 2]} df = nw.from_native(constructor(data)) @@ -56,7 +61,9 @@ def test_comparand_operators_expr( ], ) def test_logic_operators_expr( - constructor: Constructor, operator: str, expected: list[bool] + constructor: Constructor, + operator: str, + expected: list[bool], ) -> None: data = {"a": [True, True, False, False], "b": [True, False, True, False]} df = nw.from_native(constructor(data)) @@ -75,8 +82,17 @@ def test_logic_operators_expr( ], ) def test_logic_operators_expr_scalar( - constructor: Constructor, operator: str, expected: list[bool] + constructor: Constructor, + operator: str, + expected: list[bool], + request: pytest.FixtureRequest, ) -> None: + if ( + "dask" in str(constructor) + and DASK_VERSION < (2024, 10) + and operator in ("__rand__", "__ror__") + ) or ("pyspark" in str(constructor) and operator in ("__and__", "__or__")): + request.applymarker(pytest.mark.xfail) data = {"a": [True, True, False, False]} df = nw.from_native(constructor(data)) diff --git a/tests/expr_and_series/over_test.py b/tests/expr_and_series/over_test.py index a67c7973b..45b64eba0 100644 --- a/tests/expr_and_series/over_test.py +++ b/tests/expr_and_series/over_test.py @@ -24,6 +24,8 @@ def test_over_single(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "dask_lazy_p2" in str(constructor): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) expected = { @@ -40,6 +42,8 @@ def test_over_single(request: pytest.FixtureRequest, constructor: Constructor) - def test_over_multiple(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "dask_lazy_p2" in str(constructor): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) expected = { @@ -56,6 +60,8 @@ def test_over_multiple(request: pytest.FixtureRequest, constructor: Constructor) def test_over_invalid(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "polars" in str(constructor): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) with pytest.raises(ValueError, match="Anonymous expressions"): @@ -67,6 +73,8 @@ def test_over_cumsum(request: pytest.FixtureRequest, constructor: Constructor) - request.applymarker(pytest.mark.xfail) if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_cum)) expected = { @@ -84,6 +92,8 @@ def test_over_cumsum(request: pytest.FixtureRequest, constructor: Constructor) - def test_over_cumcount(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "pyarrow_table" in str(constructor) or "dask_lazy_p2" in str(constructor): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_cum)) expected = { @@ -101,10 +111,12 @@ def test_over_cumcount(request: pytest.FixtureRequest, constructor: Constructor) def test_over_cummax(request: pytest.FixtureRequest, constructor: Constructor) -> None: - if "pyarrow_table" in str(constructor) or "dask_lazy_p2" in str(constructor): + if any(x in str(constructor) for x in ("pyarrow_table", "dask_lazy_p2", "duckdb")): request.applymarker(pytest.mark.xfail) if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_cum)) expected = { "a": ["a", "a", "b", "b", "b"], @@ -120,9 +132,10 @@ def test_over_cummax(request: pytest.FixtureRequest, constructor: Constructor) - def test_over_cummin(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "pyarrow_table" in str(constructor) or "dask_lazy_p2" in str(constructor): request.applymarker(pytest.mark.xfail) - if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_cum)) expected = { @@ -138,11 +151,12 @@ def test_over_cummin(request: pytest.FixtureRequest, constructor: Constructor) - def test_over_cumprod(request: pytest.FixtureRequest, constructor: Constructor) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "dask_lazy_p2")): + if any(x in str(constructor) for x in ("pyarrow_table", "dask_lazy_p2", "duckdb")): request.applymarker(pytest.mark.xfail) - if "pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2, 1): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data_cum)) expected = { @@ -170,6 +184,8 @@ def test_over_shift(request: pytest.FixtureRequest, constructor: Constructor) -> constructor ) or "dask_lazy_p2_constructor" in str(constructor): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) expected = { diff --git a/tests/expr_and_series/quantile_test.py b/tests/expr_and_series/quantile_test.py index ae707e739..a9207cebd 100644 --- a/tests/expr_and_series/quantile_test.py +++ b/tests/expr_and_series/quantile_test.py @@ -28,7 +28,10 @@ def test_quantile_expr( expected: dict[str, list[float]], request: pytest.FixtureRequest, ) -> None: - if "dask" in str(constructor) and interpolation != "linear": + if ( + any(x in str(constructor) for x in ("dask", "duckdb")) + and interpolation != "linear" + ) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) q = 0.3 diff --git a/tests/expr_and_series/rank_test.py b/tests/expr_and_series/rank_test.py new file mode 100644 index 000000000..99a64371e --- /dev/null +++ b/tests/expr_and_series/rank_test.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +from contextlib import nullcontext as does_not_raise +from typing import Literal + +import pytest + +import narwhals.stable.v1 as nw +from tests.utils import PANDAS_VERSION +from tests.utils import ConstructorEager +from tests.utils import assert_equal_data + +rank_methods = ["average", "min", "max", "dense", "ordinal"] + +data_int = {"a": [3, 6, 1, 1, None, 6], "b": [1, 1, 2, 1, 2, 2]} +data_float = {"a": [3.1, 6.1, 1.5, 1.5, None, 6.1], "b": [1, 1, 2, 1, 2, 2]} + +expected = { + "average": [3.0, 4.5, 1.5, 1.5, None, 4.5], + "min": [3, 4, 1, 1, None, 4], + "max": [3, 5, 2, 2, None, 5], + "dense": [2, 3, 1, 1, None, 3], + "ordinal": [3, 4, 1, 2, None, 5], +} + +expected_over = { + "average": [2.0, 3.0, 1.0, 1.0, None, 2.0], + "min": [2, 3, 1, 1, None, 2], + "max": [2, 3, 1, 1, None, 2], + "dense": [2, 3, 1, 1, None, 2], + "ordinal": [2, 3, 1, 1, None, 2], +} + + +@pytest.mark.parametrize("method", rank_methods) +@pytest.mark.parametrize("data", [data_int, data_float]) +def test_rank_expr( + request: pytest.FixtureRequest, + constructor_eager: ConstructorEager, + method: Literal["average", "min", "max", "dense", "ordinal"], + data: dict[str, list[float]], +) -> None: + if ( + "pandas_pyarrow" in str(constructor_eager) + and PANDAS_VERSION < (2, 1) + and isinstance(data["a"][0], int) + ): + request.applymarker(pytest.mark.xfail) + + context = ( + pytest.raises( + ValueError, + match=r"`rank` with `method='average' is not supported for pyarrow backend.", + ) + if "pyarrow_table" in str(constructor_eager) and method == "average" + else does_not_raise() + ) + + with context: + df = nw.from_native(constructor_eager(data)) + + result = df.select(nw.col("a").rank(method=method)) + expected_data = {"a": expected[method]} + assert_equal_data(result, expected_data) + + +@pytest.mark.parametrize("method", rank_methods) +@pytest.mark.parametrize("data", [data_int, data_float]) +def test_rank_series( + request: pytest.FixtureRequest, + constructor_eager: ConstructorEager, + method: Literal["average", "min", "max", "dense", "ordinal"], + data: dict[str, list[float]], +) -> None: + if ( + "pandas_pyarrow" in str(constructor_eager) + and PANDAS_VERSION < (2, 1) + and isinstance(data["a"][0], int) + ): + request.applymarker(pytest.mark.xfail) + + context = ( + pytest.raises( + ValueError, + match=r"`rank` with `method='average' is not supported for pyarrow backend.", + ) + if "pyarrow_table" in str(constructor_eager) and method == "average" + else does_not_raise() + ) + + with context: + df = nw.from_native(constructor_eager(data), eager_only=True) + + result = {"a": df["a"].rank(method=method)} + expected_data = {"a": expected[method]} + assert_equal_data(result, expected_data) + + +@pytest.mark.parametrize("method", rank_methods) +def test_rank_expr_in_over_context( + request: pytest.FixtureRequest, + constructor_eager: ConstructorEager, + method: Literal["average", "min", "max", "dense", "ordinal"], +) -> None: + if any(x in str(constructor_eager) for x in ("pyarrow_table", "dask")): + # Pyarrow raises: + # > pyarrow.lib.ArrowKeyError: No function registered with name: hash_rank + # We can handle that to provide a better error message. + request.applymarker(pytest.mark.xfail) + + if "pandas_pyarrow" in str(constructor_eager) and PANDAS_VERSION < (2, 1): + request.applymarker(pytest.mark.xfail) + + df = nw.from_native(constructor_eager(data_float)) + + result = df.select(nw.col("a").rank(method=method).over("b")) + expected_data = {"a": expected_over[method]} + assert_equal_data(result, expected_data) + + +def test_invalid_method_raise(constructor_eager: ConstructorEager) -> None: + method = "invalid_method_name" + df = nw.from_native(constructor_eager(data_float)) + + msg = ( + "Ranking method must be one of {'average', 'min', 'max', 'dense', 'ordinal'}. " + f"Found '{method}'" + ) + + with pytest.raises(ValueError, match=msg): + df.select(nw.col("a").rank(method=method)) # type: ignore[arg-type] + + with pytest.raises(ValueError, match=msg): + df.lazy().collect()["a"].rank(method=method) # type: ignore[arg-type] diff --git a/tests/expr_and_series/reduction_test.py b/tests/expr_and_series/reduction_test.py index 3b579d9f3..49a3fddba 100644 --- a/tests/expr_and_series/reduction_test.py +++ b/tests/expr_and_series/reduction_test.py @@ -28,8 +28,21 @@ ids=range(5), ) def test_scalar_reduction_select( - constructor: Constructor, expr: list[Any], expected: dict[str, list[Any]] + constructor: Constructor, + expr: list[Any], + expected: dict[str, list[Any]], + request: pytest.FixtureRequest, ) -> None: + if "pyspark" in str(constructor) and request.node.callspec.id in { + "pyspark-2", + "pyspark-3", + "pyspark-4", + }: + request.applymarker(pytest.mark.xfail) + + if "duckdb" in str(constructor) and request.node.callspec.id not in {"duckdb-0"}: + request.applymarker(pytest.mark.xfail) + data = {"a": [1, 2, 3], "b": [4, 5, 6]} df = nw.from_native(constructor(data)) result = df.select(*expr) @@ -54,15 +67,26 @@ def test_scalar_reduction_select( ids=range(5), ) def test_scalar_reduction_with_columns( - constructor: Constructor, expr: list[Any], expected: dict[str, list[Any]] + constructor: Constructor, + expr: list[Any], + expected: dict[str, list[Any]], + request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor) or ( + "pyspark" in str(constructor) and request.node.callspec.id != "pyspark-1" + ): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 2, 3], "b": [4, 5, 6]} df = nw.from_native(constructor(data)) result = df.with_columns(*expr).select(*expected.keys()) assert_equal_data(result, expected) -def test_empty_scalar_reduction_select(constructor: Constructor) -> None: +def test_empty_scalar_reduction_select( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "pyspark" in str(constructor) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "str": [*"abcde"], "int": [0, 1, 2, 3, 4], @@ -91,7 +115,11 @@ def test_empty_scalar_reduction_select(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_empty_scalar_reduction_with_columns(constructor: Constructor) -> None: +def test_empty_scalar_reduction_with_columns( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "pyspark" in str(constructor) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) from itertools import chain data = { diff --git a/tests/expr_and_series/replace_strict_test.py b/tests/expr_and_series/replace_strict_test.py index b1449af24..33c56bae6 100644 --- a/tests/expr_and_series/replace_strict_test.py +++ b/tests/expr_and_series/replace_strict_test.py @@ -23,6 +23,8 @@ def test_replace_strict( ) -> None: if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 2, 3]})) result = df.select( nw.col("a").replace_strict( @@ -58,6 +60,8 @@ def test_replace_non_full( if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 2, 3]})) if isinstance(df, nw.LazyFrame): with pytest.raises((ValueError, PolarsError)): @@ -77,6 +81,8 @@ def test_replace_strict_mapping( ) -> None: if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 2, 3]})) result = df.select( diff --git a/tests/expr_and_series/replace_time_zone_test.py b/tests/expr_and_series/replace_time_zone_test.py index 94367d1e1..6876c318a 100644 --- a/tests/expr_and_series/replace_time_zone_test.py +++ b/tests/expr_and_series/replace_time_zone_test.py @@ -26,6 +26,8 @@ def test_replace_time_zone( or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) or ("cudf" in str(constructor)) + or ("duckdb" in str(constructor)) + or ("pyspark" in str(constructor)) ): request.applymarker(pytest.mark.xfail) data = { @@ -52,6 +54,8 @@ def test_replace_time_zone_none( or ("pandas_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) or ("modin_pyarrow" in str(constructor) and PANDAS_VERSION < (2,)) or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (12,)) + or ("duckdb" in str(constructor)) + or ("pyspark" in str(constructor)) ): request.applymarker(pytest.mark.xfail) data = { diff --git a/tests/expr_and_series/rolling_mean_test.py b/tests/expr_and_series/rolling_mean_test.py index 2fb6a47fb..b91ecb27e 100644 --- a/tests/expr_and_series/rolling_mean_test.py +++ b/tests/expr_and_series/rolling_mean_test.py @@ -76,6 +76,7 @@ def test_rolling_mean_series(constructor_eager: ConstructorEager) -> None: @pytest.mark.skipif(PANDAS_VERSION < (1,), reason="too old for pyarrow") @pytest.mark.slow @pytest.mark.filterwarnings("ignore:.*:narwhals.exceptions.NarwhalsUnstableWarning") +@pytest.mark.filterwarnings("ignore:.*is_sparse is deprecated:DeprecationWarning") def test_rolling_mean_hypothesis(center: bool, values: list[float]) -> None: # noqa: FBT001 s = pd.Series(values) n_missing = random.randint(0, len(s) - 1) # noqa: S311 diff --git a/tests/expr_and_series/rolling_sum_test.py b/tests/expr_and_series/rolling_sum_test.py index 0e3951958..f63786051 100644 --- a/tests/expr_and_series/rolling_sum_test.py +++ b/tests/expr_and_series/rolling_sum_test.py @@ -194,6 +194,7 @@ def test_rolling_sum_series_invalid_params( ) @pytest.mark.skipif(PANDAS_VERSION < (1,), reason="too old for pyarrow") @pytest.mark.filterwarnings("ignore:.*:narwhals.exceptions.NarwhalsUnstableWarning") +@pytest.mark.filterwarnings("ignore:.*is_sparse is deprecated:DeprecationWarning") @pytest.mark.slow def test_rolling_sum_hypothesis(center: bool, values: list[float]) -> None: # noqa: FBT001 s = pd.Series(values) diff --git a/tests/expr_and_series/rolling_var_test.py b/tests/expr_and_series/rolling_var_test.py index 4d4c596d9..3e5bb0c96 100644 --- a/tests/expr_and_series/rolling_var_test.py +++ b/tests/expr_and_series/rolling_var_test.py @@ -105,6 +105,7 @@ def test_rolling_var_series( ) @pytest.mark.skipif(PANDAS_VERSION < (1,), reason="too old for pyarrow") @pytest.mark.skipif(POLARS_VERSION < (1,), reason="different null behavior") +@pytest.mark.filterwarnings("ignore:.*is_sparse is deprecated:DeprecationWarning") @pytest.mark.filterwarnings("ignore:.*:narwhals.exceptions.NarwhalsUnstableWarning") def test_rolling_var_hypothesis(center: bool, values: list[float]) -> None: # noqa: FBT001 s = pd.Series(values) diff --git a/tests/expr_and_series/shift_test.py b/tests/expr_and_series/shift_test.py index 379f40986..4f7894939 100644 --- a/tests/expr_and_series/shift_test.py +++ b/tests/expr_and_series/shift_test.py @@ -1,6 +1,7 @@ from __future__ import annotations import pyarrow as pa +import pytest import narwhals.stable.v1 as nw from tests.utils import Constructor @@ -15,7 +16,9 @@ } -def test_shift(constructor: Constructor) -> None: +def test_shift(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.with_columns(nw.col("a", "b", "c").shift(2)).filter(nw.col("i") > 1) expected = { diff --git a/tests/expr_and_series/std_test.py b/tests/expr_and_series/std_test.py index b83100801..f2eabf4f2 100644 --- a/tests/expr_and_series/std_test.py +++ b/tests/expr_and_series/std_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +from contextlib import nullcontext as does_not_raise + import pytest import narwhals.stable.v1 as nw @@ -24,10 +26,27 @@ def test_std(constructor: Constructor, input_data: dict[str, list[float | None]] result = df.select( nw.col("a").std(ddof=1).alias("a_ddof_1"), nw.col("a").std(ddof=0).alias("a_ddof_0"), - nw.col("b").std(ddof=2).alias("b_ddof_2"), nw.col("z").std(ddof=0).alias("z_ddof_0"), ) + expected_results = { + "a_ddof_1": [1.0], + "a_ddof_0": [0.816497], + "z_ddof_0": [0.816497], + } assert_equal_data(result, expected_results) + context = ( + pytest.raises(NotImplementedError) + if "duckdb" in str(constructor) + else does_not_raise() + ) + with context: + result = df.select( + nw.col("b").std(ddof=2).alias("b_ddof_2"), + ) + expected_results = { + "b_ddof_2": [1.632993], + } + assert_equal_data(result, expected_results) @pytest.mark.parametrize("input_data", [data, data_with_nulls]) diff --git a/tests/expr_and_series/str/contains_test.py b/tests/expr_and_series/str/contains_test.py index 06c6913aa..c1024d53a 100644 --- a/tests/expr_and_series/str/contains_test.py +++ b/tests/expr_and_series/str/contains_test.py @@ -13,7 +13,7 @@ def test_contains_case_insensitive( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "cudf" in str(constructor): + if "cudf" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) @@ -40,7 +40,12 @@ def test_contains_series_case_insensitive( assert_equal_data(result, expected) -def test_contains_case_sensitive(constructor: Constructor) -> None: +def test_contains_case_sensitive( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select(nw.col("pets").str.contains("parrot|Dove").alias("default_match")) expected = { @@ -58,7 +63,12 @@ def test_contains_series_case_sensitive(constructor_eager: ConstructorEager) -> assert_equal_data(result, expected) -def test_contains_literal(constructor: Constructor) -> None: +def test_contains_literal( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select( nw.col("pets").str.contains("Parrot|dove").alias("default_match"), diff --git a/tests/expr_and_series/str/head_test.py b/tests/expr_and_series/str/head_test.py index cf6cbd758..97fbbc6f3 100644 --- a/tests/expr_and_series/str/head_test.py +++ b/tests/expr_and_series/str/head_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -8,7 +10,10 @@ data = {"a": ["foo", "bars"]} -def test_str_head(constructor: Constructor) -> None: +def test_str_head(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select(nw.col("a").str.head(3)) expected = { diff --git a/tests/expr_and_series/str/len_chars_test.py b/tests/expr_and_series/str/len_chars_test.py index f9c63e01c..812f193b2 100644 --- a/tests/expr_and_series/str/len_chars_test.py +++ b/tests/expr_and_series/str/len_chars_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -8,7 +10,9 @@ data = {"a": ["foo", "foobar", "Café", "345", "東京"]} -def test_str_len_chars(constructor: Constructor) -> None: +def test_str_len_chars(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.col("a").str.len_chars()) expected = { diff --git a/tests/expr_and_series/str/replace_test.py b/tests/expr_and_series/str/replace_test.py index ffd8fce2e..53904be73 100644 --- a/tests/expr_and_series/str/replace_test.py +++ b/tests/expr_and_series/str/replace_test.py @@ -93,6 +93,7 @@ def test_str_replace_all_series( ) def test_str_replace_expr( constructor: Constructor, + request: pytest.FixtureRequest, data: dict[str, list[str]], pattern: str, value: str, @@ -100,8 +101,9 @@ def test_str_replace_expr( literal: bool, # noqa: FBT001 expected: dict[str, list[str]], ) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result_df = df.select( nw.col("a").str.replace(pattern=pattern, value=value, n=n, literal=literal) ) @@ -114,14 +116,18 @@ def test_str_replace_expr( ) def test_str_replace_all_expr( constructor: Constructor, + request: pytest.FixtureRequest, data: dict[str, list[str]], pattern: str, value: str, literal: bool, # noqa: FBT001 expected: dict[str, list[str]], ) -> None: + if ("pyspark" in str(constructor)) or ( + "duckdb" in str(constructor) and literal is False + ): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result = df.select( nw.col("a").str.replace_all(pattern=pattern, value=value, literal=literal) ) diff --git a/tests/expr_and_series/str/slice_test.py b/tests/expr_and_series/str/slice_test.py index 1e7115a8a..6f9b4dc4f 100644 --- a/tests/expr_and_series/str/slice_test.py +++ b/tests/expr_and_series/str/slice_test.py @@ -17,8 +17,15 @@ [(1, 2, {"a": ["da", "df"]}), (-2, None, {"a": ["as", "as"]})], ) def test_str_slice( - constructor: Constructor, offset: int, length: int | None, expected: Any + request: pytest.FixtureRequest, + constructor: Constructor, + offset: int, + length: int | None, + expected: Any, ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result_frame = df.select(nw.col("a").str.slice(offset, length)) assert_equal_data(result_frame, expected) diff --git a/tests/expr_and_series/str/starts_with_ends_with_test.py b/tests/expr_and_series/str/starts_with_ends_with_test.py index 0b11a7537..dac70c288 100644 --- a/tests/expr_and_series/str/starts_with_ends_with_test.py +++ b/tests/expr_and_series/str/starts_with_ends_with_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -11,7 +13,10 @@ data = {"a": ["fdas", "edfas"]} -def test_ends_with(constructor: Constructor) -> None: +def test_ends_with(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.select(nw.col("a").str.ends_with("das")) expected = { @@ -29,7 +34,10 @@ def test_ends_with_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_starts_with(constructor: Constructor) -> None: +def test_starts_with(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)).lazy() result = df.select(nw.col("a").str.starts_with("fda")) expected = { diff --git a/tests/expr_and_series/str/strip_chars_test.py b/tests/expr_and_series/str/strip_chars_test.py index d765e99e3..f369bbbf9 100644 --- a/tests/expr_and_series/str/strip_chars_test.py +++ b/tests/expr_and_series/str/strip_chars_test.py @@ -20,8 +20,13 @@ ], ) def test_str_strip_chars( - constructor: Constructor, characters: str | None, expected: Any + request: pytest.FixtureRequest, + constructor: Constructor, + characters: str | None, + expected: Any, ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result_frame = df.select(nw.col("a").str.strip_chars(characters)) assert_equal_data(result_frame, expected) diff --git a/tests/expr_and_series/str/tail_test.py b/tests/expr_and_series/str/tail_test.py index e2543de0a..cdb2c024e 100644 --- a/tests/expr_and_series/str/tail_test.py +++ b/tests/expr_and_series/str/tail_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import ConstructorEager @@ -8,7 +10,9 @@ data = {"a": ["foo", "bars"]} -def test_str_tail(constructor: Constructor) -> None: +def test_str_tail(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) expected = {"a": ["foo", "ars"]} diff --git a/tests/expr_and_series/str/to_datetime_test.py b/tests/expr_and_series/str/to_datetime_test.py index 388ef23db..bfb2a4dfb 100644 --- a/tests/expr_and_series/str/to_datetime_test.py +++ b/tests/expr_and_series/str/to_datetime_test.py @@ -17,7 +17,9 @@ data = {"a": ["2020-01-01T12:34:56"]} -def test_to_datetime(constructor: Constructor) -> None: +def test_to_datetime(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "cudf" in str(constructor): expected = "2020-01-01T12:34:56.000000000" else: @@ -78,6 +80,8 @@ def test_to_datetime_infer_fmt( request.applymarker(pytest.mark.xfail) if "cudf" in str(constructor): expected = expected_cudf + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) result = ( nw.from_native(constructor(data)) .lazy() @@ -126,7 +130,11 @@ def test_to_datetime_series_infer_fmt( assert str(result) == expected -def test_to_datetime_infer_fmt_from_date(constructor: Constructor) -> None: +def test_to_datetime_infer_fmt_from_date( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"z": ["2020-01-01", "2020-01-02", None]} expected = [datetime(2020, 1, 1), datetime(2020, 1, 2), None] result = ( diff --git a/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py b/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py index 1d0eb8834..087e26a0e 100644 --- a/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py +++ b/tests/expr_and_series/str/to_uppercase_to_lowercase_test.py @@ -30,8 +30,8 @@ def test_str_to_uppercase( expected: dict[str, list[str]], request: pytest.FixtureRequest, ) -> None: - df = nw.from_native(constructor(data)) - result_frame = df.select(nw.col("a").str.to_uppercase()) + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) if any("ß" in s for value in data.values() for s in value) & ( constructor.__name__ @@ -39,6 +39,7 @@ def test_str_to_uppercase( "pandas_pyarrow_constructor", "pyarrow_table_constructor", "modin_pyarrow_constructor", + "duckdb_lazy_constructor", ) or ("dask" in str(constructor) and PYARROW_VERSION >= (12,)) ): @@ -47,6 +48,9 @@ def test_str_to_uppercase( # smaller cap 'ß' to upper cap 'ẞ' instead of 'SS' request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) + result_frame = df.select(nw.col("a").str.to_uppercase()) + assert_equal_data(result_frame, expected) @@ -80,6 +84,7 @@ def test_str_to_uppercase_series( "pandas_nullable_constructor", "polars_eager_constructor", "cudf_constructor", + "duckdb_lazy_constructor", "modin_constructor", ) ): @@ -108,10 +113,13 @@ def test_str_to_uppercase_series( ], ) def test_str_to_lowercase( + request: pytest.FixtureRequest, constructor: Constructor, data: dict[str, list[str]], expected: dict[str, list[str]], ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result_frame = df.select(nw.col("a").str.to_lowercase()) assert_equal_data(result_frame, expected) diff --git a/tests/expr_and_series/sum_horizontal_test.py b/tests/expr_and_series/sum_horizontal_test.py index 21bd138c2..decb65c02 100644 --- a/tests/expr_and_series/sum_horizontal_test.py +++ b/tests/expr_and_series/sum_horizontal_test.py @@ -10,7 +10,11 @@ @pytest.mark.parametrize("col_expr", [nw.col("a"), "a"]) -def test_sumh(constructor: Constructor, col_expr: Any) -> None: +def test_sumh( + constructor: Constructor, col_expr: Any, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor(data)) result = df.with_columns(horizontal_sum=nw.sum_horizontal(col_expr, nw.col("b"))) @@ -23,7 +27,9 @@ def test_sumh(constructor: Constructor, col_expr: Any) -> None: assert_equal_data(result, expected) -def test_sumh_nullable(constructor: Constructor) -> None: +def test_sumh_nullable(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 8, 3], "b": [4, 5, None]} expected = {"hsum": [5, 13, 3]} @@ -32,7 +38,9 @@ def test_sumh_nullable(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_sumh_all(constructor: Constructor) -> None: +def test_sumh_all(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 2, 3], "b": [10, 20, 30]} df = nw.from_native(constructor(data)) result = df.select(nw.sum_horizontal(nw.all())) diff --git a/tests/expr_and_series/unary_test.py b/tests/expr_and_series/unary_test.py index f2f9c33ff..82f616a64 100644 --- a/tests/expr_and_series/unary_test.py +++ b/tests/expr_and_series/unary_test.py @@ -10,7 +10,9 @@ from tests.utils import assert_equal_data -def test_unary(constructor: Constructor) -> None: +def test_unary(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = { "a": [1, 3, 2], "b": [4, 4, 6], @@ -77,7 +79,11 @@ def test_unary_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_unary_two_elements(constructor: Constructor) -> None: +def test_unary_two_elements( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 2], "b": [2, 10], "c": [2.0, None]} result = nw.from_native(constructor(data)).select( a_nunique=nw.col("a").n_unique(), @@ -120,7 +126,11 @@ def test_unary_two_elements_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_unary_one_element(constructor: Constructor) -> None: +def test_unary_one_element( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1], "b": [2], "c": [None]} # Dask runs into a divide by zero RuntimeWarning for 1 element skew. context = ( diff --git a/tests/expr_and_series/var_test.py b/tests/expr_and_series/var_test.py index bab97d383..2053dfe69 100644 --- a/tests/expr_and_series/var_test.py +++ b/tests/expr_and_series/var_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +from contextlib import nullcontext as does_not_raise + import pytest import narwhals.stable.v1 as nw @@ -24,10 +26,27 @@ def test_var(constructor: Constructor, input_data: dict[str, list[float | None]] result = df.select( nw.col("a").var(ddof=1).alias("a_ddof_1"), nw.col("a").var(ddof=0).alias("a_ddof_0"), - nw.col("b").var(ddof=2).alias("b_ddof_2"), nw.col("z").var(ddof=0).alias("z_ddof_0"), ) + expected_results = { + "a_ddof_1": [1.0], + "a_ddof_0": [0.6666666666666666], + "z_ddof_0": [0.6666666666666666], + } assert_equal_data(result, expected_results) + context = ( + pytest.raises(NotImplementedError) + if "duckdb" in str(constructor) + else does_not_raise() + ) + with context: + result = df.select( + nw.col("b").var(ddof=2).alias("b_ddof_2"), + ) + expected_results = { + "b_ddof_2": [2.666666666666667], + } + assert_equal_data(result, expected_results) @pytest.mark.parametrize("input_data", [data, data_with_nulls]) diff --git a/tests/expr_and_series/when_test.py b/tests/expr_and_series/when_test.py index 5c60febb4..0faf59172 100644 --- a/tests/expr_and_series/when_test.py +++ b/tests/expr_and_series/when_test.py @@ -17,7 +17,9 @@ } -def test_when(constructor: Constructor) -> None: +def test_when(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") == 1).then(value=3).alias("a_when")) expected = { @@ -26,7 +28,9 @@ def test_when(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_when_otherwise(constructor: Constructor) -> None: +def test_when_otherwise(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") == 1).then(3).otherwise(6).alias("a_when")) expected = { @@ -35,7 +39,11 @@ def test_when_otherwise(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_multiple_conditions(constructor: Constructor) -> None: +def test_multiple_conditions( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select( nw.when(nw.col("a") < 3, nw.col("c") < 5.0).then(3).alias("a_when") @@ -77,7 +85,11 @@ def test_value_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_value_expression(constructor: Constructor) -> None: +def test_value_expression( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") == 1).then(nw.col("a") + 9).alias("a_when")) expected = { @@ -110,7 +122,11 @@ def test_otherwise_series(constructor_eager: ConstructorEager) -> None: assert_equal_data(result, expected) -def test_otherwise_expression(constructor: Constructor) -> None: +def test_otherwise_expression( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select( nw.when(nw.col("a") == 1).then(-1).otherwise(nw.col("a") + 7).alias("a_when") @@ -121,14 +137,22 @@ def test_otherwise_expression(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_when_then_otherwise_into_expr(constructor: Constructor) -> None: +def test_when_then_otherwise_into_expr( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") > 1).then("c").otherwise("e")) expected = {"c": [7, 5, 6]} assert_equal_data(result, expected) -def test_when_then_otherwise_lit_str(constructor: Constructor) -> None: +def test_when_then_otherwise_lit_str( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(nw.when(nw.col("a") > 1).then(nw.col("b")).otherwise(nw.lit("z"))) expected = {"b": ["z", "b", "c"]} diff --git a/tests/frame/add_test.py b/tests/frame/add_test.py index 27a332ed0..e04561895 100644 --- a/tests/frame/add_test.py +++ b/tests/frame/add_test.py @@ -1,11 +1,15 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import assert_equal_data -def test_add(constructor: Constructor) -> None: +def test_add(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor(data)) result = df.with_columns( diff --git a/tests/frame/clone_test.py b/tests/frame/clone_test.py index 1a02910c8..316638c06 100644 --- a/tests/frame/clone_test.py +++ b/tests/frame/clone_test.py @@ -10,6 +10,8 @@ def test_clone(request: pytest.FixtureRequest, constructor: Constructor) -> None: if "dask" in str(constructor): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "pyarrow_table" in str(constructor): request.applymarker(pytest.mark.xfail) diff --git a/tests/frame/concat_test.py b/tests/frame/concat_test.py index 26bbd2e62..6d8fdbda0 100644 --- a/tests/frame/concat_test.py +++ b/tests/frame/concat_test.py @@ -7,7 +7,11 @@ from tests.utils import assert_equal_data -def test_concat_horizontal(constructor: Constructor) -> None: +def test_concat_horizontal( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_left = nw.from_native(constructor(data)).lazy() @@ -28,7 +32,12 @@ def test_concat_horizontal(constructor: Constructor) -> None: nw.concat([]) -def test_concat_vertical(constructor: Constructor) -> None: +def test_concat_vertical( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_left = ( nw.from_native(constructor(data)).lazy().rename({"a": "c", "b": "d"}).drop("z") @@ -56,7 +65,11 @@ def test_concat_vertical(constructor: Constructor) -> None: nw.concat([df_left, df_left.select("d")], how="vertical").collect() -def test_concat_diagonal(constructor: Constructor) -> None: +def test_concat_diagonal( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data_1 = {"a": [1, 3], "b": [4, 6]} data_2 = {"a": [100, 200], "z": ["x", "y"]} expected = { diff --git a/tests/frame/drop_nulls_test.py b/tests/frame/drop_nulls_test.py index bb55439eb..368ad6ba0 100644 --- a/tests/frame/drop_nulls_test.py +++ b/tests/frame/drop_nulls_test.py @@ -12,7 +12,9 @@ } -def test_drop_nulls(constructor: Constructor) -> None: +def test_drop_nulls(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) result = nw.from_native(constructor(data)).drop_nulls() expected = { "a": [2.0, 4.0], @@ -30,7 +32,12 @@ def test_drop_nulls(constructor: Constructor) -> None: ], ) def test_drop_nulls_subset( - constructor: Constructor, subset: str | list[str], expected: dict[str, float] + constructor: Constructor, + subset: str | list[str], + expected: dict[str, float], + request: pytest.FixtureRequest, ) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) result = nw.from_native(constructor(data)).drop_nulls(subset=subset) assert_equal_data(result, expected) diff --git a/tests/frame/explode_test.py b/tests/frame/explode_test.py index 631da0255..f3b096194 100644 --- a/tests/frame/explode_test.py +++ b/tests/frame/explode_test.py @@ -40,7 +40,7 @@ def test_explode_single_col( ) -> None: if any( backend in str(constructor) - for backend in ("dask", "modin", "cudf", "pyarrow_table") + for backend in ("dask", "modin", "cudf", "pyarrow_table", "duckdb", "pyspark") ): request.applymarker(pytest.mark.xfail) @@ -89,7 +89,7 @@ def test_explode_multiple_cols( ) -> None: if any( backend in str(constructor) - for backend in ("dask", "modin", "cudf", "pyarrow_table") + for backend in ("dask", "modin", "cudf", "pyarrow_table", "duckdb", "pyspark") ): request.applymarker(pytest.mark.xfail) @@ -110,7 +110,7 @@ def test_explode_shape_error( ) -> None: if any( backend in str(constructor) - for backend in ("dask", "modin", "cudf", "pyarrow_table") + for backend in ("dask", "modin", "cudf", "pyarrow_table", "duckdb", "pyspark") ): request.applymarker(pytest.mark.xfail) @@ -133,7 +133,7 @@ def test_explode_shape_error( def test_explode_invalid_operation_error( request: pytest.FixtureRequest, constructor: Constructor ) -> None: - if "dask" in str(constructor) or "pyarrow_table" in str(constructor): + if any(x in str(constructor) for x in ("pyarrow_table", "dask", "duckdb", "pyspark")): request.applymarker(pytest.mark.xfail) if "polars" in str(constructor) and POLARS_VERSION < (0, 20, 6): diff --git a/tests/frame/filter_test.py b/tests/frame/filter_test.py index b55ab7767..759d175ca 100644 --- a/tests/frame/filter_test.py +++ b/tests/frame/filter_test.py @@ -17,7 +17,11 @@ def test_filter(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_filter_with_boolean_list(constructor: Constructor) -> None: +def test_filter_with_boolean_list( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor(data)) context = ( diff --git a/tests/frame/gather_every_test.py b/tests/frame/gather_every_test.py index 671737ad1..c151f4503 100644 --- a/tests/frame/gather_every_test.py +++ b/tests/frame/gather_every_test.py @@ -11,7 +11,11 @@ @pytest.mark.parametrize("n", [1, 2, 3]) @pytest.mark.parametrize("offset", [1, 2, 3]) -def test_gather_every(constructor: Constructor, n: int, offset: int) -> None: +def test_gather_every( + constructor: Constructor, n: int, offset: int, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.gather_every(n=n, offset=offset) expected = {"a": data["a"][offset::n]} diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index faeac5b2f..f15a1b79e 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -10,6 +10,7 @@ import narwhals.stable.v1 as nw from narwhals.utils import Implementation +from tests.utils import DUCKDB_VERSION from tests.utils import PANDAS_VERSION from tests.utils import Constructor from tests.utils import assert_equal_data @@ -19,8 +20,8 @@ def test_inner_join_two_keys(constructor: Constructor) -> None: data = { "antananarivo": [1, 3, 2], "bob": [4, 4, 6], - "zorro": [7.0, 8, 9], - "index": [0, 1, 2], + "zor ro": [7.0, 8, 9], + "idx": [0, 1, 2], } df = nw.from_native(constructor(data)) df_right = df @@ -31,14 +32,14 @@ def test_inner_join_two_keys(constructor: Constructor) -> None: how="inner", ) result_on = df.join(df_right, on=["antananarivo", "bob"], how="inner") # type: ignore[arg-type] - result = result.sort("index").drop("index_right") - result_on = result_on.sort("index").drop("index_right") + result = result.sort("idx").drop("idx_right") + result_on = result_on.sort("idx").drop("idx_right") expected = { "antananarivo": [1, 3, 2], "bob": [4, 4, 6], - "zorro": [7.0, 8, 9], - "index": [0, 1, 2], - "zorro_right": [7.0, 8, 9], + "zor ro": [7.0, 8, 9], + "idx": [0, 1, 2], + "zor ro_right": [7.0, 8, 9], } assert_equal_data(result, expected) assert_equal_data(result_on, expected) @@ -48,8 +49,8 @@ def test_inner_join_single_key(constructor: Constructor) -> None: data = { "antananarivo": [1, 3, 2], "bob": [4, 4, 6], - "zorro": [7.0, 8, 9], - "index": [0, 1, 2], + "zor ro": [7.0, 8, 9], + "idx": [0, 1, 2], } df = nw.from_native(constructor(data)) df_right = df @@ -58,23 +59,25 @@ def test_inner_join_single_key(constructor: Constructor) -> None: left_on="antananarivo", right_on="antananarivo", how="inner", - ).sort("index") - result_on = df.join(df_right, on="antananarivo", how="inner").sort("index") # type: ignore[arg-type] - result = result.drop("index_right") - result_on = result_on.drop("index_right") + ).sort("idx") + result_on = df.join(df_right, on="antananarivo", how="inner").sort("idx") # type: ignore[arg-type] + result = result.drop("idx_right") + result_on = result_on.drop("idx_right") expected = { "antananarivo": [1, 3, 2], "bob": [4, 4, 6], - "zorro": [7.0, 8, 9], - "index": [0, 1, 2], + "zor ro": [7.0, 8, 9], + "idx": [0, 1, 2], "bob_right": [4, 4, 6], - "zorro_right": [7.0, 8, 9], + "zor ro_right": [7.0, 8, 9], } assert_equal_data(result, expected) assert_equal_data(result_on, expected) -def test_cross_join(constructor: Constructor) -> None: +def test_cross_join(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 1, 4): + request.applymarker(pytest.mark.xfail) data = {"antananarivo": [1, 3, 2]} df = nw.from_native(constructor(data)) result = df.join(df, how="cross").sort("antananarivo", "antananarivo_right") # type: ignore[arg-type] @@ -96,7 +99,7 @@ def test_suffix(constructor: Constructor, how: str, suffix: str) -> None: data = { "antananarivo": [1, 3, 2], "bob": [4, 4, 6], - "zorro": [7.0, 8, 9], + "zor ro": [7.0, 8, 9], } df = nw.from_native(constructor(data)) df_right = df @@ -108,11 +111,15 @@ def test_suffix(constructor: Constructor, how: str, suffix: str) -> None: suffix=suffix, ) result_cols = result.collect_schema().names() - assert result_cols == ["antananarivo", "bob", "zorro", f"zorro{suffix}"] + assert result_cols == ["antananarivo", "bob", "zor ro", f"zor ro{suffix}"] @pytest.mark.parametrize("suffix", ["_right", "_custom_suffix"]) -def test_cross_join_suffix(constructor: Constructor, suffix: str) -> None: +def test_cross_join_suffix( + constructor: Constructor, suffix: str, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 1, 4): + request.applymarker(pytest.mark.xfail) data = {"antananarivo": [1, 3, 2]} df = nw.from_native(constructor(data)) result = df.join(df, how="cross", suffix=suffix).sort( # type: ignore[arg-type] @@ -144,13 +151,13 @@ def test_cross_join_non_pandas() -> None: ( ["antananarivo", "bob"], (nw.col("bob") < 5), - {"antananarivo": [2], "bob": [6], "zorro": [9]}, + {"antananarivo": [2], "bob": [6], "zor ro": [9]}, ), - (["bob"], (nw.col("bob") < 5), {"antananarivo": [2], "bob": [6], "zorro": [9]}), + (["bob"], (nw.col("bob") < 5), {"antananarivo": [2], "bob": [6], "zor ro": [9]}), ( ["bob"], (nw.col("bob") > 5), - {"antananarivo": [1, 3], "bob": [4, 4], "zorro": [7.0, 8.0]}, + {"antananarivo": [1, 3], "bob": [4, 4], "zor ro": [7.0, 8.0]}, ), ], ) @@ -159,8 +166,11 @@ def test_anti_join( join_key: list[str], filter_expr: nw.Expr, expected: dict[str, list[Any]], + request: pytest.FixtureRequest, ) -> None: - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) other = df.filter(filter_expr) result = df.join(other, how="anti", left_on=join_key, right_on=join_key) # type: ignore[arg-type] @@ -173,22 +183,22 @@ def test_anti_join( ( "antananarivo", (nw.col("bob") > 5), - {"antananarivo": [2], "bob": [6], "zorro": [9]}, + {"antananarivo": [2], "bob": [6], "zor ro": [9]}, ), ( ["antananarivo"], (nw.col("bob") > 5), - {"antananarivo": [2], "bob": [6], "zorro": [9]}, + {"antananarivo": [2], "bob": [6], "zor ro": [9]}, ), ( ["bob"], (nw.col("bob") < 5), - {"antananarivo": [1, 3], "bob": [4, 4], "zorro": [7, 8]}, + {"antananarivo": [1, 3], "bob": [4, 4], "zor ro": [7, 8]}, ), ( ["antananarivo", "bob"], (nw.col("bob") < 5), - {"antananarivo": [1, 3], "bob": [4, 4], "zorro": [7, 8]}, + {"antananarivo": [1, 3], "bob": [4, 4], "zor ro": [7, 8]}, ), ], ) @@ -198,7 +208,7 @@ def test_semi_join( filter_expr: nw.Expr, expected: dict[str, list[Any]], ) -> None: - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) other = df.filter(filter_expr) result = df.join(other, how="semi", left_on=join_key, right_on=join_key).sort( # type: ignore[arg-type] @@ -209,7 +219,7 @@ def test_semi_join( @pytest.mark.parametrize("how", ["right", "full"]) def test_join_not_implemented(constructor: Constructor, how: str) -> None: - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) with pytest.raises( @@ -221,49 +231,47 @@ def test_join_not_implemented(constructor: Constructor, how: str) -> None: df.join(df, left_on="antananarivo", right_on="antananarivo", how=how) # type: ignore[arg-type] -@pytest.mark.filterwarnings("ignore:the default coalesce behavior") def test_left_join(constructor: Constructor) -> None: data_left = { "antananarivo": [1.0, 2, 3], "bob": [4.0, 5, 6], - "index": [0.0, 1.0, 2.0], + "idx": [0.0, 1.0, 2.0], } data_right = { "antananarivo": [1.0, 2, 3], "co": [4.0, 5, 7], - "index": [0.0, 1.0, 2.0], + "idx": [0.0, 1.0, 2.0], } df_left = nw.from_native(constructor(data_left)) df_right = nw.from_native(constructor(data_right)) result = df_left.join(df_right, left_on="bob", right_on="co", how="left") # type: ignore[arg-type] - result = result.sort("index") - result = result.drop("index_right") + result = result.sort("idx") + result = result.drop("idx_right") expected = { "antananarivo": [1, 2, 3], "bob": [4, 5, 6], - "index": [0, 1, 2], + "idx": [0, 1, 2], "antananarivo_right": [1, 2, None], } result_on_list = df_left.join( df_right, # type: ignore[arg-type] - on=["antananarivo", "index"], + on=["antananarivo", "idx"], how="left", ) - result_on_list = result_on_list.sort("index") + result_on_list = result_on_list.sort("idx") expected_on_list = { "antananarivo": [1, 2, 3], "bob": [4, 5, 6], - "index": [0, 1, 2], + "idx": [0, 1, 2], "co": [4, 5, 7], } assert_equal_data(result, expected) assert_equal_data(result_on_list, expected_on_list) -@pytest.mark.filterwarnings("ignore: the default coalesce behavior") def test_left_join_multiple_column(constructor: Constructor) -> None: - data_left = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "index": [0, 1, 2]} - data_right = {"antananarivo": [1, 2, 3], "c": [4, 5, 6], "index": [0, 1, 2]} + data_left = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "idx": [0, 1, 2]} + data_right = {"antananarivo": [1, 2, 3], "c": [4, 5, 6], "idx": [0, 1, 2]} df_left = nw.from_native(constructor(data_left)) df_right = nw.from_native(constructor(data_right)) result = df_left.join( @@ -272,35 +280,34 @@ def test_left_join_multiple_column(constructor: Constructor) -> None: right_on=["antananarivo", "c"], how="left", ) - result = result.sort("index") - result = result.drop("index_right") - expected = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "index": [0, 1, 2]} + result = result.sort("idx") + result = result.drop("idx_right") + expected = {"antananarivo": [1, 2, 3], "bob": [4, 5, 6], "idx": [0, 1, 2]} assert_equal_data(result, expected) -@pytest.mark.filterwarnings("ignore: the default coalesce behavior") def test_left_join_overlapping_column(constructor: Constructor) -> None: data_left = { "antananarivo": [1.0, 2, 3], "bob": [4.0, 5, 6], "d": [1.0, 4, 2], - "index": [0.0, 1.0, 2.0], + "idx": [0.0, 1.0, 2.0], } data_right = { "antananarivo": [1.0, 2, 3], "c": [4.0, 5, 6], "d": [1.0, 4, 2], - "index": [0.0, 1.0, 2.0], + "idx": [0.0, 1.0, 2.0], } df_left = nw.from_native(constructor(data_left)) df_right = nw.from_native(constructor(data_right)) - result = df_left.join(df_right, left_on="bob", right_on="c", how="left").sort("index") # type: ignore[arg-type] - result = result.drop("index_right") + result = df_left.join(df_right, left_on="bob", right_on="c", how="left").sort("idx") # type: ignore[arg-type] + result = result.drop("idx_right") expected: dict[str, list[Any]] = { "antananarivo": [1, 2, 3], "bob": [4, 5, 6], "d": [1, 4, 2], - "index": [0, 1, 2], + "idx": [0, 1, 2], "antananarivo_right": [1, 2, 3], "d_right": [1, 4, 2], } @@ -311,13 +318,13 @@ def test_left_join_overlapping_column(constructor: Constructor) -> None: right_on="d", how="left", ) - result = result.sort("index") - result = result.drop("index_right") + result = result.sort("idx") + result = result.drop("idx_right") expected = { "antananarivo": [1, 2, 3], "bob": [4, 5, 6], "d": [1, 4, 2], - "index": [0, 1, 2], + "idx": [0, 1, 2], "antananarivo_right": [1.0, 3.0, None], "c": [4.0, 6.0, None], } @@ -326,7 +333,7 @@ def test_left_join_overlapping_column(constructor: Constructor) -> None: @pytest.mark.parametrize("how", ["inner", "left", "semi", "anti"]) def test_join_keys_exceptions(constructor: Constructor, how: str) -> None: - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) with pytest.raises( @@ -355,7 +362,7 @@ def test_joinasof_numeric( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: - if "pyarrow_table" in str(constructor) or "cudf" in str(constructor): + if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "duckdb", "pyspark")): request.applymarker(pytest.mark.xfail) if PANDAS_VERSION < (2, 1) and ( ("pandas_pyarrow" in str(constructor)) or ("pandas_nullable" in str(constructor)) @@ -414,7 +421,7 @@ def test_joinasof_time( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: - if "pyarrow_table" in str(constructor) or "cudf" in str(constructor): + if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "duckdb", "pyspark")): request.applymarker(pytest.mark.xfail) if PANDAS_VERSION < (2, 1) and ("pandas_pyarrow" in str(constructor)): request.applymarker(pytest.mark.xfail) @@ -495,7 +502,7 @@ def test_joinasof_by( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: - if "pyarrow_table" in str(constructor) or "cudf" in str(constructor): + if any(x in str(constructor) for x in ("pyarrow_table", "cudf", "duckdb", "pyspark")): request.applymarker(pytest.mark.xfail) if PANDAS_VERSION < (2, 1) and ( ("pandas_pyarrow" in str(constructor)) or ("pandas_nullable" in str(constructor)) @@ -531,7 +538,7 @@ def test_joinasof_by( def test_joinasof_not_implemented( constructor: Constructor, strategy: Literal["backward", "forward"] ) -> None: - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) with pytest.raises( @@ -547,7 +554,7 @@ def test_joinasof_not_implemented( def test_joinasof_keys_exceptions(constructor: Constructor) -> None: - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) with pytest.raises( @@ -588,7 +595,7 @@ def test_joinasof_keys_exceptions(constructor: Constructor) -> None: def test_joinasof_by_exceptions(constructor: Constructor) -> None: - data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zorro": [7.0, 8, 9]} + data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8, 9]} df = nw.from_native(constructor(data)) with pytest.raises( ValueError, diff --git a/tests/frame/select_test.py b/tests/frame/select_test.py index 2cb3df91d..946e58203 100644 --- a/tests/frame/select_test.py +++ b/tests/frame/select_test.py @@ -27,7 +27,9 @@ def test_select(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_empty_select(constructor: Constructor) -> None: +def test_empty_select(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) result = nw.from_native(constructor({"a": [1, 2, 3]})).lazy().select() assert result.collect().shape == (0, 0) @@ -75,7 +77,11 @@ def test_comparison_with_list_error_message() -> None: nw.from_native(pd.Series([[1, 2, 3]]), series_only=True) == [1, 2, 3] # noqa: B015 -def test_missing_columns(constructor: Constructor) -> None: +def test_missing_columns( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor(data)) selected_columns = ["a", "e", "f"] @@ -118,7 +124,9 @@ def test_missing_columns(constructor: Constructor) -> None: def test_left_to_right_broadcasting( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "dask" in str(constructor) and DASK_VERSION < (2024, 9): + if "dask" in str(constructor) and DASK_VERSION < (2024, 10): + request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 1, 2], "b": [4, 5, 6]})) result = df.select(nw.col("a") + nw.col("b").sum()) diff --git a/tests/frame/sort_test.py b/tests/frame/sort_test.py index 5147c6f56..1ce3414c8 100644 --- a/tests/frame/sort_test.py +++ b/tests/frame/sort_test.py @@ -8,18 +8,18 @@ def test_sort(constructor: Constructor) -> None: - data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} + data = {"an tan": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df = nw.from_native(constructor(data)) - result = df.sort("a", "b") + result = df.sort("an tan", "b") expected = { - "a": [1, 2, 3], + "an tan": [1, 2, 3], "b": [4, 6, 4], "z": [7.0, 9.0, 8.0], } assert_equal_data(result, expected) - result = df.sort("a", "b", descending=[True, False]) + result = df.sort("an tan", "b", descending=[True, False]) expected = { - "a": [3, 2, 1], + "an tan": [3, 2, 1], "b": [4, 6, 4], "z": [8.0, 9.0, 7.0], } @@ -29,14 +29,14 @@ def test_sort(constructor: Constructor) -> None: @pytest.mark.parametrize( ("nulls_last", "expected"), [ - (True, {"a": [0, 2, 0, -1], "b": [3, 2, 1, None]}), - (False, {"a": [-1, 0, 2, 0], "b": [None, 3, 2, 1]}), + (True, {"antan desc": [0, 2, 0, -1], "b": [3, 2, 1, None]}), + (False, {"antan desc": [-1, 0, 2, 0], "b": [None, 3, 2, 1]}), ], ) def test_sort_nulls( constructor: Constructor, *, nulls_last: bool, expected: dict[str, float] ) -> None: - data = {"a": [0, 0, 2, -1], "b": [1, 3, 2, None]} + data = {"antan desc": [0, 0, 2, -1], "b": [1, 3, 2, None]} df = nw.from_native(constructor(data)) result = df.sort("b", descending=True, nulls_last=nulls_last) assert_equal_data(result, expected) diff --git a/tests/frame/tail_test.py b/tests/frame/tail_test.py index a4d265797..75f46a4a1 100644 --- a/tests/frame/tail_test.py +++ b/tests/frame/tail_test.py @@ -9,7 +9,10 @@ from tests.utils import assert_equal_data -def test_tail(constructor: Constructor) -> None: +def test_tail(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} expected = {"a": [3, 2], "b": [4, 6], "z": [8.0, 9]} diff --git a/tests/frame/to_arrow_test.py b/tests/frame/to_arrow_test.py index 3e8c704ea..70913ed15 100644 --- a/tests/frame/to_arrow_test.py +++ b/tests/frame/to_arrow_test.py @@ -12,6 +12,7 @@ from tests.utils import ConstructorEager +@pytest.mark.filterwarnings("ignore:.*is_sparse is deprecated:DeprecationWarning") def test_to_arrow( request: pytest.FixtureRequest, constructor_eager: ConstructorEager, diff --git a/tests/frame/unique_test.py b/tests/frame/unique_test.py index 96d5a8c2d..a193ab98b 100644 --- a/tests/frame/unique_test.py +++ b/tests/frame/unique_test.py @@ -5,7 +5,10 @@ import pytest -import narwhals.stable.v1 as nw +# We use nw instead of nw.stable.v1 to ensure that DuckDBPyRelation +# becomes LazyFrame instead of DataFrame +import narwhals as nw +from narwhals.exceptions import ColumnNotFoundError from tests.utils import Constructor from tests.utils import assert_equal_data @@ -31,8 +34,18 @@ def test_unique( ) -> None: df_raw = constructor(data) df = nw.from_native(df_raw) - if isinstance(df, nw.LazyFrame) and keep in {"first", "last"}: + if isinstance(df, nw.LazyFrame) and keep in { + "first", + "last", + }: context: Any = pytest.raises(ValueError, match="row order") + elif ( + keep == "none" and df.implementation is nw.Implementation.PYSPARK + ): # pragma: no cover + context = pytest.raises( + ValueError, + match="`LazyFrame.unique` with PySpark backend only supports `keep='any'`.", + ) elif keep == "foo": context = pytest.raises(ValueError, match=": foo") else: @@ -43,6 +56,13 @@ def test_unique( assert_equal_data(result, expected) +def test_unique_invalid_subset(constructor: Constructor) -> None: + df_raw = constructor(data) + df = nw.from_native(df_raw) + with pytest.raises(ColumnNotFoundError): + df.lazy().unique(["fdssfad"]).collect() + + @pytest.mark.filterwarnings("ignore:.*backwards-compatibility:UserWarning") def test_unique_none(constructor: Constructor) -> None: df_raw = constructor(data) diff --git a/tests/frame/unpivot_test.py b/tests/frame/unpivot_test.py index ad7eefe5b..72aa81f2d 100644 --- a/tests/frame/unpivot_test.py +++ b/tests/frame/unpivot_test.py @@ -37,10 +37,14 @@ [("b", expected_b_only), (["b", "c"], expected_b_c), (None, expected_b_c)], ) def test_unpivot_on( + request: pytest.FixtureRequest, constructor: Constructor, on: str | list[str] | None, expected: dict[str, list[float]], ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.unpivot(on=on, index=["a"]).sort("variable", "a") assert_equal_data(result, expected) @@ -55,10 +59,14 @@ def test_unpivot_on( ], ) def test_unpivot_var_value_names( + request: pytest.FixtureRequest, constructor: Constructor, variable_name: str | None, value_name: str | None, ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.unpivot( on=["b", "c"], index=["a"], variable_name=variable_name, value_name=value_name @@ -67,7 +75,12 @@ def test_unpivot_var_value_names( assert result.collect_schema().names()[-2:] == [variable_name, value_name] -def test_unpivot_default_var_value_names(constructor: Constructor) -> None: +def test_unpivot_default_var_value_names( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.unpivot(on=["b", "c"], index=["a"]) @@ -89,10 +102,13 @@ def test_unpivot_mixed_types( data: dict[str, Any], expected_dtypes: list[DType], ) -> None: - if "cudf" in str(constructor) or ( - "pyarrow_table" in str(constructor) and PYARROW_VERSION < (14, 0, 0) + if ( + "cudf" in str(constructor) + or "pyspark" in str(constructor) + or ("pyarrow_table" in str(constructor) and PYARROW_VERSION < (14, 0, 0)) ): request.applymarker(pytest.mark.xfail) + df = nw.from_native(constructor(data)) result = df.unpivot(on=["a", "b"], index="idx") diff --git a/tests/frame/with_columns_test.py b/tests/frame/with_columns_test.py index c05a41646..6fa3ab825 100644 --- a/tests/frame/with_columns_test.py +++ b/tests/frame/with_columns_test.py @@ -52,6 +52,8 @@ def test_with_columns_dtypes_single_row( ) -> None: if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (15,): request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": ["foo"]} df = nw.from_native(constructor(data)).with_columns(nw.col("a").cast(nw.Categorical)) result = df.with_columns(nw.col("a")) diff --git a/tests/frame/with_row_index_test.py b/tests/frame/with_row_index_test.py index e19d3c994..96f2b1547 100644 --- a/tests/frame/with_row_index_test.py +++ b/tests/frame/with_row_index_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +import pytest + import narwhals.stable.v1 as nw from tests.utils import Constructor from tests.utils import assert_equal_data @@ -10,7 +12,9 @@ } -def test_with_row_index(constructor: Constructor) -> None: +def test_with_row_index(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) result = nw.from_native(constructor(data)).with_row_index() expected = {"index": [0, 1], "a": ["foo", "bars"], "ab": ["foo", "bars"]} assert_equal_data(result, expected) diff --git a/tests/frame/write_parquet_test.py b/tests/frame/write_parquet_test.py index e4b826cfb..670e8c7c9 100644 --- a/tests/frame/write_parquet_test.py +++ b/tests/frame/write_parquet_test.py @@ -14,6 +14,7 @@ @pytest.mark.skipif(PANDAS_VERSION < (2, 0, 0), reason="too old for pyarrow") +@pytest.mark.filterwarnings("ignore:.*is_sparse is deprecated:DeprecationWarning") def test_write_parquet( constructor_eager: ConstructorEager, tmpdir: pytest.TempdirFactory, diff --git a/tests/from_dict_test.py b/tests/from_dict_test.py index 86fe07eda..0630cac43 100644 --- a/tests/from_dict_test.py +++ b/tests/from_dict_test.py @@ -12,7 +12,7 @@ def test_from_dict(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) native_namespace = nw.get_native_namespace(df) @@ -25,7 +25,7 @@ def test_from_dict(constructor: Constructor, request: pytest.FixtureRequest) -> def test_from_dict_schema( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) schema = {"c": nw_v1.Int16(), "d": nw_v1.Float32()} df = nw_v1.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) @@ -62,7 +62,7 @@ def test_from_dict_one_native_one_narwhals( def test_from_dict_v1(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw_v1.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) native_namespace = nw_v1.get_native_namespace(df) diff --git a/tests/from_numpy_test.py b/tests/from_numpy_test.py index b736d5cbd..7a40136e7 100644 --- a/tests/from_numpy_test.py +++ b/tests/from_numpy_test.py @@ -19,7 +19,7 @@ def test_from_numpy(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) native_namespace = nw.get_native_namespace(df) @@ -31,7 +31,7 @@ def test_from_numpy(constructor: Constructor, request: pytest.FixtureRequest) -> def test_from_numpy_schema_dict( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) schema = { "c": nw_v1.Int16(), @@ -52,7 +52,7 @@ def test_from_numpy_schema_dict( def test_from_numpy_schema_list( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) schema = ["c", "d", "e", "f"] df = nw_v1.from_native(constructor(data)) @@ -68,7 +68,7 @@ def test_from_numpy_schema_list( def test_from_numpy_schema_notvalid( constructor: Constructor, request: pytest.FixtureRequest ) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) native_namespace = nw_v1.get_native_namespace(df) @@ -79,7 +79,7 @@ def test_from_numpy_schema_notvalid( def test_from_numpy_v1(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "dask" in str(constructor): + if "dask" in str(constructor) or "pyspark" in str(constructor): request.applymarker(pytest.mark.xfail) df = nw_v1.from_native(constructor(data)) native_namespace = nw_v1.get_native_namespace(df) diff --git a/tests/group_by_test.py b/tests/group_by_test.py index 22c3b6f19..64b3844d0 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -115,6 +115,8 @@ def test_group_by_depth_1_agg( expected: dict[str, list[int | float]], request: pytest.FixtureRequest, ) -> None: + if "pyspark" in str(constructor) and attr == "n_unique": + request.applymarker(pytest.mark.xfail) if "pandas_pyarrow" in str(constructor) and attr == "var" and PANDAS_VERSION < (2, 1): # Known issue with variance calculation in pandas 2.0.x with pyarrow backend in groupby operations" request.applymarker(pytest.mark.xfail) @@ -134,10 +136,10 @@ def test_group_by_depth_1_agg( ], ) def test_group_by_depth_1_std_var( - constructor: Constructor, - attr: str, - ddof: int, + constructor: Constructor, attr: str, ddof: int, request: pytest.FixtureRequest ) -> None: + if "duckdb" in str(constructor) and ddof == 2: + request.applymarker(pytest.mark.xfail) data = {"a": [1, 1, 1, 2, 2, 2], "b": [4, 5, 6, 0, 5, 5]} _pow = 0.5 if attr == "std" else 1 expected = { @@ -164,7 +166,11 @@ def test_group_by_median(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_group_by_n_unique_w_missing(constructor: Constructor) -> None: +def test_group_by_n_unique_w_missing( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"a": [1, 1, 2], "b": [4, None, 5], "c": [None, None, 7], "d": [1, 1, 3]} result = ( nw.from_native(constructor(data)) @@ -269,6 +275,10 @@ def test_key_with_nulls( if "modin" in str(constructor): # TODO(unassigned): Modin flaky here? request.applymarker(pytest.mark.skip) + + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + context = ( pytest.raises(NotImplementedError, match="null values") if ("pandas_constructor" in str(constructor) and PANDAS_VERSION < (1, 1, 0)) @@ -288,8 +298,10 @@ def test_key_with_nulls( def test_key_with_nulls_ignored( - constructor: Constructor, + constructor: Constructor, request: pytest.FixtureRequest ) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) data = {"b": [4, 5, None], "a": [1, 2, 3]} result = ( nw.from_native(constructor(data)) @@ -330,7 +342,9 @@ def test_key_with_nulls_iter( assert len(result) == 4 -def test_no_agg(constructor: Constructor) -> None: +def test_no_agg(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) result = nw.from_native(constructor(data)).group_by(["a", "b"]).agg().sort("a", "b") expected = {"a": [1, 3], "b": [4, 6]} @@ -341,6 +355,8 @@ def test_group_by_categorical( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "pyarrow_table" in str(constructor) and PYARROW_VERSION < ( 15, 0, @@ -366,6 +382,8 @@ def test_group_by_categorical( def test_group_by_shift_raises( constructor: Constructor, request: pytest.FixtureRequest ) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "polars" in str(constructor): # Polars supports all kinds of crazy group-by aggregations, so # we don't check that it errors here. @@ -406,6 +424,8 @@ def test_all_kind_of_aggs( # and modin lol https://github.com/modin-project/modin/issues/7414 # and cudf https://github.com/rapidsai/cudf/issues/17649 request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) if "pandas" in str(constructor) and PANDAS_VERSION < (1, 4): # Bug in old pandas, can't do DataFrameGroupBy[['b', 'b']] request.applymarker(pytest.mark.xfail) diff --git a/tests/hypothesis/join_test.py b/tests/hypothesis/join_test.py index 7f1cd8103..da4a61679 100644 --- a/tests/hypothesis/join_test.py +++ b/tests/hypothesis/join_test.py @@ -134,7 +134,6 @@ def test_cross_join( # pragma: no cover ), ) @pytest.mark.slow -@pytest.mark.filterwarnings("ignore:the default coalesce behavior") def test_left_join( # pragma: no cover a_left_data: list[int], b_left_data: list[int], diff --git a/tests/read_scan_test.py b/tests/read_scan_test.py index dbb2cf624..55869b46b 100644 --- a/tests/read_scan_test.py +++ b/tests/read_scan_test.py @@ -52,8 +52,11 @@ def test_read_csv_kwargs(tmpdir: pytest.TempdirFactory) -> None: def test_scan_csv( tmpdir: pytest.TempdirFactory, + request: pytest.FixtureRequest, constructor: Constructor, ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df_pl = pl.DataFrame(data) filepath = str(tmpdir / "file.csv") # type: ignore[operator] df_pl.write_csv(filepath) @@ -66,8 +69,11 @@ def test_scan_csv( def test_scan_csv_v1( tmpdir: pytest.TempdirFactory, + request: pytest.FixtureRequest, constructor: Constructor, ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df_pl = pl.DataFrame(data) filepath = str(tmpdir / "file.csv") # type: ignore[operator] df_pl.write_csv(filepath) @@ -128,8 +134,11 @@ def test_read_parquet_kwargs(tmpdir: pytest.TempdirFactory) -> None: @pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="too old for pyarrow") def test_scan_parquet( tmpdir: pytest.TempdirFactory, + request: pytest.FixtureRequest, constructor: Constructor, ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df_pl = pl.DataFrame(data) filepath = str(tmpdir / "file.parquet") # type: ignore[operator] df_pl.write_parquet(filepath) @@ -143,8 +152,11 @@ def test_scan_parquet( @pytest.mark.skipif(PANDAS_VERSION < (1, 5), reason="too old for pyarrow") def test_scan_parquet_v1( tmpdir: pytest.TempdirFactory, + request: pytest.FixtureRequest, constructor: Constructor, ) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) df_pl = pl.DataFrame(data) filepath = str(tmpdir / "file.parquet") # type: ignore[operator] df_pl.write_parquet(filepath) diff --git a/tests/selectors_test.py b/tests/selectors_test.py index 8a2194caf..80aa64803 100644 --- a/tests/selectors_test.py +++ b/tests/selectors_test.py @@ -23,28 +23,36 @@ } -def test_selectors(constructor: Constructor) -> None: +def test_selectors(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(by_dtype([nw.Int64, nw.Float64]) + 1) expected = {"a": [2, 2, 3], "c": [5.1, 6.0, 7.0]} assert_equal_data(result, expected) -def test_numeric(constructor: Constructor) -> None: +def test_numeric(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(numeric() + 1) expected = {"a": [2, 2, 3], "c": [5.1, 6.0, 7.0]} assert_equal_data(result, expected) -def test_boolean(constructor: Constructor) -> None: +def test_boolean(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(boolean()) expected = {"d": [True, False, True]} assert_equal_data(result, expected) -def test_string(constructor: Constructor) -> None: +def test_string(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(string()) expected = {"b": ["a", "b", "c"]} @@ -59,6 +67,8 @@ def test_categorical( 15, ): # pragma: no cover request.applymarker(pytest.mark.xfail) + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) expected = {"b": ["a", "b", "c"]} df = nw.from_native(constructor(data)).with_columns(nw.col("b").cast(nw.Categorical)) @@ -81,19 +91,28 @@ def test_categorical( ], ) def test_set_ops( - constructor: Constructor, selector: nw.selectors.Selector, expected: list[str] + constructor: Constructor, + selector: nw.selectors.Selector, + expected: list[str], + request: pytest.FixtureRequest, ) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) result = df.select(selector).collect_schema().names() assert sorted(result) == expected @pytest.mark.parametrize("invalid_constructor", [pd.DataFrame, pa.table]) -def test_set_ops_invalid(invalid_constructor: Constructor) -> None: +def test_set_ops_invalid( + invalid_constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if "duckdb" in str(invalid_constructor): + request.applymarker(pytest.mark.xfail) df = nw.from_native(invalid_constructor(data)) - with pytest.raises(NotImplementedError): + with pytest.raises((NotImplementedError, ValueError)): df.select(1 - numeric()) - with pytest.raises(NotImplementedError): + with pytest.raises((NotImplementedError, ValueError)): df.select(1 | numeric()) - with pytest.raises(NotImplementedError): + with pytest.raises((NotImplementedError, ValueError)): df.select(1 & numeric()) diff --git a/tests/series_only/cast_test.py b/tests/series_only/cast_test.py index 10587a084..b4051e503 100644 --- a/tests/series_only/cast_test.py +++ b/tests/series_only/cast_test.py @@ -98,18 +98,6 @@ def test_cast_date_datetime_pandas() -> None: assert df.schema == {"a": nw.Date} -@pytest.mark.skipif( - PANDAS_VERSION < (2, 0, 0), - reason="pyarrow dtype not available", -) -def test_cast_date_datetime_invalid() -> None: - # pandas: pyarrow datetime to date - dfpd = pd.DataFrame({"a": [datetime(2020, 1, 1), datetime(2020, 1, 2)]}) - df = nw.from_native(dfpd) - with pytest.raises(NotImplementedError, match="pyarrow"): - df.select(nw.col("a").cast(nw.Date)) - - @pytest.mark.filterwarnings("ignore: casting period") def test_unknown_to_int() -> None: df = pd.DataFrame({"a": pd.period_range("2000", periods=3, freq="min")}) diff --git a/tests/spark_like_test.py b/tests/spark_like_test.py index 7ea7addac..3997f409a 100644 --- a/tests/spark_like_test.py +++ b/tests/spark_like_test.py @@ -324,25 +324,6 @@ def test_sumh_all(pyspark_constructor: Constructor) -> None: assert_equal_data(result, expected) -# copied from tests/expr_and_series/any_all_test.py -def test_any_all(pyspark_constructor: Constructor) -> None: - df = nw.from_native( - pyspark_constructor( - { - "a": [True, False, True], - "b": [True, True, True], - "c": [False, False, False], - } - ) - ) - result = df.select(nw.col("a", "b", "c").all()) - expected = {"a": [False], "b": [True], "c": [False]} - assert_equal_data(result, expected) - result = df.select(nw.all().any()) - expected = {"a": [True], "b": [True], "c": [False]} - assert_equal_data(result, expected) - - # copied from tests/expr_and_series/count_test.py def test_count(pyspark_constructor: Constructor) -> None: data = {"a": [1, 2, 3], "b": [4, None, 6], "z": [7.0, None, None]} diff --git a/tests/stable_api_test.py b/tests/stable_api_test.py index fd08f575c..862c5966f 100644 --- a/tests/stable_api_test.py +++ b/tests/stable_api_test.py @@ -13,7 +13,11 @@ from tests.utils import assert_equal_data -def test_renamed_taxicab_norm(constructor: Constructor) -> None: +def test_renamed_taxicab_norm( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: + if ("pyspark" in str(constructor)) or "duckdb" in str(constructor): + request.applymarker(pytest.mark.xfail) # Suppose we need to rename `_l1_norm` to `_taxicab_norm`. # We need `narwhals.stable.v1` to stay stable. So, we # make the change in `narwhals`, and then add the new method @@ -42,10 +46,15 @@ def test_renamed_taxicab_norm(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_renamed_taxicab_norm_dataframe(constructor: Constructor) -> None: +def test_renamed_taxicab_norm_dataframe( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: # Suppose we have `DataFrame._l1_norm` in `stable.v1`, but remove it # in the main namespace. Here, we check that it's still usable from # the stable api. + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + def func(df_any: Any) -> Any: df = nw_v1.from_native(df_any) df = df._l1_norm() @@ -56,10 +65,16 @@ def func(df_any: Any) -> Any: assert_equal_data(result, expected) -def test_renamed_taxicab_norm_dataframe_narwhalify(constructor: Constructor) -> None: +def test_renamed_taxicab_norm_dataframe_narwhalify( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: # Suppose we have `DataFrame._l1_norm` in `stable.v1`, but remove it # in the main namespace. Here, we check that it's still usable from # the stable api when using `narwhalify`. + + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + @nw_v1.narwhalify def func(df: Any) -> Any: return df._l1_norm() @@ -132,7 +147,10 @@ def test_series_docstrings() -> None: ), item -def test_dtypes(constructor: Constructor) -> None: +def test_dtypes(request: pytest.FixtureRequest, constructor: Constructor) -> None: + if "pyspark" in str(constructor): + request.applymarker(pytest.mark.xfail) + df = nw_v1.from_native( constructor({"a": [1], "b": [datetime(2020, 1, 1)], "c": [timedelta(1)]}) ) diff --git a/tests/tpch_q1_test.py b/tests/tpch_q1_test.py index fd2a7d24c..cb6d48548 100644 --- a/tests/tpch_q1_test.py +++ b/tests/tpch_q1_test.py @@ -10,6 +10,7 @@ import pytest import narwhals.stable.v1 as nw +from tests.utils import DASK_VERSION from tests.utils import PANDAS_VERSION from tests.utils import assert_equal_data @@ -20,6 +21,8 @@ ) @pytest.mark.filterwarnings("ignore:.*Passing a BlockManager.*:DeprecationWarning") def test_q1(library: str, request: pytest.FixtureRequest) -> None: + if library == "dask" and DASK_VERSION < (2024, 10): + request.applymarker(pytest.mark.xfail) if library == "pandas" and PANDAS_VERSION < (1, 5): request.applymarker(pytest.mark.xfail) elif library == "pandas": diff --git a/tests/utils.py b/tests/utils.py index 34f1bfa1e..2d41d6782 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -11,6 +11,7 @@ import pandas as pd +from narwhals.translate import from_native from narwhals.typing import IntoDataFrame from narwhals.typing import IntoFrame from narwhals.utils import Implementation @@ -32,6 +33,7 @@ def get_module_version_as_tuple(module_name: str) -> tuple[int, ...]: IBIS_VERSION: tuple[int, ...] = get_module_version_as_tuple("ibis") NUMPY_VERSION: tuple[int, ...] = get_module_version_as_tuple("numpy") PANDAS_VERSION: tuple[int, ...] = get_module_version_as_tuple("pandas") +DUCKDB_VERSION: tuple[int, ...] = get_module_version_as_tuple("duckdb") POLARS_VERSION: tuple[int, ...] = get_module_version_as_tuple("polars") DASK_VERSION: tuple[int, ...] = get_module_version_as_tuple("dask") PYARROW_VERSION: tuple[int, ...] = get_module_version_as_tuple("pyarrow") @@ -72,7 +74,12 @@ def assert_equal_data(result: Any, expected: dict[str, Any]) -> None: hasattr(result, "_compliant_frame") and result.implementation is Implementation.PYSPARK ) - + is_duckdb = ( + hasattr(result, "_compliant_frame") + and result._compliant_frame._implementation is Implementation.DUCKDB + ) + if is_duckdb: + result = from_native(result.to_native().arrow()) if hasattr(result, "collect"): if result.implementation is Implementation.POLARS and os.environ.get( "NARWHALS_POLARS_GPU", False diff --git a/tests/utils_test.py b/tests/utils_test.py index 26bd2ecf9..e999696d3 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -13,6 +13,7 @@ from pandas.testing import assert_series_equal import narwhals.stable.v1 as nw +from narwhals.utils import parse_version from tests.utils import PANDAS_VERSION from tests.utils import get_module_version_as_tuple @@ -271,3 +272,15 @@ def test_generate_temporary_column_name_raise() -> None: match="Internal Error: Narwhals was not able to generate a column name with ", ): nw.generate_temporary_column_name(n_bytes=1, columns=columns) + + +@pytest.mark.parametrize( + ("version", "expected"), + [ + ("2020.1.2", (2020, 1, 2)), + ("2020.1.2-dev123", (2020, 1, 2)), + ("3.0.0.dev0+618.gb552dc95c9", (3, 0, 0)), + ], +) +def test_parse_version(version: str, expected: tuple[int, ...]) -> None: + assert parse_version(version) == expected diff --git a/tpch/execute.py b/tpch/execute.py index fb5982c10..5209ad48e 100644 --- a/tpch/execute.py +++ b/tpch/execute.py @@ -5,6 +5,7 @@ from pathlib import Path import dask.dataframe as dd +import duckdb import pandas as pd import polars as pl import pyarrow as pa @@ -13,6 +14,7 @@ pd.options.mode.copy_on_write = True pd.options.future.infer_string = True +pl.Config.set_fmt_float("full") DATA_DIR = Path("data") LINEITEM_PATH = DATA_DIR / "lineitem.parquet" @@ -28,14 +30,18 @@ "pandas[pyarrow]": (pd, {"engine": "pyarrow", "dtype_backend": "pyarrow"}), "polars[lazy]": (pl, {}), "pyarrow": (pa, {}), + "duckdb": (duckdb, {}), "dask": (dd, {"engine": "pyarrow", "dtype_backend": "pyarrow"}), } BACKEND_COLLECT_FUNC_MAP = { "polars[lazy]": lambda x: x.collect(), + "duckdb": lambda x: x.pl(), "dask": lambda x: x.compute(), } +DUCKDB_SKIPS = ["q14", "q15"] + QUERY_DATA_PATH_MAP = { "q1": (LINEITEM_PATH,), "q2": (REGION_PATH, NATION_PATH, SUPPLIER_PATH, PART_PATH, PARTSUPP_PATH), @@ -89,10 +95,14 @@ def execute_query(query_id: str) -> None: data_paths = QUERY_DATA_PATH_MAP[query_id] for backend, (native_namespace, kwargs) in BACKEND_NAMESPACE_KWARGS_MAP.items(): + if backend == "duckdb" and query_id in DUCKDB_SKIPS: + print(f"\nSkipping {query_id} for DuckDB") # noqa: T201 + continue + print(f"\nRunning {query_id} with {backend=}") # noqa: T201 result = query_module.query( *( - nw.scan_parquet(path, native_namespace=native_namespace, **kwargs) + nw.scan_parquet(str(path), native_namespace=native_namespace, **kwargs) for path in data_paths ) ) diff --git a/utils/generate_backend_completeness.py b/utils/generate_backend_completeness.py index 2ab8d2187..397c8e4d6 100644 --- a/utils/generate_backend_completeness.py +++ b/utils/generate_backend_completeness.py @@ -31,9 +31,10 @@ class Backend(NamedTuple): MODULES = ["dataframe", "series", "expr"] BACKENDS = [ - Backend(name="pandas-like", module="_pandas_like", type_=BackendType.EAGER), Backend(name="arrow", module="_arrow", type_=BackendType.EAGER), Backend(name="dask", module="_dask", type_=BackendType.LAZY), + Backend(name="duckdb", module="_duckdb", type_=BackendType.LAZY), + Backend(name="pandas-like", module="_pandas_like", type_=BackendType.EAGER), Backend(name="spark-like", module="_spark_like", type_=BackendType.LAZY), ] @@ -55,6 +56,7 @@ def parse_module(module_name: str, backend: str, nw_class_name: str) -> list[str inspect.isclass(c) and c.__name__.endswith(nw_class_name) and not c.__name__.startswith("Compliant") # Exclude protocols + and not c.__name__.startswith("DuckDBInterchange") ), ) diff --git a/utils/import_check.py b/utils/import_check.py index eee35dfc4..bac54aff7 100644 --- a/utils/import_check.py +++ b/utils/import_check.py @@ -23,6 +23,7 @@ "_arrow": {"pyarrow", "pyarrow.compute", "pyarrow.parquet"}, "_dask": {"dask.dataframe", "pandas", "dask_expr"}, "_polars": {"polars"}, + "_duckdb": {"duckdb"}, } @@ -63,6 +64,7 @@ def visit_ImportFrom(self, node: ast.ImportFrom) -> None: # noqa: N802 if ( node.module in BANNED_IMPORTS and "# ignore-banned-import" not in self.lines[node.lineno - 1] + and node.module not in self.allowed_imports ): print( # noqa: T201 f"{self.file_name}:{node.lineno}:{node.col_offset}: found {node.module} import"