Merge branch 'main' into feat/expr-rank

narwhals-dev · Jan 7, 2025 · 585d0d6 · 585d0d6
2 parents b68f575 + 74dd9db
commit 585d0d6
Show file tree

Hide file tree

Showing 150 changed files with 5,750 additions and 2,621 deletions.
diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml
@@ -61,7 +61,7 @@ jobs:
           cache-suffix: ${{ matrix.python-version }}
           cache-dependency-glob: "pyproject.toml"
       - name: install-pretty-old-versions
-        run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.3.0 scipy==1.5.0 scikit-learn==1.1.0 tzdata --system
+        run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.5.0 scikit-learn==1.1.0 tzdata --system
       - name: install-reqs
         run: uv pip install -e ".[dev]" --system
       - name: show-deps
@@ -75,7 +75,7 @@ jobs:
           echo "$DEPS" | grep 'polars==0.20.3'
           echo "$DEPS" | grep 'numpy==1.17.5'
           echo "$DEPS" | grep 'pyarrow==11.0.0'
-          echo "$DEPS" | grep 'pyspark==3.3.0'
+          echo "$DEPS" | grep 'pyspark==3.5.0'
           echo "$DEPS" | grep 'scipy==1.5.0'
           echo "$DEPS" | grep 'scikit-learn==1.1.0'
       - name: Run pytest
@@ -84,7 +84,7 @@ jobs:
   not_so_old_versions:
     strategy:
       matrix:
-        python-version: ["3.9"]
+        python-version: ["3.10"]
         os: [ubuntu-latest]
     runs-on: ${{ matrix.os }}
     steps:
@@ -99,7 +99,7 @@ jobs:
           cache-suffix: ${{ matrix.python-version }}
           cache-dependency-glob: "pyproject.toml"
       - name: install-not-so-old-versions
-        run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==14.0.0 "pyarrow-stubs<17" pyspark==3.4.0 scipy==1.8.0 scikit-learn==1.3.0 dask[dataframe]==2024.7 tzdata --system
+        run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==15.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.8.0 scikit-learn==1.3.0 dask[dataframe]==2024.10 tzdata --system
       - name: install-reqs
         run: uv pip install -e ".[dev]" --system
       - name: show-deps
@@ -110,11 +110,11 @@ jobs:
           echo "$DEPS" | grep 'pandas==2.0.3'
           echo "$DEPS" | grep 'polars==0.20.8'
           echo "$DEPS" | grep 'numpy==1.24.4'
-          echo "$DEPS" | grep 'pyarrow==14.0.0'
-          echo "$DEPS" | grep 'pyspark==3.4.0'
+          echo "$DEPS" | grep 'pyarrow==15.0.0'
+          echo "$DEPS" | grep 'pyspark==3.5.0'
           echo "$DEPS" | grep 'scipy==1.8.0'
           echo "$DEPS" | grep 'scikit-learn==1.3.0'
-          echo "$DEPS" | grep 'dask==2024.7'
+          echo "$DEPS" | grep 'dask==2024.10'
       - name: Run pytest
         run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow --constructors=pandas,pyarrow,polars[eager],polars[lazy],dask
 

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -59,7 +59,7 @@ jobs:
   pytest-full-coverage:
     strategy:
       matrix:
-        python-version: ["3.9", "3.11", "3.13"]
+        python-version: ["3.11", "3.13"]
         os: [ubuntu-latest]
     runs-on: ${{ matrix.os }}
     steps:
@@ -78,7 +78,7 @@ jobs:
       - name: install pyspark
         run: uv pip install -e ".[pyspark]" --system
         # PySpark is not yet available on Python3.12+
-        if: matrix.python-version == '3.9' || matrix.python-version == '3.11'
+        if: matrix.python-version != '3.13'
       - name: install ibis
         run: uv pip install -e ".[ibis]" --system
         # Ibis puts upper bounds on dependencies, and requires Python3.10+,

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ ci:
 repos:
 - repo: https://github.com/astral-sh/ruff-pre-commit
   # Ruff version.
-  rev: 'v0.8.1'
+  rev: 'v0.8.6'
   hooks:
     # Run the formatter.
     - id: ruff-format
@@ -14,7 +14,7 @@ repos:
       alias: check-docstrings
       entry: python utils/check_docstrings.py
 - repo: https://github.com/pre-commit/mirrors-mypy
-  rev: 'v1.13.0'
+  rev: 'v1.14.1'
   hooks:
     - id: mypy
       additional_dependencies: ['polars==1.4.1', 'pytest==8.3.2']

diff --git a/README.md b/README.md
@@ -14,8 +14,7 @@
 Extremely lightweight and extensible compatibility layer between dataframe libraries!
 
 - **Full API support**: cuDF, Modin, pandas, Polars, PyArrow
-- **Lazy-only support**: Dask
-- **Interchange-level support**: DuckDB, Ibis, Vaex, anything which implements the DataFrame Interchange Protocol
+- **Lazy-only support**: Dask. Work in progress: DuckDB, Ibis, PySpark.
 
 Seamlessly support all, without depending on any!
 

diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md
@@ -32,6 +32,7 @@
         - is_first_distinct
         - is_in
         - is_last_distinct
+        - is_nan
         - is_null
         - is_unique
         - len

diff --git a/docs/api-reference/expr_dt.md b/docs/api-reference/expr_dt.md
@@ -23,6 +23,7 @@
         - total_nanoseconds
         - total_seconds
         - to_string
+        - weekday
         - year
       show_source: false
       show_bases: false
diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md
@@ -38,6 +38,7 @@
         - is_first_distinct
         - is_in
         - is_last_distinct
+        - is_nan
         - is_null
         - is_sorted
         - is_unique

diff --git a/docs/api-reference/series_dt.md b/docs/api-reference/series_dt.md
@@ -23,6 +23,7 @@
         - total_nanoseconds
         - total_seconds
         - to_string
+        - weekday
         - year
       show_source: false
       show_bases: false
diff --git a/docs/backcompat.md b/docs/backcompat.md
@@ -96,12 +96,13 @@ Anything currently in `narwhals.stable.v1` will not be changed or removed in fut
 
 Here are exceptions to our backwards compatibility policy:
 
-- unambiguous bugs. If a function contains what is unambiguously a bug, then we'll fix it, without
+- Unambiguous bugs. If a function contains what is unambiguously a bug, then we'll fix it, without
   considering that to be a breaking change.
-- radical changes in backends. Suppose that Polars was to remove
+- Radical changes in backends. Suppose that Polars was to remove
   expressions, or pandas were to remove support for categorical data. At that point, we might
   need to rethink Narwhals. However, we expect such radical changes to be exceedingly unlikely.
-- we may consider making some type hints more precise.
+- We may consider making some type hints more precise.
+- Anything labelled "unstable".
 
 In general, decision are driven by use-cases, and we conduct a search of public GitHub repositories
 before making any change.
@@ -110,6 +111,10 @@ before making any change.
 
 ### After `stable.v1`
 
+
+- Since Narwhals 1.21, passing a `DuckDBPyRelation` to `from_native` returns a `LazyFrame`. In
+  `narwhals.stable.v1`, it returns a `DataFrame` with `level='interchange'`.
+
 - Since Narwhals 1.15, `Series` is generic in the native Series, meaning that you can
   write:
   ```python

diff --git a/docs/basics/dataframe_conversion.md b/docs/basics/dataframe_conversion.md
@@ -14,6 +14,7 @@ To illustrate, we create dataframes in various formats:
 ```python exec="1" source="above" session="conversion"
 import narwhals as nw
 from narwhals.typing import IntoDataFrame
+from typing import Any
 
 import duckdb
 import polars as pl
@@ -45,11 +46,17 @@ print(df_to_pandas(df_polars))
 
 ### Via PyCapsule Interface
 
-Similarly, if your library uses Polars internally, you can convert any user-supplied dataframe to Polars format using Narwhals.
+Similarly, if your library uses Polars internally, you can convert any user-supplied dataframe
+which implements `__arrow_c_stream__`:
 
 ```python exec="1" source="above" session="conversion" result="python"
-def df_to_polars(df: IntoDataFrame) -> pl.DataFrame:
-    return nw.from_arrow(nw.from_native(df), native_namespace=pl).to_native()
+def df_to_polars(df_native: Any) -> pl.DataFrame:
+    if hasattr(df_native, "__arrow_c_stream__"):
+        return nw.from_arrow(df_native, native_namespace=pl).to_native()
+    msg = (
+        f"Expected object which implements '__arrow_c_stream__' got: {type(df_native)}"
+    )
+    raise TypeError(msg)
 
 
 print(df_to_polars(df_duckdb))  # You can only execute this line of code once.
@@ -66,8 +73,9 @@ If you need to ingest the same dataframe multiple times, then you may want to go
 This may be less efficient than the PyCapsule approach above (and always requires PyArrow!), but is more forgiving:
 
 ```python exec="1" source="above" session="conversion" result="python"
-def df_to_polars(df: IntoDataFrame) -> pl.DataFrame:
-    return pl.DataFrame(nw.from_native(df).to_arrow())
+def df_to_polars(df_native: IntoDataFrame) -> pl.DataFrame:
+    df = nw.from_native(df_native).lazy().collect()
+    return pl.DataFrame(nw.from_native(df, eager_only=True).to_arrow())
 
 
 df_duckdb = duckdb.sql("SELECT * FROM df_polars")

diff --git a/docs/extending.md b/docs/extending.md
@@ -15,17 +15,16 @@ Currently, Narwhals has **full API** support for the following libraries:
 It also has **lazy-only** support for [Dask](https://github.com/dask/dask), and **interchange** support
 for [DuckDB](https://github.com/duckdb/duckdb) and [Ibis](https://github.com/ibis-project/ibis).
 
+We are working towards full "lazy-only" support for DuckDB, Ibis, and PySpark.
+
 ### Levels of support
 
 Narwhals comes with three levels of support:
 
 - **Full API support**: cuDF, Modin, pandas, Polars, PyArrow
-- **Lazy-only support**: Dask
+- **Lazy-only support**: Dask. Work in progress: DuckDB, Ibis, PySpark.
 - **Interchange-level support**: DuckDB, Ibis, Vaex, anything which implements the DataFrame Interchange Protocol
 
-The lazy-only layer is a major item on our 2025 roadmap, and hope to be able to bring libraries currently in
-the "interchange" level into that one.
-
 Libraries for which we have full support can benefit from the whole
 [Narwhals API](./api-reference/index.md).
 

diff --git a/docs/installation.md b/docs/installation.md
@@ -30,7 +30,7 @@ To verify the installation, start the Python REPL and execute:
 ```python
 >>> import narwhals
 >>> narwhals.__version__
-'1.20.1'
+'1.21.0'
 ```
 
 If you see the version number, then the installation was successful!

diff --git a/docs/pandas_like_concepts/null_handling.md b/docs/pandas_like_concepts/null_handling.md
@@ -43,3 +43,48 @@ def check_null_behavior(df: IntoFrameT) -> IntoFrameT:
     df = pa.table(data)
     print(check_null_behavior(df))
     ```
+
+Conversely, `is_nan` is consistent across backends. This consistency comes from Narwhals exploiting its native implementations
+in Polars and PyArrow, while ensuring that pandas only identifies the floating-point NaN values and not those encoding the missing value indicator.
+
+```python exec="1" source="above" session="null_handling"
+import narwhals as nw
+from narwhals.typing import IntoFrameT
+
+data = {"a": [0.0, None, 2.0]}
+
+
+def check_nan_behavior(df: IntoFrameT) -> IntoFrameT:
+    return (
+        nw.from_native(df)
+        .with_columns(
+            a_div_a=(nw.col("a") / nw.col("a")),
+            a_div_a_is_nan=(nw.col("a") / nw.col("a")).is_nan(),
+        )
+        .to_native()
+    )
+```
+
+=== "pandas"
+    ```python exec="true" source="material-block" result="python" session="null_handling"
+    import pandas as pd
+
+    df = pd.DataFrame(data).astype({"a": "Float64"})
+    print(check_nan_behavior(df))
+    ```
+
+=== "Polars (eager)"
+    ```python exec="true" source="material-block" result="python" session="null_handling"
+    import polars as pl
+
+    df = pl.DataFrame(data)
+    print(check_nan_behavior(df))
+    ```
+
+=== "PyArrow"
+    ```python exec="true" source="material-block" result="python" session="null_handling"
+    import pyarrow as pa
+
+    df = pa.table(data)
+    print(check_nan_behavior(df))
+    ```
diff --git a/narwhals/__init__.py b/narwhals/__init__.py
@@ -79,7 +79,7 @@
 from narwhals.utils import maybe_reset_index
 from narwhals.utils import maybe_set_index
 
-__version__ = "1.20.1"
+__version__ = "1.21.0"
 
 __all__ = [
     "Array",

diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py
@@ -16,12 +16,14 @@
 from narwhals._arrow.utils import validate_dataframe_comparand
 from narwhals._expression_parsing import evaluate_into_exprs
 from narwhals.dependencies import is_numpy_array
+from narwhals.exceptions import ColumnNotFoundError
 from narwhals.utils import Implementation
 from narwhals.utils import flatten
 from narwhals.utils import generate_temporary_column_name
 from narwhals.utils import is_sequence_but_not_str
 from narwhals.utils import parse_columns_to_drop
 from narwhals.utils import scale_bytes
+from narwhals.utils import validate_backend_version
 
 if TYPE_CHECKING:
     from types import ModuleType
@@ -56,6 +58,7 @@ def __init__(
         self._implementation = Implementation.PYARROW
         self._backend_version = backend_version
         self._version = version
+        validate_backend_version(self._implementation, self._backend_version)
 
     def __narwhals_namespace__(self: Self) -> ArrowNamespace:
         from narwhals._arrow.namespace import ArrowNamespace
@@ -485,9 +488,12 @@ def with_row_index(self: Self, name: str) -> Self:
         import pyarrow as pa
 
         df = self._native_frame
+        cols = self.columns
 
         row_indices = pa.array(range(df.num_rows))
-        return self._from_native_frame(df.append_column(name, row_indices))
+        return self._from_native_frame(
+            df.append_column(name, row_indices).select([name, *cols])
+        )
 
     def filter(self: Self, *predicates: IntoArrowExpr, **constraints: Any) -> Self:
         if (
@@ -664,6 +670,9 @@ def unique(
         import pyarrow.compute as pc
 
         df = self._native_frame
+        if subset is not None and any(x not in self.columns for x in subset):
+            msg = f"Column(s) {subset} not found in {self.columns}"
+            raise ColumnNotFoundError(msg)
         subset = subset or self.columns
 
         if keep in {"any", "first", "last"}:
-Original file line number
+Diff line change
@@ Expand Up / @@ -32,6 +32,7 @@ @@
             - is_first_distinct
             - is_in
             - is_last_distinct
+            - is_nan
             - is_null
             - is_unique
             - len
@@ Expand Down @@