Skip to content

Commit

Permalink
Merge branch 'main' into feat/expr-rank
Browse files Browse the repository at this point in the history
  • Loading branch information
FBruzzesi authored Jan 7, 2025
2 parents b68f575 + 74dd9db commit 585d0d6
Show file tree
Hide file tree
Showing 150 changed files with 5,750 additions and 2,621 deletions.
14 changes: 7 additions & 7 deletions .github/workflows/extremes.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:
cache-suffix: ${{ matrix.python-version }}
cache-dependency-glob: "pyproject.toml"
- name: install-pretty-old-versions
run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.3.0 scipy==1.5.0 scikit-learn==1.1.0 tzdata --system
run: uv pip install pipdeptree tox virtualenv setuptools pandas==1.1.5 polars==0.20.3 numpy==1.17.5 pyarrow==11.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.5.0 scikit-learn==1.1.0 tzdata --system
- name: install-reqs
run: uv pip install -e ".[dev]" --system
- name: show-deps
Expand All @@ -75,7 +75,7 @@ jobs:
echo "$DEPS" | grep 'polars==0.20.3'
echo "$DEPS" | grep 'numpy==1.17.5'
echo "$DEPS" | grep 'pyarrow==11.0.0'
echo "$DEPS" | grep 'pyspark==3.3.0'
echo "$DEPS" | grep 'pyspark==3.5.0'
echo "$DEPS" | grep 'scipy==1.5.0'
echo "$DEPS" | grep 'scikit-learn==1.1.0'
- name: Run pytest
Expand All @@ -84,7 +84,7 @@ jobs:
not_so_old_versions:
strategy:
matrix:
python-version: ["3.9"]
python-version: ["3.10"]
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
steps:
Expand All @@ -99,7 +99,7 @@ jobs:
cache-suffix: ${{ matrix.python-version }}
cache-dependency-glob: "pyproject.toml"
- name: install-not-so-old-versions
run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==14.0.0 "pyarrow-stubs<17" pyspark==3.4.0 scipy==1.8.0 scikit-learn==1.3.0 dask[dataframe]==2024.7 tzdata --system
run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==15.0.0 "pyarrow-stubs<17" pyspark==3.5.0 scipy==1.8.0 scikit-learn==1.3.0 dask[dataframe]==2024.10 tzdata --system
- name: install-reqs
run: uv pip install -e ".[dev]" --system
- name: show-deps
Expand All @@ -110,11 +110,11 @@ jobs:
echo "$DEPS" | grep 'pandas==2.0.3'
echo "$DEPS" | grep 'polars==0.20.8'
echo "$DEPS" | grep 'numpy==1.24.4'
echo "$DEPS" | grep 'pyarrow==14.0.0'
echo "$DEPS" | grep 'pyspark==3.4.0'
echo "$DEPS" | grep 'pyarrow==15.0.0'
echo "$DEPS" | grep 'pyspark==3.5.0'
echo "$DEPS" | grep 'scipy==1.8.0'
echo "$DEPS" | grep 'scikit-learn==1.3.0'
echo "$DEPS" | grep 'dask==2024.7'
echo "$DEPS" | grep 'dask==2024.10'
- name: Run pytest
run: pytest tests --cov=narwhals --cov=tests --cov-fail-under=50 --runslow --constructors=pandas,pyarrow,polars[eager],polars[lazy],dask

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ jobs:
pytest-full-coverage:
strategy:
matrix:
python-version: ["3.9", "3.11", "3.13"]
python-version: ["3.11", "3.13"]
os: [ubuntu-latest]
runs-on: ${{ matrix.os }}
steps:
Expand All @@ -78,7 +78,7 @@ jobs:
- name: install pyspark
run: uv pip install -e ".[pyspark]" --system
# PySpark is not yet available on Python3.12+
if: matrix.python-version == '3.9' || matrix.python-version == '3.11'
if: matrix.python-version != '3.13'
- name: install ibis
run: uv pip install -e ".[ibis]" --system
# Ibis puts upper bounds on dependencies, and requires Python3.10+,
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ ci:
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: 'v0.8.1'
rev: 'v0.8.6'
hooks:
# Run the formatter.
- id: ruff-format
Expand All @@ -14,7 +14,7 @@ repos:
alias: check-docstrings
entry: python utils/check_docstrings.py
- repo: https://github.com/pre-commit/mirrors-mypy
rev: 'v1.13.0'
rev: 'v1.14.1'
hooks:
- id: mypy
additional_dependencies: ['polars==1.4.1', 'pytest==8.3.2']
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
Extremely lightweight and extensible compatibility layer between dataframe libraries!

- **Full API support**: cuDF, Modin, pandas, Polars, PyArrow
- **Lazy-only support**: Dask
- **Interchange-level support**: DuckDB, Ibis, Vaex, anything which implements the DataFrame Interchange Protocol
- **Lazy-only support**: Dask. Work in progress: DuckDB, Ibis, PySpark.

Seamlessly support all, without depending on any!

Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/expr.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
- is_first_distinct
- is_in
- is_last_distinct
- is_nan
- is_null
- is_unique
- len
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/expr_dt.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
- total_nanoseconds
- total_seconds
- to_string
- weekday
- year
show_source: false
show_bases: false
1 change: 1 addition & 0 deletions docs/api-reference/series.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
- is_first_distinct
- is_in
- is_last_distinct
- is_nan
- is_null
- is_sorted
- is_unique
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/series_dt.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
- total_nanoseconds
- total_seconds
- to_string
- weekday
- year
show_source: false
show_bases: false
11 changes: 8 additions & 3 deletions docs/backcompat.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,13 @@ Anything currently in `narwhals.stable.v1` will not be changed or removed in fut

Here are exceptions to our backwards compatibility policy:

- unambiguous bugs. If a function contains what is unambiguously a bug, then we'll fix it, without
- Unambiguous bugs. If a function contains what is unambiguously a bug, then we'll fix it, without
considering that to be a breaking change.
- radical changes in backends. Suppose that Polars was to remove
- Radical changes in backends. Suppose that Polars was to remove
expressions, or pandas were to remove support for categorical data. At that point, we might
need to rethink Narwhals. However, we expect such radical changes to be exceedingly unlikely.
- we may consider making some type hints more precise.
- We may consider making some type hints more precise.
- Anything labelled "unstable".

In general, decision are driven by use-cases, and we conduct a search of public GitHub repositories
before making any change.
Expand All @@ -110,6 +111,10 @@ before making any change.

### After `stable.v1`


- Since Narwhals 1.21, passing a `DuckDBPyRelation` to `from_native` returns a `LazyFrame`. In
`narwhals.stable.v1`, it returns a `DataFrame` with `level='interchange'`.

- Since Narwhals 1.15, `Series` is generic in the native Series, meaning that you can
write:
```python
Expand Down
18 changes: 13 additions & 5 deletions docs/basics/dataframe_conversion.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ To illustrate, we create dataframes in various formats:
```python exec="1" source="above" session="conversion"
import narwhals as nw
from narwhals.typing import IntoDataFrame
from typing import Any

import duckdb
import polars as pl
Expand Down Expand Up @@ -45,11 +46,17 @@ print(df_to_pandas(df_polars))

### Via PyCapsule Interface

Similarly, if your library uses Polars internally, you can convert any user-supplied dataframe to Polars format using Narwhals.
Similarly, if your library uses Polars internally, you can convert any user-supplied dataframe
which implements `__arrow_c_stream__`:

```python exec="1" source="above" session="conversion" result="python"
def df_to_polars(df: IntoDataFrame) -> pl.DataFrame:
return nw.from_arrow(nw.from_native(df), native_namespace=pl).to_native()
def df_to_polars(df_native: Any) -> pl.DataFrame:
if hasattr(df_native, "__arrow_c_stream__"):
return nw.from_arrow(df_native, native_namespace=pl).to_native()
msg = (
f"Expected object which implements '__arrow_c_stream__' got: {type(df_native)}"
)
raise TypeError(msg)


print(df_to_polars(df_duckdb)) # You can only execute this line of code once.
Expand All @@ -66,8 +73,9 @@ If you need to ingest the same dataframe multiple times, then you may want to go
This may be less efficient than the PyCapsule approach above (and always requires PyArrow!), but is more forgiving:

```python exec="1" source="above" session="conversion" result="python"
def df_to_polars(df: IntoDataFrame) -> pl.DataFrame:
return pl.DataFrame(nw.from_native(df).to_arrow())
def df_to_polars(df_native: IntoDataFrame) -> pl.DataFrame:
df = nw.from_native(df_native).lazy().collect()
return pl.DataFrame(nw.from_native(df, eager_only=True).to_arrow())


df_duckdb = duckdb.sql("SELECT * FROM df_polars")
Expand Down
7 changes: 3 additions & 4 deletions docs/extending.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,16 @@ Currently, Narwhals has **full API** support for the following libraries:
It also has **lazy-only** support for [Dask](https://github.com/dask/dask), and **interchange** support
for [DuckDB](https://github.com/duckdb/duckdb) and [Ibis](https://github.com/ibis-project/ibis).

We are working towards full "lazy-only" support for DuckDB, Ibis, and PySpark.

### Levels of support

Narwhals comes with three levels of support:

- **Full API support**: cuDF, Modin, pandas, Polars, PyArrow
- **Lazy-only support**: Dask
- **Lazy-only support**: Dask. Work in progress: DuckDB, Ibis, PySpark.
- **Interchange-level support**: DuckDB, Ibis, Vaex, anything which implements the DataFrame Interchange Protocol

The lazy-only layer is a major item on our 2025 roadmap, and hope to be able to bring libraries currently in
the "interchange" level into that one.

Libraries for which we have full support can benefit from the whole
[Narwhals API](./api-reference/index.md).

Expand Down
2 changes: 1 addition & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ To verify the installation, start the Python REPL and execute:
```python
>>> import narwhals
>>> narwhals.__version__
'1.20.1'
'1.21.0'
```

If you see the version number, then the installation was successful!
Expand Down
45 changes: 45 additions & 0 deletions docs/pandas_like_concepts/null_handling.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,48 @@ def check_null_behavior(df: IntoFrameT) -> IntoFrameT:
df = pa.table(data)
print(check_null_behavior(df))
```

Conversely, `is_nan` is consistent across backends. This consistency comes from Narwhals exploiting its native implementations
in Polars and PyArrow, while ensuring that pandas only identifies the floating-point NaN values and not those encoding the missing value indicator.

```python exec="1" source="above" session="null_handling"
import narwhals as nw
from narwhals.typing import IntoFrameT

data = {"a": [0.0, None, 2.0]}


def check_nan_behavior(df: IntoFrameT) -> IntoFrameT:
return (
nw.from_native(df)
.with_columns(
a_div_a=(nw.col("a") / nw.col("a")),
a_div_a_is_nan=(nw.col("a") / nw.col("a")).is_nan(),
)
.to_native()
)
```

=== "pandas"
```python exec="true" source="material-block" result="python" session="null_handling"
import pandas as pd

df = pd.DataFrame(data).astype({"a": "Float64"})
print(check_nan_behavior(df))
```

=== "Polars (eager)"
```python exec="true" source="material-block" result="python" session="null_handling"
import polars as pl

df = pl.DataFrame(data)
print(check_nan_behavior(df))
```

=== "PyArrow"
```python exec="true" source="material-block" result="python" session="null_handling"
import pyarrow as pa

df = pa.table(data)
print(check_nan_behavior(df))
```
2 changes: 1 addition & 1 deletion narwhals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
from narwhals.utils import maybe_reset_index
from narwhals.utils import maybe_set_index

__version__ = "1.20.1"
__version__ = "1.21.0"

__all__ = [
"Array",
Expand Down
11 changes: 10 additions & 1 deletion narwhals/_arrow/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@
from narwhals._arrow.utils import validate_dataframe_comparand
from narwhals._expression_parsing import evaluate_into_exprs
from narwhals.dependencies import is_numpy_array
from narwhals.exceptions import ColumnNotFoundError
from narwhals.utils import Implementation
from narwhals.utils import flatten
from narwhals.utils import generate_temporary_column_name
from narwhals.utils import is_sequence_but_not_str
from narwhals.utils import parse_columns_to_drop
from narwhals.utils import scale_bytes
from narwhals.utils import validate_backend_version

if TYPE_CHECKING:
from types import ModuleType
Expand Down Expand Up @@ -56,6 +58,7 @@ def __init__(
self._implementation = Implementation.PYARROW
self._backend_version = backend_version
self._version = version
validate_backend_version(self._implementation, self._backend_version)

def __narwhals_namespace__(self: Self) -> ArrowNamespace:
from narwhals._arrow.namespace import ArrowNamespace
Expand Down Expand Up @@ -485,9 +488,12 @@ def with_row_index(self: Self, name: str) -> Self:
import pyarrow as pa

df = self._native_frame
cols = self.columns

row_indices = pa.array(range(df.num_rows))
return self._from_native_frame(df.append_column(name, row_indices))
return self._from_native_frame(
df.append_column(name, row_indices).select([name, *cols])
)

def filter(self: Self, *predicates: IntoArrowExpr, **constraints: Any) -> Self:
if (
Expand Down Expand Up @@ -664,6 +670,9 @@ def unique(
import pyarrow.compute as pc

df = self._native_frame
if subset is not None and any(x not in self.columns for x in subset):
msg = f"Column(s) {subset} not found in {self.columns}"
raise ColumnNotFoundError(msg)
subset = subset or self.columns

if keep in {"any", "first", "last"}:
Expand Down
Loading

0 comments on commit 585d0d6

Please sign in to comment.