diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 2406cbecf..87ec52ef5 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -20,7 +20,6 @@ from narwhals.group_by import GroupBy from narwhals.series import Series from narwhals.typing import IntoExpr - from narwhals.typing import T class BaseFrame: @@ -208,7 +207,7 @@ def to_dict(self, *, as_series: bool = True) -> dict[str, Any]: class LazyFrame(BaseFrame): def __init__( self, - df: T, + df: Any, *, implementation: str | None = None, ) -> None: diff --git a/narwhals/pandas_like/dataframe.py b/narwhals/pandas_like/dataframe.py index 41babe4af..bbfdfc74b 100644 --- a/narwhals/pandas_like/dataframe.py +++ b/narwhals/pandas_like/dataframe.py @@ -8,7 +8,6 @@ from narwhals.pandas_like.utils import evaluate_into_exprs from narwhals.pandas_like.utils import horizontal_concat -from narwhals.pandas_like.utils import reset_index from narwhals.pandas_like.utils import translate_dtype from narwhals.pandas_like.utils import validate_dataframe_comparand from narwhals.utils import flatten_str @@ -33,17 +32,18 @@ def __init__( implementation: str, ) -> None: self._validate_columns(dataframe.columns) - self._dataframe = reset_index(dataframe) + self._dataframe = dataframe self._implementation = implementation def _validate_columns(self, columns: Sequence[str]) -> None: - counter = collections.Counter(columns) - for col, count in counter.items(): - if count > 1: - msg = f"Expected unique column names, got {col!r} {count} time(s)" - raise ValueError( - msg, - ) + if len(columns) != len(set(columns)): + counter = collections.Counter(columns) + for col, count in counter.items(): + if count > 1: + msg = f"Expected unique column names, got {col!r} {count} time(s)" + raise ValueError( + msg, + ) def _validate_booleanness(self) -> None: if not ( @@ -102,7 +102,7 @@ def filter( expr = plx.all_horizontal(*predicates) # Safety: all_horizontal's expression only returns a single column. mask = expr._call(self)[0] - _mask = validate_dataframe_comparand(mask) + _mask = validate_dataframe_comparand(self._dataframe.index, mask) return self._from_dataframe(self._dataframe.loc[_mask]) def with_columns( @@ -112,7 +112,10 @@ def with_columns( ) -> Self: new_series = evaluate_into_exprs(self, *exprs, **named_exprs) df = self._dataframe.assign( - **{series.name: validate_dataframe_comparand(series) for series in new_series} + **{ + series.name: validate_dataframe_comparand(self._dataframe.index, series) + for series in new_series + } ) return self._from_dataframe(df) @@ -137,9 +140,7 @@ def sort( ascending: bool | list[bool] = not descending else: ascending = [not d for d in descending] - return self._from_dataframe( - df.sort_values(flat_keys, ascending=ascending), - ) + return self._from_dataframe(df.sort_values(flat_keys, ascending=ascending)) # --- convert --- def collect(self) -> PandasDataFrame: diff --git a/narwhals/pandas_like/series.py b/narwhals/pandas_like/series.py index c0626ab69..59a1e0bcc 100644 --- a/narwhals/pandas_like/series.py +++ b/narwhals/pandas_like/series.py @@ -7,7 +7,6 @@ from pandas.api.types import is_extension_array_dtype from narwhals.pandas_like.utils import item -from narwhals.pandas_like.utils import reset_index from narwhals.pandas_like.utils import reverse_translate_dtype from narwhals.pandas_like.utils import translate_dtype from narwhals.pandas_like.utils import validate_column_comparand @@ -32,7 +31,7 @@ def __init__( """ self._name = str(series.name) if series.name is not None else "" - self._series = reset_index(series) + self._series = series self._implementation = implementation def _from_series(self, series: Any) -> Self: @@ -70,7 +69,9 @@ def cast( def filter(self, mask: Self) -> Self: ser = self._series - return self._from_series(ser.loc[validate_column_comparand(mask)]) + return self._from_series( + ser.loc[validate_column_comparand(self._series.index, mask)] + ) def item(self) -> Any: return item(self._series) @@ -93,122 +94,122 @@ def is_in(self, other: Any) -> PandasSeries: def __eq__(self, other: object) -> PandasSeries: # type: ignore[override] ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__eq__(other)).rename(ser.name, copy=False)) def __ne__(self, other: object) -> PandasSeries: # type: ignore[override] ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__ne__(other)).rename(ser.name, copy=False)) def __ge__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__ge__(other)).rename(ser.name, copy=False)) def __gt__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__gt__(other)).rename(ser.name, copy=False)) def __le__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__le__(other)).rename(ser.name, copy=False)) def __lt__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__lt__(other)).rename(ser.name, copy=False)) def __and__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__and__(other)).rename(ser.name, copy=False)) def __rand__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__rand__(other)).rename(ser.name, copy=False)) def __or__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__or__(other)).rename(ser.name, copy=False)) def __ror__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__ror__(other)).rename(ser.name, copy=False)) def __add__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__add__(other)).rename(ser.name, copy=False)) def __radd__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__radd__(other)).rename(ser.name, copy=False)) def __sub__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__sub__(other)).rename(ser.name, copy=False)) def __rsub__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__rsub__(other)).rename(ser.name, copy=False)) def __mul__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__mul__(other)).rename(ser.name, copy=False)) def __rmul__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__rmul__(other)).rename(ser.name, copy=False)) def __truediv__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__truediv__(other)).rename(ser.name, copy=False)) def __rtruediv__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__rtruediv__(other)).rename(ser.name, copy=False)) def __floordiv__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__floordiv__(other)).rename(ser.name, copy=False)) def __rfloordiv__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__rfloordiv__(other)).rename(ser.name, copy=False)) def __pow__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__pow__(other)).rename(ser.name, copy=False)) def __rpow__(self, other: Any) -> PandasSeries: # pragma: no cover ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__rpow__(other)).rename(ser.name, copy=False)) def __mod__(self, other: Any) -> PandasSeries: ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__mod__(other)).rename(ser.name, copy=False)) def __rmod__(self, other: Any) -> PandasSeries: # pragma: no cover ser = self._series - other = validate_column_comparand(other) + other = validate_column_comparand(self._series.index, other) return self._from_series((ser.__rmod__(other)).rename(ser.name, copy=False)) # Unary @@ -285,8 +286,8 @@ def n_unique(self) -> int: return ser.nunique() # type: ignore[no-any-return] def zip_with(self, mask: PandasSeries, other: PandasSeries) -> PandasSeries: - mask = validate_column_comparand(mask) - other = validate_column_comparand(other) + mask = validate_column_comparand(self._series.index, mask) + other = validate_column_comparand(self._series.index, other) ser = self._series return self._from_series(ser.where(mask, other)) diff --git a/narwhals/pandas_like/utils.py b/narwhals/pandas_like/utils.py index 7f0351003..3cdf673d1 100644 --- a/narwhals/pandas_like/utils.py +++ b/narwhals/pandas_like/utils.py @@ -23,7 +23,7 @@ from narwhals.pandas_like.typing import IntoPandasExpr -def validate_column_comparand(other: Any) -> Any: +def validate_column_comparand(index: Any, other: Any) -> Any: """Validate RHS of binary operation. If the comparison isn't supported, return `NotImplemented` so that the @@ -47,11 +47,17 @@ def validate_column_comparand(other: Any) -> Any: if other.len() == 1: # broadcast return other.item() + if other._series.index is not index and not (other._series.index == index).all(): + msg = ( + "Narwhals does not support automated index alignment. " + "Please reset the index of the Series or DataFrame." + ) + raise ValueError(msg) return other._series return other -def validate_dataframe_comparand(other: Any) -> Any: +def validate_dataframe_comparand(index: Any, other: Any) -> Any: """Validate RHS of binary operation. If the comparison isn't supported, return `NotImplemented` so that the @@ -60,19 +66,25 @@ def validate_dataframe_comparand(other: Any) -> Any: from narwhals.pandas_like.dataframe import PandasDataFrame from narwhals.pandas_like.series import PandasSeries - if isinstance(other, list) and len(other) > 1: - # e.g. `plx.all() + plx.all()` - msg = "Multi-output expressions are not supported in this context" - raise ValueError(msg) - if isinstance(other, list): - other = other[0] if isinstance(other, PandasDataFrame): return NotImplemented if isinstance(other, PandasSeries): if other.len() == 1: # broadcast return item(other._series) + if other._series.index is not index and not (other._series.index == index).all(): + msg = ( + "Narwhals does not support automated index alignment. " + "Please reset the index of the Series or DataFrame." + ) + raise ValueError(msg) return other._series + if isinstance(other, list) and len(other) > 1: + # e.g. `plx.all() + plx.all()` + msg = "Multi-output expressions are not supported in this context" + raise ValueError(msg) + if isinstance(other, list): + other = other[0] return other @@ -368,17 +380,3 @@ def reverse_translate_dtype(dtype: DType | type[DType]) -> Any: return "bool" msg = f"Unknown dtype: {dtype}" raise TypeError(msg) - - -def reset_index(obj: Any) -> Any: - index = obj.index - if ( - hasattr(index, "start") - and hasattr(index, "stop") - and hasattr(index, "step") - and index.start == 0 - and index.stop == len(obj) - and index.step == 1 - ): - return obj - return obj.reset_index(drop=True) diff --git a/tests/test_common.py b/tests/test_common.py index c1034b738..78c160d6a 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -233,6 +233,7 @@ def test_accepted_dataframes() -> None: @pytest.mark.parametrize("df_raw", [df_polars, df_pandas, df_mpd]) +@pytest.mark.filterwarnings("ignore:.*Passing a BlockManager.*:DeprecationWarning") def test_convert_pandas(df_raw: Any) -> None: result = nw.DataFrame(df_raw).to_pandas() expected = pd.DataFrame({"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}) diff --git a/tests/tpch_q1_test.py b/tests/tpch_q1_test.py index e07dbea2a..a5d421c67 100644 --- a/tests/tpch_q1_test.py +++ b/tests/tpch_q1_test.py @@ -2,10 +2,10 @@ import os from datetime import datetime -from typing import Any from unittest import mock -import polars +import pandas as pd +import polars as pl import pytest import narwhals as nw @@ -13,13 +13,16 @@ @pytest.mark.parametrize( - "df_raw", - [ - (polars.read_parquet("tests/data/lineitem.parquet").to_pandas()), - polars.scan_parquet("tests/data/lineitem.parquet"), - ], + "library", + ["pandas", "polars"], ) -def test_q1(df_raw: Any) -> None: +@pytest.mark.filterwarnings("ignore:.*Passing a BlockManager.*:DeprecationWarning") +def test_q1(library: str) -> None: + if library == "pandas": + df_raw = pd.read_parquet("tests/data/lineitem.parquet") + df_raw["l_shipdate"] = pd.to_datetime(df_raw["l_shipdate"]) + elif library == "polars": + df_raw = pl.scan_parquet("tests/data/lineitem.parquet") var_1 = datetime(1998, 9, 2) df = nw.LazyFrame(df_raw) query_result = ( @@ -73,14 +76,11 @@ def test_q1(df_raw: Any) -> None: compare_dicts(result, expected) -@pytest.mark.parametrize( - "df_raw", - [ - (polars.read_parquet("tests/data/lineitem.parquet").to_pandas()), - ], -) @mock.patch.dict(os.environ, {"NARWHALS_FORCE_GENERIC": "1"}) -def test_q1_w_pandas_agg_generic_path(df_raw: Any) -> None: +@pytest.mark.filterwarnings("ignore:.*Passing a BlockManager.*:DeprecationWarning") +def test_q1_w_pandas_agg_generic_path() -> None: + df_raw = pd.read_parquet("tests/data/lineitem.parquet") + df_raw["l_shipdate"] = pd.to_datetime(df_raw["l_shipdate"]) var_1 = datetime(1998, 9, 2) df = nw.LazyFrame(df_raw) query_result = ( diff --git a/tpch/q1.py b/tpch/q1.py index 965069548..e634cec47 100644 --- a/tpch/q1.py +++ b/tpch/q1.py @@ -39,9 +39,7 @@ def q1(df_raw: Any) -> Any: return nw.to_native(result.collect()) -df = pd.read_parquet("../tpch-data/s1/lineitem.parquet", dtype_backend="pyarrow") -breakpoint() -# df["l_shipdate"] = pd.to_datetime(df["l_shipdate"]) -print(q1(df)) -df = polars.scan_parquet("../tpch-data/s1/lineitem.parquet") +df = pd.read_parquet( + "../tpch-data/s1/lineitem.parquet", dtype_backend="pyarrow", engine="pyarrow" +) print(q1(df))