Skip to content

Commit

Permalink
this it?
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoGorelli committed Mar 14, 2024
1 parent 2bf5c2b commit f5ec176
Show file tree
Hide file tree
Showing 8 changed files with 77 additions and 102 deletions.
21 changes: 10 additions & 11 deletions demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,24 @@


def func(df_raw: Any) -> Any:
df = nw.NarwhalsFrame(df_raw)

print(df)
df = nw.DataFrame(df_raw)
res = df.with_columns(
d=nw.col("a") + 1,
e=nw.col("a") + nw.col("b"),
)

res = res.group_by("a").agg(nw.col("b").sum())
print(res)

res = res.group_by(["a"]).agg(
nw.col("b").sum(),
d=nw.col("c").sum(),
# e=nw.len(),
)
return nw.to_native(res)


import pandas as pd

# df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
# print(func(df))
df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
df = pd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
print(func(df))
df = pl.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
print(func(df))
df = pl.LazyFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
df = pl.LazyFrame({"a": [1, 1, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
print(func(df).collect())
4 changes: 2 additions & 2 deletions narwhals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from narwhals.containers import is_pandas
from narwhals.containers import is_polars
from narwhals.containers import is_series
from narwhals.dataframe import NarwhalsFrame
from narwhals.dataframe import DataFrame
from narwhals.expression import col
from narwhals.expression import len
from narwhals.translate import get_namespace
Expand All @@ -27,5 +27,5 @@
"to_native",
"col",
"len",
"NarwhalsFrame",
"DataFrame",
]
108 changes: 43 additions & 65 deletions narwhals/dataframe.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from __future__ import annotations

from narwhals.pandas_like.utils import evaluate_into_exprs
from narwhals.pandas_like.dataframe import PandasDataFrame
from narwhals.polars import PolarsDataFrame
from narwhals.translate import get_pandas
from narwhals.translate import get_polars


def extract_native(obj: Any, implementation) -> Any:
from narwhals.expression import NarwhalsExpr
from narwhals.series import Series

# if isinstance(obj, NarwhalsExpr):
# return obj._call(pl.col)
Expand All @@ -17,14 +19,16 @@ def extract_native(obj: Any, implementation) -> Any:
return obj._call(pl.col)
# if isinstance(obj, DType):
# return obj._dtype
if isinstance(obj, NarwhalsFrame):
if isinstance(obj, DataFrame):
return obj._dataframe
if isinstance(obj, Series):
return obj._series
# if isinstance(obj, PolarsSeries):
# return obj._series
return obj


class NarwhalsFrame:
class DataFrame:
def __init__(
self, df, *, is_eager=False, is_lazy=False, implementation: str | None = None
):
Expand All @@ -35,24 +39,14 @@ def __init__(
self._implementation = implementation
return
if (pl := get_polars()) is not None:
if isinstance(df, pl.DataFrame):
if is_lazy:
raise ValueError(
"can't instantiate with `is_lazy` if you pass a polars DataFrame"
)
self._dataframe = df
self._implementation = "polars"
return
elif isinstance(df, pl.LazyFrame):
if is_eager:
raise ValueError(
"can't instantiate with `is_eager` if you pass a polars LazyFrame"
)
self._dataframe = df
if isinstance(df, (pl.DataFrame, pl.LazyFrame)):
self._dataframe = PolarsDataFrame(df, is_eager=is_eager, is_lazy=is_lazy)
self._implementation = "polars"
return
if (pd := get_pandas()) is not None and isinstance(df, pd.DataFrame):
self._dataframe = df
self._dataframe = PandasDataFrame(
df, is_eager=is_eager, is_lazy=is_lazy, implementation="pandas"
)
self._implementation = "pandas"
return
raise TypeError(
Expand All @@ -68,73 +62,57 @@ def _from_dataframe(self, df: Any) -> Self:
implementation=self._implementation,
)

def _extract_native(self, obj):
return extract_native(obj, implementation=self._implementation)

def with_columns(
self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr
) -> Self:
if self._implementation == "polars":
return self._from_dataframe(
self._dataframe.with_columns(
*[self._extract_native(v) for v in exprs],
**{
key: self._extract_native(value)
for key, value in named_exprs.items()
},
)
)
elif self._implementation == "pandas":
new_series = evaluate_into_exprs(self, *exprs, **named_exprs)
df = self._dataframe.assign(
**{series.name: series._series for series in new_series}
)
return self._from_dataframe(df)
return self._from_dataframe(
self._dataframe.with_columns(*exprs, **named_exprs),
)

def filter(self, *predicates: IntoExpr | Iterable[IntoExpr]) -> Self:
return self._from_dataframe(
self._dataframe.filter(*[self._extract_native(v) for v in predicates])
self._dataframe.filter(*predicates),
)

def group_by(self, *keys: str | Iterable[str]) -> GroupBy:
from narwhals.group_by import NarwhalsGroupBy

return NarwhalsGroupBy(
self,
*keys,
is_eager=self._is_eager,
is_lazy=self._is_lazy,
)
return NarwhalsGroupBy(self, *keys)

def sort(
self,
by: str | Iterable[str],
*more_by: str,
descending: bool | Sequence[bool] = False,
) -> Self:
if self._implementation == "polars":
return self._from_dataframe(
self._dataframe.sort(by, *more_by, descending=descending)
)
return self._from_dataframe(
self._dataframe.sort(by, *more_by, descending=descending)
)

def collect(self) -> Self:
if not self._is_lazy:
raise RuntimeError(
"DataFrame.collect can only be called if frame was instantiated with `is_lazy=True`"
)
if self._implementation == "polars":
import polars as pl

assert isinstance(self._dataframe, pl.LazyFrame)
return self.__class__(self._dataframe.collect(), is_eager=True, is_lazy=False)
return self.__class__(
self._dataframe.collect(),
is_eager=True,
is_lazy=False,
implementation=self._implementation,
)

def to_dict(self, *, as_series: bool = True) -> dict[str, Any]:
if not self._is_eager:
raise RuntimeError(
"DataFrame.to_dict can only be called if frame was instantiated with `is_eager=True`"
)
if self._implementation == "polars":
import polars as pl
return self._dataframe.to_dict(as_series=as_series)

assert isinstance(self._dataframe, pl.DataFrame)
return self._dataframe.to_dict(as_series=as_series)
def join(
self,
other: Self,
*,
how: Literal[inner] = "inner",
left_on: str | list[str],
right_on: str | list[str],
) -> Self:
return self._from_dataframe(
self._dataframe.join(
other._dataframe,
how=how,
left_on=left_on,
right_on=right_on,
)
)
32 changes: 15 additions & 17 deletions narwhals/expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

from typing import Any

from narwhals.translate import get_polars


def extract_native(expr, other: Any) -> Any:
if isinstance(other, NarwhalsExpr):
Expand Down Expand Up @@ -97,46 +95,46 @@ def sum(self) -> Expr:
return self.__class__(lambda expr: self._call(expr).sum())

def min(self) -> Expr:
return self.__class__(self._expr.min())
return self.__class__(lambda expr: self._call(expr).min())

def max(self) -> Expr:
return self.__class__(self._expr.max())
return self.__class__(lambda expr: self._call(expr).max())

def n_unique(self) -> Expr:
return self.__class__(self._expr.n_unique())
return self.__class__(lambda expr: self._call(expr).n_unique())

def unique(self) -> Expr:
return self.__class__(self._expr.unique())
return self.__class__(lambda expr: self._call(expr).unique())

# --- transform ---
def is_between(
self, lower_bound: Any, upper_bound: Any, closed: str = "both"
) -> Expr:
return self.__class__(self._expr.is_between(lower_bound, upper_bound, closed)) # type: ignore[arg-type]
return self.__class__(
lambda expr: self._call(expr).is_between(lower_bound, upper_bound, closed)
) # type: ignore[arg-type]

def is_in(self, other: Any) -> Expr:
return self.__class__(self._expr.is_in(other))
return self.__class__(lambda expr: self._call(expr).is_in(other))

def is_null(self) -> Expr:
return self.__class__(self._expr.is_null())
return self.__class__(lambda expr: self._call(expr).is_null())

# --- partial reduction ---
def drop_nulls(self) -> Expr:
return self.__class__(self._expr.drop_nulls())
return self.__class__(lambda expr: self._call(expr).drop_nulls())

def sample(self, n: int, fraction: float, *, with_replacement: bool) -> Expr:
return self.__class__(
self._expr.sample(n, fraction=fraction, with_replacement=with_replacement)
lambda expr: self._call(expr).sample(
n, fraction=fraction, with_replacement=with_replacement
)
)


def col(col_name: str):
return NarwhalsExpr(lambda expr: expr(col_name))
return NarwhalsExpr(lambda plx: plx.col(col_name))


def len():
def func(expr):
if (pl := get_polars()) is not None and issubclass(expr, pl.col):
return pl.len()

return NarwhalsExpr(func)
return NarwhalsExpr(lambda plx: plx.len())
2 changes: 1 addition & 1 deletion narwhals/pandas_like/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def parse_into_expr(implementation: str, into_expr: IntoExpr) -> Expr:
plx = Namespace(implementation=implementation)

if isinstance(into_expr, NarwhalsExpr):
return into_expr._call(plx.col)
return into_expr._call(plx)
if isinstance(into_expr, str):
return plx.col(into_expr)
if isinstance(into_expr, Expr):
Expand Down
2 changes: 1 addition & 1 deletion narwhals/polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def extract_native(obj: Any) -> Any:
from narwhals.expression import NarwhalsExpr

if isinstance(obj, NarwhalsExpr):
return obj._call(pl.col)
return obj._call(pl)
if isinstance(obj, Expr):
return obj._expr
if isinstance(obj, DType):
Expand Down
6 changes: 3 additions & 3 deletions narwhals/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,14 +175,14 @@ def get_namespace(obj: Any) -> Namespace:


def to_native(obj: Any) -> Any:
from narwhals.dataframe import NarwhalsFrame
from narwhals.dataframe import DataFrame
from narwhals.pandas_like.dataframe import PandasDataFrame
from narwhals.pandas_like.series import PandasSeries
from narwhals.polars import PolarsDataFrame
from narwhals.polars import PolarsSeries

if isinstance(obj, NarwhalsFrame):
return obj._dataframe
if isinstance(obj, DataFrame):
return obj._dataframe._dataframe
if isinstance(obj, PandasDataFrame):
return obj._dataframe
if isinstance(obj, PandasSeries):
Expand Down
4 changes: 2 additions & 2 deletions tests/tpch_q1_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@
@pytest.mark.parametrize(
"df_raw",
[
# (polars.read_parquet("tests/data/lineitem.parquet").to_pandas()),
(polars.read_parquet("tests/data/lineitem.parquet").to_pandas()),
polars.scan_parquet("tests/data/lineitem.parquet"),
],
)
def test_q1(df_raw: Any) -> None:
var_1 = datetime(1998, 9, 2)
df = nw.NarwhalsFrame(df_raw, is_lazy=True)
df = nw.DataFrame(df_raw, is_lazy=True)
query_result = (
df.filter(nw.col("l_shipdate") <= var_1)
.group_by(["l_returnflag", "l_linestatus"])
Expand Down

0 comments on commit f5ec176

Please sign in to comment.