From f3bc82a31da17715472d3cca4e815c71017cc430 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 3 Sep 2024 15:17:04 +0100 Subject: [PATCH 1/5] fix: compatiblity with Python3.8 for Ibis (#906) --- .github/workflows/pytest.yml | 2 +- narwhals/dependencies.py | 32 ++++++------ tests/frame/arrow_c_stream_test.py | 9 ++++ tests/frame/interchange_schema_test.py | 65 +++++++++++++++++------- tests/series_only/arrow_c_stream_test.py | 9 ++++ 5 files changed, 81 insertions(+), 36 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index ee5ba37ec..265442e9f 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -25,7 +25,7 @@ jobs: if: runner.os == 'Windows' run: powershell -c "irm https://astral.sh/uv/install.ps1 | iex" - name: install-reqs - run: uv pip install --upgrade tox virtualenv setuptools -r requirements-dev.txt --system + run: uv pip install --upgrade tox virtualenv setuptools -r requirements-dev.txt ibis-framework[duckdb] --system - name: show-deps run: uv pip freeze - name: Run pytest diff --git a/narwhals/dependencies.py b/narwhals/dependencies.py index 24b435858..2cd9f0983 100644 --- a/narwhals/dependencies.py +++ b/narwhals/dependencies.py @@ -83,79 +83,79 @@ def get_ibis() -> Any: def is_pandas_dataframe(df: Any) -> TypeGuard[pd.DataFrame]: """Check whether `df` is a pandas DataFrame without importing pandas.""" - return bool((pd := get_pandas()) is not None and isinstance(df, pd.DataFrame)) + return (pd := get_pandas()) is not None and isinstance(df, pd.DataFrame) def is_pandas_series(ser: Any) -> TypeGuard[pd.Series[Any]]: """Check whether `ser` is a pandas Series without importing pandas.""" - return bool((pd := get_pandas()) is not None and isinstance(ser, pd.Series)) + return (pd := get_pandas()) is not None and isinstance(ser, pd.Series) def is_modin_dataframe(df: Any) -> TypeGuard[mpd.DataFrame]: """Check whether `df` is a modin DataFrame without importing modin.""" - return bool((pd := get_modin()) is not None and isinstance(df, pd.DataFrame)) + return (pd := get_modin()) is not None and isinstance(df, pd.DataFrame) def is_modin_series(ser: Any) -> TypeGuard[mpd.Series]: """Check whether `ser` is a modin Series without importing modin.""" - return bool((pd := get_modin()) is not None and isinstance(ser, pd.Series)) + return (pd := get_modin()) is not None and isinstance(ser, pd.Series) def is_cudf_dataframe(df: Any) -> TypeGuard[cudf.DataFrame]: """Check whether `df` is a cudf DataFrame without importing cudf.""" - return bool((pd := get_cudf()) is not None and isinstance(df, pd.DataFrame)) + return (pd := get_cudf()) is not None and isinstance(df, pd.DataFrame) def is_cudf_series(ser: Any) -> TypeGuard[pd.Series[Any]]: """Check whether `ser` is a cudf Series without importing cudf.""" - return bool((pd := get_cudf()) is not None and isinstance(ser, pd.Series)) + return (pd := get_cudf()) is not None and isinstance(ser, pd.Series) def is_dask_dataframe(df: Any) -> TypeGuard[dd.DataFrame]: """Check whether `df` is a Dask DataFrame without importing Dask.""" - return bool((dd := get_dask_dataframe()) is not None and isinstance(df, dd.DataFrame)) + return (dd := get_dask_dataframe()) is not None and isinstance(df, dd.DataFrame) def is_duckdb_relation(df: Any) -> TypeGuard[duckdb.DuckDBPyRelation]: """Check whether `df` is a DuckDB Relation without importing DuckDB.""" - return bool( - (duckdb := get_duckdb()) is not None and isinstance(df, duckdb.DuckDBPyRelation) + return (duckdb := get_duckdb()) is not None and isinstance( + df, duckdb.DuckDBPyRelation ) def is_ibis_table(df: Any) -> TypeGuard[ibis.Table]: """Check whether `df` is a Ibis Table without importing Ibis.""" - return bool((ibis := get_ibis()) is not None and isinstance(df, ibis.Table)) + return (ibis := get_ibis()) is not None and isinstance(df, ibis.expr.types.Table) def is_polars_dataframe(df: Any) -> TypeGuard[pl.DataFrame]: """Check whether `df` is a Polars DataFrame without importing Polars.""" - return bool((pl := get_polars()) is not None and isinstance(df, pl.DataFrame)) + return (pl := get_polars()) is not None and isinstance(df, pl.DataFrame) def is_polars_lazyframe(df: Any) -> TypeGuard[pl.LazyFrame]: """Check whether `df` is a Polars LazyFrame without importing Polars.""" - return bool((pl := get_polars()) is not None and isinstance(df, pl.LazyFrame)) + return (pl := get_polars()) is not None and isinstance(df, pl.LazyFrame) def is_polars_series(ser: Any) -> TypeGuard[pl.Series]: """Check whether `ser` is a Polars Series without importing Polars.""" - return bool((pl := get_polars()) is not None and isinstance(ser, pl.Series)) + return (pl := get_polars()) is not None and isinstance(ser, pl.Series) def is_pyarrow_chunked_array(ser: Any) -> TypeGuard[pa.ChunkedArray]: """Check whether `ser` is a PyArrow ChunkedArray without importing PyArrow.""" - return bool((pa := get_pyarrow()) is not None and isinstance(ser, pa.ChunkedArray)) + return (pa := get_pyarrow()) is not None and isinstance(ser, pa.ChunkedArray) def is_pyarrow_table(df: Any) -> TypeGuard[pa.Table]: """Check whether `df` is a PyArrow Table without importing PyArrow.""" - return bool((pa := get_pyarrow()) is not None and isinstance(df, pa.Table)) + return (pa := get_pyarrow()) is not None and isinstance(df, pa.Table) def is_numpy_array(arr: Any) -> TypeGuard[np.ndarray]: """Check whether `arr` is a NumPy Array without importing NumPy.""" - return bool((np := get_numpy()) is not None and isinstance(arr, np.ndarray)) + return (np := get_numpy()) is not None and isinstance(arr, np.ndarray) def is_pandas_like_dataframe(df: Any) -> bool: diff --git a/tests/frame/arrow_c_stream_test.py b/tests/frame/arrow_c_stream_test.py index 7a3403f69..cb856adf9 100644 --- a/tests/frame/arrow_c_stream_test.py +++ b/tests/frame/arrow_c_stream_test.py @@ -10,6 +10,9 @@ @pytest.mark.skipif( parse_version(pl.__version__) < (1, 3), reason="too old for pycapsule in Polars" ) +@pytest.mark.skipif( + parse_version(pa.__version__) < (16, 0, 0), reason="too old for pycapsule in PyArrow" +) def test_arrow_c_stream_test() -> None: df = nw.from_native(pl.Series([1, 2, 3]).to_frame("a"), eager_only=True) result = pa.table(df) @@ -20,6 +23,9 @@ def test_arrow_c_stream_test() -> None: @pytest.mark.skipif( parse_version(pl.__version__) < (1, 3), reason="too old for pycapsule in Polars" ) +@pytest.mark.skipif( + parse_version(pa.__version__) < (16, 0, 0), reason="too old for pycapsule in PyArrow" +) def test_arrow_c_stream_test_invalid(monkeypatch: pytest.MonkeyPatch) -> None: # "poison" the dunder method to make sure it actually got called above monkeypatch.setattr( @@ -33,6 +39,9 @@ def test_arrow_c_stream_test_invalid(monkeypatch: pytest.MonkeyPatch) -> None: @pytest.mark.skipif( parse_version(pl.__version__) < (1, 3), reason="too old for pycapsule in Polars" ) +@pytest.mark.skipif( + parse_version(pa.__version__) < (16, 0, 0), reason="too old for pycapsule in PyArrow" +) def test_arrow_c_stream_test_fallback(monkeypatch: pytest.MonkeyPatch) -> None: # Check that fallback to PyArrow works monkeypatch.delattr("polars.DataFrame.__arrow_c_stream__") diff --git a/tests/frame/interchange_schema_test.py b/tests/frame/interchange_schema_test.py index a73ca6259..afec06831 100644 --- a/tests/frame/interchange_schema_test.py +++ b/tests/frame/interchange_schema_test.py @@ -8,6 +8,7 @@ import pytest import narwhals.stable.v1 as nw +from narwhals.utils import parse_version def test_interchange_schema() -> None: @@ -67,7 +68,10 @@ def test_interchange_schema() -> None: assert df["a"].dtype == nw.Int64 -def test_interchange_schema_ibis() -> None: # pragma: no cover +@pytest.mark.filterwarnings("ignore:.*locale specific date formats") +def test_interchange_schema_ibis( + tmpdir: pytest.TempdirFactory, +) -> None: # pragma: no cover ibis = pytest.importorskip("ibis") df_pl = pl.DataFrame( { @@ -105,26 +109,49 @@ def test_interchange_schema_ibis() -> None: # pragma: no cover "o": pl.Boolean, }, ) - tbl = ibis.memtable(df_pl) + filepath = str(tmpdir / "file.parquet") # type: ignore[operator] + df_pl.write_parquet(filepath) + tbl = ibis.read_parquet(filepath) df = nw.from_native(tbl, eager_or_interchange_only=True) result = df.schema - expected = { - "a": nw.Int64, - "b": nw.Int32, - "c": nw.Int16, - "d": nw.Int8, - "e": nw.UInt64, - "f": nw.UInt32, - "g": nw.UInt16, - "h": nw.UInt8, - "i": nw.Float64, - "j": nw.Float32, - "k": nw.String, - "l": nw.String, - "m": nw.Date, - "n": nw.Datetime, - "o": nw.Boolean, - } + if parse_version(ibis.__version__) > (6, 0, 0): + expected = { + "a": nw.Int64, + "b": nw.Int32, + "c": nw.Int16, + "d": nw.Int8, + "e": nw.UInt64, + "f": nw.UInt32, + "g": nw.UInt16, + "h": nw.UInt8, + "i": nw.Float64, + "j": nw.Float32, + "k": nw.String, + "l": nw.String, + "m": nw.Date, + "n": nw.Datetime, + "o": nw.Boolean, + } + else: + # Old versions of Ibis would read the file in + # with different data types + expected = { + "a": nw.Int64, + "b": nw.Int32, + "c": nw.Int16, + "d": nw.Int32, + "e": nw.Int32, + "f": nw.Int32, + "g": nw.Int32, + "h": nw.Int32, + "i": nw.Float64, + "j": nw.Float64, + "k": nw.String, + "l": nw.String, + "m": nw.Date, + "n": nw.Datetime, + "o": nw.Boolean, + } assert result == expected assert df["a"].dtype == nw.Int64 diff --git a/tests/series_only/arrow_c_stream_test.py b/tests/series_only/arrow_c_stream_test.py index 9964d7408..9d2ebc8d0 100644 --- a/tests/series_only/arrow_c_stream_test.py +++ b/tests/series_only/arrow_c_stream_test.py @@ -10,6 +10,9 @@ @pytest.mark.skipif( parse_version(pl.__version__) < (1, 3), reason="too old for pycapsule in Polars" ) +@pytest.mark.skipif( + parse_version(pa.__version__) < (16, 0, 0), reason="too old for pycapsule in PyArrow" +) def test_arrow_c_stream_test() -> None: s = nw.from_native(pl.Series([1, 2, 3]), series_only=True) result = pa.chunked_array(s) @@ -20,6 +23,9 @@ def test_arrow_c_stream_test() -> None: @pytest.mark.skipif( parse_version(pl.__version__) < (1, 3), reason="too old for pycapsule in Polars" ) +@pytest.mark.skipif( + parse_version(pa.__version__) < (16, 0, 0), reason="too old for pycapsule in PyArrow" +) def test_arrow_c_stream_test_invalid(monkeypatch: pytest.MonkeyPatch) -> None: # "poison" the dunder method to make sure it actually got called above monkeypatch.setattr("narwhals.series.Series.__arrow_c_stream__", lambda *_: 1 / 0) @@ -31,6 +37,9 @@ def test_arrow_c_stream_test_invalid(monkeypatch: pytest.MonkeyPatch) -> None: @pytest.mark.skipif( parse_version(pl.__version__) < (1, 3), reason="too old for pycapsule in Polars" ) +@pytest.mark.skipif( + parse_version(pa.__version__) < (16, 0, 0), reason="too old for pycapsule in PyArrow" +) def test_arrow_c_stream_test_fallback(monkeypatch: pytest.MonkeyPatch) -> None: # Check that fallback to PyArrow works monkeypatch.delattr("polars.Series.__arrow_c_stream__") From 100735229e90aaed83f2e470b243573a36e02a2b Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Tue, 3 Sep 2024 15:22:38 +0100 Subject: [PATCH 2/5] release: Bump version to 1.6.2 (#907) --- docs/installation.md | 2 +- narwhals/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 38da4db9f..796cd8708 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -11,6 +11,6 @@ Then, if you start the Python REPL and see the following: ```python >>> import narwhals >>> narwhals.__version__ -'1.6.1' +'1.6.2' ``` then installation worked correctly! diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 0cd5cb8a0..56c638e84 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -53,7 +53,7 @@ from narwhals.utils import maybe_get_index from narwhals.utils import maybe_set_index -__version__ = "1.6.1" +__version__ = "1.6.2" __all__ = [ "dependencies", diff --git a/pyproject.toml b/pyproject.toml index 262c7cdc5..a279280bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "narwhals" -version = "1.6.1" +version = "1.6.2" authors = [ { name="Marco Gorelli", email="33491632+MarcoGorelli@users.noreply.github.com" }, ] From 9eb5f46aa58672763d3309537416eef1bdfb2273 Mon Sep 17 00:00:00 2001 From: Isaias Gutierrez-Cruz <64386035+IsaiasGutierrezCruz@users.noreply.github.com> Date: Wed, 4 Sep 2024 09:47:58 -0600 Subject: [PATCH 3/5] ci: add tests for the queries of TPC-H (#899) --- .github/workflows/check_tpch_queries.yml | 30 ++++++++++++++++++++++++ pyproject.toml | 1 + requirements-dev.txt | 1 + tpch/tests/__init__.py | 0 tpch/tests/test_queries.py | 29 +++++++++++++++++++++++ 5 files changed, 61 insertions(+) create mode 100644 .github/workflows/check_tpch_queries.yml create mode 100644 tpch/tests/__init__.py create mode 100644 tpch/tests/test_queries.py diff --git a/.github/workflows/check_tpch_queries.yml b/.github/workflows/check_tpch_queries.yml new file mode 100644 index 000000000..397163091 --- /dev/null +++ b/.github/workflows/check_tpch_queries.yml @@ -0,0 +1,30 @@ +name: Tests for TPCH Queries + +on: + pull_request: + types: [labeled] + +jobs: + validate-queries: + if: ${{ github.event.label.name == 'full-test' }} + strategy: + matrix: + python-version: ["3.12"] + os: [ubuntu-latest] + + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + - name: install-reqs + run: uv pip install --upgrade -r requirements-dev.txt --system + - name: local-install + run: uv pip install -e . --system + - name: generate-data + run: cd tpch && python generate_data.py + - name: tpch-tests + run: python -m unittest discover -s 'tpch/tests' \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a279280bf..b3a2a0c28 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,6 +76,7 @@ lint.ignore = [ [tool.ruff.lint.per-file-ignores] "tests/*" = ["S101"] +"tpch/tests/*" = ["S101"] "utils/*" = ["S311", "PTH123"] "tpch/execute/*" = ["T201"] diff --git a/requirements-dev.txt b/requirements-dev.txt index 213fcdcb8..23ff1757e 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,4 @@ +tqdm covdefaults duckdb pandas diff --git a/tpch/tests/__init__.py b/tpch/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tpch/tests/test_queries.py b/tpch/tests/test_queries.py new file mode 100644 index 000000000..4b7cdd866 --- /dev/null +++ b/tpch/tests/test_queries.py @@ -0,0 +1,29 @@ +import os +import subprocess +import sys +import unittest +from pathlib import Path + + +class TestQueries(unittest.TestCase): + def test_execute_scripts(self) -> None: + root = Path(__file__).resolve().parent.parent + # directory containing all the queries + execute_dir = root / "execute" + + env = os.environ.copy() + env["PYTHONPATH"] = str(root) + + for script_path in execute_dir.glob("q[1-9]*.py"): + result = subprocess.run( # noqa: S603 + [sys.executable, str(script_path)], + capture_output=True, + text=True, + env=env, + cwd=root, + check=False, + shell=False, + ) + assert ( + result.returncode == 0 + ), f"Script {script_path} failed with error: {result.stderr}" From 69da133bbf86906b7787c95db403fcd4a626e78a Mon Sep 17 00:00:00 2001 From: Zhengbo Wang Date: Wed, 4 Sep 2024 23:50:39 +0800 Subject: [PATCH 4/5] feat: Add more queries of tpch (#898) --- tpch/execute/__init__.py | 30 +++++++++++++++++++++++ tpch/execute/q1.py | 23 ++---------------- tpch/execute/q10.py | 25 ++++---------------- tpch/execute/q11.py | 23 ++++-------------- tpch/execute/q15.py | 21 +++++++++++++++++ tpch/execute/q17.py | 21 +++++++++++++++++ tpch/execute/q18.py | 22 +++++++++++++++++ tpch/execute/q19.py | 17 ++++++++++++++ tpch/execute/q2.py | 32 +++++-------------------- tpch/execute/q20.py | 20 ++++++++++++++++ tpch/execute/q21.py | 19 +++++++++++++++ tpch/execute/q3.py | 25 ++++---------------- tpch/execute/q4.py | 21 +++-------------- tpch/execute/q5.py | 29 ++++++----------------- tpch/execute/q6.py | 15 ++---------- tpch/execute/q7.py | 27 +++++++++++++++++++++ tpch/execute/q9.py | 35 +++++++++++++++++++++++++++ tpch/queries/q15.py | 33 ++++++++++++++++++++++++++ tpch/queries/q17.py | 23 ++++++++++++++++++ tpch/queries/q18.py | 31 ++++++++++++++++++++++++ tpch/queries/q19.py | 39 ++++++++++++++++++++++++++++++ tpch/queries/q20.py | 43 +++++++++++++++++++++++++++++++++ tpch/queries/q21.py | 43 +++++++++++++++++++++++++++++++++ tpch/queries/q6.py | 4 ---- tpch/queries/q7.py | 51 ++++++++++++++++++++++++++++++++++++++++ tpch/queries/q9.py | 36 ++++++++++++++++++++++++++++ 26 files changed, 544 insertions(+), 164 deletions(-) create mode 100644 tpch/execute/q15.py create mode 100644 tpch/execute/q17.py create mode 100644 tpch/execute/q18.py create mode 100644 tpch/execute/q19.py create mode 100644 tpch/execute/q20.py create mode 100644 tpch/execute/q21.py create mode 100644 tpch/execute/q7.py create mode 100644 tpch/execute/q9.py create mode 100644 tpch/queries/q15.py create mode 100644 tpch/queries/q17.py create mode 100644 tpch/queries/q18.py create mode 100644 tpch/queries/q19.py create mode 100644 tpch/queries/q20.py create mode 100644 tpch/queries/q21.py create mode 100644 tpch/queries/q7.py create mode 100644 tpch/queries/q9.py diff --git a/tpch/execute/__init__.py b/tpch/execute/__init__.py index e69de29bb..e0c448649 100644 --- a/tpch/execute/__init__.py +++ b/tpch/execute/__init__.py @@ -0,0 +1,30 @@ +from pathlib import Path + +import dask.dataframe as dd +import pandas as pd +import polars as pl +import pyarrow.parquet as pq + +pd.options.mode.copy_on_write = True +pd.options.future.infer_string = True + +lineitem = Path("data") / "lineitem.parquet" +region = Path("data") / "region.parquet" +nation = Path("data") / "nation.parquet" +supplier = Path("data") / "supplier.parquet" +part = Path("data") / "part.parquet" +partsupp = Path("data") / "partsupp.parquet" +orders = Path("data") / "orders.parquet" +customer = Path("data") / "customer.parquet" +line_item = Path("data") / "lineitem.parquet" + +IO_FUNCS = { + "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), + "pandas[pyarrow]": lambda x: pd.read_parquet( + x, engine="pyarrow", dtype_backend="pyarrow" + ), + "polars[eager]": lambda x: pl.read_parquet(x), + "polars[lazy]": lambda x: pl.scan_parquet(x), + "pyarrow": lambda x: pq.read_table(x), + "dask": lambda x: dd.read_parquet(x, engine="pyarrow", dtype_backend="pyarrow"), +} diff --git a/tpch/execute/q1.py b/tpch/execute/q1.py index dd839b292..9889c3af0 100644 --- a/tpch/execute/q1.py +++ b/tpch/execute/q1.py @@ -1,26 +1,7 @@ -from pathlib import Path - -import dask.dataframe as dd -import pandas as pd -import polars as pl -import pyarrow.parquet as pq from queries import q1 -pd.options.mode.copy_on_write = True -pd.options.future.infer_string = True - -lineitem = Path("data") / "lineitem.parquet" - -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), - "pyarrow": lambda x: pq.read_table(x), - "dask": lambda x: dd.read_parquet(x, engine="pyarrow", dtype_backend="pyarrow"), -} +from . import IO_FUNCS +from . import lineitem print(q1.query(IO_FUNCS["pandas[pyarrow]"](lineitem))) print(q1.query(IO_FUNCS["polars[lazy]"](lineitem)).collect()) diff --git a/tpch/execute/q10.py b/tpch/execute/q10.py index 19e2e7ce0..9876f2aa9 100644 --- a/tpch/execute/q10.py +++ b/tpch/execute/q10.py @@ -1,25 +1,10 @@ -from pathlib import Path - -import pandas as pd -import polars as pl from queries import q10 -pd.options.mode.copy_on_write = True -pd.options.future.infer_string = True - -customer = Path("data") / "customer.parquet" -nation = Path("data") / "nation.parquet" -lineitem = Path("data") / "lineitem.parquet" -orders = Path("data") / "orders.parquet" - -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), -} +from . import IO_FUNCS +from . import customer +from . import lineitem +from . import nation +from . import orders tool = "pandas" fn = IO_FUNCS[tool] diff --git a/tpch/execute/q11.py b/tpch/execute/q11.py index 55161ae6b..82b1936aa 100644 --- a/tpch/execute/q11.py +++ b/tpch/execute/q11.py @@ -1,24 +1,9 @@ -from pathlib import Path - -import pandas as pd -import polars as pl from queries import q11 -pd.options.mode.copy_on_write = True -pd.options.future.infer_string = True - -nation = Path("data") / "nation.parquet" -partsupp = Path("data") / "partsupp.parquet" -supplier = Path("data") / "supplier.parquet" - -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), -} +from . import IO_FUNCS +from . import nation +from . import partsupp +from . import supplier tool = "pandas" fn = IO_FUNCS[tool] diff --git a/tpch/execute/q15.py b/tpch/execute/q15.py new file mode 100644 index 000000000..8fdaf2ab1 --- /dev/null +++ b/tpch/execute/q15.py @@ -0,0 +1,21 @@ +from queries import q15 + +from . import IO_FUNCS +from . import lineitem +from . import supplier + +tool = "pandas" +fn = IO_FUNCS[tool] +print(q15.query(fn(lineitem), fn(supplier))) + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print(q15.query(fn(lineitem), fn(supplier))) + +tool = "polars[eager]" +fn = IO_FUNCS[tool] +print(q15.query(fn(lineitem), fn(supplier))) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print(q15.query(fn(lineitem), fn(supplier)).collect()) diff --git a/tpch/execute/q17.py b/tpch/execute/q17.py new file mode 100644 index 000000000..5f2228012 --- /dev/null +++ b/tpch/execute/q17.py @@ -0,0 +1,21 @@ +from queries import q17 + +from . import IO_FUNCS +from . import lineitem +from . import part + +tool = "pandas" +fn = IO_FUNCS[tool] +print(q17.query(fn(lineitem), fn(part))) + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print(q17.query(fn(lineitem), fn(part))) + +tool = "polars[eager]" +fn = IO_FUNCS[tool] +print(q17.query(fn(lineitem), fn(part))) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print(q17.query(fn(lineitem), fn(part)).collect()) diff --git a/tpch/execute/q18.py b/tpch/execute/q18.py new file mode 100644 index 000000000..5a59f0e5e --- /dev/null +++ b/tpch/execute/q18.py @@ -0,0 +1,22 @@ +from queries import q18 + +from . import IO_FUNCS +from . import customer +from . import lineitem +from . import orders + +tool = "pandas" +fn = IO_FUNCS[tool] +print(q18.query(fn(customer), fn(lineitem), fn(orders))) + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print(q18.query(fn(customer), fn(lineitem), fn(orders))) + +tool = "polars[eager]" +fn = IO_FUNCS[tool] +print(q18.query(fn(customer), fn(lineitem), fn(orders))) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print(q18.query(fn(customer), fn(lineitem), fn(orders)).collect()) diff --git a/tpch/execute/q19.py b/tpch/execute/q19.py new file mode 100644 index 000000000..87467064c --- /dev/null +++ b/tpch/execute/q19.py @@ -0,0 +1,17 @@ +from queries import q19 + +from . import IO_FUNCS +from . import lineitem +from . import part + +fn = IO_FUNCS["pandas"] +print(q19.query(fn(lineitem), fn(part))) + +fn = IO_FUNCS["pandas[pyarrow]"] +print(q19.query(fn(lineitem), fn(part))) + +fn = IO_FUNCS["polars[eager]"] +print(q19.query(fn(lineitem), fn(part))) + +fn = IO_FUNCS["polars[lazy]"] +print(q19.query(fn(lineitem), fn(part)).collect()) diff --git a/tpch/execute/q2.py b/tpch/execute/q2.py index 22a7f4317..cd82a9047 100644 --- a/tpch/execute/q2.py +++ b/tpch/execute/q2.py @@ -1,31 +1,11 @@ -from pathlib import Path - -import dask.dataframe as dd -import pandas as pd -import polars as pl -import pyarrow.parquet as pq from queries import q2 -pd.options.mode.copy_on_write = True -pd.options.future.infer_string = True - -region = Path("data") / "region.parquet" -nation = Path("data") / "nation.parquet" -supplier = Path("data") / "supplier.parquet" -part = Path("data") / "part.parquet" -partsupp = Path("data") / "partsupp.parquet" - -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), - "pyarrow": lambda x: pq.read_table(x), - "dask": lambda x: dd.read_parquet(x, engine="pyarrow", dtype_backend="pyarrow"), -} - +from . import IO_FUNCS +from . import nation +from . import part +from . import partsupp +from . import region +from . import supplier tool = "pandas[pyarrow]" fn = IO_FUNCS[tool] diff --git a/tpch/execute/q20.py b/tpch/execute/q20.py new file mode 100644 index 000000000..68d18a6b5 --- /dev/null +++ b/tpch/execute/q20.py @@ -0,0 +1,20 @@ +from queries import q20 + +from . import IO_FUNCS +from . import lineitem +from . import nation +from . import part +from . import partsupp +from . import supplier + +fn = IO_FUNCS["pandas"] +print(q20.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))) + +fn = IO_FUNCS["pandas[pyarrow]"] +print(q20.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))) + +fn = IO_FUNCS["polars[eager]"] +print(q20.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))) + +fn = IO_FUNCS["polars[lazy]"] +print(q20.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier)).collect()) diff --git a/tpch/execute/q21.py b/tpch/execute/q21.py new file mode 100644 index 000000000..693953870 --- /dev/null +++ b/tpch/execute/q21.py @@ -0,0 +1,19 @@ +from queries import q21 + +from . import IO_FUNCS +from . import lineitem +from . import nation +from . import orders +from . import supplier + +fn = IO_FUNCS["pandas"] +print(q21.query(fn(lineitem), fn(nation), fn(orders), fn(supplier))) + +fn = IO_FUNCS["pandas[pyarrow]"] +print(q21.query(fn(lineitem), fn(nation), fn(orders), fn(supplier))) + +fn = IO_FUNCS["polars[eager]"] +print(q21.query(fn(lineitem), fn(nation), fn(orders), fn(supplier))) + +fn = IO_FUNCS["polars[lazy]"] +print(q21.query(fn(lineitem), fn(nation), fn(orders), fn(supplier)).collect()) diff --git a/tpch/execute/q3.py b/tpch/execute/q3.py index 30194b5da..8602bb3d0 100644 --- a/tpch/execute/q3.py +++ b/tpch/execute/q3.py @@ -1,26 +1,9 @@ -from pathlib import Path - -import pandas as pd -import polars as pl from queries import q3 -pd.options.mode.copy_on_write = True -pd.options.future.infer_string = True - - -customer = Path("data") / "customer.parquet" -lineitem = Path("data") / "lineitem.parquet" -orders = Path("data") / "orders.parquet" - -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), -} - +from . import IO_FUNCS +from . import customer +from . import lineitem +from . import orders tool = "pandas" fn = IO_FUNCS[tool] diff --git a/tpch/execute/q4.py b/tpch/execute/q4.py index 672a43e17..3e67a9c87 100644 --- a/tpch/execute/q4.py +++ b/tpch/execute/q4.py @@ -1,23 +1,8 @@ -from pathlib import Path - -import pandas as pd -import polars as pl from queries import q4 -pd.options.mode.copy_on_write = True -pd.options.future.infer_string = True - -line_item = Path("data") / "lineitem.parquet" -orders = Path("data") / "orders.parquet" - -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), -} +from . import IO_FUNCS +from . import line_item +from . import orders tool = "pandas" fn = IO_FUNCS[tool] diff --git a/tpch/execute/q5.py b/tpch/execute/q5.py index b77f740d8..317b15fc7 100644 --- a/tpch/execute/q5.py +++ b/tpch/execute/q5.py @@ -1,27 +1,12 @@ -from pathlib import Path - -import pandas as pd -import polars as pl from queries import q5 -pd.options.mode.copy_on_write = True -pd.options.future.infer_string = True - -region = Path("data") / "region.parquet" -nation = Path("data") / "nation.parquet" -customer = Path("data") / "customer.parquet" -line_item = Path("data") / "lineitem.parquet" -orders = Path("data") / "orders.parquet" -supplier = Path("data") / "supplier.parquet" - -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), -} +from . import IO_FUNCS +from . import customer +from . import line_item +from . import nation +from . import orders +from . import region +from . import supplier tool = "pandas" fn = IO_FUNCS[tool] diff --git a/tpch/execute/q6.py b/tpch/execute/q6.py index 85b3d9968..adca0e26d 100644 --- a/tpch/execute/q6.py +++ b/tpch/execute/q6.py @@ -1,18 +1,7 @@ -from pathlib import Path - -import pandas as pd -import polars as pl from queries import q6 -lineitem = Path("data") / "lineitem.parquet" -IO_FUNCS = { - "pandas": lambda x: pd.read_parquet(x, engine="pyarrow"), - "pandas[pyarrow]": lambda x: pd.read_parquet( - x, engine="pyarrow", dtype_backend="pyarrow" - ), - "polars[eager]": lambda x: pl.read_parquet(x), - "polars[lazy]": lambda x: pl.scan_parquet(x), -} +from . import IO_FUNCS +from . import lineitem tool = "pandas" fn = IO_FUNCS[tool] diff --git a/tpch/execute/q7.py b/tpch/execute/q7.py new file mode 100644 index 000000000..43e110a72 --- /dev/null +++ b/tpch/execute/q7.py @@ -0,0 +1,27 @@ +from queries import q7 + +from . import IO_FUNCS +from . import customer +from . import lineitem +from . import nation +from . import orders +from . import supplier + +tool = "pandas" +fn = IO_FUNCS[tool] +print(q7.query(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))) + + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print(q7.query(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))) + +tool = "polars[eager]" +fn = IO_FUNCS[tool] +print(q7.query(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print( + q7.query(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect() +) diff --git a/tpch/execute/q9.py b/tpch/execute/q9.py new file mode 100644 index 000000000..9ccbe35b7 --- /dev/null +++ b/tpch/execute/q9.py @@ -0,0 +1,35 @@ +from queries import q9 + +from . import IO_FUNCS +from . import lineitem +from . import nation +from . import orders +from . import part +from . import partsupp +from . import supplier + +tool = "pandas" +fn = IO_FUNCS[tool] +print( + q9.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)) +) + +tool = "pandas[pyarrow]" +fn = IO_FUNCS[tool] +print( + q9.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)) +) + +tool = "polars[eager]" +fn = IO_FUNCS[tool] +print( + q9.query(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)) +) + +tool = "polars[lazy]" +fn = IO_FUNCS[tool] +print( + q9.query( + fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier) + ).collect() +) diff --git a/tpch/queries/q15.py b/tpch/queries/q15.py new file mode 100644 index 000000000..1ebae57d6 --- /dev/null +++ b/tpch/queries/q15.py @@ -0,0 +1,33 @@ +from datetime import datetime + +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query( + lineitem_ds: FrameT, + supplier_ds: FrameT, +) -> FrameT: + var1 = datetime(1996, 1, 1) + var2 = datetime(1996, 4, 1) + + revenue = ( + lineitem_ds.filter(nw.col("l_shipdate").is_between(var1, var2, closed="left")) + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias( + "total_revenue" + ) + ) + .group_by("l_suppkey") + .agg(nw.sum("total_revenue")) + .select(nw.col("l_suppkey").alias("supplier_no"), nw.col("total_revenue")) + ) + + return ( + supplier_ds.join(revenue, left_on="s_suppkey", right_on="supplier_no") + .filter(nw.col("total_revenue") == nw.col("total_revenue").max()) + .with_columns(nw.col("total_revenue").round(2)) + .select("s_suppkey", "s_name", "s_address", "s_phone", "total_revenue") + .sort("s_suppkey") + ) diff --git a/tpch/queries/q17.py b/tpch/queries/q17.py new file mode 100644 index 000000000..5d35929d1 --- /dev/null +++ b/tpch/queries/q17.py @@ -0,0 +1,23 @@ +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query(lineitem_ds: FrameT, part_ds: FrameT) -> FrameT: + var1 = "Brand#23" + var2 = "MED BOX" + + query1 = ( + part_ds.filter(nw.col("p_brand") == var1) + .filter(nw.col("p_container") == var2) + .join(lineitem_ds, how="left", left_on="p_partkey", right_on="l_partkey") + ) + + return ( + query1.group_by("p_partkey") + .agg((0.2 * nw.col("l_quantity").mean()).alias("avg_quantity")) + .select(nw.col("p_partkey").alias("key"), nw.col("avg_quantity")) + .join(query1, left_on="key", right_on="p_partkey") + .filter(nw.col("l_quantity") < nw.col("avg_quantity")) + .select((nw.col("l_extendedprice").sum() / 7.0).round(2).alias("avg_yearly")) + ) diff --git a/tpch/queries/q18.py b/tpch/queries/q18.py new file mode 100644 index 000000000..d3d183176 --- /dev/null +++ b/tpch/queries/q18.py @@ -0,0 +1,31 @@ +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query(customer_ds: FrameT, lineitem_ds: FrameT, orders_ds: FrameT) -> FrameT: + var1 = 300 + + query1 = ( + lineitem_ds.group_by("l_orderkey") + .agg(nw.col("l_quantity").sum().alias("sum_quantity")) + .filter(nw.col("sum_quantity") > var1) + ) + + return ( + orders_ds.join(query1, left_on="o_orderkey", right_on="l_orderkey", how="semi") + .join(lineitem_ds, left_on="o_orderkey", right_on="l_orderkey") + .join(customer_ds, left_on="o_custkey", right_on="c_custkey") + .group_by("c_name", "o_custkey", "o_orderkey", "o_orderdate", "o_totalprice") + .agg(nw.col("l_quantity").sum().alias("col6")) + .select( + nw.col("c_name"), + nw.col("o_custkey").alias("c_custkey"), + nw.col("o_orderkey"), + nw.col("o_orderdate").alias("o_orderdat"), + nw.col("o_totalprice"), + nw.col("col6"), + ) + .sort(by=["o_totalprice", "o_orderdat"], descending=[True, False]) + .head(100) + ) diff --git a/tpch/queries/q19.py b/tpch/queries/q19.py new file mode 100644 index 000000000..bcab36e9a --- /dev/null +++ b/tpch/queries/q19.py @@ -0,0 +1,39 @@ +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query(lineitem_ds: FrameT, part_ds: FrameT) -> FrameT: + return ( + part_ds.join(lineitem_ds, left_on="p_partkey", right_on="l_partkey") + .filter(nw.col("l_shipmode").is_in(["AIR", "AIR REG"])) + .filter(nw.col("l_shipinstruct") == "DELIVER IN PERSON") + .filter( + ( + (nw.col("p_brand") == "Brand#12") + & nw.col("p_container").is_in(["SM CASE", "SM BOX", "SM PACK", "SM PKG"]) + & (nw.col("l_quantity").is_between(1, 11)) + & (nw.col("p_size").is_between(1, 5)) + ) + | ( + (nw.col("p_brand") == "Brand#23") + & nw.col("p_container").is_in( + ["MED BAG", "MED BOX", "MED PKG", "MED PACK"] + ) + & (nw.col("l_quantity").is_between(10, 20)) + & (nw.col("p_size").is_between(1, 10)) + ) + | ( + (nw.col("p_brand") == "Brand#34") + & nw.col("p_container").is_in(["LG CASE", "LG BOX", "LG PACK", "LG PKG"]) + & (nw.col("l_quantity").is_between(20, 30)) + & (nw.col("p_size").is_between(1, 15)) + ) + ) + .select( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))) + .sum() + .round(2) + .alias("revenue") + ) + ) diff --git a/tpch/queries/q20.py b/tpch/queries/q20.py new file mode 100644 index 000000000..d9014f7b8 --- /dev/null +++ b/tpch/queries/q20.py @@ -0,0 +1,43 @@ +from datetime import datetime + +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query( + part_ds: FrameT, + partsupp_ds: FrameT, + nation_ds: FrameT, + lineitem_ds: FrameT, + supplier_ds: FrameT, +) -> FrameT: + var1 = datetime(1994, 1, 1) + var2 = datetime(1995, 1, 1) + var3 = "CANADA" + var4 = "forest" + + query1 = ( + lineitem_ds.filter(nw.col("l_shipdate").is_between(var1, var2, closed="left")) + .group_by("l_partkey", "l_suppkey") + .agg((nw.col("l_quantity").sum()).alias("sum_quantity")) + .with_columns(sum_quantity=nw.col("sum_quantity") * 0.5) + ) + query2 = nation_ds.filter(nw.col("n_name") == var3) + query3 = supplier_ds.join(query2, left_on="s_nationkey", right_on="n_nationkey") + + return ( + part_ds.filter(nw.col("p_name").str.starts_with(var4)) + .select(nw.col("p_partkey").unique()) + .join(partsupp_ds, left_on="p_partkey", right_on="ps_partkey") + .join( + query1, + left_on=["ps_suppkey", "p_partkey"], + right_on=["l_suppkey", "l_partkey"], + ) + .filter(nw.col("ps_availqty") > nw.col("sum_quantity")) + .select(nw.col("ps_suppkey").unique()) + .join(query3, left_on="ps_suppkey", right_on="s_suppkey") + .select("s_name", "s_address") + .sort("s_name") + ) diff --git a/tpch/queries/q21.py b/tpch/queries/q21.py new file mode 100644 index 000000000..d10ff394f --- /dev/null +++ b/tpch/queries/q21.py @@ -0,0 +1,43 @@ +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query( + lineitem: FrameT, + nation: FrameT, + orders: FrameT, + supplier: FrameT, +) -> FrameT: + var1 = "SAUDI ARABIA" + + q1 = ( + lineitem.group_by("l_orderkey") + .agg(nw.len().alias("n_supp_by_order")) + .filter(nw.col("n_supp_by_order") > 1) + .join( + lineitem.filter(nw.col("l_receiptdate") > nw.col("l_commitdate")), + left_on="l_orderkey", + right_on="l_orderkey", + ) + ) + + return ( + q1.group_by("l_orderkey") + .agg(nw.len().alias("n_supp_by_order")) + .join( + q1, + left_on="l_orderkey", + right_on="l_orderkey", + ) + .join(supplier, left_on="l_suppkey", right_on="s_suppkey") + .join(nation, left_on="s_nationkey", right_on="n_nationkey") + .join(orders, left_on="l_orderkey", right_on="o_orderkey") + .filter(nw.col("n_supp_by_order") == 1) + .filter(nw.col("n_name") == var1) + .filter(nw.col("o_orderstatus") == "F") + .group_by("s_name") + .agg(nw.len().alias("numwait")) + .sort(by=["numwait", "s_name"], descending=[True, False]) + .head(100) + ) diff --git a/tpch/queries/q6.py b/tpch/queries/q6.py index 6a9b5c1d2..67f0ac785 100644 --- a/tpch/queries/q6.py +++ b/tpch/queries/q6.py @@ -1,12 +1,8 @@ from datetime import datetime -import pandas as pd - import narwhals as nw from narwhals.typing import FrameT -pd.options.mode.copy_on_write = True - @nw.narwhalify def query(line_item_ds: FrameT) -> FrameT: diff --git a/tpch/queries/q7.py b/tpch/queries/q7.py new file mode 100644 index 000000000..ec0946ac3 --- /dev/null +++ b/tpch/queries/q7.py @@ -0,0 +1,51 @@ +from datetime import datetime + +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query( + nation_ds: FrameT, + customer_ds: FrameT, + line_item_ds: FrameT, + orders_ds: FrameT, + supplier_ds: FrameT, +) -> FrameT: + n1 = nation_ds.filter(nw.col("n_name") == "FRANCE") + n2 = nation_ds.filter(nw.col("n_name") == "GERMANY") + + var_1 = datetime(1995, 1, 1) + var_2 = datetime(1996, 12, 31) + + df1 = ( + customer_ds.join(n1, left_on="c_nationkey", right_on="n_nationkey") + .join(orders_ds, left_on="c_custkey", right_on="o_custkey") + .rename({"n_name": "cust_nation"}) + .join(line_item_ds, left_on="o_orderkey", right_on="l_orderkey") + .join(supplier_ds, left_on="l_suppkey", right_on="s_suppkey") + .join(n2, left_on="s_nationkey", right_on="n_nationkey") + .rename({"n_name": "supp_nation"}) + ) + + df2 = ( + customer_ds.join(n2, left_on="c_nationkey", right_on="n_nationkey") + .join(orders_ds, left_on="c_custkey", right_on="o_custkey") + .rename({"n_name": "cust_nation"}) + .join(line_item_ds, left_on="o_orderkey", right_on="l_orderkey") + .join(supplier_ds, left_on="l_suppkey", right_on="s_suppkey") + .join(n1, left_on="s_nationkey", right_on="n_nationkey") + .rename({"n_name": "supp_nation"}) + ) + + return ( + nw.concat([df1, df2]) + .filter(nw.col("l_shipdate").is_between(var_1, var_2)) + .with_columns( + (nw.col("l_extendedprice") * (1 - nw.col("l_discount"))).alias("volume") + ) + .with_columns(nw.col("l_shipdate").dt.year().alias("l_year")) + .group_by("supp_nation", "cust_nation", "l_year") + .agg(nw.sum("volume").alias("revenue")) + .sort(by=["supp_nation", "cust_nation", "l_year"]) + ) diff --git a/tpch/queries/q9.py b/tpch/queries/q9.py new file mode 100644 index 000000000..09dff4787 --- /dev/null +++ b/tpch/queries/q9.py @@ -0,0 +1,36 @@ +import narwhals as nw +from narwhals.typing import FrameT + + +@nw.narwhalify +def query( + part_ds: FrameT, + partsupp_ds: FrameT, + nation_ds: FrameT, + lineitem_ds: FrameT, + orders_ds: FrameT, + supplier_ds: FrameT, +) -> FrameT: + return ( + part_ds.join(partsupp_ds, left_on="p_partkey", right_on="ps_partkey") + .join(supplier_ds, left_on="ps_suppkey", right_on="s_suppkey") + .join( + lineitem_ds, + left_on=["p_partkey", "ps_suppkey"], + right_on=["l_partkey", "l_suppkey"], + ) + .join(orders_ds, left_on="l_orderkey", right_on="o_orderkey") + .join(nation_ds, left_on="s_nationkey", right_on="n_nationkey") + .filter(nw.col("p_name").str.contains("green")) + .select( + nw.col("n_name").alias("nation"), + nw.col("o_orderdate").dt.year().alias("o_year"), + ( + nw.col("l_extendedprice") * (1 - nw.col("l_discount")) + - nw.col("ps_supplycost") * nw.col("l_quantity") + ).alias("amount"), + ) + .group_by("nation", "o_year") + .agg(nw.sum("amount").alias("sum_profit")) + .sort(by=["nation", "o_year"], descending=[False, True]) + ) From cb82d26b7d9d6a1aef882aa6fcbda79a612f1223 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Wed, 4 Sep 2024 17:51:36 +0200 Subject: [PATCH 5/5] feat: dask lit with dtype (#909) --- narwhals/_dask/namespace.py | 15 ++++++++++++--- tests/frame/lit_test.py | 6 +----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 89ca372ec..1668ee323 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -12,6 +12,7 @@ from narwhals._dask.dataframe import DaskLazyFrame from narwhals._dask.expr import DaskExpr from narwhals._dask.selectors import DaskSelectorNamespace +from narwhals._dask.utils import reverse_translate_dtype from narwhals._dask.utils import validate_comparand from narwhals._expression_parsing import parse_into_exprs @@ -19,6 +20,7 @@ import dask_expr from narwhals._dask.typing import IntoDaskExpr + from narwhals.dtypes import DType class DaskNamespace: @@ -70,10 +72,17 @@ def col(self, *column_names: str) -> DaskExpr: ) def lit(self, value: Any, dtype: dtypes.DType | None) -> DaskExpr: - # TODO @FBruzzesi: cast to dtype once `narwhals_to_native_dtype` is implemented. - # It should be enough to add `.astype(narwhals_to_native_dtype(dtype))` + def convert_if_dtype( + series: dask_expr.Series, dtype: DType | type[DType] + ) -> dask_expr.Series: + return series.astype(reverse_translate_dtype(dtype)) if dtype else series + return DaskExpr( - lambda df: [df._native_frame.assign(lit=value).loc[:, "lit"]], + lambda df: [ + df._native_frame.assign(lit=value) + .loc[:, "lit"] + .pipe(convert_if_dtype, dtype) + ], depth=0, function_name="lit", root_names=None, diff --git a/tests/frame/lit_test.py b/tests/frame/lit_test.py index 328e4d8e0..e5756e035 100644 --- a/tests/frame/lit_test.py +++ b/tests/frame/lit_test.py @@ -17,11 +17,7 @@ ("dtype", "expected_lit"), [(None, [2, 2, 2]), (nw.String, ["2", "2", "2"]), (nw.Float32, [2.0, 2.0, 2.0])], ) -def test_lit( - constructor: Any, dtype: DType | None, expected_lit: list[Any], request: Any -) -> None: - if "dask" in str(constructor) and dtype == nw.String: - request.applymarker(pytest.mark.xfail) +def test_lit(constructor: Any, dtype: DType | None, expected_lit: list[Any]) -> None: data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]} df_raw = constructor(data) df = nw.from_native(df_raw).lazy()