From 5f91aa17c6cd9274851d219f4b50abe12c0fdfeb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 7 Sep 2024 08:43:20 +0100 Subject: [PATCH] [pre-commit.ci] pre-commit autoupdate (#818) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.5.7 → v0.6.3](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.7...v0.6.3) - [github.com/pre-commit/mirrors-mypy: v1.11.1 → v1.11.2](https://github.com/pre-commit/mirrors-mypy/compare/v1.11.1...v1.11.2) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * making ruff happy --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Edoardo Abati <29585319+EdAbati@users.noreply.github.com> --- .pre-commit-config.yaml | 4 +- pyproject.toml | 9 ++ tests/expr_and_series/arithmetic_test.py | 4 +- tests/expr_and_series/dt/ordinal_day_test.py | 2 +- .../expr_and_series/dt/total_minutes_test.py | 2 +- tests/hypothesis/test_basic_arithmetic.py | 2 +- tests/hypothesis/test_concat.py | 2 +- tests/hypothesis/test_join.py | 6 +- tpch/notebooks/q1/execute.ipynb | 53 +++++----- tpch/notebooks/q10/execute.ipynb | 41 ++++---- tpch/notebooks/q11/execute.ipynb | 44 ++++----- tpch/notebooks/q15/execute.ipynb | 43 ++++---- tpch/notebooks/q17/execute.ipynb | 42 ++++---- tpch/notebooks/q18/execute.ipynb | 41 ++++---- tpch/notebooks/q19/execute.ipynb | 45 ++++----- tpch/notebooks/q2/execute.ipynb | 51 +++++----- tpch/notebooks/q20/execute.ipynb | 47 ++++----- tpch/notebooks/q21/execute.ipynb | 99 +++++++++---------- tpch/notebooks/q3/execute.ipynb | 84 ++++++++-------- tpch/notebooks/q4/execute.ipynb | 79 +++++++-------- tpch/notebooks/q5/execute.ipynb | 73 +++++++------- tpch/notebooks/q6/execute.ipynb | 77 ++++++++------- tpch/notebooks/q7/execute.ipynb | 85 ++++++++-------- tpch/notebooks/q9/execute.ipynb | 42 ++++---- 24 files changed, 501 insertions(+), 476 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 57e766f59..f3a68e7a0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.5.7' + rev: 'v0.6.3' hooks: # Run the formatter. - id: ruff-format @@ -9,7 +9,7 @@ repos: - id: ruff args: [--fix] - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.11.1' + rev: 'v1.11.2' hooks: - id: mypy additional_dependencies: ['polars==1.4.1', 'pytest==8.3.2'] diff --git a/pyproject.toml b/pyproject.toml index b3a2a0c28..c4a10603f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,6 +79,15 @@ lint.ignore = [ "tpch/tests/*" = ["S101"] "utils/*" = ["S311", "PTH123"] "tpch/execute/*" = ["T201"] +"tpch/notebooks/*" = [ + "ANN001", + "ANN201", + "EM101", + "EXE002", + "PTH123", + "T203", + "TRY003", +] [tool.ruff.lint.pydocstyle] convention = "google" diff --git a/tests/expr_and_series/arithmetic_test.py b/tests/expr_and_series/arithmetic_test.py index 47d3e8ff0..7ff945c80 100644 --- a/tests/expr_and_series/arithmetic_test.py +++ b/tests/expr_and_series/arithmetic_test.py @@ -149,7 +149,7 @@ def test_truediv_same_dims(constructor_eager: Any, request: Any) -> None: compare_dicts({"a": result}, {"a": [2, 1, 1 / 3]}) -@pytest.mark.slow() +@pytest.mark.slow @given( # type: ignore[misc] left=st.integers(-100, 100), right=st.integers(-100, 100), @@ -189,7 +189,7 @@ def test_floordiv(left: int, right: int) -> None: compare_dicts(result, expected) -@pytest.mark.slow() +@pytest.mark.slow @given( # type: ignore[misc] left=st.integers(-100, 100), right=st.integers(-100, 100), diff --git a/tests/expr_and_series/dt/ordinal_day_test.py b/tests/expr_and_series/dt/ordinal_day_test.py index 1cb464259..2681188df 100644 --- a/tests/expr_and_series/dt/ordinal_day_test.py +++ b/tests/expr_and_series/dt/ordinal_day_test.py @@ -17,7 +17,7 @@ parse_version(pd.__version__) < parse_version("2.0.0"), reason="pyarrow dtype not available", ) -@pytest.mark.slow() +@pytest.mark.slow def test_ordinal_day(dates: datetime) -> None: result_pd = nw.from_native(pd.Series([dates]), series_only=True).dt.ordinal_day()[0] result_pdms = nw.from_native( diff --git a/tests/expr_and_series/dt/total_minutes_test.py b/tests/expr_and_series/dt/total_minutes_test.py index f2469e495..bcd664442 100644 --- a/tests/expr_and_series/dt/total_minutes_test.py +++ b/tests/expr_and_series/dt/total_minutes_test.py @@ -22,7 +22,7 @@ parse_version(pd.__version__) < parse_version("2.2.0"), reason="pyarrow dtype not available", ) -@pytest.mark.slow() +@pytest.mark.slow def test_total_minutes(timedeltas: timedelta) -> None: result_pd = nw.from_native( pd.Series([timedeltas]), series_only=True diff --git a/tests/hypothesis/test_basic_arithmetic.py b/tests/hypothesis/test_basic_arithmetic.py index 2ab7bad7b..00818271d 100644 --- a/tests/hypothesis/test_basic_arithmetic.py +++ b/tests/hypothesis/test_basic_arithmetic.py @@ -22,7 +22,7 @@ max_size=3, ), ) # type: ignore[misc] -@pytest.mark.slow() +@pytest.mark.slow def test_mean( integer: st.SearchStrategy[list[int]], floats: st.SearchStrategy[float], diff --git a/tests/hypothesis/test_concat.py b/tests/hypothesis/test_concat.py index 1b1248628..9ae54dbc4 100644 --- a/tests/hypothesis/test_concat.py +++ b/tests/hypothesis/test_concat.py @@ -31,7 +31,7 @@ ), how=st.sampled_from(["horizontal", "vertical"]), ) # type: ignore[misc] -@pytest.mark.slow() +@pytest.mark.slow @pytest.mark.skipif(is_windows(), reason="pyarrow breaking on windows") def test_concat( # pragma: no cover integers: list[int], diff --git a/tests/hypothesis/test_join.py b/tests/hypothesis/test_join.py index ebdb88757..bc1cd735c 100644 --- a/tests/hypothesis/test_join.py +++ b/tests/hypothesis/test_join.py @@ -42,7 +42,7 @@ ) # type: ignore[misc] @pytest.mark.skipif(pl_version < parse_version("0.20.13"), reason="0.0 == -0.0") @pytest.mark.skipif(pd_version < parse_version("2.0.0"), reason="requires pyarrow") -@pytest.mark.slow() +@pytest.mark.slow def test_join( # pragma: no cover integers: st.SearchStrategy[list[int]], other_integers: st.SearchStrategy[list[int]], @@ -88,7 +88,7 @@ def test_join( # pragma: no cover max_size=3, ), ) # type: ignore[misc] -@pytest.mark.slow() +@pytest.mark.slow @pytest.mark.skipif(pd_version < parse_version("2.0.0"), reason="requires pyarrow") def test_cross_join( # pragma: no cover integers: st.SearchStrategy[list[int]], @@ -135,7 +135,7 @@ def test_cross_join( # pragma: no cover st.sampled_from(["a", "b", "d"]), min_size=1, max_size=3, unique=True ), ) -@pytest.mark.slow() +@pytest.mark.slow @pytest.mark.filterwarnings("ignore:the default coalesce behavior") def test_left_join( # pragma: no cover a_left_data: list[int], diff --git a/tpch/notebooks/q1/execute.ipynb b/tpch/notebooks/q1/execute.ipynb index cc6dd4559..de9c52baa 100755 --- a/tpch/notebooks/q1/execute.ipynb +++ b/tpch/notebooks/q1/execute.ipynb @@ -58,10 +58,12 @@ }, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import datetime\n", + "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "@nw.narwhalify\n", "def q1(lineitem_ds: Any) -> Any:\n", " var_1 = datetime(1998, 9, 2)\n", @@ -107,14 +109,14 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + 'region.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "region = dir_ + \"region.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -133,16 +135,18 @@ }, "outputs": [], "source": [ - "import pyarrow.parquet as pq\n", "import dask.dataframe as dd\n", + "import pyarrow.parquet as pq\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - " 'pyarrow': lambda x: pq.read_table(x),\n", - " 'dask': lambda x: dd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", + " \"pyarrow\": lambda x: pq.read_table(x),\n", + " \"dask\": lambda x: dd.read_parquet(x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"),\n", "}" ] }, @@ -171,7 +175,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pyarrow'\n", + "tool = \"pyarrow\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q1(fn(lineitem))\n", "results[tool] = timings.all_runs" @@ -210,7 +214,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q1(lineitem_ds=fn(lineitem))\n", "results[tool] = timings.all_runs" @@ -249,7 +253,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q1(fn(lineitem))\n", "results[tool] = timings.all_runs" @@ -288,7 +292,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q1(fn(lineitem))\n", "results[tool] = timings.all_runs" @@ -327,7 +331,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q1(fn(lineitem)).collect()\n", "results[tool] = timings.all_runs" @@ -348,7 +352,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'dask'\n", + "tool = \"dask\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q1(fn(lineitem)).collect()\n", "results[tool] = timings.all_runs" @@ -370,8 +374,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q10/execute.ipynb b/tpch/notebooks/q10/execute.ipynb index 85ec0f14b..9ff211773 100644 --- a/tpch/notebooks/q10/execute.ipynb +++ b/tpch/notebooks/q10/execute.ipynb @@ -55,22 +55,23 @@ }, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import datetime\n", + "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q10(\n", " customer_ds_raw: Any,\n", " nation_ds_raw: Any,\n", " lineitem_ds_raw: Any,\n", " orders_ds_raw: Any,\n", ") -> Any:\n", - "\n", " nation_ds = nw.from_native(nation_ds_raw)\n", " line_item_ds = nw.from_native(lineitem_ds_raw)\n", " orders_ds = nw.from_native(orders_ds_raw)\n", " customer_ds = nw.from_native(customer_ds_raw)\n", - " \n", + "\n", " var1 = datetime(1993, 10, 1)\n", " var2 = datetime(1994, 1, 1)\n", "\n", @@ -81,8 +82,7 @@ " .filter(nw.col(\"o_orderdate\").is_between(var1, var2, closed=\"left\"))\n", " .filter(nw.col(\"l_returnflag\") == \"R\")\n", " .with_columns(\n", - " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\")))\n", - " .alias(\"revenue\")\n", + " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))).alias(\"revenue\")\n", " )\n", " .group_by(\n", " \"c_custkey\",\n", @@ -127,10 +127,10 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "nation = dir_ + 'nation.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "customer = dir_ + 'customer.parquet'" + "nation = dir_ + \"nation.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "customer = dir_ + \"customer.parquet\"" ] }, { @@ -149,10 +149,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -196,7 +198,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -233,7 +235,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -270,7 +272,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -307,7 +309,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q10(fn(customer), fn(nation), fn(lineitem), fn(orders)).collect()\n", "results[tool] = timings.all_runs" @@ -327,8 +329,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q11/execute.ipynb b/tpch/notebooks/q11/execute.ipynb index 33951d922..f5bbc0f9c 100644 --- a/tpch/notebooks/q11/execute.ipynb +++ b/tpch/notebooks/q11/execute.ipynb @@ -15,7 +15,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -56,19 +56,19 @@ "outputs": [], "source": [ "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q11(\n", " partsupp_ds_raw: Any,\n", " nation_ds_raw: Any,\n", " supplier_ds_raw: Any,\n", ") -> Any:\n", - "\n", " nation_ds = nw.from_native(nation_ds_raw)\n", " partsupp_ds = nw.from_native(partsupp_ds_raw)\n", " supplier_ds = nw.from_native(supplier_ds_raw)\n", "\n", - " \n", " var1 = \"GERMANY\"\n", " var2 = 0.0001\n", "\n", @@ -83,14 +83,9 @@ " )\n", "\n", " q_final = (\n", - " q1.with_columns(\n", - " (nw.col(\"ps_supplycost\") * nw.col(\"ps_availqty\"))\n", - " .alias(\"value\")\n", - " )\n", + " q1.with_columns((nw.col(\"ps_supplycost\") * nw.col(\"ps_availqty\")).alias(\"value\"))\n", " .group_by(\"ps_partkey\")\n", - " .agg(\n", - " nw.sum(\"value\")\n", - " )\n", + " .agg(nw.sum(\"value\"))\n", " .join(q2, how=\"cross\")\n", " .filter(nw.col(\"value\") > nw.col(\"tmp\"))\n", " .select(\"ps_partkey\", \"value\")\n", @@ -116,9 +111,9 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "nation = dir_ + 'nation.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "nation = dir_ + \"nation.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -137,10 +132,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -184,7 +181,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -221,7 +218,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -258,7 +255,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -295,7 +292,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q11(fn(partsupp), fn(nation), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" @@ -315,8 +312,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] }, { diff --git a/tpch/notebooks/q15/execute.ipynb b/tpch/notebooks/q15/execute.ipynb index 0baf11956..d108a7196 100644 --- a/tpch/notebooks/q15/execute.ipynb +++ b/tpch/notebooks/q15/execute.ipynb @@ -15,7 +15,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -55,32 +55,34 @@ }, "outputs": [], "source": [ + "from datetime import datetime\n", "from typing import Any\n", + "\n", "import narwhals as nw\n", - "from datetime import datetime\n", + "\n", "\n", "def q15(\n", " lineitem_ds_raw: Any,\n", " supplier_ds_raw: Any,\n", ") -> Any:\n", - "\n", " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", " supplier_ds = nw.from_native(supplier_ds_raw)\n", - " \n", + "\n", " var1 = datetime(1996, 1, 1)\n", " var2 = datetime(1996, 4, 1)\n", "\n", " revenue = (\n", " lineitem_ds.filter(nw.col(\"l_shipdate\").is_between(var1, var2, closed=\"left\"))\n", " .with_columns(\n", - " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\")))\n", - " .alias(\"total_revenue\")\n", + " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))).alias(\n", + " \"total_revenue\"\n", + " )\n", " )\n", " .group_by(\"l_suppkey\")\n", " .agg(nw.sum(\"total_revenue\"))\n", " .select(nw.col(\"l_suppkey\").alias(\"supplier_no\"), nw.col(\"total_revenue\"))\n", " )\n", - " \n", + "\n", " result = (\n", " supplier_ds.join(revenue, left_on=\"s_suppkey\", right_on=\"supplier_no\")\n", " .filter(nw.col(\"total_revenue\") == nw.col(\"total_revenue\").max())\n", @@ -108,8 +110,8 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "supplier = dir_ + 'supplier.parquet'" + "lineitem = dir_ + \"lineitem.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"" ] }, { @@ -128,10 +130,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -175,7 +179,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q15(fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -212,7 +216,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q15(fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -249,7 +253,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q15(fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -286,7 +290,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q15(fn(lineitem), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" @@ -306,8 +310,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q17/execute.ipynb b/tpch/notebooks/q17/execute.ipynb index b13445d28..4d012f088 100644 --- a/tpch/notebooks/q17/execute.ipynb +++ b/tpch/notebooks/q17/execute.ipynb @@ -15,7 +15,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -56,25 +56,23 @@ "outputs": [], "source": [ "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", - "def q17(\n", - " lineitem_ds_raw: Any,\n", - " part_ds_raw: Any\n", - ") -> Any:\n", "\n", + "def q17(lineitem_ds_raw: Any, part_ds_raw: Any) -> Any:\n", " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", " part_ds = nw.from_native(part_ds_raw)\n", - " \n", + "\n", " var1 = \"Brand#23\"\n", " var2 = \"MED BOX\"\n", - " \n", + "\n", " query1 = (\n", " part_ds.filter(nw.col(\"p_brand\") == var1)\n", " .filter(nw.col(\"p_container\") == var2)\n", " .join(lineitem_ds, how=\"left\", left_on=\"p_partkey\", right_on=\"l_partkey\")\n", " )\n", - " \n", + "\n", " final_query = (\n", " query1.group_by(\"p_partkey\")\n", " .agg((0.2 * nw.col(\"l_quantity\").mean()).alias(\"avg_quantity\"))\n", @@ -84,7 +82,6 @@ " .select((nw.col(\"l_extendedprice\").sum() / 7.0).round(2).alias(\"avg_yearly\"))\n", " )\n", "\n", - "\n", " return nw.to_native(final_query)" ] }, @@ -104,8 +101,8 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "part = dir_ + 'part.parquet'" + "lineitem = dir_ + \"lineitem.parquet\"\n", + "part = dir_ + \"part.parquet\"" ] }, { @@ -124,10 +121,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -171,7 +170,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q17(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" @@ -208,7 +207,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q17(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" @@ -245,7 +244,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q17(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" @@ -282,7 +281,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q17(fn(lineitem), fn(part)).collect()\n", "results[tool] = timings.all_runs" @@ -302,8 +301,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q18/execute.ipynb b/tpch/notebooks/q18/execute.ipynb index c90629e0f..edf635d9e 100644 --- a/tpch/notebooks/q18/execute.ipynb +++ b/tpch/notebooks/q18/execute.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -29,18 +29,15 @@ "outputs": [], "source": [ "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", - "def q18(\n", - " customer_ds_raw: Any,\n", - " lineitem_ds_raw: Any,\n", - " orders_ds_raw: Any\n", - ") -> Any:\n", "\n", + "def q18(customer_ds_raw: Any, lineitem_ds_raw: Any, orders_ds_raw: Any) -> Any:\n", " customer_ds = nw.from_native(customer_ds_raw)\n", " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", " orders_ds = nw.from_native(orders_ds_raw)\n", - " \n", + "\n", " var1 = 300\n", "\n", " query1 = (\n", @@ -67,7 +64,6 @@ " .head(100)\n", " )\n", "\n", - "\n", " return nw.to_native(q_final)" ] }, @@ -78,9 +74,9 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'" + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"" ] }, { @@ -90,10 +86,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -119,7 +117,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" @@ -138,7 +136,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q18(fn(customer), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -157,7 +155,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q18(fn(customer), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -176,7 +174,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q18(fn(customer), fn(lineitem), fn(orders)).collect()\n", "results[tool] = timings.all_runs" @@ -196,8 +194,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q19/execute.ipynb b/tpch/notebooks/q19/execute.ipynb index 8483e06d5..8860cc773 100644 --- a/tpch/notebooks/q19/execute.ipynb +++ b/tpch/notebooks/q19/execute.ipynb @@ -15,7 +15,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -56,14 +56,11 @@ "outputs": [], "source": [ "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", - "def q19(\n", - " lineitem_ds_raw: Any,\n", - " part_ds_raw: Any\n", - " \n", - ") -> Any:\n", "\n", + "def q19(lineitem_ds_raw: Any, part_ds_raw: Any) -> Any:\n", " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", " part_ds = nw.from_native(part_ds_raw)\n", "\n", @@ -74,9 +71,7 @@ " .filter(\n", " (\n", " (nw.col(\"p_brand\") == \"Brand#12\")\n", - " & nw.col(\"p_container\").is_in(\n", - " [\"SM CASE\", \"SM BOX\", \"SM PACK\", \"SM PKG\"]\n", - " )\n", + " & nw.col(\"p_container\").is_in([\"SM CASE\", \"SM BOX\", \"SM PACK\", \"SM PKG\"])\n", " & (nw.col(\"l_quantity\").is_between(1, 11))\n", " & (nw.col(\"p_size\").is_between(1, 5))\n", " )\n", @@ -90,9 +85,7 @@ " )\n", " | (\n", " (nw.col(\"p_brand\") == \"Brand#34\")\n", - " & nw.col(\"p_container\").is_in(\n", - " [\"LG CASE\", \"LG BOX\", \"LG PACK\", \"LG PKG\"]\n", - " )\n", + " & nw.col(\"p_container\").is_in([\"LG CASE\", \"LG BOX\", \"LG PACK\", \"LG PKG\"])\n", " & (nw.col(\"l_quantity\").is_between(20, 30))\n", " & (nw.col(\"p_size\").is_between(1, 15))\n", " )\n", @@ -105,7 +98,6 @@ " )\n", " )\n", "\n", - "\n", " return nw.to_native(result)" ] }, @@ -125,8 +117,8 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "part = dir_ + 'part.parquet'" + "lineitem = dir_ + \"lineitem.parquet\"\n", + "part = dir_ + \"part.parquet\"" ] }, { @@ -145,10 +137,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -192,7 +186,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" @@ -229,7 +223,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" @@ -266,7 +260,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q19(fn(lineitem), fn(part))\n", "results[tool] = timings.all_runs" @@ -303,7 +297,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q19(fn(lineitem), fn(part)).collect()\n", "results[tool] = timings.all_runs" @@ -323,8 +317,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q2/execute.ipynb b/tpch/notebooks/q2/execute.ipynb index c05345336..74ba50f2a 100755 --- a/tpch/notebooks/q2/execute.ipynb +++ b/tpch/notebooks/q2/execute.ipynb @@ -69,8 +69,10 @@ "outputs": [], "source": [ "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "@nw.narwhalify\n", "def q2(\n", " region_ds: Any,\n", @@ -140,14 +142,14 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + 'region.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "region = dir_ + \"region.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -166,16 +168,18 @@ }, "outputs": [], "source": [ - "import pyarrow.parquet as pq\n", "import dask.dataframe as dd\n", + "import pyarrow.parquet as pq\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - " 'pyarrow': lambda x: pq.read_table(x),\n", - " 'dask': lambda x: dd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", + " \"pyarrow\": lambda x: pq.read_table(x),\n", + " \"dask\": lambda x: dd.read_parquet(x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"),\n", "}" ] }, @@ -222,7 +226,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", "results[tool] = timings.all_runs" @@ -261,7 +265,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", "results[tool] = timings.all_runs" @@ -300,7 +304,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", "results[tool] = timings.all_runs" @@ -339,7 +343,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp)).collect()\n", "results[tool] = timings.all_runs" @@ -360,7 +364,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pyarrow'\n", + "tool = \"pyarrow\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", "results[tool] = timings.all_runs" @@ -381,7 +385,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'dask'\n", + "tool = \"dask\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q2(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp)).compute()\n", "results[tool] = timings.all_runs" @@ -403,8 +407,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q20/execute.ipynb b/tpch/notebooks/q20/execute.ipynb index aecb3a473..a9698c1ad 100644 --- a/tpch/notebooks/q20/execute.ipynb +++ b/tpch/notebooks/q20/execute.ipynb @@ -15,7 +15,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -55,24 +55,25 @@ }, "outputs": [], "source": [ + "from datetime import datetime\n", "from typing import Any\n", + "\n", "import narwhals as nw\n", - "from datetime import datetime\n", + "\n", "\n", "def q20(\n", " part_ds_raw: Any,\n", " partsupp_ds_raw: Any,\n", " nation_ds_raw: Any,\n", " lineitem_ds_raw: Any,\n", - " supplier_ds_raw: Any\n", + " supplier_ds_raw: Any,\n", ") -> Any:\n", - "\n", " part_ds = nw.from_native(part_ds_raw)\n", " nation_ds = nw.from_native(nation_ds_raw)\n", " partsupp_ds = nw.from_native(partsupp_ds_raw)\n", " lineitem_ds = nw.from_native(lineitem_ds_raw)\n", " supplier_ds = nw.from_native(supplier_ds_raw)\n", - " \n", + "\n", " var1 = datetime(1994, 1, 1)\n", " var2 = datetime(1995, 1, 1)\n", " var3 = \"CANADA\"\n", @@ -82,7 +83,7 @@ " lineitem_ds.filter(nw.col(\"l_shipdate\").is_between(var1, var2, closed=\"left\"))\n", " .group_by(\"l_partkey\", \"l_suppkey\")\n", " .agg((nw.col(\"l_quantity\").sum()).alias(\"sum_quantity\"))\n", - " .with_columns(sum_quantity = nw.col(\"sum_quantity\") * 0.5)\n", + " .with_columns(sum_quantity=nw.col(\"sum_quantity\") * 0.5)\n", " )\n", " query2 = nation_ds.filter(nw.col(\"n_name\") == var3)\n", " query3 = supplier_ds.join(query2, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", @@ -103,7 +104,6 @@ " .sort(\"s_name\")\n", " )\n", "\n", - "\n", " return nw.to_native(result)" ] }, @@ -123,11 +123,11 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "nation = dir_ + 'nation.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "nation = dir_ + \"nation.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -146,10 +146,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -193,7 +195,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -230,7 +232,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -267,7 +269,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -304,7 +306,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q20(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" @@ -324,8 +326,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q21/execute.ipynb b/tpch/notebooks/q21/execute.ipynb index b51b15dce..af12a424c 100755 --- a/tpch/notebooks/q21/execute.ipynb +++ b/tpch/notebooks/q21/execute.ipynb @@ -36,13 +36,12 @@ "outputs": [], "source": [ "from typing import Any\n", - "from datetime import date\n", - "\n", - "import narwhals as nw\n", "\n", "import pandas as pd\n", "import polars as pl\n", "\n", + "import narwhals as nw\n", + "\n", "pd.options.mode.copy_on_write = True\n", "pd.options.future.infer_string = True" ] @@ -66,10 +65,12 @@ "Q_NUM = 21\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -95,34 +96,28 @@ " orders_raw: Any,\n", " supplier_raw: Any,\n", ") -> Any:\n", - " \n", " lineitem = nw.from_native(lineitem_raw)\n", " nation = nw.from_native(nation_raw)\n", " orders = nw.from_native(orders_raw)\n", " supplier = nw.from_native(supplier_raw)\n", - " \n", + "\n", " var1 = \"SAUDI ARABIA\"\n", - " \n", - " \n", + "\n", " q1 = (\n", " lineitem.group_by(\"l_orderkey\")\n", - "# .agg(nw.col(\"l_suppkey\").len().alias(\"n_supp_by_order\"))\n", " .agg(nw.len().alias(\"n_supp_by_order\"))\n", " .filter(nw.col(\"n_supp_by_order\") > 1)\n", " .join(\n", " lineitem.filter(nw.col(\"l_receiptdate\") > nw.col(\"l_commitdate\")),\n", - "# on=\"l_orderkey\",\n", - " left_on=\"l_orderkey\", right_on=\"l_orderkey\",\n", + " left_on=\"l_orderkey\",\n", + " right_on=\"l_orderkey\",\n", " )\n", " )\n", "\n", " q_final = (\n", " q1.group_by(\"l_orderkey\")\n", - "# .agg(nw.col(\"l_suppkey\").len().alias(\"n_supp_by_order\"))\n", " .agg(nw.len().alias(\"n_supp_by_order\"))\n", - " .join(q1, left_on=\"l_orderkey\", right_on=\"l_orderkey\"\n", - " #on=\"l_orderkey\"\n", - " )\n", + " .join(q1, left_on=\"l_orderkey\", right_on=\"l_orderkey\")\n", " .join(supplier, left_on=\"l_suppkey\", right_on=\"s_suppkey\")\n", " .join(nation, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", " .join(orders, left_on=\"l_orderkey\", right_on=\"o_orderkey\")\n", @@ -155,10 +150,10 @@ "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", "\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'" + "lineitem = dir_ + \"lineitem.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"" ] }, { @@ -213,10 +208,15 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "\n", - "lineitem_raw, nation_raw, orders_raw, supplier_raw = fn(lineitem), fn(nation), fn(orders), fn(supplier)\n", + "lineitem_raw, nation_raw, orders_raw, supplier_raw = (\n", + " fn(lineitem),\n", + " fn(nation),\n", + " fn(orders),\n", + " fn(supplier),\n", + ")\n", "\n", "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw)\n", "results[tool] = timings.all_runs" @@ -255,9 +255,14 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", - "lineitem_raw, nation_raw, orders_raw, supplier_raw = fn(lineitem), fn(nation), fn(orders), fn(supplier)\n", + "lineitem_raw, nation_raw, orders_raw, supplier_raw = (\n", + " fn(lineitem),\n", + " fn(nation),\n", + " fn(orders),\n", + " fn(supplier),\n", + ")\n", "\n", "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw)\n", "results[tool] = timings.all_runs" @@ -296,10 +301,15 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "\n", - "lineitem_raw, nation_raw, orders_raw, supplier_raw = fn(lineitem), fn(nation), fn(orders), fn(supplier)\n", + "lineitem_raw, nation_raw, orders_raw, supplier_raw = (\n", + " fn(lineitem),\n", + " fn(nation),\n", + " fn(orders),\n", + " fn(supplier),\n", + ")\n", "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw)\n", "results[tool] = timings.all_runs" ] @@ -337,10 +347,15 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "\n", - "lineitem_raw, nation_raw, orders_raw, supplier_raw = fn(lineitem), fn(nation), fn(orders), fn(supplier)\n", + "lineitem_raw, nation_raw, orders_raw, supplier_raw = (\n", + " fn(lineitem),\n", + " fn(nation),\n", + " fn(orders),\n", + " fn(supplier),\n", + ")\n", "timings = %timeit -o -q q21(lineitem_raw, nation_raw, orders_raw, supplier_raw).collect()\n", "results[tool] = timings.all_runs" ] @@ -379,29 +394,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16", - "metadata": { - "papermill": { - "duration": 0.02616, - "end_time": "2024-06-20T09:46:18.666732", - "exception": false, - "start_time": "2024-06-20T09:46:18.640572", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "from pprint import pprint\n", "\n", - "pprint(results)" + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q3/execute.ipynb b/tpch/notebooks/q3/execute.ipynb index 80178cae1..b81135fc3 100755 --- a/tpch/notebooks/q3/execute.ipynb +++ b/tpch/notebooks/q3/execute.ipynb @@ -49,14 +49,15 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import date\n", + "from typing import Any\n", + "\n", "\n", "def q3_pandas_native(\n", " customer_ds: Any,\n", " line_item_ds: Any,\n", " orders_ds: Any,\n", - "):\n", + ") -> Any:\n", " var1 = \"BUILDING\"\n", " var2 = date(1995, 3, 15)\n", "\n", @@ -69,18 +70,15 @@ " jn2 = jn2[jn2[\"l_shipdate\"] > var2]\n", " jn2[\"revenue\"] = jn2.l_extendedprice * (1 - jn2.l_discount)\n", "\n", - " gb = jn2.groupby(\n", - " [\"o_orderkey\", \"o_orderdate\", \"o_shippriority\"], as_index=False\n", - " )\n", + " gb = jn2.groupby([\"o_orderkey\", \"o_orderdate\", \"o_shippriority\"], as_index=False)\n", " agg = gb[\"revenue\"].sum()\n", "\n", " sel = agg.loc[:, [\"o_orderkey\", \"revenue\", \"o_orderdate\", \"o_shippriority\"]]\n", " sel = sel.rename({\"o_orderkey\": \"l_orderkey\"}, axis=\"columns\")\n", "\n", " sorted = sel.sort_values(by=[\"revenue\", \"o_orderdate\"], ascending=[False, True])\n", - " result_df = sorted.head(10)\n", "\n", - " return result_df # type: ignore[no-any-return]" + " return sorted.head(10) # type: ignore[no-any-return]" ] }, { @@ -99,10 +97,12 @@ }, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import datetime\n", + "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q3(\n", " customer_ds_raw: Any,\n", " line_item_ds_raw: Any,\n", @@ -122,7 +122,8 @@ " .filter(\n", " nw.col(\"o_orderdate\") < var_2,\n", " nw.col(\"l_shipdate\") > var_1,\n", - " ).with_columns(\n", + " )\n", + " .with_columns(\n", " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))).alias(\"revenue\")\n", " )\n", " .group_by([\"o_orderkey\", \"o_orderdate\", \"o_shippriority\"])\n", @@ -150,16 +151,16 @@ "outputs": [], "source": [ "from typing import Any\n", - "from datetime import datetime\n", - "import narwhals as nw\n", + "\n", "import ibis\n", "\n", + "\n", "def q3_ibis(\n", " customer: Any,\n", " lineitem: Any,\n", " orders: Any,\n", " *,\n", - " tool,\n", + " tool: str,\n", ") -> Any:\n", " var1 = \"BUILDING\"\n", " var2 = date(1995, 3, 15)\n", @@ -186,9 +187,9 @@ " .order_by(ibis.desc(\"revenue\"), \"o_orderdate\")\n", " .limit(10)\n", " )\n", - " if tool == 'pandas':\n", + " if tool == \"pandas\":\n", " return q_final.to_pandas()\n", - " if tool == 'polars':\n", + " if tool == \"polars\":\n", " return q_final.to_polars()\n", " raise ValueError(\"expected pandas or polars\")" ] @@ -210,14 +211,14 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + 'region.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "region = dir_ + \"region.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -236,18 +237,20 @@ }, "outputs": [], "source": [ - "import ibis\n", - "\n", "con_pd = ibis.pandas.connect()\n", "con_pl = ibis.polars.connect()\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"pandas[pyarrow][ibis]\": lambda x: con_pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", + " \"polars[lazy][ibis]\": lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -276,7 +279,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow][ibis]'\n", + "tool = \"pandas[pyarrow][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q3_ibis(fn(customer), fn(lineitem), fn(orders), tool='pandas')\n", "results[tool] = timings.all_runs" @@ -297,7 +300,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'polars[lazy][ibis]'\n", + "tool = \"polars[lazy][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q3_ibis(fn(customer), fn(lineitem), fn(orders), tool='polars')\n", "results[tool] = timings.all_runs" @@ -318,10 +321,10 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q3_pandas_native(fn(customer), fn(lineitem), fn(orders))\n", - "results[tool+'[native]'] = timings.all_runs" + "results[tool + \"[native]\"] = timings.all_runs" ] }, { @@ -357,7 +360,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -396,7 +399,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -435,7 +438,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -474,7 +477,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q3(fn(customer), fn(lineitem), fn(orders)).collect()\n", "results[tool] = timings.all_runs" @@ -496,8 +499,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q4/execute.ipynb b/tpch/notebooks/q4/execute.ipynb index df07c9c5f..b0a55e345 100755 --- a/tpch/notebooks/q4/execute.ipynb +++ b/tpch/notebooks/q4/execute.ipynb @@ -52,6 +52,7 @@ "from datetime import date\n", "from typing import Any\n", "\n", + "\n", "def q4_pandas_native(\n", " line_item_ds: Any,\n", " orders_ds: Any,\n", @@ -72,9 +73,7 @@ " gb = jn.groupby(\"o_orderpriority\", as_index=False)\n", " agg = gb.agg(order_count=pd.NamedAgg(column=\"o_orderkey\", aggfunc=\"count\"))\n", "\n", - " result_df = agg.sort_values([\"o_orderpriority\"])\n", - "\n", - " return result_df # type: ignore[no-any-return]" + " return agg.sort_values([\"o_orderpriority\"]) # type: ignore[no-any-return]" ] }, { @@ -93,10 +92,12 @@ }, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import datetime\n", + "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q4(\n", " lineitem_ds_raw: Any,\n", " orders_ds_raw: Any,\n", @@ -112,7 +113,8 @@ " .filter(\n", " nw.col(\"o_orderdate\").is_between(var_1, var_2, closed=\"left\"),\n", " nw.col(\"l_commitdate\") < nw.col(\"l_receiptdate\"),\n", - " ).unique(subset=[\"o_orderpriority\", \"l_orderkey\"])\n", + " )\n", + " .unique(subset=[\"o_orderpriority\", \"l_orderkey\"])\n", " .group_by(\"o_orderpriority\")\n", " .agg(nw.len().alias(\"order_count\"))\n", " .sort(by=\"o_orderpriority\")\n", @@ -130,15 +132,11 @@ "outputs": [], "source": [ "from typing import Any\n", - "from datetime import datetime\n", + "\n", "import ibis\n", "\n", - "def q4_ibis(\n", - " lineitem: Any,\n", - " orders: Any,\n", - " *,\n", - " tool: str\n", - ") -> Any:\n", + "\n", + "def q4_ibis(lineitem: Any, orders: Any, *, tool: str) -> Any:\n", " var1 = datetime(1993, 7, 1)\n", " var2 = datetime(1993, 10, 1)\n", "\n", @@ -151,9 +149,9 @@ " .agg(order_count=ibis._.count())\n", " .order_by(\"o_orderpriority\")\n", " )\n", - " if tool == 'pandas':\n", + " if tool == \"pandas\":\n", " return q_final.to_pandas()\n", - " if tool == 'polars':\n", + " if tool == \"polars\":\n", " return q_final.to_polars()\n", " raise ValueError(\"expected pandas or polars\")" ] @@ -175,14 +173,14 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + 'region.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "region = dir_ + \"region.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -201,18 +199,20 @@ }, "outputs": [], "source": [ - "import ibis\n", - "\n", "con_pd = ibis.pandas.connect()\n", "con_pl = ibis.polars.connect()\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"pandas[pyarrow][ibis]\": lambda x: con_pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", + " \"polars[lazy][ibis]\": lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -241,7 +241,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'polars[lazy][ibis]'\n", + "tool = \"polars[lazy][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q4_ibis(fn(lineitem), fn(orders), tool='polars')\n", "results[tool] = timings.all_runs" @@ -262,10 +262,10 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q4_pandas_native(fn(lineitem), fn(orders))\n", - "results[tool+'[native]'] = timings.all_runs" + "results[tool + \"[native]\"] = timings.all_runs" ] }, { @@ -301,7 +301,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q4(fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -340,7 +340,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q4(fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -379,7 +379,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q4(fn(lineitem), fn(orders))\n", "results[tool] = timings.all_runs" @@ -418,7 +418,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q4(fn(lineitem), fn(orders)).collect()\n", "results[tool] = timings.all_runs" @@ -440,8 +440,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q5/execute.ipynb b/tpch/notebooks/q5/execute.ipynb index 5f6df9bbc..da0cae78b 100755 --- a/tpch/notebooks/q5/execute.ipynb +++ b/tpch/notebooks/q5/execute.ipynb @@ -49,8 +49,9 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import date\n", + "from typing import Any\n", + "\n", "\n", "def q5_pandas_native(\n", " region_ds: Any,\n", @@ -79,9 +80,8 @@ " jn5[\"revenue\"] = jn5.l_extendedprice * (1.0 - jn5.l_discount)\n", "\n", " gb = jn5.groupby(\"n_name\", as_index=False)[\"revenue\"].sum()\n", - " result_df = gb.sort_values(\"revenue\", ascending=False)\n", "\n", - " return result_df # type: ignore[no-any-return]" + " return gb.sort_values(\"revenue\", ascending=False) # type: ignore[no-any-return]" ] }, { @@ -91,10 +91,12 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import datetime\n", + "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q5(\n", " region_ds_raw: Any,\n", " nation_ds_raw: Any,\n", @@ -126,7 +128,7 @@ " )\n", " .filter(\n", " nw.col(\"r_name\") == var_1,\n", - " nw.col(\"o_orderdate\").is_between(var_2, var_3, closed=\"left\")\n", + " nw.col(\"o_orderdate\").is_between(var_2, var_3, closed=\"left\"),\n", " )\n", " .with_columns(\n", " (nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))).alias(\"revenue\")\n", @@ -147,10 +149,10 @@ "outputs": [], "source": [ "from typing import Any\n", - "from datetime import datetime\n", - "import narwhals as nw\n", + "\n", "import ibis\n", "\n", + "\n", "def q5_ibis(\n", " region: Any,\n", " nation: Any,\n", @@ -183,9 +185,9 @@ " .order_by(ibis.desc(\"revenue\"))\n", " )\n", "\n", - " if tool == 'pandas':\n", + " if tool == \"pandas\":\n", " return q_final.to_pandas()\n", - " if tool == 'polars':\n", + " if tool == \"polars\":\n", " return q_final.to_polars()\n", " raise ValueError(\"expected pandas or polars\")" ] @@ -207,14 +209,14 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + 'region.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "region = dir_ + \"region.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -233,18 +235,20 @@ }, "outputs": [], "source": [ - "import ibis\n", - "\n", "con_pd = ibis.pandas.connect()\n", "con_pl = ibis.polars.connect()\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"pandas[pyarrow][ibis]\": lambda x: con_pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", + " \"polars[lazy][ibis]\": lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -273,7 +277,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'polars[lazy][ibis]'\n", + "tool = \"polars[lazy][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q5_ibis(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='polars')\n", "results[tool] = timings.all_runs" @@ -294,10 +298,10 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q5_pandas_native(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", - "results[tool+'[native]'] = timings.all_runs" + "results[tool + \"[native]\"] = timings.all_runs" ] }, { @@ -333,7 +337,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -372,7 +376,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -411,7 +415,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -450,7 +454,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q5(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" @@ -472,8 +476,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q6/execute.ipynb b/tpch/notebooks/q6/execute.ipynb index b101aa98d..5abcb65f0 100755 --- a/tpch/notebooks/q6/execute.ipynb +++ b/tpch/notebooks/q6/execute.ipynb @@ -50,6 +50,7 @@ "source": [ "from datetime import date\n", "\n", + "\n", "def q6_pandas_native(line_item_ds):\n", " var1 = date(1994, 1, 1)\n", " var2 = date(1995, 1, 1)\n", @@ -66,9 +67,8 @@ " ]\n", "\n", " result_value = (flineitem[\"l_extendedprice\"] * flineitem[\"l_discount\"]).sum()\n", - " result_df = pd.DataFrame({\"revenue\": [result_value]})\n", "\n", - " return result_df" + " return pd.DataFrame({\"revenue\": [result_value]})" ] }, { @@ -87,10 +87,11 @@ }, "outputs": [], "source": [ - "from typing import Any\n", "from datetime import datetime\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q6(line_item_raw) -> None:\n", " var_1 = datetime(1994, 1, 1)\n", " var_2 = datetime(1995, 1, 1)\n", @@ -103,12 +104,11 @@ " nw.col(\"l_shipdate\").is_between(var_1, var_2, closed=\"left\"),\n", " nw.col(\"l_discount\").is_between(0.05, 0.07),\n", " nw.col(\"l_quantity\") < var_3,\n", - " ).with_columns(\n", - " (nw.col(\"l_extendedprice\") * nw.col(\"l_discount\")).alias(\"revenue\")\n", " )\n", + " .with_columns((nw.col(\"l_extendedprice\") * nw.col(\"l_discount\")).alias(\"revenue\"))\n", " .select(nw.sum(\"revenue\"))\n", " )\n", - " return nw.to_native(result)\n" + " return nw.to_native(result)" ] }, { @@ -118,10 +118,6 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Any\n", - "from datetime import datetime\n", - "import narwhals as nw\n", - "\n", "def q6_ibis(lineitem, *, tool: str) -> None:\n", " var1 = datetime(1994, 1, 1)\n", " var2 = datetime(1995, 1, 1)\n", @@ -138,12 +134,12 @@ " .mutate(revenue=ibis._[\"l_extendedprice\"] * (ibis._[\"l_discount\"]))\n", " .agg(revenue=ibis._[\"revenue\"].sum())\n", " )\n", - " \n", - " if tool == 'pandas':\n", + "\n", + " if tool == \"pandas\":\n", " return q_final.to_pandas()\n", - " if tool == 'polars':\n", + " if tool == \"polars\":\n", " return q_final.to_polars()\n", - " raise ValueError(\"expected pandas or polars\")\n" + " raise ValueError(\"expected pandas or polars\")" ] }, { @@ -163,14 +159,14 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + 'region.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "region = dir_ + \"region.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -195,12 +191,16 @@ "con_pl = ibis.polars.connect()\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"pandas[pyarrow][ibis]\": lambda x: con_pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", + " \"polars[lazy][ibis]\": lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -229,7 +229,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow][ibis]'\n", + "tool = \"pandas[pyarrow][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q6_ibis(fn(lineitem), tool='pandas')\n", "results[tool] = timings.all_runs" @@ -250,7 +250,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'polars[lazy][ibis]'\n", + "tool = \"polars[lazy][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q6_ibis(fn(lineitem), tool='polars')\n", "results[tool] = timings.all_runs" @@ -271,10 +271,10 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q6_pandas_native(fn(lineitem))\n", - "results[tool+'[native]'] = timings.all_runs" + "results[tool + \"[native]\"] = timings.all_runs" ] }, { @@ -310,7 +310,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q6(fn(lineitem))\n", "results[tool] = timings.all_runs" @@ -349,7 +349,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q6(fn(lineitem))\n", "results[tool] = timings.all_runs" @@ -388,7 +388,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q6(fn(lineitem))\n", "results[tool] = timings.all_runs" @@ -427,7 +427,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q6(fn(lineitem)).collect()\n", "results[tool] = timings.all_runs" @@ -449,8 +449,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q7/execute.ipynb b/tpch/notebooks/q7/execute.ipynb index 1213043b0..8711d7505 100755 --- a/tpch/notebooks/q7/execute.ipynb +++ b/tpch/notebooks/q7/execute.ipynb @@ -49,10 +49,13 @@ "metadata": {}, "outputs": [], "source": [ + "from datetime import date\n", + "from datetime import datetime\n", "from typing import Any\n", - "from datetime import datetime, date\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q7_pandas_native(\n", " nation_ds,\n", " customer_ds,\n", @@ -96,9 +99,7 @@ " gb = total.groupby([\"supp_nation\", \"cust_nation\", \"l_year\"], as_index=False)\n", " agg = gb.agg(revenue=pd.NamedAgg(column=\"volume\", aggfunc=\"sum\"))\n", "\n", - " result_df = agg.sort_values(by=[\"supp_nation\", \"cust_nation\", \"l_year\"])\n", - "\n", - " return result_df # type: ignore[no-any-return]" + " return agg.sort_values(by=[\"supp_nation\", \"cust_nation\", \"l_year\"]) # type: ignore[no-any-return]" ] }, { @@ -117,10 +118,6 @@ }, "outputs": [], "source": [ - "from typing import Any\n", - "from datetime import datetime\n", - "import narwhals as nw\n", - "\n", "def q7(\n", " nation_ds,\n", " customer_ds,\n", @@ -171,7 +168,7 @@ " .agg(nw.sum(\"volume\").alias(\"revenue\"))\n", " .sort(by=[\"supp_nation\", \"cust_nation\", \"l_year\"])\n", " )\n", - " return nw.to_native(result)\n" + " return nw.to_native(result)" ] }, { @@ -181,18 +178,11 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Any\n", - "from datetime import datetime\n", "import ibis\n", "\n", + "\n", "def q7_ibis(\n", - " nation: Any,\n", - " customer: Any,\n", - " lineitem: Any,\n", - " orders: Any,\n", - " supplier: Any,\n", - " *,\n", - " tool: str\n", + " nation: Any, customer: Any, lineitem: Any, orders: Any, supplier: Any, *, tool: str\n", ") -> None:\n", " var1 = \"FRANCE\"\n", " var2 = \"GERMANY\"\n", @@ -234,9 +224,9 @@ " .order_by(\"supp_nation\", \"cust_nation\", \"l_year\")\n", " )\n", "\n", - " if tool == 'pandas':\n", + " if tool == \"pandas\":\n", " return q_final.to_pandas()\n", - " if tool == 'polars':\n", + " if tool == \"polars\":\n", " return q_final.to_polars()\n", " raise ValueError(\"expected pandas or polars\")" ] @@ -258,14 +248,14 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "region = dir_ + 'region.parquet'\n", - "nation = dir_ + 'nation.parquet'\n", - "customer = dir_ + 'customer.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "region = dir_ + \"region.parquet\"\n", + "nation = dir_ + \"nation.parquet\"\n", + "customer = dir_ + \"customer.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -284,18 +274,20 @@ }, "outputs": [], "source": [ - "import ibis\n", - "\n", "con_pd = ibis.pandas.connect()\n", "con_pl = ibis.polars.connect()\n", "\n", "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'pandas[pyarrow][ibis]': lambda x: con_pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", - " 'polars[lazy][ibis]': lambda x: con_pl.read_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"pandas[pyarrow][ibis]\": lambda x: con_pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", + " \"polars[lazy][ibis]\": lambda x: con_pl.read_parquet(x),\n", "}" ] }, @@ -324,7 +316,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow][ibis]'\n", + "tool = \"pandas[pyarrow][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q7_ibis(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='pandas')\n", "results[tool] = timings.all_runs" @@ -345,7 +337,7 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'polars[lazy][ibis]'\n", + "tool = \"polars[lazy][ibis]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q7_ibis(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier), tool='polars')\n", "results[tool] = timings.all_runs" @@ -366,10 +358,10 @@ "metadata": {}, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q7_pandas_native(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", - "results[tool+'[native]'] = timings.all_runs" + "results[tool + \"[native]\"] = timings.all_runs" ] }, { @@ -405,7 +397,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -444,7 +436,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -483,7 +475,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -522,7 +514,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q7(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" @@ -544,8 +536,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ], diff --git a/tpch/notebooks/q9/execute.ipynb b/tpch/notebooks/q9/execute.ipynb index 86417e180..802799a01 100644 --- a/tpch/notebooks/q9/execute.ipynb +++ b/tpch/notebooks/q9/execute.ipynb @@ -15,7 +15,7 @@ }, "outputs": [], "source": [ - "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals " + "!pip uninstall apache-beam -y && pip install -U pandas polars pyarrow narwhals" ] }, { @@ -56,8 +56,10 @@ "outputs": [], "source": [ "from typing import Any\n", + "\n", "import narwhals as nw\n", "\n", + "\n", "def q9(\n", " part_ds_raw: Any,\n", " partsupp_ds_raw: Any,\n", @@ -66,7 +68,6 @@ " orders_ds_raw: Any,\n", " supplier_ds_raw: Any,\n", ") -> Any:\n", - "\n", " part_ds = nw.from_native(part_ds_raw)\n", " nation_ds = nw.from_native(nation_ds_raw)\n", " partsupp_ds = nw.from_native(partsupp_ds_raw)\n", @@ -91,7 +92,7 @@ " (\n", " nw.col(\"l_extendedprice\") * (1 - nw.col(\"l_discount\"))\n", " - nw.col(\"ps_supplycost\") * nw.col(\"l_quantity\")\n", - " ).alias(\"amount\")\n", + " ).alias(\"amount\"),\n", " )\n", " .group_by(\"nation\", \"o_year\")\n", " .agg(nw.sum(\"amount\").alias(\"sum_profit\"))\n", @@ -117,12 +118,12 @@ "outputs": [], "source": [ "dir_ = \"/kaggle/input/tpc-h-data-parquet-s-2/\"\n", - "nation = dir_ + 'nation.parquet'\n", - "lineitem = dir_ + 'lineitem.parquet'\n", - "orders = dir_ + 'orders.parquet'\n", - "supplier = dir_ + 'supplier.parquet'\n", - "part = dir_ + 'part.parquet'\n", - "partsupp = dir_ + 'partsupp.parquet'" + "nation = dir_ + \"nation.parquet\"\n", + "lineitem = dir_ + \"lineitem.parquet\"\n", + "orders = dir_ + \"orders.parquet\"\n", + "supplier = dir_ + \"supplier.parquet\"\n", + "part = dir_ + \"part.parquet\"\n", + "partsupp = dir_ + \"partsupp.parquet\"" ] }, { @@ -141,10 +142,12 @@ "outputs": [], "source": [ "IO_FUNCS = {\n", - " 'pandas': lambda x: pd.read_parquet(x, engine='pyarrow'),\n", - " 'pandas[pyarrow]': lambda x: pd.read_parquet(x, engine='pyarrow', dtype_backend='pyarrow'),\n", - " 'polars[eager]': lambda x: pl.read_parquet(x),\n", - " 'polars[lazy]': lambda x: pl.scan_parquet(x),\n", + " \"pandas\": lambda x: pd.read_parquet(x, engine=\"pyarrow\"),\n", + " \"pandas[pyarrow]\": lambda x: pd.read_parquet(\n", + " x, engine=\"pyarrow\", dtype_backend=\"pyarrow\"\n", + " ),\n", + " \"polars[eager]\": lambda x: pl.read_parquet(x),\n", + " \"polars[lazy]\": lambda x: pl.scan_parquet(x),\n", "}" ] }, @@ -188,7 +191,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas'\n", + "tool = \"pandas\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -225,7 +228,7 @@ }, "outputs": [], "source": [ - "tool = 'pandas[pyarrow]'\n", + "tool = \"pandas[pyarrow]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -262,7 +265,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[eager]'\n", + "tool = \"polars[eager]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier))\n", "results[tool] = timings.all_runs" @@ -299,7 +302,7 @@ }, "outputs": [], "source": [ - "tool = 'polars[lazy]'\n", + "tool = \"polars[lazy]\"\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o -q q9(fn(part), fn(partsupp), fn(nation), fn(lineitem), fn(orders), fn(supplier)).collect()\n", "results[tool] = timings.all_runs" @@ -319,8 +322,9 @@ "outputs": [], "source": [ "import json\n", - "with open('results.json', 'w') as fd:\n", - " json.dump(results, fd)\n" + "\n", + "with open(\"results.json\", \"w\") as fd:\n", + " json.dump(results, fd)" ] } ],