From 7d7da411aca15582104baf84eda65adf354e6ba2 Mon Sep 17 00:00:00 2001 From: ritchie Date: Sun, 2 Jun 2024 14:19:54 +0200 Subject: [PATCH 1/2] add exists -> semi in q4 --- queries/dask/q4.py | 6 ++---- queries/polars/q4.py | 9 ++++++--- requirements.in | 1 + 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/queries/dask/q4.py b/queries/dask/q4.py index 7bc85a3..b3b3e79 100644 --- a/queries/dask/q4.py +++ b/queries/dask/q4.py @@ -26,12 +26,10 @@ def query() -> pd.DataFrame: var1 = date(1993, 7, 1) var2 = date(1993, 10, 1) - jn = line_item_ds.merge(orders_ds, left_on="l_orderkey", right_on="o_orderkey") + exists = line_item_ds[line_item_ds["l_commitdate"] < line_item_ds["l_receiptdate"]] + jn = orders_ds.merge(exists, left_on="o_orderkey", right_on="l_orderkey", how="leftsemi") jn = jn[(jn["o_orderdate"] >= var1) & (jn["o_orderdate"] < var2)] - jn = jn[jn["l_commitdate"] < jn["l_receiptdate"]] - - jn = jn.drop_duplicates(subset=["o_orderpriority", "l_orderkey"]) gb = jn.groupby("o_orderpriority") agg = gb.agg( diff --git a/queries/polars/q4.py b/queries/polars/q4.py index ae1d6b9..e80c61c 100644 --- a/queries/polars/q4.py +++ b/queries/polars/q4.py @@ -15,10 +15,13 @@ def q() -> None: var2 = date(1993, 10, 1) q_final = ( - lineitem.join(orders, left_on="l_orderkey", right_on="o_orderkey") + # SQL exists translates to semi join in Polars API + orders.join(( + lineitem.filter( + pl.col("l_commitdate") < pl.col("l_receiptdate") + ) + ), left_on="o_orderkey", right_on="l_orderkey", how="semi") .filter(pl.col("o_orderdate").is_between(var1, var2, closed="left")) - .filter(pl.col("l_commitdate") < pl.col("l_receiptdate")) - .unique(subset=["o_orderpriority", "l_orderkey"]) .group_by("o_orderpriority") .agg(pl.len().alias("order_count")) .sort("o_orderpriority") diff --git a/requirements.in b/requirements.in index 1fbe6cd..8ab5049 100644 --- a/requirements.in +++ b/requirements.in @@ -1,4 +1,5 @@ dask[dataframe] +dask-expr duckdb modin[ray] pandas>=2.0 From 6cc4fd6f1cca09023cc9b2eb6d0daabe774cce9e Mon Sep 17 00:00:00 2001 From: ritchie Date: Sun, 2 Jun 2024 14:22:37 +0200 Subject: [PATCH 2/2] fmt --- queries/dask/q4.py | 8 ++++++-- queries/polars/q4.py | 11 ++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/queries/dask/q4.py b/queries/dask/q4.py index b3b3e79..2b4f28e 100644 --- a/queries/dask/q4.py +++ b/queries/dask/q4.py @@ -26,9 +26,13 @@ def query() -> pd.DataFrame: var1 = date(1993, 7, 1) var2 = date(1993, 10, 1) - exists = line_item_ds[line_item_ds["l_commitdate"] < line_item_ds["l_receiptdate"]] + exists = line_item_ds[ + line_item_ds["l_commitdate"] < line_item_ds["l_receiptdate"] + ] - jn = orders_ds.merge(exists, left_on="o_orderkey", right_on="l_orderkey", how="leftsemi") + jn = orders_ds.merge( + exists, left_on="o_orderkey", right_on="l_orderkey", how="leftsemi" + ) jn = jn[(jn["o_orderdate"] >= var1) & (jn["o_orderdate"] < var2)] gb = jn.groupby("o_orderpriority") diff --git a/queries/polars/q4.py b/queries/polars/q4.py index e80c61c..b250b07 100644 --- a/queries/polars/q4.py +++ b/queries/polars/q4.py @@ -16,11 +16,12 @@ def q() -> None: q_final = ( # SQL exists translates to semi join in Polars API - orders.join(( - lineitem.filter( - pl.col("l_commitdate") < pl.col("l_receiptdate") - ) - ), left_on="o_orderkey", right_on="l_orderkey", how="semi") + orders.join( + (lineitem.filter(pl.col("l_commitdate") < pl.col("l_receiptdate"))), + left_on="o_orderkey", + right_on="l_orderkey", + how="semi", + ) .filter(pl.col("o_orderdate").is_between(var1, var2, closed="left")) .group_by("o_orderpriority") .agg(pl.len().alias("order_count"))