Skip to content

Commit

Permalink
add datafusion to parity tests
Browse files Browse the repository at this point in the history
  • Loading branch information
tokoko committed Aug 22, 2024
1 parent 58c3250 commit 411766e
Show file tree
Hide file tree
Showing 28 changed files with 142 additions and 60 deletions.
5 changes: 5 additions & 0 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM mcr.microsoft.com/vscode/devcontainers/python:3.10-buster
USER vscode
RUN curl -sSL https://install.python-poetry.org | python -
RUN /home/vscode/.local/bin/poetry config virtualenvs.in-project true
USER root
24 changes: 24 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"name": "ibis-substrait-devcontainer",
"build": {
"context": "..",
"dockerfile": "Dockerfile"
},

// Features to add to the dev container. More info: https://containers.dev/features.
"features": {
"ghcr.io/devcontainers/features/nix:1": {}
},

// Use 'forwardPorts' to make a list of ports inside the container available locally.
// "forwardPorts": [],

// Use 'postCreateCommand' to run commands after the container is created.
"postCreateCommand": "poetry install"

// Configure tool-specific properties.
// "customizations": {},

// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
// "remoteUser": "root"
}
2 changes: 1 addition & 1 deletion ibis_substrait/compiler/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,7 +883,7 @@ def _translate_join_type(join_kind: ops.JoinKind) -> stalg.JoinRel.JoinType.V:
"left": stalg.JoinRel.JoinType.JOIN_TYPE_LEFT,
"right": stalg.JoinRel.JoinType.JOIN_TYPE_RIGHT,
"outer": stalg.JoinRel.JoinType.JOIN_TYPE_OUTER,
"semi": stalg.JoinRel.JoinType.JOIN_TYPE_SEMI,
"semi": stalg.JoinRel.JoinType.JOIN_TYPE_LEFT_SEMI,
# "asof",
# "anti",
# "any_inner",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1285,7 +1285,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1012,7 +1012,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1032,7 +1032,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1358,7 +1358,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -878,7 +878,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -843,7 +843,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -863,7 +863,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -774,7 +774,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1085,7 +1085,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -704,7 +704,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -773,7 +773,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -774,7 +774,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1426,7 +1426,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1012,7 +1012,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1245,7 +1245,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -879,7 +879,7 @@
}
],
"version": {
"minorNumber": 52,
"minorNumber": 54,
"producer": "ibis-substrait"
}
}
46 changes: 40 additions & 6 deletions ibis_substrait/tests/compiler/test_parity.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,38 @@ def run_query_duckdb_substrait(expr, datasets, compiler):
return result.fetch_arrow_table()


def run_query_datafusion(expr, datasets, compiler):
import pyarrow
import datafusion.substrait

connection = datafusion.SessionContext()

for k, v in datasets.items(): # noqa: B007
connection.deregister_table(k)
connection.register_record_batches(k, [v.to_batches()])

plan = compiler.compile(expr)

plan_data = plan.SerializeToString()
substrait_plan = datafusion.substrait.serde.deserialize_bytes(plan_data)
logical_plan = datafusion.substrait.consumer.from_substrait_plan(
connection, substrait_plan
)

df = connection.create_dataframe_from_logical_plan(logical_plan)
for column_number, column_name in enumerate(df.schema().names):
df = df.with_column_renamed(
column_name, plan.relations[0].root.names[column_number]
)
return df.to_arrow_table()


def run_parity_tests(expr, datasets, compiler, engines=None):
if engines is None:
engines = ["acero"] # duckdb_substrait disabled because can't run on windows
engines = [
"acero",
"datafusion",
] # duckdb_substrait disabled because can't run on windows
res_duckdb = sort_pyarrow_table(run_query_duckdb(expr, datasets))
if "acero" in engines:
res_acero = sort_pyarrow_table(run_query_acero(expr, datasets, compiler))
Expand All @@ -90,6 +119,12 @@ def run_parity_tests(expr, datasets, compiler, engines=None):
)
assert res_duckdb_substrait.equals(res_duckdb)

if "datafusion" in engines:
res_datafusion = sort_pyarrow_table(
run_query_datafusion(expr, datasets, compiler)
)
assert res_datafusion.equals(res_duckdb)


orders_raw = [
("order_id", "int64", [1, 2, 3, 4]),
Expand Down Expand Up @@ -176,7 +211,7 @@ def test_filter_groupby():
)

compiler = SubstraitCompiler()
run_parity_tests(grouped_table, datasets, compiler=compiler)
run_parity_tests(grouped_table, datasets, compiler=compiler, engines=["acero"])


def test_filter_groupby_count_distinct():
Expand All @@ -200,7 +235,7 @@ def test_aggregate_having():
)

compiler = SubstraitCompiler()
run_parity_tests(expr, datasets, compiler=compiler)
run_parity_tests(expr, datasets, compiler=compiler, engines=["acero"])


def test_inner_join_chain():
Expand All @@ -219,14 +254,13 @@ def test_union():
run_parity_tests(expr, datasets, compiler=compiler)


# TODO acero doesn't seem to support this, maybe run duckdb on both sides?
def test_window():
expr = orders.select(
orders["order_total"].mean().over(ibis.window(group_by="fk_store_id"))
)

compiler = SubstraitCompiler()
run_parity_tests(expr, datasets, compiler=compiler, engines=[])
run_parity_tests(expr, datasets, compiler=compiler, engines=["datafusion"])


def test_is_in():
Expand All @@ -240,4 +274,4 @@ def test_scalar_subquery():
expr = orders.filter(orders["order_total"] == orders["order_total"].max())

compiler = SubstraitCompiler()
run_parity_tests(expr, datasets, compiler=compiler, engines=[])
run_parity_tests(expr, datasets, compiler=compiler, engines=["datafusion"])
Loading

0 comments on commit 411766e

Please sign in to comment.