Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH#2256 Introduce a query optimizer concept #2257

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions rdflib/plugins/sparql/_contrib/valuesToTheLeftOfTheJoins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Move a VALUES clause to the left of the join.
This is normally smart as this is often a much shorter list than what is generated
by the other expression.
"""

from typing import Any

from rdflib.plugins.sparql.sparql import Query


class ValuesToTheLeftOfTheJoin:
@classmethod
def translate(cls, query: Query) -> Query:
main = query.algebra
query.algebra = ValuesToTheLeftOfTheJoin._optimize_node(main)
return query

@classmethod
def _optimize_node(cls, cv: Any) -> Any:
if cv.name == "Join":
if cv.p1.name != "ToMultiSet" and "ToMultiSet" == cv.p2.name:
cv.update(p1=cv.p2, p2=cv.p1)
else:
op1 = ValuesToTheLeftOfTheJoin._optimize_node(cv.p1)
op2 = ValuesToTheLeftOfTheJoin._optimize_node(cv.p2)
cv.update(op1, op2)
return cv
elif cv.p is not None:
cv.p.update(ValuesToTheLeftOfTheJoin._optimize_node(cv.p))
elif cv.p1 is not None and cv.p2 is not None:
cv.p1.update(ValuesToTheLeftOfTheJoin._optimize_node(cv.p1))
cv.p2.update(ValuesToTheLeftOfTheJoin._optimize_node(cv.p2))
elif cv.p1 is not None:
cv.p1.update(ValuesToTheLeftOfTheJoin._optimize_node(cv.p1))
return cv
23 changes: 23 additions & 0 deletions rdflib/plugins/sparql/optimizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from __future__ import annotations

"""
This contains standard optimizers for sparql

"""
import re
from typing import Any, Callable

from rdflib import Literal
from rdflib.plugins.sparql.algebra import CompValue, Expr, Join, Values
from rdflib.plugins.sparql.operators import Builtin_CONTAINS, Builtin_REGEX
from rdflib.plugins.sparql.sparql import Query

"""
An interface for having optimizers that transform a query algebra hopefully
in an faster to evaluate version.
"""


class SPARQLOptimizer:
def optimize(self, query: Query) -> Query:
return query
11 changes: 9 additions & 2 deletions rdflib/plugins/sparql/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from __future__ import annotations

from typing import Any, Mapping, Optional, Union
from typing import Any, Callable, List, Mapping, Optional, Union

from rdflib.graph import Graph
from rdflib.plugins.sparql.algebra import translateQuery, translateUpdate
Expand Down Expand Up @@ -106,9 +106,13 @@ def update(
return evalUpdate(self.graph, strOrQuery, initBindings)


_QueryTranslatorType = Callable[[Query], Query]


class SPARQLProcessor(Processor):
def __init__(self, graph):
def __init__(self, graph, translators: Optional[List[_QueryTranslatorType]] = None):
self.graph = graph
self.translators = translators

# NOTE on type error: this is because the super type constructor does not
# accept base argument and thie position of the DEBUG argument is
Expand Down Expand Up @@ -144,4 +148,7 @@ def query( # type: ignore[override]
if isinstance(strOrQuery, str):
strOrQuery = translateQuery(parseQuery(strOrQuery), base, initNs)

for translator in self.translators:
strOrQuery = translator(strOrQuery)

return evalQuery(self.graph, strOrQuery, initBindings, base)
71 changes: 71 additions & 0 deletions test/test_sparql/test_contrib_query_translators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from rdflib import Graph
from rdflib.plugins.sparql._contrib.valuesToTheLeftOfTheJoins import (
ValuesToTheLeftOfTheJoin,
)
from rdflib.plugins.sparql.parser import *

# from rdflib.plugins.sparql.processor import prepareQuery
from rdflib.plugins.sparql.processor import parseQuery, translateQuery

query_slow = """
PREFIX ex:<https://example.org/>

SELECT ?x {
?x ?y ?z .
VALUES (?x) {
(ex:1)
(ex:2)
(ex:3)
}
}
"""

query_fast = """
PREFIX ex:<https://example.org/>

SELECT ?x {
VALUES (?x) {
(ex:1)
(ex:2)
(ex:3)
}
?x ?y ?z .
}
"""

query_regex = """
PREFIX ex:<https://example.org/>

SELECT ?x {
?x ?y ?z .
FILTER(regex("?z", "hi"))
}
"""

query_contains = """
PREFIX ex:<https://example.org/>

SELECT ?x {
?x ?y ?z .
FILTER(contains("?z", "hi"))
}
"""


def test_values_to_left():
qs = _prepare_query(query_slow)
qf = _prepare_query(query_fast)
assert qs != qf
qso = ValuesToTheLeftOfTheJoin.translate(qs)

assert qso.algebra == qf.algebra


def _prepare_query(str_or_query):
parse_tree = parseQuery(str_or_query)
query = translateQuery(parse_tree, None, {})
return query


if __name__ == "__main__":
test_values_to_left()
Loading