Skip to content

Commit

Permalink
bugfix(parser): allow multiple or conditions in or and and clauses (
Browse files Browse the repository at this point in the history
#4769)

### Context

Even though `or(1, 1, 1, 1)` is valid snql, the clickhouse formatter chokes on it because there's an assumption in [other parts of the code](https://github.com/getsentry/snuba/blob/master/snuba/query/conditions.py#L237-L243) which assumes that ands and ors have two arguments. Breaking that assumption causes an infinite recursion error

### Solution

Rather than changing the matching infrastructure and making sure all the query processors using that assumption work afterwards, I just massaged the AST post-parsing to make sure that all `or` and `and` functions had at most two arguments
  • Loading branch information
volokluev authored Oct 5, 2023
1 parent 431fc7c commit 0ccfe59
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 1 deletion.
37 changes: 36 additions & 1 deletion snuba/query/snql/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -988,6 +988,33 @@ def transform(exp: Expression) -> Expression:
query.transform_expressions(transform)


def _treeify_or_and_conditions(
query: Union[CompositeQuery[QueryEntity], LogicalQuery]
) -> None:
"""
look for expressions like or(a, b, c) and turn them into or(a, or(b, c))
and(a, b, c) and turn them into and(a, and(b, c))
even though clickhouse sql supports arbitrary amount of arguments there are other parts of the
codebase which assume `or` and `and` have two arguments
Adding this post-process step is easier than changing the rest of the query pipeline
"""

def transform(exp: Expression) -> Expression:
if not isinstance(exp, FunctionCall):
return exp

if exp.function_name == "and":
return combine_and_conditions(exp.parameters)
elif exp.function_name == "or":
return combine_or_conditions(exp.parameters)
else:
return exp

query.transform_expressions(transform)


DATETIME_MATCH = FunctionCallMatch(
StringMatch("toDateTime"), (Param("date_string", LiteralMatch(AnyMatch(str))),)
)
Expand Down Expand Up @@ -1242,6 +1269,7 @@ def validate_lambda(exp: Lambda) -> None:
def _replace_time_condition(
query: Union[CompositeQuery[QueryEntity], LogicalQuery]
) -> None:

condition = query.get_condition()
top_level = (
get_first_level_and_conditions(condition) if condition is not None else []
Expand Down Expand Up @@ -1433,8 +1461,8 @@ def _post_process(
# custom processors can be partials instead of functions but partials don't
# have the __name__ attribute set automatically (and we don't set it manually)
description = getattr(func, "__name__", "custom")

with sentry_sdk.start_span(op="processor", description=description):

if settings and settings.get_dry_run():
with explain_meta.with_query_differ("snql_parsing", description, query):
func(query)
Expand Down Expand Up @@ -1478,12 +1506,19 @@ def parse_snql_query(
custom_processing: Optional[CustomProcessors] = None,
settings: QuerySettings | None = None,
) -> Tuple[Union[CompositeQuery[QueryEntity], LogicalQuery], str]:

with sentry_sdk.start_span(op="parser", description="parse_snql_query_initial"):
query = parse_snql_query_initial(body)

if settings and settings.get_dry_run():
explain_meta.set_original_ast(str(query))

# NOTE (volo): The anonymizer that runs after this function call chokes on
# OR and AND clauses with multiple parameters so we have to treeify them
# before we run the anonymizer and the rest of the post processors
with sentry_sdk.start_span(op="processor", description="treeify_conditions"):
_post_process(query, [_treeify_or_and_conditions], settings)

with sentry_sdk.start_span(op="parser", description="anonymize_snql_query"):
snql_anonymized = format_snql_anonymized(query).get_sql()

Expand Down
28 changes: 28 additions & 0 deletions tests/query/parser/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -1227,3 +1227,31 @@ def test_circular_aliases() -> None:
""",
get_dataset("events"),
)


def test_treeify() -> None:
query = """MATCH (replays)
SELECT replay_id BY replay_id
WHERE project_id IN array(4552673527463954) AND timestamp < toDateTime('2023-09-22T18:18:10.891157') AND timestamp >= toDateTime('2023-06-24T18:18:10.891157')
HAVING or(1, 1, 1, 1) != 0 LIMIT 10
"""
query_ast, _ = parse_snql_query(query, get_dataset("replays"))
having = query_ast.get_having()
expected = binary_condition(
ConditionFunctions.NEQ,
binary_condition(
BooleanFunctions.OR,
Literal(None, 1),
binary_condition(
BooleanFunctions.OR,
Literal(None, 1),
binary_condition(
BooleanFunctions.OR,
Literal(None, 1),
Literal(None, 1),
),
),
),
Literal(None, 0),
)
assert having == expected

0 comments on commit 0ccfe59

Please sign in to comment.