Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(mapping-optimizer): Support in operator for mapping optimizer #5685

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 121 additions & 35 deletions snuba/query/processors/physical/mapping_optimizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from enum import Enum
from typing import Optional, Tuple
from typing import Optional, Tuple, cast

from snuba import environment
from snuba.clickhouse.query import Query
Expand All @@ -13,6 +13,7 @@
)
from snuba.query.conditions import (
BooleanFunctions,
ConditionFunctions,
combine_and_conditions,
combine_or_conditions,
get_first_level_and_conditions,
Expand Down Expand Up @@ -90,9 +91,8 @@ def __init__(
self.__killswitch = killswitch
self.__value_subcolumn_name = value_subcolumn_name

# TODO: Add the support for IN conditions.
self.__optimizable_pattern = FunctionCall(
function_name=String("equals"),
self.__equals_condition_pattern = FunctionCall(
function_name=String(ConditionFunctions.EQ),
parameters=(
Or(
[
Expand All @@ -106,6 +106,19 @@ def __init__(
Param("right_hand_side", Literal(Any(str))),
),
)
self.__in_condition_pattern = FunctionCall(
function_name=String(ConditionFunctions.IN),
parameters=(
mapping_pattern,
Param(
"right_hand_side",
FunctionCall(
Or([String("array"), String("tuple")]),
all_parameters=Literal(),
),
),
),
)
self.__tag_exists_patterns = [
FunctionCall(
function_name=String("notEquals"),
Expand Down Expand Up @@ -155,20 +168,39 @@ def __classify_combined_conditions(self, condition: Expression) -> ConditionClas

def __classify_condition(self, condition: Expression) -> ConditionClass:
# Expects this to be an individual condition
match = self.__optimizable_pattern.match(condition)
equals_condition_match = self.__equals_condition_pattern.match(condition)
in_condition_match = (
self.__in_condition_pattern.match(condition)
if equals_condition_match is None
else None
)
if (
match is not None
and match.string(KEY_COL_MAPPING_PARAM) == f"{self.__column_name}.key"
equals_condition_match is not None
and equals_condition_match.string(KEY_COL_MAPPING_PARAM)
== f"{self.__column_name}.key"
):
rhs = match.expression("right_hand_side")
rhs = equals_condition_match.expression("right_hand_side")
assert isinstance(rhs, LiteralExpr)
return (
ConditionClass.NOT_OPTIMIZABLE
# ifNull(tags[asd], '') = '' is not optimizable.
if rhs.value == ""
else ConditionClass.OPTIMIZABLE
)
elif match is None:
if (
in_condition_match is not None
and in_condition_match.string(KEY_COL_MAPPING_PARAM)
== f"{self.__column_name}.key"
):
rhs = in_condition_match.expression("right_hand_side")
assert isinstance(rhs, FunctionExpr)

params = rhs.parameters
for param in params:
assert isinstance(param, LiteralExpr)

return ConditionClass.OPTIMIZABLE
elif equals_condition_match is None and in_condition_match is None:
# If this condition is not matching an optimizable condition,
# check that it does not reference the optimizable column.
# If it does, it means we should not optimize this query.
Expand All @@ -183,35 +215,89 @@ def __classify_condition(self, condition: Expression) -> ConditionClass:
return ConditionClass.IRRELEVANT

def __replace_with_hash(self, condition: Expression) -> Expression:
match = self.__optimizable_pattern.match(condition)
equals_condition_match = self.__equals_condition_pattern.match(condition)
in_condition_match = (
self.__in_condition_pattern.match(condition)
if equals_condition_match is None
else None
)
if (
match is None
or match.string(KEY_COL_MAPPING_PARAM) != f"{self.__column_name}.key"
equals_condition_match is not None
and equals_condition_match.string(KEY_COL_MAPPING_PARAM)
== f"{self.__column_name}.key"
):
return condition
rhs = match.expression("right_hand_side")
assert isinstance(rhs, LiteralExpr)
key = match.scalar(KEY_MAPPING_PARAM)
assert isinstance(key, (str, int))
if isinstance(key, str):
key = key.translate(ESCAPE_TRANSLATION)

return FunctionExpr(
alias=condition.alias,
function_name="has",
parameters=(
Column(
alias=None,
table_name=match.optional_string(TABLE_MAPPING_PARAM),
column_name=self.__hash_map_name,
rhs = equals_condition_match.expression("right_hand_side")
assert isinstance(rhs, LiteralExpr)
key = equals_condition_match.scalar(KEY_MAPPING_PARAM)
assert isinstance(key, (str, int))
if isinstance(key, str):
key = key.translate(ESCAPE_TRANSLATION)

return FunctionExpr(
alias=condition.alias,
function_name="has",
parameters=(
Column(
alias=None,
table_name=equals_condition_match.optional_string(
TABLE_MAPPING_PARAM
),
column_name=self.__hash_map_name,
),
FunctionExpr(
alias=None,
function_name="cityHash64",
parameters=(LiteralExpr(None, f"{key}={rhs.value}"),),
),
),
FunctionExpr(
alias=None,
function_name="cityHash64",
parameters=(LiteralExpr(None, f"{key}={rhs.value}"),),
)
elif (
in_condition_match is not None
and in_condition_match.string(KEY_COL_MAPPING_PARAM)
== f"{self.__column_name}.key"
):
key = in_condition_match.scalar(KEY_MAPPING_PARAM)
assert isinstance(key, (str, int))
if isinstance(key, str):
key = key.translate(ESCAPE_TRANSLATION)

rhs = in_condition_match.expression("right_hand_side")
assert isinstance(rhs, FunctionExpr)
for param in rhs.parameters:
assert isinstance(param, LiteralExpr)
params = cast(list[LiteralExpr], rhs.parameters)

return FunctionExpr(
alias=condition.alias,
function_name="hasAny",
parameters=(
Column(
alias=None,
table_name=in_condition_match.optional_string(
TABLE_MAPPING_PARAM
),
column_name=self.__hash_map_name,
),
FunctionExpr(
alias=rhs.alias,
function_name="array",
parameters=tuple(
[
FunctionExpr(
alias=None,
function_name="cityHash64",
parameters=(
LiteralExpr(None, f"{key}={lit.value}"),
),
)
for lit in params
]
),
),
),
),
)
)

return condition

def _get_condition_without_redundant_checks(
self, condition: Expression, query: Query
Expand Down Expand Up @@ -265,7 +351,7 @@ def _get_condition_without_redundant_checks(
if tag_exist_match:
matched_tag_exists_conditions[condition_id] = tag_exist_match
if not tag_exist_match:
eq_match = self.__optimizable_pattern.match(cond)
eq_match = self.__equals_condition_pattern.match(cond)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@volokluev would you happen to know if I need to implement this removing of redundant checks for IN conditions?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you could but it's not strictly necessary. I'm not sure how often we get those cases with IN conditions. Definitely something that can be added later

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is less common on the older datasets but more likely to happen with the spans dataset as we have sentry_tags which contains some more commonly used columns.

The example I ran into was with environment. For a 24h period, it read >48GiB of data, and after applying this optimization, I saw it was reduced to <24GiB of data. On 7 day periods, the query was already timing out. So this optimization should already be helpful

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh yes absolutely. Merge the PR. I was saying that the redundant clause optimization is probably not going to be as applicable for IN clauses

if eq_match:
tag_eq_match_keys.add(eq_match.scalar(KEY_MAPPING_PARAM))
useful_conditions = []
Expand Down
70 changes: 70 additions & 0 deletions tests/query/processors/test_mapping_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,76 @@
nested_condition("tags", "my_tag", ConditionFunctions.EQ, "a"),
id="Non optimizable having",
),
pytest.param(
build_query(
selected_columns=[column("event_id")],
condition=binary_condition(
ConditionFunctions.IN,
nested_expression("tags", "my_tag"),
FunctionCall(
None,
"tuple",
(
Literal(None, "a"),
Literal(None, "b"),
Literal(None, "c"),
),
),
),
),
FunctionCall(
None,
"hasAny",
(
column("_tags_hash_map", True),
FunctionCall(
None,
"array",
(
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=a"),)),
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=b"),)),
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=c"),)),
),
),
),
),
id="Optimizable IN condition using tuple",
),
pytest.param(
build_query(
selected_columns=[column("event_id")],
condition=binary_condition(
ConditionFunctions.IN,
nested_expression("tags", "my_tag"),
FunctionCall(
None,
"array",
(
Literal(None, "a"),
Literal(None, "b"),
Literal(None, "c"),
),
),
),
),
FunctionCall(
None,
"hasAny",
(
column("_tags_hash_map", True),
FunctionCall(
None,
"array",
(
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=a"),)),
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=b"),)),
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=c"),)),
),
),
),
),
id="Optimizable IN condition using array",
),
]


Expand Down
Loading