Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(mapping-optimizer): Support in operator for mapping optimizer #5691

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ jobs:
tests/sentry/search/events \
tests/sentry/event_manager \
tests/sentry/api/endpoints/test_organization_profiling_functions.py \
tests/snuba/api/endpoints/test_organization_events_stats_mep.py \
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A test in here was failing in the last attempt

tests/snuba/test_snql_snuba.py \
tests/snuba/test_metrics_layer.py \
-vv --cov . --cov-report="xml:.artifacts/snuba.coverage.xml"
Expand Down
160 changes: 125 additions & 35 deletions snuba/query/processors/physical/mapping_optimizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from enum import Enum
from typing import Optional, Tuple
from typing import Optional, Tuple, cast

from snuba import environment
from snuba.clickhouse.query import Query
Expand All @@ -13,6 +13,7 @@
)
from snuba.query.conditions import (
BooleanFunctions,
ConditionFunctions,
combine_and_conditions,
combine_or_conditions,
get_first_level_and_conditions,
Expand Down Expand Up @@ -90,9 +91,8 @@ def __init__(
self.__killswitch = killswitch
self.__value_subcolumn_name = value_subcolumn_name

# TODO: Add the support for IN conditions.
self.__optimizable_pattern = FunctionCall(
function_name=String("equals"),
self.__equals_condition_pattern = FunctionCall(
function_name=String(ConditionFunctions.EQ),
parameters=(
Or(
[
Expand All @@ -106,6 +106,19 @@ def __init__(
Param("right_hand_side", Literal(Any(str))),
),
)
self.__in_condition_pattern = FunctionCall(
function_name=String(ConditionFunctions.IN),
parameters=(
mapping_pattern,
Param(
"right_hand_side",
FunctionCall(
Or([String("array"), String("tuple")]),
all_parameters=Literal(),
),
),
),
)
self.__tag_exists_patterns = [
FunctionCall(
function_name=String("notEquals"),
Expand Down Expand Up @@ -155,20 +168,43 @@ def __classify_combined_conditions(self, condition: Expression) -> ConditionClas

def __classify_condition(self, condition: Expression) -> ConditionClass:
# Expects this to be an individual condition
match = self.__optimizable_pattern.match(condition)
equals_condition_match = self.__equals_condition_pattern.match(condition)
in_condition_match = (
self.__in_condition_pattern.match(condition)
if equals_condition_match is None
else None
)
if (
match is not None
and match.string(KEY_COL_MAPPING_PARAM) == f"{self.__column_name}.key"
equals_condition_match is not None
and equals_condition_match.string(KEY_COL_MAPPING_PARAM)
== f"{self.__column_name}.key"
):
rhs = match.expression("right_hand_side")
rhs = equals_condition_match.expression("right_hand_side")
assert isinstance(rhs, LiteralExpr)
return (
ConditionClass.NOT_OPTIMIZABLE
# ifNull(tags[asd], '') = '' is not optimizable.
if rhs.value == ""
else ConditionClass.OPTIMIZABLE
)
elif match is None:
if (
in_condition_match is not None
and in_condition_match.string(KEY_COL_MAPPING_PARAM)
== f"{self.__column_name}.key"
):
rhs = in_condition_match.expression("right_hand_side")
assert isinstance(rhs, FunctionExpr)

params = rhs.parameters
for param in params:
assert isinstance(param, LiteralExpr)

# tags[foo] IN array('') is not optimizable
if param.value == "":
return ConditionClass.NOT_OPTIMIZABLE
Comment on lines +202 to +204
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was the cause of the test failure. Due to the way we query for tags, even if foo is not a tag on the row, it has a value of '' and won't exist in the tags hash map.

To handle this, we just mark these cases are not optimizable. and run the query as is.


return ConditionClass.OPTIMIZABLE
elif equals_condition_match is None and in_condition_match is None:
# If this condition is not matching an optimizable condition,
# check that it does not reference the optimizable column.
# If it does, it means we should not optimize this query.
Expand All @@ -183,35 +219,89 @@ def __classify_condition(self, condition: Expression) -> ConditionClass:
return ConditionClass.IRRELEVANT

def __replace_with_hash(self, condition: Expression) -> Expression:
match = self.__optimizable_pattern.match(condition)
equals_condition_match = self.__equals_condition_pattern.match(condition)
in_condition_match = (
self.__in_condition_pattern.match(condition)
if equals_condition_match is None
else None
)
if (
match is None
or match.string(KEY_COL_MAPPING_PARAM) != f"{self.__column_name}.key"
equals_condition_match is not None
and equals_condition_match.string(KEY_COL_MAPPING_PARAM)
== f"{self.__column_name}.key"
):
return condition
rhs = match.expression("right_hand_side")
assert isinstance(rhs, LiteralExpr)
key = match.scalar(KEY_MAPPING_PARAM)
assert isinstance(key, (str, int))
if isinstance(key, str):
key = key.translate(ESCAPE_TRANSLATION)

return FunctionExpr(
alias=condition.alias,
function_name="has",
parameters=(
Column(
alias=None,
table_name=match.optional_string(TABLE_MAPPING_PARAM),
column_name=self.__hash_map_name,
rhs = equals_condition_match.expression("right_hand_side")
assert isinstance(rhs, LiteralExpr)
key = equals_condition_match.scalar(KEY_MAPPING_PARAM)
assert isinstance(key, (str, int))
if isinstance(key, str):
key = key.translate(ESCAPE_TRANSLATION)

return FunctionExpr(
alias=condition.alias,
function_name="has",
parameters=(
Column(
alias=None,
table_name=equals_condition_match.optional_string(
TABLE_MAPPING_PARAM
),
column_name=self.__hash_map_name,
),
FunctionExpr(
alias=None,
function_name="cityHash64",
parameters=(LiteralExpr(None, f"{key}={rhs.value}"),),
),
),
FunctionExpr(
alias=None,
function_name="cityHash64",
parameters=(LiteralExpr(None, f"{key}={rhs.value}"),),
)
elif (
in_condition_match is not None
and in_condition_match.string(KEY_COL_MAPPING_PARAM)
== f"{self.__column_name}.key"
):
key = in_condition_match.scalar(KEY_MAPPING_PARAM)
assert isinstance(key, (str, int))
if isinstance(key, str):
key = key.translate(ESCAPE_TRANSLATION)

rhs = in_condition_match.expression("right_hand_side")
assert isinstance(rhs, FunctionExpr)
for param in rhs.parameters:
assert isinstance(param, LiteralExpr)
params = cast(list[LiteralExpr], rhs.parameters)

return FunctionExpr(
alias=condition.alias,
function_name="hasAny",
parameters=(
Column(
alias=None,
table_name=in_condition_match.optional_string(
TABLE_MAPPING_PARAM
),
column_name=self.__hash_map_name,
),
FunctionExpr(
alias=rhs.alias,
function_name="array",
parameters=tuple(
[
FunctionExpr(
alias=None,
function_name="cityHash64",
parameters=(
LiteralExpr(None, f"{key}={lit.value}"),
),
)
for lit in params
]
),
),
),
),
)
)

return condition

def _get_condition_without_redundant_checks(
self, condition: Expression, query: Query
Expand Down Expand Up @@ -265,7 +355,7 @@ def _get_condition_without_redundant_checks(
if tag_exist_match:
matched_tag_exists_conditions[condition_id] = tag_exist_match
if not tag_exist_match:
eq_match = self.__optimizable_pattern.match(cond)
eq_match = self.__equals_condition_pattern.match(cond)
if eq_match:
tag_eq_match_keys.add(eq_match.scalar(KEY_MAPPING_PARAM))
useful_conditions = []
Expand Down
102 changes: 102 additions & 0 deletions tests/query/processors/test_mapping_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,108 @@
nested_condition("tags", "my_tag", ConditionFunctions.EQ, "a"),
id="Non optimizable having",
),
pytest.param(
build_query(
selected_columns=[column("event_id")],
condition=binary_condition(
ConditionFunctions.IN,
nested_expression("tags", "my_tag"),
FunctionCall(
None,
"tuple",
(
Literal(None, "a"),
Literal(None, "b"),
Literal(None, "c"),
),
),
),
),
FunctionCall(
None,
"hasAny",
(
column("_tags_hash_map", True),
FunctionCall(
None,
"array",
(
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=a"),)),
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=b"),)),
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=c"),)),
),
),
),
),
id="Optimizable IN condition using tuple",
),
pytest.param(
build_query(
selected_columns=[column("event_id")],
condition=binary_condition(
ConditionFunctions.IN,
nested_expression("tags", "my_tag"),
FunctionCall(
None,
"array",
(
Literal(None, "a"),
Literal(None, "b"),
Literal(None, "c"),
),
),
),
),
FunctionCall(
None,
"hasAny",
(
column("_tags_hash_map", True),
FunctionCall(
None,
"array",
(
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=a"),)),
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=b"),)),
FunctionCall(None, "cityHash64", (Literal(None, "my_tag=c"),)),
),
),
),
),
id="Optimizable IN condition using array",
),
pytest.param(
build_query(
selected_columns=[column("event_id")],
condition=binary_condition(
ConditionFunctions.IN,
nested_expression("tags", "my_tag"),
FunctionCall(
None,
"array",
(
Literal(None, "a"),
Literal(None, "b"),
Literal(None, ""),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

new test with an empty value in the list to capture this failure case

),
),
),
),
binary_condition(
ConditionFunctions.IN,
nested_expression("tags", "my_tag"),
FunctionCall(
None,
"array",
(
Literal(None, "a"),
Literal(None, "b"),
Literal(None, ""),
),
),
),
id="Non optimizable IN condition with empty value",
),
]


Expand Down
Loading