From 6cb2a7204748c16fb9011965626c7c1b208d4fc3 Mon Sep 17 00:00:00 2001 From: volokluev <3169433+volokluev@users.noreply.github.com> Date: Thu, 19 Dec 2024 14:28:00 -0800 Subject: [PATCH 1/2] fix(inc984): scrub the correct bucket for sentry.user.ip (#6692) the `user.ip` field is sent in the `sentry_tags` dictionary which is prefixed with `sentry` automatically in the message processor. thus, a different attribute column has to be scrubbed --- snuba/manual_jobs/scrub_ips_from_eap_spans.py | 2 +- tests/manual_jobs/test_scrub_ips_from_eap_spans.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/snuba/manual_jobs/scrub_ips_from_eap_spans.py b/snuba/manual_jobs/scrub_ips_from_eap_spans.py index 9fd415f97d..88a43b4167 100644 --- a/snuba/manual_jobs/scrub_ips_from_eap_spans.py +++ b/snuba/manual_jobs/scrub_ips_from_eap_spans.py @@ -26,7 +26,7 @@ def _get_query(self, cluster_name: str | None) -> str: on_cluster = f"ON CLUSTER '{cluster_name}'" if cluster_name else "" return f"""ALTER TABLE eap_spans_2_local {on_cluster} -UPDATE `attr_str_1` = mapApply((k, v) -> (k, if(k = 'user.ip', 'scrubbed', v)), `attr_str_1`) +UPDATE `attr_str_14` = mapApply((k, v) -> (k, if(k = 'sentry.user.ip', 'scrubbed', v)), `attr_str_14`) WHERE organization_id IN [{organization_ids}] AND _sort_timestamp >= toDateTime('{start_datetime}') AND _sort_timestamp < toDateTime('{end_datetime}')""" diff --git a/tests/manual_jobs/test_scrub_ips_from_eap_spans.py b/tests/manual_jobs/test_scrub_ips_from_eap_spans.py index a3914b8961..019761f866 100644 --- a/tests/manual_jobs/test_scrub_ips_from_eap_spans.py +++ b/tests/manual_jobs/test_scrub_ips_from_eap_spans.py @@ -111,7 +111,7 @@ def test_generate_query() -> None: job._get_query(None) == """ALTER TABLE eap_spans_2_local -UPDATE `attr_str_1` = mapApply((k, v) -> (k, if(k = 'user.ip', 'scrubbed', v)), `attr_str_1`) +UPDATE `attr_str_14` = mapApply((k, v) -> (k, if(k = 'sentry.user.ip', 'scrubbed', v)), `attr_str_14`) WHERE organization_id IN [1,3,5,6] AND _sort_timestamp >= toDateTime('2024-12-01T00:00:00') AND _sort_timestamp < toDateTime('2024-12-10T00:00:00')""" @@ -175,6 +175,7 @@ def _gen_message( "transaction.method": "POST", "transaction.op": "http.server", "user": "ip:127.0.0.1", + "user.ip": _USER_IP, }, "span_id": "123456781234567D", "tags": { @@ -185,7 +186,6 @@ def _gen_message( "relay_protocol_version": "3", "relay_use_post_or_schedule": "True", "relay_use_post_or_schedule_rejected": "version", - "user.ip": _USER_IP, "spans_over_limit": "False", "server_name": "blah", "color": random.choice(["red", "green", "blue"]), @@ -219,12 +219,16 @@ def _generate_request( ) ), columns=[ - Column(key=AttributeKey(type=AttributeKey.TYPE_STRING, name="user.ip")) + Column( + key=AttributeKey(type=AttributeKey.TYPE_STRING, name="sentry.user.ip") + ) ], order_by=[ TraceItemTableRequest.OrderBy( column=Column( - key=AttributeKey(type=AttributeKey.TYPE_STRING, name="user.ip") + key=AttributeKey( + type=AttributeKey.TYPE_STRING, name="sentry.user.ip" + ) ) ) ], @@ -235,7 +239,7 @@ def _generate_expected_response(ip: str) -> TraceItemTableResponse: return TraceItemTableResponse( column_values=[ TraceItemColumnValues( - attribute_name="user.ip", + attribute_name="sentry.user.ip", results=[AttributeValue(val_str=ip) for _ in range(20)], ) ], From 4535ce01c4dc9928354698bd19beb6fe1d5101c5 Mon Sep 17 00:00:00 2001 From: xurui-c <159840875+xurui-c@users.noreply.github.com> Date: Thu, 19 Dec 2024 15:05:12 -0800 Subject: [PATCH 2/2] fix(inc984): scrub the correct bucket for sentry.user (#6693) the ` user` field is sent in the sentry_tags dictionary which is prefixed with sentry automatically in the message processor. thus, a different attribute column has to be scrubbed Co-authored-by: Rachel Chen --- snuba/manual_jobs/scrub_users_from_eap_spans.py | 2 +- tests/manual_jobs/test_scrub_users_from_eap_spans.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/snuba/manual_jobs/scrub_users_from_eap_spans.py b/snuba/manual_jobs/scrub_users_from_eap_spans.py index 043c85af54..89989a67a5 100644 --- a/snuba/manual_jobs/scrub_users_from_eap_spans.py +++ b/snuba/manual_jobs/scrub_users_from_eap_spans.py @@ -29,7 +29,7 @@ def _get_query(self, cluster_name: str | None) -> str: on_cluster = f"ON CLUSTER '{cluster_name}'" if cluster_name else "" return f"""ALTER TABLE eap_spans_2_local {on_cluster} -UPDATE `attr_str_2` = mapApply((k, v) -> (k, if(k = 'user' AND startsWith(v, '{_IP_PREFIX}') AND (isIPv4String(substring(v, 4)) OR isIPv6String(substring(v, 4))), 'ip:scrubbed', v)), `attr_str_2`) +UPDATE `attr_str_11` = mapApply((k, v) -> (k, if(k = 'sentry.user' AND startsWith(v, '{_IP_PREFIX}') AND (isIPv4String(substring(v, 4)) OR isIPv6String(substring(v, 4))), 'ip:scrubbed', v)), `attr_str_11`) WHERE organization_id IN [{organization_ids}] AND _sort_timestamp >= toDateTime('{start_datetime}') AND _sort_timestamp < toDateTime('{end_datetime}')""" diff --git a/tests/manual_jobs/test_scrub_users_from_eap_spans.py b/tests/manual_jobs/test_scrub_users_from_eap_spans.py index e5cf6c104d..fc40acab0b 100644 --- a/tests/manual_jobs/test_scrub_users_from_eap_spans.py +++ b/tests/manual_jobs/test_scrub_users_from_eap_spans.py @@ -111,7 +111,7 @@ def test_generate_query() -> None: job._get_query(None) == """ALTER TABLE eap_spans_2_local -UPDATE `attr_str_2` = mapApply((k, v) -> (k, if(k = 'user' AND startsWith(v, 'ip:') AND (isIPv4String(substring(v, 4)) OR isIPv6String(substring(v, 4))), 'ip:scrubbed', v)), `attr_str_2`) +UPDATE `attr_str_11` = mapApply((k, v) -> (k, if(k = 'sentry.user' AND startsWith(v, 'ip:') AND (isIPv4String(substring(v, 4)) OR isIPv6String(substring(v, 4))), 'ip:scrubbed', v)), `attr_str_11`) WHERE organization_id IN [1,3,5,6] AND _sort_timestamp >= toDateTime('2024-12-01T00:00:00') AND _sort_timestamp < toDateTime('2024-12-10T00:00:00')""" @@ -185,8 +185,6 @@ def _gen_message( "relay_protocol_version": "3", "relay_use_post_or_schedule": "True", "relay_use_post_or_schedule_rejected": "version", - "user.ip": "192.168.0.45", - "user": user, "spans_over_limit": "False", "server_name": "blah", "color": random.choice(["red", "green", "blue"]), @@ -219,11 +217,13 @@ def _generate_request( key=AttributeKey(type=AttributeKey.TYPE_STRING, name="color") ) ), - columns=[Column(key=AttributeKey(type=AttributeKey.TYPE_STRING, name="user"))], + columns=[ + Column(key=AttributeKey(type=AttributeKey.TYPE_STRING, name="sentry.user")) + ], order_by=[ TraceItemTableRequest.OrderBy( column=Column( - key=AttributeKey(type=AttributeKey.TYPE_STRING, name="user") + key=AttributeKey(type=AttributeKey.TYPE_STRING, name="sentry.user") ) ) ], @@ -234,7 +234,7 @@ def _generate_expected_response(user: str) -> TraceItemTableResponse: return TraceItemTableResponse( column_values=[ TraceItemColumnValues( - attribute_name="user", + attribute_name="sentry.user", results=[AttributeValue(val_str=user) for _ in range(20)], ) ],