From 1e1bfc427e054c6516edb5898659ee1600bd648c Mon Sep 17 00:00:00 2001 From: Josh Devlin Date: Tue, 19 Sep 2023 03:44:37 +1000 Subject: [PATCH] Redact values from logs due 'duplicate key' error (#773) * Add failing test * Fix model test naming * Add redaction looping logic * Apply suggestions from code review Co-authored-by: Doug Beatty <44704949+dbeatty10@users.noreply.github.com> * Add changelog * Fix test case * Rename test class * Colocate redaction tests * Ignore if dbt run passes or fails * Materialize as a table instead of a view to trigger an error * Expect the run to fail with a specific error message * Reverse order of dict, assert that sensitive data is replaced * Add newline --------- Co-authored-by: Doug Beatty <44704949+dbeatty10@users.noreply.github.com> Co-authored-by: Mike Alfare <13974384+mikealfare@users.noreply.github.com> (cherry picked from commit 9fa8de24332c14afd5aa505c94535fbd7753ec48) --- .../unreleased/Features-20230915-091507.yaml | 6 ++++ dbt/adapters/snowflake/connections.py | 11 +++++-- .../test_duplicate_key_not_in_exceptions.py | 33 +++++++++++++++++++ .../test_row_values_not_in_exceptions.py | 1 + 4 files changed, 48 insertions(+), 3 deletions(-) create mode 100644 .changes/unreleased/Features-20230915-091507.yaml create mode 100644 tests/functional/redact_log_values/test_duplicate_key_not_in_exceptions.py rename tests/functional/{row_values_unlogged => redact_log_values}/test_row_values_not_in_exceptions.py (94%) diff --git a/.changes/unreleased/Features-20230915-091507.yaml b/.changes/unreleased/Features-20230915-091507.yaml new file mode 100644 index 000000000..7a5fc3a54 --- /dev/null +++ b/.changes/unreleased/Features-20230915-091507.yaml @@ -0,0 +1,6 @@ +kind: Features +body: Redact cases where raw data can be leaked logs +time: 2023-09-15T09:15:07.430443+10:00 +custom: + Author: jaypeedevlin + Issue: "772" diff --git a/dbt/adapters/snowflake/connections.py b/dbt/adapters/snowflake/connections.py index 1078b9f73..cf2253f09 100644 --- a/dbt/adapters/snowflake/connections.py +++ b/dbt/adapters/snowflake/connections.py @@ -48,7 +48,11 @@ logger = AdapterLogger("Snowflake") _TOKEN_REQUEST_URL = "https://{}.snowflakecomputing.com/oauth/token-request" -ROW_VALUE_REGEX = re.compile(r"Row Values: \[(.|\n)*\]") + +ERROR_REDACTION_PATTERNS = { + re.compile(r"Row Values: \[(.|\n)*\]"): "Row Values: [redacted]", + re.compile(r"Duplicate field key '(.|\n)*'"): "Duplicate field key '[redacted]'", +} @dataclass @@ -271,13 +275,14 @@ def exception_handler(self, sql): try: yield except snowflake.connector.errors.ProgrammingError as e: - unscrubbed_msg = str(e) + msg = str(e) # A class of Snowflake errors -- such as a failure from attempting to merge # duplicate rows -- includes row values in the error message, i.e. # [12345, "col_a_value", "col_b_value", etc...]. We don't want to log potentially # sensitive user data. - msg = re.sub(ROW_VALUE_REGEX, "Row Values: [redacted]", unscrubbed_msg) + for regex_pattern, replacement_message in ERROR_REDACTION_PATTERNS.items(): + msg = re.sub(regex_pattern, replacement_message, msg) logger.debug("Snowflake query id: {}".format(e.sfqid)) logger.debug("Snowflake error: {}".format(msg)) diff --git a/tests/functional/redact_log_values/test_duplicate_key_not_in_exceptions.py b/tests/functional/redact_log_values/test_duplicate_key_not_in_exceptions.py new file mode 100644 index 000000000..e32f08c15 --- /dev/null +++ b/tests/functional/redact_log_values/test_duplicate_key_not_in_exceptions.py @@ -0,0 +1,33 @@ +import pytest + +from dbt.tests.util import ( + run_dbt, +) + +_MODELS__view = """ +{{ config( + materialized='table', +) }} + +with dupes as ( + select 'foo' as key, 1 as value + union all + select 'foo' as key, 2 as value +) + +select + object_agg(key, value) as agg +from dupes +""" + + +class TestDuplicateKeyNotInExceptions: + @pytest.fixture(scope="class") + def models(self): + return {"model.sql": _MODELS__view} + + def test_row_values_were_scrubbed_from_duplicate_merge_exception(self, project): + result = run_dbt(["run", "-s", "model"], expect_pass=False) + assert len(result) == 1 + assert "Duplicate field key '[redacted]'" in result[0].message + assert "'foo'" not in result[0].message diff --git a/tests/functional/row_values_unlogged/test_row_values_not_in_exceptions.py b/tests/functional/redact_log_values/test_row_values_not_in_exceptions.py similarity index 94% rename from tests/functional/row_values_unlogged/test_row_values_not_in_exceptions.py rename to tests/functional/redact_log_values/test_row_values_not_in_exceptions.py index 409b18ae3..ec4fe2348 100644 --- a/tests/functional/row_values_unlogged/test_row_values_not_in_exceptions.py +++ b/tests/functional/redact_log_values/test_row_values_not_in_exceptions.py @@ -31,3 +31,4 @@ def test_row_values_were_scrubbed_from_duplicate_merge_exception(self, project): result = run_dbt(["run", "-s", "model"], expect_pass=False) assert len(result) == 1 assert "Row Values: [redacted]" in result[0].message + assert "'one'" not in result[0].message