Skip to content

Commit

Permalink
Redact values from logs due 'duplicate key' error (#773)
Browse files Browse the repository at this point in the history
* Add failing test

* Fix model test naming

* Add redaction looping logic

* Apply suggestions from code review

Co-authored-by: Doug Beatty <[email protected]>

* Add changelog

* Fix test case

* Rename test class

* Colocate redaction tests

* Ignore if dbt run passes or fails

* Materialize as a table instead of a view to trigger an error

* Expect the run to fail with a specific error message

* Reverse order of dict, assert that sensitive data is replaced

* Add newline

---------

Co-authored-by: Doug Beatty <[email protected]>
Co-authored-by: Mike Alfare <[email protected]>
(cherry picked from commit 9fa8de2)
  • Loading branch information
jaypeedevlin authored and dataders committed Sep 19, 2023
1 parent 57a71c0 commit 1e1bfc4
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 3 deletions.
6 changes: 6 additions & 0 deletions .changes/unreleased/Features-20230915-091507.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
kind: Features
body: Redact cases where raw data can be leaked logs
time: 2023-09-15T09:15:07.430443+10:00
custom:
Author: jaypeedevlin
Issue: "772"
11 changes: 8 additions & 3 deletions dbt/adapters/snowflake/connections.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,11 @@

logger = AdapterLogger("Snowflake")
_TOKEN_REQUEST_URL = "https://{}.snowflakecomputing.com/oauth/token-request"
ROW_VALUE_REGEX = re.compile(r"Row Values: \[(.|\n)*\]")

ERROR_REDACTION_PATTERNS = {
re.compile(r"Row Values: \[(.|\n)*\]"): "Row Values: [redacted]",
re.compile(r"Duplicate field key '(.|\n)*'"): "Duplicate field key '[redacted]'",
}


@dataclass
Expand Down Expand Up @@ -271,13 +275,14 @@ def exception_handler(self, sql):
try:
yield
except snowflake.connector.errors.ProgrammingError as e:
unscrubbed_msg = str(e)
msg = str(e)

# A class of Snowflake errors -- such as a failure from attempting to merge
# duplicate rows -- includes row values in the error message, i.e.
# [12345, "col_a_value", "col_b_value", etc...]. We don't want to log potentially
# sensitive user data.
msg = re.sub(ROW_VALUE_REGEX, "Row Values: [redacted]", unscrubbed_msg)
for regex_pattern, replacement_message in ERROR_REDACTION_PATTERNS.items():
msg = re.sub(regex_pattern, replacement_message, msg)

logger.debug("Snowflake query id: {}".format(e.sfqid))
logger.debug("Snowflake error: {}".format(msg))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pytest

from dbt.tests.util import (
run_dbt,
)

_MODELS__view = """
{{ config(
materialized='table',
) }}
with dupes as (
select 'foo' as key, 1 as value
union all
select 'foo' as key, 2 as value
)
select
object_agg(key, value) as agg
from dupes
"""


class TestDuplicateKeyNotInExceptions:
@pytest.fixture(scope="class")
def models(self):
return {"model.sql": _MODELS__view}

def test_row_values_were_scrubbed_from_duplicate_merge_exception(self, project):
result = run_dbt(["run", "-s", "model"], expect_pass=False)
assert len(result) == 1
assert "Duplicate field key '[redacted]'" in result[0].message
assert "'foo'" not in result[0].message
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ def test_row_values_were_scrubbed_from_duplicate_merge_exception(self, project):
result = run_dbt(["run", "-s", "model"], expect_pass=False)
assert len(result) == 1
assert "Row Values: [redacted]" in result[0].message
assert "'one'" not in result[0].message

0 comments on commit 1e1bfc4

Please sign in to comment.