Skip to content
This repository has been archived by the owner on May 17, 2024. It is now read-only.

Evolve dbt data diff output #857

Merged
merged 47 commits into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from 40 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
34e037d
first draft
Jan 24, 2024
21b1131
style fixes by ruff
sungchun12 Jan 24, 2024
a7b575d
past tense consistency
Jan 24, 2024
ce9c743
working draft of new table
Jan 24, 2024
8925474
style fixes by ruff
sungchun12 Jan 24, 2024
5e9b55d
dbt diffs work, cloud broken for now
Jan 24, 2024
ff99881
remove cached git repos
Jan 24, 2024
8e79b92
efficient naming
Jan 24, 2024
b84fa26
add type changed count
Jan 24, 2024
e222316
reorder for priority on prod changes
Jan 24, 2024
47e4b0e
tabulate value diffs
Jan 24, 2024
9e89281
style fixes by ruff
sungchun12 Jan 24, 2024
2d3398e
less horizontal space needed
Jan 24, 2024
52949d4
leo's feedback
Jan 25, 2024
6d1d265
center align values
Jan 25, 2024
3f23314
consistent formatting
Jan 25, 2024
887ff7e
shorter name same meaning
Jan 25, 2024
076d0e2
row counts and diff values working
Jan 25, 2024
c6157b3
deps impacts works now
Jan 25, 2024
4cff08f
default val
Jan 25, 2024
4c65ceb
more readable
Jan 25, 2024
1c246ce
add primary key used
Jan 26, 2024
9154995
add model specific CI configs
Jan 26, 2024
0afe71c
consistency
Jan 26, 2024
026b4be
conditional headers
Jan 26, 2024
e880caf
style fixes by ruff
sungchun12 Jan 26, 2024
9e4acb0
cleaner implementation
Jan 29, 2024
e5009db
more cleaning
Jan 29, 2024
99fe5d0
consistent format
Jan 29, 2024
a51cff7
fix unchanged calc
Jan 29, 2024
9d9f53d
remove prints
Jan 29, 2024
7677d1a
default value
Jan 29, 2024
eabe47a
draft up tests
Jan 29, 2024
c5d2f80
a couple more tests
Jan 29, 2024
f52c3a3
new version
Jan 29, 2024
edfbf4f
passing tests
Jan 29, 2024
6134ade
style fixes by ruff
sungchun12 Jan 29, 2024
8e9223d
util unit test
Jan 30, 2024
41dcb27
add unit tests
Jan 30, 2024
99f88da
test the templates
Jan 30, 2024
163b3f2
fix type hints
Jan 30, 2024
a7cb6a5
real test no mocking
Jan 31, 2024
248678f
update tests with all the new outputs
Jan 31, 2024
ecadd5a
add more validations for mock
Jan 31, 2024
fc3f341
fix json bug
Jan 31, 2024
4563beb
Merge branch 'master' into evolve-dbt-output
dlawin Feb 6, 2024
6057923
Merge branch 'master' into evolve-dbt-output
sungchun12 Feb 6, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions data_diff/cloud/datafold_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,18 +144,22 @@ class TSummaryResultSchemaStats(pydantic.BaseModel):
exclusive_columns: Tuple[List[str], List[str]]


class TSummaryResultDependencyDetails(pydantic.BaseModel):
deps: Dict[str, List[Dict]]


class TCloudApiDataDiffSummaryResult(pydantic.BaseModel):
status: str
pks: Optional[TSummaryResultPrimaryKeyStats]
values: Optional[TSummaryResultValueStats]
schema_: Optional[TSummaryResultSchemaStats]
dependencies: Optional[Dict[str, Any]]
deps: Optional[TSummaryResultDependencyDetails]

@classmethod
def from_orm(cls, obj: Any) -> Self:
pks = TSummaryResultPrimaryKeyStats(**obj["pks"]) if "pks" in obj else None
values = TSummaryResultValueStats(**obj["values"]) if "values" in obj else None
deps = obj["deps"] if "deps" in obj else None
deps = TSummaryResultDependencyDetails(**obj["dependencies"]) if "dependencies" in obj else None
schema = TSummaryResultSchemaStats(**obj["schema"]) if "schema" in obj else None
return cls(
status=obj["status"],
Expand Down
68 changes: 52 additions & 16 deletions data_diff/dbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,12 +306,23 @@ def _local_diff(
k for k, v in table2_columns.items() if k in table1_columns and v.data_type != table1_columns[k].data_type
}

if columns_added:
diff_output_str += columns_added_template(columns_added)
diff_output_str += f"Primary Keys: {diff_vars.primary_keys} \n"

if diff_vars.where_filter:
diff_output_str += f"Where Filter: '{str(diff_vars.where_filter)}' \n"

if diff_vars.include_columns:
diff_output_str += f"Included Columns: {diff_vars.include_columns} \n"

if diff_vars.exclude_columns:
diff_output_str += f"Excluded Columns: {diff_vars.exclude_columns} \n"

if columns_removed:
diff_output_str += columns_removed_template(columns_removed)

if columns_added:
diff_output_str += columns_added_template(columns_added)

if columns_type_changed:
diff_output_str += columns_type_changed_template(columns_type_changed)
column_set = column_set.difference(columns_type_changed)
Expand Down Expand Up @@ -455,32 +466,57 @@ def _cloud_diff(
rows_removed_count = diff_results.pks.exclusives[0]

rows_updated = diff_results.values.rows_with_differences
total_rows = diff_results.values.total_rows
rows_unchanged = int(total_rows) - int(rows_updated)
total_rows_table1 = diff_results.pks.total_rows[0]
total_rows_table2 = diff_results.pks.total_rows[1]
total_rows_diff = total_rows_table2 - total_rows_table1

rows_unchanged = int(total_rows_table1) - int(rows_updated) - int(rows_removed_count)
sungchun12 marked this conversation as resolved.
Show resolved Hide resolved
diff_percent_list = {
x.column_name: str(x.match) + "%" for x in diff_results.values.columns_diff_stats if x.match != 100.0
x.column_name: f"{str(round(100.00 - x.match, 2))}%"
for x in diff_results.values.columns_diff_stats
if x.match != 100.0
}
columns_added = diff_results.schema_.exclusive_columns[1]
columns_removed = diff_results.schema_.exclusive_columns[0]
columns_added = set(diff_results.schema_.exclusive_columns[1])
columns_removed = set(diff_results.schema_.exclusive_columns[0])
column_type_changes = diff_results.schema_.column_type_differs

if columns_added:
diff_output_str += columns_added_template(columns_added)
diff_output_str += f"Primary Keys: {diff_vars.primary_keys} \n"
if diff_vars.where_filter:
diff_output_str += f"Where Filter: '{str(diff_vars.where_filter)}' \n"

if diff_vars.include_columns:
diff_output_str += f"Included Columns: {diff_vars.include_columns} \n"

if diff_vars.exclude_columns:
diff_output_str += f"Excluded Columns: {diff_vars.exclude_columns} \n"

if columns_removed:
diff_output_str += columns_removed_template(columns_removed)

if columns_added:
diff_output_str += columns_added_template(columns_added)

if column_type_changes:
diff_output_str += columns_type_changed_template(column_type_changes)

deps_impacts = {
key: len(value) + sum(len(item.get("BiHtSync", [])) for item in value) if key == "hightouch" else len(value)
for key, value in diff_results.deps.deps.items()
}

if any([rows_added_count, rows_removed_count, rows_updated]):
diff_output = dbt_diff_string_template(
rows_added_count,
rows_removed_count,
rows_updated,
str(rows_unchanged),
diff_percent_list,
"Value Match Percent:",
total_rows_table1=total_rows_table1,
total_rows_table2=total_rows_table2,
total_rows_diff=total_rows_diff,
rows_added=rows_added_count,
rows_removed=rows_removed_count,
rows_updated=rows_updated,
rows_unchanged=str(rows_unchanged),
deps_impacts=deps_impacts,
is_cloud=True,
extra_info_dict=diff_percent_list,
extra_info_str="Value Changed:",
)
diff_output_str += f"\n{diff_url}\n {diff_output} \n"
rich.print(diff_output_str)
Expand Down Expand Up @@ -524,7 +560,7 @@ def _cloud_diff(


def _diff_output_base(dev_path: str, prod_path: str) -> str:
return f"\n[green]{prod_path} <> {dev_path}[/] \n"
return f"\n[blue]{prod_path}[/] <> [green]{dev_path}[/] \n"


def _initialize_events(dbt_user_id: Optional[str], dbt_version: Optional[str], dbt_project_id: Optional[str]) -> None:
Expand Down
17 changes: 11 additions & 6 deletions data_diff/diff_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,14 +138,19 @@ def _get_stats(self, is_dbt: bool = False) -> DiffStats:
def get_stats_string(self, is_dbt: bool = False):
diff_stats = self._get_stats(is_dbt)

total_rows_diff = diff_stats.table2_count - diff_stats.table1_count

if is_dbt:
string_output = dbt_diff_string_template(
diff_stats.diff_by_sign["+"],
diff_stats.diff_by_sign["-"],
diff_stats.diff_by_sign["!"],
diff_stats.unchanged,
diff_stats.extra_column_diffs,
"Values Updated:",
total_rows_table1=diff_stats.table1_count,
total_rows_table2=diff_stats.table2_count,
total_rows_diff=total_rows_diff,
rows_added=diff_stats.diff_by_sign["+"],
rows_removed=diff_stats.diff_by_sign["-"],
rows_updated=diff_stats.diff_by_sign["!"],
rows_unchanged=diff_stats.unchanged,
extra_info_dict=diff_stats.extra_column_diffs,
extra_info_str="[u]Values Changed[/u]",
)

else:
Expand Down
66 changes: 53 additions & 13 deletions data_diff/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,19 +459,59 @@ def __repr__(self) -> str:


def dbt_diff_string_template(
rows_added: str, rows_removed: str, rows_updated: str, rows_unchanged: str, extra_info_dict: Dict, extra_info_str
total_rows_table1: int,
total_rows_table2: int,
total_rows_diff: str,
rows_added: str,
rows_removed: str,
rows_updated: str,
rows_unchanged: str,
extra_info_dict: Dict,
extra_info_str: str,
is_cloud: Optional[bool] = False,
deps_impacts: Optional[Dict] = None,
) -> str:
string_output = f"\n{tabulate([[rows_added, rows_removed]], headers=['Rows Added', 'Rows Removed'])}"
# main table
main_rows = [
["Total", total_rows_table1, "", f"{total_rows_table2} [{diff_int_dynamic_color_template(total_rows_diff)}]"],
["Added", "", diff_int_dynamic_color_template(rows_added), ""],
["Removed", "", diff_int_dynamic_color_template(-rows_removed), ""],
["Different", "", rows_updated, ""],
["Unchanged", "", rows_unchanged, ""],
]

main_headers = ["rows", "PROD", "<>", "DEV"]
main_table = tabulate(main_rows, headers=main_headers)

# diffs table
diffs_rows = sorted(list(extra_info_dict.items()))

diffs_headers = ["columns", "% diff values" if is_cloud else "# diff values"]
diffs_table = tabulate(diffs_rows, headers=diffs_headers)

# deps impacts table
deps_impacts_table = ""
if deps_impacts:
deps_impacts_rows = list(deps_impacts.items())
deps_impacts_headers = ["deps", "# data assets"]
deps_impacts_table = f"\n\n{tabulate(deps_impacts_rows, headers=deps_impacts_headers)}"

# combine all tables
string_output = f"\n{main_table}\n\n{diffs_table}{deps_impacts_table}"

string_output += f"\n\nUpdated Rows: {rows_updated}\n"
string_output += f"Unchanged Rows: {rows_unchanged}\n\n"
return string_output

string_output += extra_info_str

for k, v in extra_info_dict.items():
string_output += f"\n{k}: {v}"
def diff_int_dynamic_color_template(diff_value: int) -> str:
if not isinstance(diff_value, int):
return diff_value

return string_output
if diff_value > 0:
return f"[green]+{diff_value}[/]"
elif diff_value < 0:
return f"[red]{diff_value}[/]"
else:
return "0"


def _jsons_equiv(a: str, b: str):
Expand All @@ -498,18 +538,18 @@ def diffs_are_equiv_jsons(diff: list, json_cols: dict):
return match, overriden_diff_cols


def columns_removed_template(columns_removed) -> str:
columns_removed_str = f"Column(s) removed: {columns_removed}\n"
def columns_removed_template(columns_removed: set) -> str:
columns_removed_str = f"[red]Columns removed [-{len(columns_removed)}]:[/] [blue]{columns_removed}[/]\n"
return columns_removed_str


def columns_added_template(columns_added) -> str:
columns_added_str = f"Column(s) added: {columns_added}\n"
def columns_added_template(columns_added: set) -> str:
columns_added_str = f"[green]Columns added [+{len(columns_added)}]: {columns_added}[/]\n"
return columns_added_str


def columns_type_changed_template(columns_type_changed) -> str:
columns_type_changed_str = f"Type change: {columns_type_changed}\n"
columns_type_changed_str = f"Type changed [{len(columns_type_changed)}]: [green]{columns_type_changed}[/]\n"
return columns_type_changed_str


Expand Down
2 changes: 1 addition & 1 deletion data_diff/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.10.1"
__version__ = "0.11.0"
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "data-diff"
version = "0.10.1"
version = "0.11.0"
description = "Command-line tool and Python library to efficiently diff rows across two different databases."
authors = ["Datafold <[email protected]>"]
license = "MIT"
Expand Down
83 changes: 73 additions & 10 deletions tests/test_dbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,70 @@ def test_integration_basic_dbt(self):
)

# assertions for the diff that exists in tests/dbt_artifacts/jaffle_shop.duckdb
orders_expected_output = """
jaffle_shop.prod.orders <> jaffle_shop.dev.orders
Primary Keys: ['order_id']

rows PROD <> DEV
--------- ------ ------------- ------------------
Total 10 20 [+10]
Added +10
Removed 0
Different 0
Unchanged 10

columns # diff values
-------------------- ---------------
amount 0
bank_transfer_amount 0
coupon_amount 0
credit_card_amount 0
customer_id 0
gift_card_amount 0
order_date 0
status 0
"""

stg_payments_expected_output = """
jaffle_shop.prod.stg_payments <> jaffle_shop.dev.stg_payments
Primary Keys: ['payment_id']
No row differences
"""

stg_customers_expected_output = """
jaffle_shop.prod.stg_customers <> jaffle_shop.dev.stg_customers
Primary Keys: ['customer_id']
No row differences
"""

stg_orders_expected_output = """
jaffle_shop.prod.stg_orders <> jaffle_shop.dev.stg_orders
Primary Keys: ['order_id']
No row differences
"""

customers_expected_output = """
jaffle_shop.prod.customers <> jaffle_shop.dev.customers
Primary Keys: ['customer_id']
No row differences
"""

expected_outputs = [
orders_expected_output,
stg_payments_expected_output,
stg_customers_expected_output,
stg_orders_expected_output,
customers_expected_output,
]

if test_project_path == artifacts_path:
diff_string = b"".join(diff).decode("utf-8")
# 5 diffs were ran
assert diff_string.count("<>") == 5
# 4 with no diffs
assert diff_string.count("No row differences") == 4
# 1 with a diff
assert diff_string.count(" Rows Added Rows Removed") == 1
actual_output_stripped = b"".join(diff).decode("utf-8").strip().replace(" ", "")

for expected_output in expected_outputs:
expected_output_stripped = "".join(line.strip() for line in expected_output.split("\n")).replace(
" ", ""
)
assert expected_output_stripped in actual_output_stripped

@unittest.skipIf(
not os.environ.get("MOTHERDUCK_TOKEN"),
Expand All @@ -61,12 +117,19 @@ def test_integration_motherduck_dbt(self):
# assertions for the diff that exists in tests/dbt_artifacts/jaffle_shop.duckdb
if test_project_path == artifacts_path:
diff_string = b"".join(diff).decode("utf-8")
# 5 diffs were ran
assert diff_string.count("<>") == 5
# 4 with no diffs
assert diff_string.count("No row differences") == 4
# 1 with a diff
assert diff_string.count(" Rows Added Rows Removed") == 1
assert diff_string.count("PROD") == 1
assert diff_string.count("DEV") == 1
assert diff_string.count("Primary Keys") == 5
assert diff_string.count("Where Filter") == 0
assert diff_string.count("Type Changed") == 0
assert diff_string.count("Total") == 1
assert diff_string.count("Added") == 1
assert diff_string.count("Removed") == 1
assert diff_string.count("Different") == 1
assert diff_string.count("Unchanged") == 1

def test_integration_cloud_dbt(self):
project_dir = os.environ.get("DATA_DIFF_DBT_PROJ")
Expand Down
Loading
Loading