Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 3b0b7cc

Browse files
authored
Merge branch 'master' into 859
2 parents f4df144 + cac665b commit 3b0b7cc

File tree

14 files changed

+353
-62
lines changed

14 files changed

+353
-62
lines changed

data_diff/cloud/datafold_api.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,18 +144,22 @@ class TSummaryResultSchemaStats(pydantic.BaseModel):
144144
exclusive_columns: Tuple[List[str], List[str]]
145145

146146

147+
class TSummaryResultDependencyDetails(pydantic.BaseModel):
148+
deps: Dict[str, List[Dict]]
149+
150+
147151
class TCloudApiDataDiffSummaryResult(pydantic.BaseModel):
148152
status: str
149153
pks: Optional[TSummaryResultPrimaryKeyStats]
150154
values: Optional[TSummaryResultValueStats]
151155
schema_: Optional[TSummaryResultSchemaStats]
152-
dependencies: Optional[Dict[str, Any]]
156+
deps: Optional[TSummaryResultDependencyDetails]
153157

154158
@classmethod
155159
def from_orm(cls, obj: Any) -> Self:
156160
pks = TSummaryResultPrimaryKeyStats(**obj["pks"]) if "pks" in obj else None
157161
values = TSummaryResultValueStats(**obj["values"]) if "values" in obj else None
158-
deps = obj["deps"] if "deps" in obj else None
162+
deps = TSummaryResultDependencyDetails(**obj["dependencies"]) if "dependencies" in obj else None
159163
schema = TSummaryResultSchemaStats(**obj["schema"]) if "schema" in obj else None
160164
return cls(
161165
status=obj["status"],

data_diff/databases/base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,6 +1046,7 @@ def query_table_schema(self, path: DbPath) -> Dict[str, RawColumnInfo]:
10461046
accessing the schema using a SQL query.
10471047
"""
10481048
rows = self.query(self.select_table_schema(path), list, log_message=path)
1049+
10491050
if not rows:
10501051
raise RuntimeError(f"{self.name}: Table '{'.'.join(path)}' does not exist, or has no columns")
10511052

@@ -1060,6 +1061,7 @@ def query_table_schema(self, path: DbPath) -> Dict[str, RawColumnInfo]:
10601061
)
10611062
for r in rows
10621063
}
1064+
10631065
assert len(d) == len(rows)
10641066
return d
10651067

data_diff/databases/duckdb.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,12 +167,16 @@ def select_table_schema(self, path: DbPath) -> str:
167167
database, schema, table = self._normalize_table_path(path)
168168

169169
info_schema_path = ["information_schema", "columns"]
170+
170171
if database:
171172
info_schema_path.insert(0, database)
173+
dynamic_database_clause = f"'{database}'"
174+
else:
175+
dynamic_database_clause = "current_catalog()"
172176

173177
return (
174178
f"SELECT column_name, data_type, datetime_precision, numeric_precision, numeric_scale FROM {'.'.join(info_schema_path)} "
175-
f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
179+
f"WHERE table_name = '{table}' AND table_schema = '{schema}' and table_catalog = {dynamic_database_clause}"
176180
)
177181

178182
def _normalize_table_path(self, path: DbPath) -> DbPath:

data_diff/dbt.py

Lines changed: 57 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -287,12 +287,23 @@ def _local_diff(
287287
k for k, v in table2_columns.items() if k in table1_columns and v.data_type != table1_columns[k].data_type
288288
}
289289

290-
if columns_added:
291-
diff_output_str += columns_added_template(columns_added)
290+
diff_output_str += f"Primary Keys: {diff_vars.primary_keys} \n"
291+
292+
if diff_vars.where_filter:
293+
diff_output_str += f"Where Filter: '{str(diff_vars.where_filter)}' \n"
294+
295+
if diff_vars.include_columns:
296+
diff_output_str += f"Included Columns: {diff_vars.include_columns} \n"
297+
298+
if diff_vars.exclude_columns:
299+
diff_output_str += f"Excluded Columns: {diff_vars.exclude_columns} \n"
292300

293301
if columns_removed:
294302
diff_output_str += columns_removed_template(columns_removed)
295303

304+
if columns_added:
305+
diff_output_str += columns_added_template(columns_added)
306+
296307
if columns_type_changed:
297308
diff_output_str += columns_type_changed_template(columns_type_changed)
298309
column_set = column_set.difference(columns_type_changed)
@@ -330,13 +341,14 @@ def _local_diff(
330341
return
331342

332343
dataset1_columns = [
333-
(name, type_, table1.database.dialect.parse_type(table1.table_path, name, type_, *other))
334-
for (name, type_, *other) in table1_columns.values()
344+
(info.column_name, info.data_type, table1.database.dialect.parse_type(table1.table_path, info))
345+
for info in table1_columns.values()
335346
]
336347
dataset2_columns = [
337-
(name, type_, table2.database.dialect.parse_type(table2.table_path, name, type_, *other))
338-
for (name, type_, *other) in table2_columns.values()
348+
(info.column_name, info.data_type, table2.database.dialect.parse_type(table2.table_path, info))
349+
for info in table2_columns.values()
339350
]
351+
340352
print(
341353
json.dumps(
342354
jsonify(
@@ -436,32 +448,57 @@ def _cloud_diff(
436448
rows_removed_count = diff_results.pks.exclusives[0]
437449

438450
rows_updated = diff_results.values.rows_with_differences
439-
total_rows = diff_results.values.total_rows
440-
rows_unchanged = int(total_rows) - int(rows_updated)
451+
total_rows_table1 = diff_results.pks.total_rows[0]
452+
total_rows_table2 = diff_results.pks.total_rows[1]
453+
total_rows_diff = total_rows_table2 - total_rows_table1
454+
455+
rows_unchanged = int(total_rows_table1) - int(rows_updated) - int(rows_removed_count)
441456
diff_percent_list = {
442-
x.column_name: str(x.match) + "%" for x in diff_results.values.columns_diff_stats if x.match != 100.0
457+
x.column_name: f"{str(round(100.00 - x.match, 2))}%"
458+
for x in diff_results.values.columns_diff_stats
459+
if x.match != 100.0
443460
}
444-
columns_added = diff_results.schema_.exclusive_columns[1]
445-
columns_removed = diff_results.schema_.exclusive_columns[0]
461+
columns_added = set(diff_results.schema_.exclusive_columns[1])
462+
columns_removed = set(diff_results.schema_.exclusive_columns[0])
446463
column_type_changes = diff_results.schema_.column_type_differs
447464

448-
if columns_added:
449-
diff_output_str += columns_added_template(columns_added)
465+
diff_output_str += f"Primary Keys: {diff_vars.primary_keys} \n"
466+
if diff_vars.where_filter:
467+
diff_output_str += f"Where Filter: '{str(diff_vars.where_filter)}' \n"
468+
469+
if diff_vars.include_columns:
470+
diff_output_str += f"Included Columns: {diff_vars.include_columns} \n"
471+
472+
if diff_vars.exclude_columns:
473+
diff_output_str += f"Excluded Columns: {diff_vars.exclude_columns} \n"
450474

451475
if columns_removed:
452476
diff_output_str += columns_removed_template(columns_removed)
453477

478+
if columns_added:
479+
diff_output_str += columns_added_template(columns_added)
480+
454481
if column_type_changes:
455482
diff_output_str += columns_type_changed_template(column_type_changes)
456483

484+
deps_impacts = {
485+
key: len(value) + sum(len(item.get("BiHtSync", [])) for item in value) if key == "hightouch" else len(value)
486+
for key, value in diff_results.deps.deps.items()
487+
}
488+
457489
if any([rows_added_count, rows_removed_count, rows_updated]):
458490
diff_output = dbt_diff_string_template(
459-
rows_added_count,
460-
rows_removed_count,
461-
rows_updated,
462-
str(rows_unchanged),
463-
diff_percent_list,
464-
"Value Match Percent:",
491+
total_rows_table1=total_rows_table1,
492+
total_rows_table2=total_rows_table2,
493+
total_rows_diff=total_rows_diff,
494+
rows_added=rows_added_count,
495+
rows_removed=rows_removed_count,
496+
rows_updated=rows_updated,
497+
rows_unchanged=str(rows_unchanged),
498+
deps_impacts=deps_impacts,
499+
is_cloud=True,
500+
extra_info_dict=diff_percent_list,
501+
extra_info_str="Value Changed:",
465502
)
466503
diff_output_str += f"\n{diff_url}\n {diff_output} \n"
467504
rich.print(diff_output_str)
@@ -505,7 +542,7 @@ def _cloud_diff(
505542

506543

507544
def _diff_output_base(dev_path: str, prod_path: str) -> str:
508-
return f"\n[green]{prod_path} <> {dev_path}[/] \n"
545+
return f"\n[blue]{prod_path}[/] <> [green]{dev_path}[/] \n"
509546

510547

511548
def _initialize_events(dbt_user_id: Optional[str], dbt_version: Optional[str], dbt_project_id: Optional[str]) -> None:

data_diff/diff_tables.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -138,14 +138,19 @@ def _get_stats(self, is_dbt: bool = False) -> DiffStats:
138138
def get_stats_string(self, is_dbt: bool = False):
139139
diff_stats = self._get_stats(is_dbt)
140140

141+
total_rows_diff = diff_stats.table2_count - diff_stats.table1_count
142+
141143
if is_dbt:
142144
string_output = dbt_diff_string_template(
143-
diff_stats.diff_by_sign["+"],
144-
diff_stats.diff_by_sign["-"],
145-
diff_stats.diff_by_sign["!"],
146-
diff_stats.unchanged,
147-
diff_stats.extra_column_diffs,
148-
"Values Updated:",
145+
total_rows_table1=diff_stats.table1_count,
146+
total_rows_table2=diff_stats.table2_count,
147+
total_rows_diff=total_rows_diff,
148+
rows_added=diff_stats.diff_by_sign["+"],
149+
rows_removed=diff_stats.diff_by_sign["-"],
150+
rows_updated=diff_stats.diff_by_sign["!"],
151+
rows_unchanged=diff_stats.unchanged,
152+
extra_info_dict=diff_stats.extra_column_diffs,
153+
extra_info_str="[u]Values Changed[/u]",
149154
)
150155

151156
else:

data_diff/utils.py

Lines changed: 53 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -459,19 +459,59 @@ def __repr__(self) -> str:
459459

460460

461461
def dbt_diff_string_template(
462-
rows_added: str, rows_removed: str, rows_updated: str, rows_unchanged: str, extra_info_dict: Dict, extra_info_str
462+
total_rows_table1: int,
463+
total_rows_table2: int,
464+
total_rows_diff: int,
465+
rows_added: int,
466+
rows_removed: int,
467+
rows_updated: int,
468+
rows_unchanged: int,
469+
extra_info_dict: Dict,
470+
extra_info_str: str,
471+
is_cloud: Optional[bool] = False,
472+
deps_impacts: Optional[Dict] = None,
463473
) -> str:
464-
string_output = f"\n{tabulate([[rows_added, rows_removed]], headers=['Rows Added', 'Rows Removed'])}"
474+
# main table
475+
main_rows = [
476+
["Total", total_rows_table1, "", f"{total_rows_table2} [{diff_int_dynamic_color_template(total_rows_diff)}]"],
477+
["Added", "", diff_int_dynamic_color_template(rows_added), ""],
478+
["Removed", "", diff_int_dynamic_color_template(-rows_removed), ""],
479+
["Different", "", rows_updated, ""],
480+
["Unchanged", "", rows_unchanged, ""],
481+
]
482+
483+
main_headers = ["rows", "PROD", "<>", "DEV"]
484+
main_table = tabulate(main_rows, headers=main_headers)
485+
486+
# diffs table
487+
diffs_rows = sorted(list(extra_info_dict.items()))
488+
489+
diffs_headers = ["columns", "% diff values" if is_cloud else "# diff values"]
490+
diffs_table = tabulate(diffs_rows, headers=diffs_headers)
491+
492+
# deps impacts table
493+
deps_impacts_table = ""
494+
if deps_impacts:
495+
deps_impacts_rows = list(deps_impacts.items())
496+
deps_impacts_headers = ["deps", "# data assets"]
497+
deps_impacts_table = f"\n\n{tabulate(deps_impacts_rows, headers=deps_impacts_headers)}"
498+
499+
# combine all tables
500+
string_output = f"\n{main_table}\n\n{diffs_table}{deps_impacts_table}"
465501

466-
string_output += f"\n\nUpdated Rows: {rows_updated}\n"
467-
string_output += f"Unchanged Rows: {rows_unchanged}\n\n"
502+
return string_output
468503

469-
string_output += extra_info_str
470504

471-
for k, v in extra_info_dict.items():
472-
string_output += f"\n{k}: {v}"
505+
def diff_int_dynamic_color_template(diff_value: int) -> str:
506+
if not isinstance(diff_value, int):
507+
return diff_value
473508

474-
return string_output
509+
if diff_value > 0:
510+
return f"[green]+{diff_value}[/]"
511+
elif diff_value < 0:
512+
return f"[red]{diff_value}[/]"
513+
else:
514+
return "0"
475515

476516

477517
def _jsons_equiv(a: str, b: str):
@@ -498,18 +538,18 @@ def diffs_are_equiv_jsons(diff: list, json_cols: dict):
498538
return match, overriden_diff_cols
499539

500540

501-
def columns_removed_template(columns_removed) -> str:
502-
columns_removed_str = f"Column(s) removed: {columns_removed}\n"
541+
def columns_removed_template(columns_removed: set) -> str:
542+
columns_removed_str = f"[red]Columns removed [-{len(columns_removed)}]:[/] [blue]{columns_removed}[/]\n"
503543
return columns_removed_str
504544

505545

506-
def columns_added_template(columns_added) -> str:
507-
columns_added_str = f"Column(s) added: {columns_added}\n"
546+
def columns_added_template(columns_added: set) -> str:
547+
columns_added_str = f"[green]Columns added [+{len(columns_added)}]: {columns_added}[/]\n"
508548
return columns_added_str
509549

510550

511551
def columns_type_changed_template(columns_type_changed) -> str:
512-
columns_type_changed_str = f"Type change: {columns_type_changed}\n"
552+
columns_type_changed_str = f"Type changed [{len(columns_type_changed)}]: [green]{columns_type_changed}[/]\n"
513553
return columns_type_changed_str
514554

515555

data_diff/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.10.1"
1+
__version__ = "0.11.0"

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "data-diff"
3-
version = "0.10.1"
3+
version = "0.11.0"
44
description = "Command-line tool and Python library to efficiently diff rows across two different databases."
55
authors = ["Datafold <[email protected]>"]
66
license = "MIT"
0 Bytes
Binary file not shown.

tests/dbt_artifacts/target/manifest.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)