|
142 | 142 |
|
143 | 143 | @dataclass
|
144 | 144 | class DBTSourceReport(StaleEntityRemovalSourceReport):
|
145 |
| - sql_statements_parsed: int = 0 |
146 |
| - sql_statements_table_error: int = 0 |
147 |
| - sql_statements_column_error: int = 0 |
148 |
| - sql_parser_detach_ctes_failures: LossyList[str] = field(default_factory=LossyList) |
149 | 145 | sql_parser_skipped_missing_code: LossyList[str] = field(default_factory=LossyList)
|
| 146 | + sql_parser_parse_failures: int = 0 |
| 147 | + sql_parser_detach_ctes_failures: int = 0 |
| 148 | + sql_parser_table_errors: int = 0 |
| 149 | + sql_parser_column_errors: int = 0 |
| 150 | + sql_parser_successes: int = 0 |
| 151 | + |
| 152 | + sql_parser_parse_failures_list: LossyList[str] = field(default_factory=LossyList) |
| 153 | + sql_parser_detach_ctes_failures_list: LossyList[str] = field( |
| 154 | + default_factory=LossyList |
| 155 | + ) |
150 | 156 |
|
151 | 157 | in_manifest_but_missing_catalog: LossyList[str] = field(default_factory=LossyList)
|
152 | 158 |
|
@@ -558,10 +564,11 @@ def get_fake_ephemeral_table_name(self) -> str:
|
558 | 564 | assert self.is_ephemeral_model()
|
559 | 565 |
|
560 | 566 | # Similar to get_db_fqn.
|
561 |
| - fqn = self._join_parts( |
| 567 | + db_fqn = self._join_parts( |
562 | 568 | [self.database, self.schema, f"__datahub__dbt__ephemeral__{self.name}"]
|
563 | 569 | )
|
564 |
| - return fqn.replace('"', "") |
| 570 | + db_fqn = db_fqn.lower() |
| 571 | + return db_fqn.replace('"', "") |
565 | 572 |
|
566 | 573 | def get_urn_for_upstream_lineage(
|
567 | 574 | self,
|
@@ -819,9 +826,10 @@ def get_column_type(
|
819 | 826 |
|
820 | 827 | # if still not found, report the warning
|
821 | 828 | if TypeClass is None:
|
822 |
| - report.report_warning( |
823 |
| - dataset_name, f"unable to map type {column_type} to metadata schema" |
824 |
| - ) |
| 829 | + if column_type: |
| 830 | + report.report_warning( |
| 831 | + dataset_name, f"unable to map type {column_type} to metadata schema" |
| 832 | + ) |
825 | 833 | TypeClass = NullTypeClass
|
826 | 834 |
|
827 | 835 | return SchemaFieldDataType(type=TypeClass())
|
@@ -1041,15 +1049,16 @@ def _infer_schemas_and_update_cll( # noqa: C901
|
1041 | 1049 |
|
1042 | 1050 | # Iterate over the dbt nodes in topological order.
|
1043 | 1051 | # This ensures that we process upstream nodes before downstream nodes.
|
1044 |
| - for dbt_name in topological_sort( |
| 1052 | + node_order = topological_sort( |
1045 | 1053 | list(all_nodes_map.keys()),
|
1046 | 1054 | edges=list(
|
1047 | 1055 | (upstream, node.dbt_name)
|
1048 | 1056 | for node in all_nodes_map.values()
|
1049 | 1057 | for upstream in node.upstream_nodes
|
1050 | 1058 | if upstream in all_nodes_map
|
1051 | 1059 | ),
|
1052 |
| - ): |
| 1060 | + ) |
| 1061 | + for dbt_name in node_order: |
1053 | 1062 | node = all_nodes_map[dbt_name]
|
1054 | 1063 | logger.debug(f"Processing CLL/schemas for {node.dbt_name}")
|
1055 | 1064 |
|
@@ -1119,55 +1128,26 @@ def _infer_schemas_and_update_cll( # noqa: C901
|
1119 | 1128 |
|
1120 | 1129 | # Run sql parser to infer the schema + generate column lineage.
|
1121 | 1130 | sql_result = None
|
1122 |
| - if node.node_type in {"source", "test"}: |
| 1131 | + if node.node_type in {"source", "test", "seed"}: |
1123 | 1132 | # For sources, we generate CLL as a 1:1 mapping.
|
1124 |
| - # We don't support CLL for tests (assertions). |
| 1133 | + # We don't support CLL for tests (assertions) or seeds. |
1125 | 1134 | pass
|
1126 | 1135 | elif node.compiled_code:
|
1127 |
| - try: |
1128 |
| - # Add CTE stops based on the upstreams list. |
1129 |
| - cte_mapping = { |
1130 |
| - cte_name: upstream_node.get_fake_ephemeral_table_name() |
1131 |
| - for upstream_node in [ |
1132 |
| - all_nodes_map[upstream_node_name] |
1133 |
| - for upstream_node_name in node.upstream_nodes |
1134 |
| - if upstream_node_name in all_nodes_map |
1135 |
| - ] |
1136 |
| - if upstream_node.is_ephemeral_model() |
1137 |
| - for cte_name in _get_dbt_cte_names( |
1138 |
| - upstream_node.name, schema_resolver.platform |
1139 |
| - ) |
1140 |
| - } |
1141 |
| - preprocessed_sql = detach_ctes( |
1142 |
| - parse_statements_and_pick( |
1143 |
| - node.compiled_code, |
1144 |
| - platform=schema_resolver.platform, |
1145 |
| - ), |
1146 |
| - platform=schema_resolver.platform, |
1147 |
| - cte_mapping=cte_mapping, |
1148 |
| - ) |
1149 |
| - except Exception as e: |
1150 |
| - self.report.sql_parser_detach_ctes_failures.append(node.dbt_name) |
1151 |
| - logger.debug( |
1152 |
| - f"Failed to detach CTEs from compiled code. {node.dbt_name} will not have column lineage." |
1153 |
| - ) |
1154 |
| - sql_result = SqlParsingResult.make_from_error(e) |
1155 |
| - else: |
1156 |
| - sql_result = sqlglot_lineage( |
1157 |
| - preprocessed_sql, schema_resolver=schema_resolver |
| 1136 | + # Add CTE stops based on the upstreams list. |
| 1137 | + cte_mapping = { |
| 1138 | + cte_name: upstream_node.get_fake_ephemeral_table_name() |
| 1139 | + for upstream_node in [ |
| 1140 | + all_nodes_map[upstream_node_name] |
| 1141 | + for upstream_node_name in node.upstream_nodes |
| 1142 | + if upstream_node_name in all_nodes_map |
| 1143 | + ] |
| 1144 | + if upstream_node.is_ephemeral_model() |
| 1145 | + for cte_name in _get_dbt_cte_names( |
| 1146 | + upstream_node.name, schema_resolver.platform |
1158 | 1147 | )
|
1159 |
| - if sql_result.debug_info.error: |
1160 |
| - self.report.sql_statements_table_error += 1 |
1161 |
| - logger.info( |
1162 |
| - f"Failed to parse compiled code for {node.dbt_name}: {sql_result.debug_info.error}" |
1163 |
| - ) |
1164 |
| - elif sql_result.debug_info.column_error: |
1165 |
| - self.report.sql_statements_column_error += 1 |
1166 |
| - logger.info( |
1167 |
| - f"Failed to generate CLL for {node.dbt_name}: {sql_result.debug_info.column_error}" |
1168 |
| - ) |
1169 |
| - else: |
1170 |
| - self.report.sql_statements_parsed += 1 |
| 1148 | + } |
| 1149 | + |
| 1150 | + sql_result = self._parse_cll(node, cte_mapping, schema_resolver) |
1171 | 1151 | else:
|
1172 | 1152 | self.report.sql_parser_skipped_missing_code.append(node.dbt_name)
|
1173 | 1153 |
|
@@ -1212,6 +1192,56 @@ def _infer_schemas_and_update_cll( # noqa: C901
|
1212 | 1192 | if inferred_schema_fields:
|
1213 | 1193 | node.columns_setdefault(inferred_schema_fields)
|
1214 | 1194 |
|
| 1195 | + def _parse_cll( |
| 1196 | + self, |
| 1197 | + node: DBTNode, |
| 1198 | + cte_mapping: Dict[str, str], |
| 1199 | + schema_resolver: SchemaResolver, |
| 1200 | + ) -> SqlParsingResult: |
| 1201 | + assert node.compiled_code is not None |
| 1202 | + |
| 1203 | + try: |
| 1204 | + picked_statement = parse_statements_and_pick( |
| 1205 | + node.compiled_code, |
| 1206 | + platform=schema_resolver.platform, |
| 1207 | + ) |
| 1208 | + except Exception as e: |
| 1209 | + logger.debug( |
| 1210 | + f"Failed to parse compiled code. {node.dbt_name} will not have column lineage." |
| 1211 | + ) |
| 1212 | + self.report.sql_parser_parse_failures += 1 |
| 1213 | + self.report.sql_parser_parse_failures_list.append(node.dbt_name) |
| 1214 | + return SqlParsingResult.make_from_error(e) |
| 1215 | + |
| 1216 | + try: |
| 1217 | + preprocessed_sql = detach_ctes( |
| 1218 | + picked_statement, |
| 1219 | + platform=schema_resolver.platform, |
| 1220 | + cte_mapping=cte_mapping, |
| 1221 | + ) |
| 1222 | + except Exception as e: |
| 1223 | + self.report.sql_parser_detach_ctes_failures += 1 |
| 1224 | + self.report.sql_parser_detach_ctes_failures_list.append(node.dbt_name) |
| 1225 | + logger.debug( |
| 1226 | + f"Failed to detach CTEs from compiled code. {node.dbt_name} will not have column lineage." |
| 1227 | + ) |
| 1228 | + return SqlParsingResult.make_from_error(e) |
| 1229 | + |
| 1230 | + sql_result = sqlglot_lineage(preprocessed_sql, schema_resolver=schema_resolver) |
| 1231 | + if sql_result.debug_info.table_error: |
| 1232 | + self.report.sql_parser_table_errors += 1 |
| 1233 | + logger.info( |
| 1234 | + f"Failed to generate any CLL lineage for {node.dbt_name}: {sql_result.debug_info.error}" |
| 1235 | + ) |
| 1236 | + elif sql_result.debug_info.column_error: |
| 1237 | + self.report.sql_parser_column_errors += 1 |
| 1238 | + logger.info( |
| 1239 | + f"Failed to generate CLL for {node.dbt_name}: {sql_result.debug_info.column_error}" |
| 1240 | + ) |
| 1241 | + else: |
| 1242 | + self.report.sql_parser_successes += 1 |
| 1243 | + return sql_result |
| 1244 | + |
1215 | 1245 | def create_dbt_platform_mces(
|
1216 | 1246 | self,
|
1217 | 1247 | dbt_nodes: List[DBTNode],
|
|
0 commit comments