52
52
PreparsedQuery ,
53
53
SqlAggregatorReport ,
54
54
SqlParsingAggregator ,
55
+ TableRename ,
56
+ TableSwap ,
55
57
)
56
58
from datahub .sql_parsing .sql_parsing_common import QueryType
57
59
from datahub .sql_parsing .sqlglot_lineage import (
@@ -116,6 +118,8 @@ class SnowflakeQueriesExtractorReport(Report):
116
118
audit_log_load_timer : PerfTimer = dataclasses .field (default_factory = PerfTimer )
117
119
sql_aggregator : Optional [SqlAggregatorReport ] = None
118
120
121
+ num_ddl_queries_dropped : int = 0
122
+
119
123
120
124
@dataclass
121
125
class SnowflakeQueriesSourceReport (SourceReport ):
@@ -225,7 +229,9 @@ def get_workunits_internal(
225
229
audit_log_file = self .local_temp_path / "audit_log.sqlite"
226
230
use_cached_audit_log = audit_log_file .exists ()
227
231
228
- queries : FileBackedList [Union [KnownLineageMapping , PreparsedQuery ]]
232
+ queries : FileBackedList [
233
+ Union [KnownLineageMapping , PreparsedQuery , TableRename , TableSwap ]
234
+ ]
229
235
if use_cached_audit_log :
230
236
logger .info ("Using cached audit log" )
231
237
shared_connection = ConnectionWrapper (audit_log_file )
@@ -235,7 +241,7 @@ def get_workunits_internal(
235
241
236
242
shared_connection = ConnectionWrapper (audit_log_file )
237
243
queries = FileBackedList (shared_connection )
238
- entry : Union [KnownLineageMapping , PreparsedQuery ]
244
+ entry : Union [KnownLineageMapping , PreparsedQuery , TableRename , TableSwap ]
239
245
240
246
with self .report .copy_history_fetch_timer :
241
247
for entry in self .fetch_copy_history ():
@@ -296,7 +302,7 @@ def fetch_copy_history(self) -> Iterable[KnownLineageMapping]:
296
302
297
303
def fetch_query_log (
298
304
self ,
299
- ) -> Iterable [PreparsedQuery ]:
305
+ ) -> Iterable [Union [ PreparsedQuery , TableRename , TableSwap ] ]:
300
306
query_log_query = _build_enriched_query_log_query (
301
307
start_time = self .config .window .start_time ,
302
308
end_time = self .config .window .end_time ,
@@ -324,12 +330,16 @@ def fetch_query_log(
324
330
exc = e ,
325
331
)
326
332
else :
327
- yield entry
333
+ if entry :
334
+ yield entry
328
335
329
- def _parse_audit_log_row (self , row : Dict [str , Any ]) -> PreparsedQuery :
336
+ def _parse_audit_log_row (
337
+ self , row : Dict [str , Any ]
338
+ ) -> Optional [Union [TableRename , TableSwap , PreparsedQuery ]]:
330
339
json_fields = {
331
340
"DIRECT_OBJECTS_ACCESSED" ,
332
341
"OBJECTS_MODIFIED" ,
342
+ "OBJECT_MODIFIED_BY_DDL" ,
333
343
}
334
344
335
345
res = {}
@@ -341,6 +351,17 @@ def _parse_audit_log_row(self, row: Dict[str, Any]) -> PreparsedQuery:
341
351
342
352
direct_objects_accessed = res ["direct_objects_accessed" ]
343
353
objects_modified = res ["objects_modified" ]
354
+ object_modified_by_ddl = res ["object_modified_by_ddl" ]
355
+
356
+ if object_modified_by_ddl and not objects_modified :
357
+ ddl_entry : Optional [Union [TableRename , TableSwap ]] = None
358
+ with self .structured_reporter .report_exc (
359
+ "Error fetching ddl lineage from Snowflake"
360
+ ):
361
+ ddl_entry = self .parse_ddl_query (
362
+ res ["query_text" ], object_modified_by_ddl
363
+ )
364
+ return ddl_entry
344
365
345
366
upstreams = []
346
367
column_usage = {}
@@ -437,6 +458,45 @@ def _parse_audit_log_row(self, row: Dict[str, Any]) -> PreparsedQuery:
437
458
)
438
459
return entry
439
460
461
+ def parse_ddl_query (
462
+ self , query : str , object_modified_by_ddl : dict
463
+ ) -> Optional [Union [TableRename , TableSwap ]]:
464
+ if object_modified_by_ddl [
465
+ "operationType"
466
+ ] == "ALTER" and object_modified_by_ddl ["properties" ].get ("swapTargetName" ):
467
+ urn1 = self .identifiers .gen_dataset_urn (
468
+ self .identifiers .get_dataset_identifier_from_qualified_name (
469
+ object_modified_by_ddl ["objectName" ]
470
+ )
471
+ )
472
+
473
+ urn2 = self .identifiers .gen_dataset_urn (
474
+ self .identifiers .get_dataset_identifier_from_qualified_name (
475
+ object_modified_by_ddl ["properties" ]["swapTargetName" ]["value" ]
476
+ )
477
+ )
478
+
479
+ return TableSwap (urn1 , urn2 , query )
480
+ elif object_modified_by_ddl [
481
+ "operationType"
482
+ ] == "RENAME_TABLE" and object_modified_by_ddl ["properties" ].get ("objectName" ):
483
+ original_un = self .identifiers .gen_dataset_urn (
484
+ self .identifiers .get_dataset_identifier_from_qualified_name (
485
+ object_modified_by_ddl ["objectName" ]
486
+ )
487
+ )
488
+
489
+ new_urn = self .identifiers .gen_dataset_urn (
490
+ self .identifiers .get_dataset_identifier_from_qualified_name (
491
+ object_modified_by_ddl ["properties" ]["objectName" ]["value" ]
492
+ )
493
+ )
494
+
495
+ return TableRename (original_un , new_urn , query )
496
+ else :
497
+ self .report .num_ddl_queries_dropped += 1
498
+ return None
499
+
440
500
def close (self ) -> None :
441
501
self ._exit_stack .close ()
442
502
@@ -542,6 +602,7 @@ def _build_enriched_query_log_query(
542
602
user_name,
543
603
direct_objects_accessed,
544
604
objects_modified,
605
+ object_modified_by_ddl
545
606
FROM
546
607
snowflake.account_usage.access_history
547
608
WHERE
@@ -563,8 +624,9 @@ def _build_enriched_query_log_query(
563
624
) as direct_objects_accessed,
564
625
-- TODO: Drop the columns.baseSources subfield.
565
626
FILTER(objects_modified, o -> o:objectDomain IN { SnowflakeQuery .ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER } ) as objects_modified,
627
+ case when object_modified_by_ddl:objectDomain IN { SnowflakeQuery .ACCESS_HISTORY_TABLE_VIEW_DOMAINS_FILTER } then object_modified_by_ddl else null end as object_modified_by_ddl
566
628
FROM raw_access_history
567
- WHERE ( array_size(direct_objects_accessed) > 0 or array_size(objects_modified) > 0 )
629
+ WHERE ( array_size(direct_objects_accessed) > 0 or array_size(objects_modified) > 0 or object_modified_by_ddl is not null )
568
630
)
569
631
, query_access_history AS (
570
632
SELECT
@@ -586,6 +648,7 @@ def _build_enriched_query_log_query(
586
648
q.role_name AS "ROLE_NAME",
587
649
a.direct_objects_accessed,
588
650
a.objects_modified,
651
+ a.object_modified_by_ddl
589
652
FROM deduplicated_queries q
590
653
JOIN filtered_access_history a USING (query_id)
591
654
)
0 commit comments