@@ -286,6 +286,8 @@ def _gen_refs(metadata: sa.schema.MetaData) -> Iterator[Tuple[str, str, int, Dic
286
286
run = _get_table (metadata , "run" )
287
287
collection = _get_table (metadata , "collection" )
288
288
289
+ dataset_ids = set ()
290
+
289
291
tag_tables = [table for table in metadata .tables .values () if table .name .startswith ("dataset_tags_" )]
290
292
for table in tag_tables :
291
293
@@ -331,7 +333,29 @@ def _gen_refs(metadata: sa.schema.MetaData) -> Iterator[Tuple[str, str, int, Dic
331
333
dstype_name = row [col_dataset_type_name ]
332
334
dataset_id = row [col_dataset_id ]
333
335
dataId = dict ((col .name , row [col ]) for col in dim_cols )
336
+ dataset_ids .add (dataset_id )
337
+
338
+ yield run_name , dstype_name , dataset_id , dataId
334
339
340
+ # Also look at removed datasets that are only known to datastore.
341
+ removed_ids = set ()
342
+ for table_name in ("file_datastore_records" , "dataset_location_trash" ):
343
+ table = _get_table (metadata , table_name )
344
+ col_dataset_id = table .columns ["dataset_id" ]
345
+ sql = sa .select (col_dataset_id ).select_from (table )
346
+ _LOG .debug ("sql: %s" , sql )
347
+ result = metadata .bind .execute (sql )
348
+ for row in result :
349
+ dataset_id = row [col_dataset_id ]
350
+ if dataset_id not in dataset_ids :
351
+ removed_ids .add (dataset_id )
352
+ if removed_ids :
353
+ _LOG .debug ("found %s removed datasets" , len (removed_ids ))
354
+ # Run name and dataset type name can be anything that is non-raw.
355
+ run_name = ""
356
+ dstype_name = ""
357
+ dataId = {}
358
+ for dataset_id in removed_ids :
335
359
yield run_name , dstype_name , dataset_id , dataId
336
360
337
361
@@ -399,13 +423,13 @@ def _fill_uuid_column(table: sa.schema.Table, map_table: sa.schema.Table) -> Non
399
423
sql = table .update ().values (
400
424
id_uuid = sa .select ([map_table .columns .uuid ]).where (
401
425
map_table .columns .id == table .columns .id
402
- )
426
+ ). scalar_subquery ()
403
427
)
404
428
else :
405
429
sql = table .update ().values (
406
430
dataset_id_uuid = sa .select ([map_table .columns .uuid ]).where (
407
431
map_table .columns .id == table .columns .dataset_id
408
- )
432
+ ). scalar_subquery ()
409
433
)
410
434
op .get_bind ().execute (sql )
411
435
_LOG .debug ("Filled uuids in table %r" , table .name )
0 commit comments