Skip to content

Commit

Permalink
fix: Move 'grouped_tables' into _retrieve_tables (#430)
Browse files Browse the repository at this point in the history
Signed-off-by: xuans <[email protected]>
  • Loading branch information
xuan616 authored Dec 18, 2020
1 parent 7157c24 commit 26a0d0a
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 8 deletions.
1 change: 0 additions & 1 deletion databuilder/extractor/base_bigquery_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ class BaseBigQueryExtractor(Extractor):
DEFAULT_PAGE_SIZE = 300
NUM_RETRIES = 3
DATE_LENGTH = 8
SHARDED_TABLE_KEY_FORMAT = '{dataset_id}/{table_id}'

def init(self, conf: ConfigTree) -> None:
# should use key_path, or cred_key if the former doesn't exist
Expand Down
12 changes: 5 additions & 7 deletions databuilder/extractor/bigquery_metadata_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@ class BigQueryMetadataExtractor(BaseBigQueryExtractor):

def init(self, conf: ConfigTree) -> None:
BaseBigQueryExtractor.init(self, conf)
self.grouped_tables: Set[str] = set([])
self.iter = iter(self._iterate_over_tables())

def _retrieve_tables(self, dataset: DatasetRef) -> Any:
grouped_tables: Set[str] = set([])

for page in self._page_table_list_results(dataset):
if 'tables' not in page:
continue
Expand All @@ -47,16 +48,13 @@ def _retrieve_tables(self, dataset: DatasetRef) -> Any:
# If the last eight characters are digits, we assume the table is of a table date range type
# and then we only need one schema definition
table_prefix = table_id[:-BigQueryMetadataExtractor.DATE_LENGTH]
table_id = table_prefix
sharded_table_key = BigQueryMetadataExtractor.SHARDED_TABLE_KEY_FORMAT.format(
dataset_id=tableRef['datasetId'],
table_id=table_id)
if sharded_table_key in self.grouped_tables:
if table_prefix in grouped_tables:
# If one table in the date range is processed, then ignore other ones
# (it adds too much metadata)
continue

self.grouped_tables.add(sharded_table_key)
table_id = table_prefix
grouped_tables.add(table_prefix)

table = self.bigquery_service.tables().get(
projectId=tableRef['projectId'],
Expand Down

0 comments on commit 26a0d0a

Please sign in to comment.