diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index cbe1f6eb978247..d3d8ed62cd364c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -292,6 +292,11 @@ def get_datasets_for_project_id( if hasattr(d, "_properties") and isinstance(d._properties, dict) else None ), + # TODO: Fetch dataset description individually impacts overall performance if the number of datasets is high (hundreds); instead we should fetch in batch for all datasets. + # TODO: Given we are calling get_dataset for each dataset, we may consume and publish other fields too, such as created, modified, etc... + # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_get_dataset + # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dataset.Dataset + comment=self.bq_client.get_dataset(d.reference).description, ) for d in datasets ] diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index 1cace31a5e4c49..8ea414c43abb79 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -296,6 +296,7 @@ def gen_dataset_containers( self, dataset: str, project_id: str, + description: Optional[str] = None, tags: Optional[Dict[str, str]] = None, extra_properties: Optional[Dict[str, str]] = None, ) -> Iterable[MetadataWorkUnit]: @@ -336,6 +337,7 @@ def gen_dataset_containers( domain_config=self.config.domain, schema_container_key=schema_container_key, database_container_key=database_container_key, + description=description, external_url=( BQ_EXTERNAL_DATASET_URL_TEMPLATE.format( project=project_id, dataset=dataset @@ -471,14 +473,15 @@ def _process_schema( if self.config.include_schema_metadata: yield from self.gen_dataset_containers( - dataset_name, - project_id, - bigquery_dataset.labels, - ( + dataset=dataset_name, + project_id=project_id, + tags=bigquery_dataset.labels, + extra_properties=( {"location": bigquery_dataset.location} if bigquery_dataset.location else None ), + description=bigquery_dataset.comment, ) columns = None