diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py index 545e13f2da3ad8..2214f91e9128a9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py @@ -490,6 +490,8 @@ def community_get_formatted_tables( if table.get("TABLE_SCHEMA") == schemas.get("original_path"): dataset_list.append( { + "TABLE_SCHEMA": "[" + ", ".join(schemas.get("formatted_path") + [ + table.get("TABLE_NAME")]) + "]", "TABLE_NAME": table.get("TABLE_NAME"), "COLUMNS": column_dictionary.get( table.get('FULL_TABLE_PATH') @@ -527,12 +529,12 @@ def get_pattern_condition( return "" if isinstance(patterns, str): - patterns = [patterns] + patterns = [patterns.upper()] if ".*" in patterns and allow: return "" - patterns = [p for p in patterns if p != ".*"] + patterns = [p.upper() for p in patterns if p != ".*"] if not patterns: return "" @@ -541,9 +543,9 @@ def get_pattern_condition( return f"AND {operator}({field}, '{pattern_str}')" schema_field = ( - "CONCAT(REPLACE(REPLACE(REPLACE(V.PATH, ', ', '.'), '[', ''), ']', ''))" + "CONCAT(REPLACE(REPLACE(REPLACE(UPPER(TABLE_SCHEMA), ', ', '.'), '[', ''), ']', ''))" ) - table_field = "V.TABLE_NAME" + table_field = "UPPER(TABLE_NAME)" schema_condition = get_pattern_condition(self.allow_schema_pattern, schema_field) table_condition = get_pattern_condition(self.allow_dataset_pattern, table_field) @@ -907,6 +909,7 @@ def __init__( self.resource_name = dataset_details.get("TABLE_NAME") self.path = dataset_details.get("TABLE_SCHEMA")[1:-1].split(", ")[:-1] self.location_id = dataset_details.get("LOCATION_ID") + #Protect against null columns returned if dataset_details.get("COLUMNS")[0].name: self.columns = dataset_details.get("COLUMNS") @@ -923,9 +926,9 @@ def __init__( DremioEdition.ENTERPRISE, DremioEdition.CLOUD, ): + self.created = dataset_details.get("CREATED") self.owner = dataset_details.get("OWNER") self.owner_type = dataset_details.get("OWNER_TYPE") - self.created = dataset_details.get("CREATED") self.format_type = dataset_details.get("FORMAT_TYPE") self.description = api_operations.get_description_for_resource( diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_aspects.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_aspects.py index 049efd08863ae6..6806a3f78fc6d0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_aspects.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_aspects.py @@ -227,23 +227,34 @@ def _create_container_properties(self, container: DremioContainer) -> ContainerP env=self.env ) - def _create_browse_paths(self, entity) -> BrowsePathsV2Class: + def _create_browse_paths(self, entity) -> Optional[BrowsePathsV2Class]: + paths = [] + + if self.platform_instance: + paths.append( + BrowsePathEntryClass( + id=self.platform_instance, + ) + ) + if hasattr(entity, 'path') and entity.path: - return BrowsePathsV2Class( - path=[ + for browse_path_level in range(len(entity.path)): + paths.append( BrowsePathEntryClass( id=entity.path[browse_path_level], urn=self.get_container_urn( name=entity.container_name if hasattr(entity, 'container_name') else "", path=entity.path[:browse_path_level+1], ), - ) for browse_path_level in range(len(entity.path)) - ] - ) - return BrowsePathsV2Class(path=[]) + ) + ) - def _create_container_class(self, entity) -> ContainerClass: - if hasattr(entity, 'path') and entity.path: + if paths: + return BrowsePathsV2Class(path=paths) + return None + + def _create_container_class(self, entity) -> Optional[ContainerClass]: + if hasattr(entity, "path") and entity.path: return ContainerClass( container=self.get_container_urn( path=entity.path, @@ -269,8 +280,10 @@ def _create_dataset_properties(self, dataset: DremioDataset) -> DatasetPropertie externalUrl=self._create_external_url(dataset=dataset), created=TimeStampClass( time=round( - datetime.strptime(dataset.created, '%Y-%m-%d %H:%M:%S.%f').timestamp() * 1000 - ), + datetime.strptime( + dataset.created, '%Y-%m-%d %H:%M:%S.%f' + ).timestamp() * 1000 + ) if hasattr(dataset, "created") else 0, ), ) @@ -289,16 +302,19 @@ def _create_external_url(self, dataset: DremioDataset) -> str: return f"{self.base_url}/{container_type}/{dataset_url_path}\"{dataset.resource_name}\"" - def _create_ownership(self, dataset: DremioDataset) -> OwnershipClass: - owner = make_user_urn(dataset.owner) if dataset.owner_type == "USER" else make_group_urn(dataset.owner) - return OwnershipClass( - owners=[ - OwnerClass( - owner=owner, - type=OwnershipTypeClass.TECHNICAL_OWNER, - ) - ] - ) + def _create_ownership(self, dataset: DremioDataset) -> Optional[OwnershipClass]: + if hasattr(dataset, "owner"): + owner = make_user_urn(dataset.owner) if dataset.owner_type == "USER" else make_group_urn(dataset.owner) + return OwnershipClass( + owners=[ + OwnerClass( + owner=owner, + type=OwnershipTypeClass.TECHNICAL_OWNER, + ) + ] + ) + else: + return None def _create_glossary_terms( self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_sql_queries.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_sql_queries.py index 86cb1cc124884a..4f783e37e85dc5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_sql_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_sql_queries.py @@ -1,15 +1,17 @@ class DremioSQLQueries: QUERY_DATASETS_CE = """ + SELECT * FROM + ( SELECT T.TABLE_SCHEMA, T.TABLE_NAME, - CONCAT(PATH, '.', T.TABLE_NAME) AS FULL_TABLE_PATH + CONCAT(T.TABLE_SCHEMA, '.', T.TABLE_NAME) AS FULL_TABLE_PATH, V.VIEW_DEFINITION, C.COLUMN_NAME, C.IS_NULLABLE, C.DATA_TYPE, C.COLUMN_SIZE - FROM + FROM INFORMATION_SCHEMA."TABLES" T LEFT JOIN INFORMATION_SCHEMA.VIEWS V ON V.TABLE_CATALOG = T.TABLE_CATALOG @@ -20,17 +22,21 @@ class DremioSQLQueries: AND C.TABLE_SCHEMA = T.TABLE_SCHEMA AND C.TABLE_NAME = T.TABLE_NAME WHERE - T.TYPE NOT IN ('SYSTEM_TABLE') + T.TABLE_TYPE NOT IN ('SYSTEM_TABLE') + ) + WHERE 1=1 {schema_pattern} {table_pattern} {deny_schema_pattern} {deny_table_pattern} ORDER BY - T.TABLE_SCHEMA ASC, - T.TABLE_NAME ASC + TABLE_SCHEMA ASC, + TABLE_NAME ASC """ QUERY_DATASETS_EE = """ + SELECT * FROM + ( SELECT RESOURCE_ID, V.TABLE_NAME, @@ -117,110 +123,116 @@ class DremioSQLQueries: CONCAT(C.TABLE_SCHEMA, '.', C.TABLE_NAME) WHERE V.TYPE NOT IN ('SYSTEM_TABLE') + ) + WHERE 1=1 {schema_pattern} {table_pattern} {deny_schema_pattern} {deny_table_pattern} ORDER BY - V.PATH ASC, - V.TABLE_NAME ASC + TABLE_SCHEMA ASC, + TABLE_NAME ASC """ QUERY_DATASETS_CLOUD = """ + SELECT * FROM + ( + SELECT + RESOURCE_ID, + V.TABLE_NAME, + OWNER, + PATH AS TABLE_SCHEMA, + CONCAT(REPLACE(REPLACE( + CONCAT(REPLACE(REPLACE( + REPLACE(V.PATH, ', ', '.'), + '[', ''), ']', '' + )) AS FULL_TABLE_PATH, + OWNER_TYPE, + LOCATION_ID, + VIEW_DEFINITION, + FORMAT_TYPE, + COLUMN_NAME, + ORDINAL_POSITION, + IS_NULLABLE, + DATA_TYPE, + COLUMN_SIZE, + CREATED + FROM + (SELECT + VIEW_ID AS RESOURCE_ID, + VIEW_NAME AS TABLE_NAME, + PATH, + CASE + WHEN LENGTH(SCHEMA_ID) = 0 THEN SPACE_ID + ELSE SCHEMA_ID + END AS LOCATION_ID, + OWNER_ID, + SQL_DEFINITION AS VIEW_DEFINITION, + '' AS FORMAT_TYPE, + CREATED, + TYPE + FROM + SYS.PROJECT.VIEWS + UNION ALL SELECT - RESOURCE_ID, - V.TABLE_NAME, - OWNER, - PATH AS TABLE_SCHEMA, - CONCAT(REPLACE(REPLACE( - CONCAT(REPLACE(REPLACE( - REPLACE(V.PATH, ', ', '.'), - '[', ''), ']', '' - )) AS FULL_TABLE_PATH, - OWNER_TYPE, - LOCATION_ID, - VIEW_DEFINITION, + TABLE_ID AS RESOURCE_ID, + TABLE_NAME, + PATH, + CASE + WHEN LENGTH(SCHEMA_ID) = 0 THEN SOURCE_ID + ELSE SCHEMA_ID + END AS LOCATION_ID, + OWNER_ID, + NULL AS VIEW_DEFINITION, FORMAT_TYPE, - COLUMN_NAME, - ORDINAL_POSITION, - IS_NULLABLE, - DATA_TYPE, - COLUMN_SIZE, - CREATED + CREATED, + TYPE FROM - (SELECT - VIEW_ID AS RESOURCE_ID, - VIEW_NAME AS TABLE_NAME, - PATH, - CASE - WHEN LENGTH(SCHEMA_ID) = 0 THEN SPACE_ID - ELSE SCHEMA_ID - END AS LOCATION_ID, - OWNER_ID, - SQL_DEFINITION AS VIEW_DEFINITION, - '' AS FORMAT_TYPE, - CREATED, - TYPE - FROM - SYS.PROJECT.VIEWS - UNION ALL - SELECT - TABLE_ID AS RESOURCE_ID, - TABLE_NAME, - PATH, - CASE - WHEN LENGTH(SCHEMA_ID) = 0 THEN SOURCE_ID - ELSE SCHEMA_ID - END AS LOCATION_ID, - OWNER_ID, - NULL AS VIEW_DEFINITION, - FORMAT_TYPE, - CREATED, - TYPE - FROM - SYS.PROJECT."TABLES" - ) V - LEFT JOIN - (SELECT - USER_ID AS ID, - USER_NAME AS "OWNER", - 'USER' AS OWNER_TYPE - FROM - SYS.ORGANIZATION.USERS - UNION ALL - SELECT - ROLE_ID AS ID, - ROLE_NAME AS "OWNER", - 'GROUP' AS OWNER_TYPE - FROM - SYS.ORGANIZATION.ROLES - ) U - ON - V.OWNER_ID = U.ID - LEFT JOIN - (SELECT - TABLE_SCHEMA, - TABLE_NAME, - COLUMN_NAME, - ORDINAL_POSITION, - IS_NULLABLE, - DATA_TYPE, - COLUMN_SIZE - FROM - INFORMATION_SCHEMA.COLUMNS - ) C - ON - CONCAT(REPLACE(REPLACE(REPLACE(V.PATH, ', ', '.'), '[', ''), ']', '')) = - CONCAT(C.TABLE_SCHEMA, '.', C.TABLE_NAME) - WHERE - V.TYPE NOT IN ('SYSTEM_TABLE') - {schema_pattern} - {table_pattern} - {deny_schema_pattern} - {deny_table_pattern} - ORDER BY - V.PATH ASC, - V.TABLE_NAME ASC + SYS.PROJECT."TABLES" + ) V + LEFT JOIN + (SELECT + USER_ID AS ID, + USER_NAME AS "OWNER", + 'USER' AS OWNER_TYPE + FROM + SYS.ORGANIZATION.USERS + UNION ALL + SELECT + ROLE_ID AS ID, + ROLE_NAME AS "OWNER", + 'GROUP' AS OWNER_TYPE + FROM + SYS.ORGANIZATION.ROLES + ) U + ON + V.OWNER_ID = U.ID + LEFT JOIN + (SELECT + TABLE_SCHEMA, + TABLE_NAME, + COLUMN_NAME, + ORDINAL_POSITION, + IS_NULLABLE, + DATA_TYPE, + COLUMN_SIZE + FROM + INFORMATION_SCHEMA.COLUMNS + ) C + ON + CONCAT(REPLACE(REPLACE(REPLACE(V.PATH, ', ', '.'), '[', ''), ']', '')) = + CONCAT(C.TABLE_SCHEMA, '.', C.TABLE_NAME) + WHERE + V.TYPE NOT IN ('SYSTEM_TABLE') + ) + WHERE 1=1 + {schema_pattern} + {table_pattern} + {deny_schema_pattern} + {deny_table_pattern} + ORDER BY + TABLE_SCHEMA ASC, + TABLE_NAME ASC """ QUERY_ALL_JOBS = """