diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index dbcc7da8467035..1e5fec96867316 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -25,7 +25,9 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - #11484 - Rest API authorization enabled by default - #10472 - `SANDBOX` added as a FabricType. No rollbacks allowed once metadata with this fabric type is added without manual cleanups in databases. - #11619 - schema field/column paths can no longer be empty strings -- #11619 - schema field/column paths can no longer be duplicated within the schema +- #11619 - schema field/column paths can no longer be duplicated within the schema +- #11570 - The `DatahubClientConfig`'s server field no longer defaults to `http://localhost:8080`. Be sure to explicitly set this. +- #11570 - If a `datahub_api` is explicitly passed to a stateful ingestion config provider, it will be used. We previously ignored it if the pipeline context also had a graph object. ### Potential Downtime diff --git a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py index df9d0fc423fcfa..a2cf159dd12f6e 100644 --- a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py +++ b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py @@ -78,12 +78,6 @@ class Constant: # Default config constants DEFAULT_DATAHUB_REST_URL = "http://localhost:8080" - # Environment variable contants - DATAHUB_REST_URL = "DATAHUB_REST_URL" - DATAHUB_ENV = "DATAHUB_ENV" - DATAHUB_PLATFORM_INSTANCE = "DATAHUB_PLATFORM_INSTANCE" - DAGSTER_UI_URL = "DAGSTER_UI_URL" - # Datahub inputs/outputs constant DATAHUB_INPUTS = "datahub.inputs" DATAHUB_OUTPUTS = "datahub.outputs" @@ -154,7 +148,6 @@ class DatasetLineage(NamedTuple): class DatahubDagsterSourceConfig(DatasetSourceConfigMixin): datahub_client_config: DatahubClientConfig = pydantic.Field( - default=DatahubClientConfig(), description="Datahub client config", ) diff --git a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/sensors/datahub_sensors.py b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/sensors/datahub_sensors.py index 4633014222d054..ebb2c82d952b1f 100644 --- a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/sensors/datahub_sensors.py +++ b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/sensors/datahub_sensors.py @@ -1,5 +1,6 @@ import os import traceback +import warnings from collections import defaultdict from types import ModuleType from typing import Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, Union @@ -38,7 +39,7 @@ from dagster._core.events import DagsterEventType, HandledOutputData, LoadedInputData from dagster._core.execution.stats import RunStepKeyStatsSnapshot from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.graph.client import DataHubGraph +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph from datahub.metadata.schema_classes import SubTypesClass from datahub.sql_parsing.sqlglot_lineage import ( SqlParsingResult, @@ -47,6 +48,7 @@ from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub_dagster_plugin.client.dagster_generator import ( + Constant, DagsterEnvironment, DagsterGenerator, DatahubDagsterSourceConfig, @@ -182,7 +184,17 @@ def __init__( if config: self.config = config else: - self.config = DatahubDagsterSourceConfig() + # This is a temporary warning for backwards compatibility. Eventually, we'll remove this + # branch and make the config required. + warnings.warn( + "Using the default DataHub client config is deprecated. Pass in a config object explicitly.", + stacklevel=2, + ) + self.config = DatahubDagsterSourceConfig( + datahub_client_config=DatahubClientConfig( + server=Constant.DEFAULT_DATAHUB_REST_URL + ) + ) self.graph = DataHubGraph( self.config.datahub_client_config, ) diff --git a/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py b/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py index 8f58fa469a7d96..2097922c151366 100644 --- a/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py +++ b/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py @@ -117,7 +117,7 @@ class DataProduct(ConfigModel): @pydantic.validator("assets", each_item=True) def assets_must_be_urns(cls, v: str) -> str: try: - Urn.create_from_string(v) + Urn.from_string(v) except Exception as e: raise ValueError(f"asset {v} is not an urn: {e}") from e diff --git a/metadata-ingestion/src/datahub/ingestion/graph/config.py b/metadata-ingestion/src/datahub/ingestion/graph/config.py index cf0ec45b71458c..5f269e14e1a4af 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/config.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/config.py @@ -8,7 +8,7 @@ class DatahubClientConfig(ConfigModel): # TODO: Having a default for the server doesn't make a ton of sense. This should be handled # by callers / the CLI, but the actual client should not have any magic. - server: str = "http://localhost:8080" + server: str token: Optional[str] = None timeout_sec: Optional[int] = None retry_status_codes: Optional[List[int]] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py b/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py index 442abb3aaf4cf8..8f4a53ffc3ed58 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py @@ -17,7 +17,7 @@ class DatahubIngestionStateProviderConfig(IngestionCheckpointingProviderConfig): - datahub_api: DatahubClientConfig = DatahubClientConfig() + datahub_api: Optional[DatahubClientConfig] = None class DatahubIngestionCheckpointingProvider(IngestionCheckpointingProviderBase): @@ -31,8 +31,8 @@ def __init__( self.graph = graph if not self._is_server_stateful_ingestion_capable(): raise ConfigurationError( - "Datahub server is not capable of supporting stateful ingestion." - " Please consider upgrading to the latest server version to use this feature." + "Datahub server is not capable of supporting stateful ingestion. " + "Please consider upgrading to the latest server version to use this feature." ) @classmethod @@ -40,11 +40,15 @@ def create( cls, config_dict: Dict[str, Any], ctx: PipelineContext ) -> "DatahubIngestionCheckpointingProvider": config = DatahubIngestionStateProviderConfig.parse_obj(config_dict) - if ctx.graph: - # Use the pipeline-level graph if set + if config.datahub_api is not None: + return cls(DataHubGraph(config.datahub_api)) + elif ctx.graph: + # Use the pipeline-level graph if set. return cls(ctx.graph) else: - return cls(DataHubGraph(config.datahub_api)) + raise ValueError( + "A graph instance is required. Either pass one in the pipeline context, or set it explicitly in the stateful ingestion provider config." + ) def _is_server_stateful_ingestion_capable(self) -> bool: server_config = self.graph.get_config() if self.graph else None diff --git a/metadata-ingestion/tests/conftest.py b/metadata-ingestion/tests/conftest.py index d0716e34ee2b6f..db025e7f806c06 100644 --- a/metadata-ingestion/tests/conftest.py +++ b/metadata-ingestion/tests/conftest.py @@ -25,7 +25,10 @@ docker_compose_command, docker_compose_runner, ) -from tests.test_helpers.state_helpers import mock_datahub_graph # noqa: F401,E402 +from tests.test_helpers.state_helpers import ( # noqa: F401,E402 + mock_datahub_graph, + mock_datahub_graph_instance, +) try: # See https://github.com/spulec/freezegun/issues/98#issuecomment-590553475. diff --git a/metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py b/metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py index b6e1aca4d4fedb..73b90df65c04fe 100644 --- a/metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py +++ b/metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py @@ -3,7 +3,6 @@ import pytest from freezegun import freeze_time -from datahub.ingestion.graph.client import DatahubClientConfig from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.source.metadata import business_glossary from tests.test_helpers import mce_helpers @@ -41,7 +40,12 @@ def get_default_recipe( @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_glossary_ingest( - mock_datahub_graph, pytestconfig, tmp_path, mock_time, enable_auto_id, golden_file + mock_datahub_graph_instance, + pytestconfig, + tmp_path, + mock_time, + enable_auto_id, + golden_file, ): test_resources_dir = pytestconfig.rootpath / "tests/integration/business-glossary" @@ -55,9 +59,7 @@ def test_glossary_ingest( enable_auto_id=enable_auto_id, ) ) - pipeline.ctx.graph = mock_datahub_graph( - DatahubClientConfig() - ) # Mock to resolve domain + pipeline.ctx.graph = mock_datahub_graph_instance pipeline.run() pipeline.raise_from_status() diff --git a/metadata-ingestion/tests/test_helpers/state_helpers.py b/metadata-ingestion/tests/test_helpers/state_helpers.py index 76f2ab283790fb..f68aef742fc730 100644 --- a/metadata-ingestion/tests/test_helpers/state_helpers.py +++ b/metadata-ingestion/tests/test_helpers/state_helpers.py @@ -1,5 +1,5 @@ import types -from typing import Any, Dict, Optional, Type, cast +from typing import Any, Callable, Dict, Optional, Type, cast from unittest.mock import MagicMock, create_autospec import pytest @@ -10,6 +10,7 @@ IngestionCheckpointingProviderBase, ) from datahub.ingestion.graph.client import DataHubGraph +from datahub.ingestion.graph.config import DatahubClientConfig from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.source.state.checkpoint import Checkpoint from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState @@ -101,6 +102,13 @@ def monkey_patch_get_latest_timeseries_value( return mock_datahub_graph_ctx.mock_graph +@pytest.fixture +def mock_datahub_graph_instance( + mock_datahub_graph: Callable[[DatahubClientConfig], DataHubGraph] +) -> DataHubGraph: + return mock_datahub_graph(DatahubClientConfig(server="http://fake.domain.local")) + + def get_current_checkpoint_from_pipeline( pipeline: Pipeline, ) -> Optional[Checkpoint[GenericCheckpointState]]: diff --git a/metadata-ingestion/tests/unit/graph/test_client.py b/metadata-ingestion/tests/unit/graph/test_client.py index faed1f51b29aa1..16795ef8c7f814 100644 --- a/metadata-ingestion/tests/unit/graph/test_client.py +++ b/metadata-ingestion/tests/unit/graph/test_client.py @@ -11,7 +11,7 @@ @patch("datahub.emitter.rest_emitter.DataHubRestEmitter.test_connection") def test_get_aspect(mock_test_connection): mock_test_connection.return_value = {} - graph = DataHubGraph(DatahubClientConfig()) + graph = DataHubGraph(DatahubClientConfig(server="http://fake-domain.local")) user_urn = "urn:li:corpuser:foo" with patch("requests.Session.get") as mock_get: mock_response = Mock() diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py index be6efd3e121ff1..85c86f8d205d9a 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/state/test_redundant_run_skip_handler.py @@ -12,17 +12,11 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import ( StatefulStaleMetadataRemovalConfig, ) -from datahub.ingestion.source.state.stateful_ingestion_base import ( - DynamicTypedStateProviderConfig, -) from datahub.ingestion.source.state.usage_common_state import ( BaseTimeWindowCheckpointState, ) from datahub.utilities.time import datetime_to_ts_millis -GMS_PORT = 8080 -GMS_SERVER = f"http://localhost:{GMS_PORT}" - @pytest.fixture def stateful_source(mock_datahub_graph: DataHubGraph) -> Iterable[SnowflakeV2Source]: @@ -39,9 +33,7 @@ def stateful_source(mock_datahub_graph: DataHubGraph) -> Iterable[SnowflakeV2Sou password="TST_PWD", stateful_ingestion=StatefulStaleMetadataRemovalConfig( enabled=True, - state_provider=DynamicTypedStateProviderConfig( - type="datahub", config={"datahub_api": {"server": GMS_SERVER}} - ), + # Uses the graph from the pipeline context. ), ) diff --git a/metadata-ingestion/tests/unit/stateful_ingestion/test_configs.py b/metadata-ingestion/tests/unit/stateful_ingestion/test_configs.py index 0e6d60e3440b20..ba40962866f8cc 100644 --- a/metadata-ingestion/tests/unit/stateful_ingestion/test_configs.py +++ b/metadata-ingestion/tests/unit/stateful_ingestion/test_configs.py @@ -14,16 +14,12 @@ ) # 0. Common client configs. -datahub_client_configs: Dict[str, Any] = { - "full": { - "server": "http://localhost:8080", - "token": "dummy_test_tok", - "timeout_sec": 10, - "extra_headers": {}, - "max_threads": 10, - }, - "simple": {}, - "default": {}, +datahub_client_full_config = { + "server": "http://localhost:8080", + "token": "dummy_test_tok", + "timeout_sec": 10, + "extra_headers": {}, + "max_threads": 10, } @@ -41,7 +37,7 @@ "checkpointing_valid_full_config": ( DatahubIngestionStateProviderConfig, { - "datahub_api": datahub_client_configs["full"], + "datahub_api": datahub_client_full_config, }, DatahubIngestionStateProviderConfig( # This test verifies that the max_threads arg is ignored. @@ -57,27 +53,14 @@ ), False, ), - # Simple config - "checkpointing_valid_simple_config": ( - DatahubIngestionStateProviderConfig, - { - "datahub_api": datahub_client_configs["simple"], - }, - DatahubIngestionStateProviderConfig( - datahub_api=DatahubClientConfig( - server="http://localhost:8080", - ), - ), - False, - ), # Default "checkpointing_default": ( DatahubIngestionStateProviderConfig, { - "datahub_api": datahub_client_configs["default"], + "datahub_api": None, }, DatahubIngestionStateProviderConfig( - datahub_api=DatahubClientConfig(), + datahub_api=None, ), False, ), @@ -102,7 +85,7 @@ "max_checkpoint_state_size": 1024, "state_provider": { "type": "datahub", - "config": datahub_client_configs["full"], + "config": datahub_client_full_config, }, "ignore_old_state": True, "ignore_new_state": True, @@ -114,7 +97,7 @@ ignore_new_state=True, state_provider=DynamicTypedStateProviderConfig( type="datahub", - config=datahub_client_configs["full"], + config=datahub_client_full_config, ), ), False, diff --git a/metadata-ingestion/tests/unit/test_glue_source.py b/metadata-ingestion/tests/unit/test_glue_source.py index 45b9899eacaa77..eb1e7f3fe41d9a 100644 --- a/metadata-ingestion/tests/unit/test_glue_source.py +++ b/metadata-ingestion/tests/unit/test_glue_source.py @@ -1,6 +1,6 @@ import json from pathlib import Path -from typing import Any, Callable, Dict, Optional, Tuple, Type, cast +from typing import Any, Dict, Optional, Tuple, Type, cast from unittest.mock import patch import pydantic @@ -11,7 +11,7 @@ import datahub.metadata.schema_classes as models from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields -from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph +from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.sink.file import write_metadata_file from datahub.ingestion.source.aws.glue import ( GlueProfilingConfig, @@ -74,7 +74,7 @@ def glue_source( platform_instance: Optional[str] = None, - mock_datahub_graph: Optional[Callable[[DatahubClientConfig], DataHubGraph]] = None, + mock_datahub_graph_instance: Optional[DataHubGraph] = None, use_s3_bucket_tags: bool = True, use_s3_object_tags: bool = True, extract_delta_schema_from_parameters: bool = False, @@ -83,8 +83,8 @@ def glue_source( extract_transforms: bool = True, ) -> GlueSource: pipeline_context = PipelineContext(run_id="glue-source-tes") - if mock_datahub_graph: - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + if mock_datahub_graph_instance: + pipeline_context.graph = mock_datahub_graph_instance return GlueSource( ctx=pipeline_context, config=GlueSourceConfig( @@ -493,14 +493,14 @@ def test_glue_with_malformed_delta_schema_ingest( def test_glue_ingest_include_table_lineage( tmp_path: Path, pytestconfig: PytestConfig, - mock_datahub_graph: Callable[[DatahubClientConfig], DataHubGraph], + mock_datahub_graph_instance: DataHubGraph, platform_instance: str, mce_file: str, mce_golden_file: str, ) -> None: glue_source_instance = glue_source( platform_instance=platform_instance, - mock_datahub_graph=mock_datahub_graph, + mock_datahub_graph_instance=mock_datahub_graph_instance, emit_s3_lineage=True, ) @@ -589,14 +589,14 @@ def test_glue_ingest_include_table_lineage( def test_glue_ingest_include_column_lineage( tmp_path: Path, pytestconfig: PytestConfig, - mock_datahub_graph: Callable[[DatahubClientConfig], DataHubGraph], + mock_datahub_graph_instance: DataHubGraph, platform_instance: str, mce_file: str, mce_golden_file: str, ) -> None: glue_source_instance = glue_source( platform_instance=platform_instance, - mock_datahub_graph=mock_datahub_graph, + mock_datahub_graph_instance=mock_datahub_graph_instance, emit_s3_lineage=True, include_column_lineage=True, use_s3_bucket_tags=False, diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py index 46c6390b184d36..2e2e85b5d18113 100644 --- a/metadata-ingestion/tests/unit/test_transform_dataset.py +++ b/metadata-ingestion/tests/unit/test_transform_dataset.py @@ -1,17 +1,7 @@ import json import re from datetime import datetime, timezone -from typing import ( - Any, - Callable, - Dict, - List, - MutableSequence, - Optional, - Type, - Union, - cast, -) +from typing import Any, Dict, List, MutableSequence, Optional, Type, Union, cast from unittest import mock from uuid import uuid4 @@ -24,7 +14,7 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api import workunit from datahub.ingestion.api.common import EndOfStream, PipelineContext, RecordEnvelope -from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph +from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.transformer.add_dataset_browse_path import ( AddDatasetBrowsePathTransformer, @@ -1106,7 +1096,7 @@ def test_pattern_dataset_ownership_with_invalid_type_transformation(mock_time): def test_pattern_container_and_dataset_ownership_transformation( - mock_time, mock_datahub_graph + mock_time, mock_datahub_graph_instance ): def fake_get_aspect( entity_urn: str, @@ -1127,7 +1117,7 @@ def fake_get_aspect( pipeline_context = PipelineContext( run_id="test_pattern_container_and_dataset_ownership_transformation" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance pipeline_context.graph.get_aspect = fake_get_aspect # type: ignore # No owner aspect for the first dataset @@ -1240,7 +1230,7 @@ def fake_get_aspect( def test_pattern_container_and_dataset_ownership_with_no_container( - mock_time, mock_datahub_graph + mock_time, mock_datahub_graph_instance ): def fake_get_aspect( entity_urn: str, @@ -1252,7 +1242,7 @@ def fake_get_aspect( pipeline_context = PipelineContext( run_id="test_pattern_container_and_dataset_ownership_with_no_container" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance pipeline_context.graph.get_aspect = fake_get_aspect # type: ignore # No owner aspect for the first dataset @@ -1357,7 +1347,7 @@ def fake_get_aspect( def test_pattern_container_and_dataset_ownership_with_no_match( - mock_time, mock_datahub_graph + mock_time, mock_datahub_graph_instance ): def fake_get_aspect( entity_urn: str, @@ -1375,7 +1365,7 @@ def fake_get_aspect( pipeline_context = PipelineContext( run_id="test_pattern_container_and_dataset_ownership_with_no_match" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance pipeline_context.graph.get_aspect = fake_get_aspect # type: ignore # No owner aspect for the first dataset @@ -1598,10 +1588,10 @@ def run_simple_add_dataset_properties_transformer_semantics( semantics: TransformerSemantics, new_properties: dict, server_properties: dict, - mock_datahub_graph: Callable[[DatahubClientConfig], DataHubGraph], + mock_datahub_graph_instance: DataHubGraph, ) -> List[RecordEnvelope]: pipeline_context = PipelineContext(run_id="test_pattern_dataset_schema_terms") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance # fake the server response def fake_dataset_properties(entity_urn: str) -> models.DatasetPropertiesClass: @@ -1624,7 +1614,7 @@ def fake_dataset_properties(entity_urn: str) -> models.DatasetPropertiesClass: return output -def test_simple_add_dataset_properties_overwrite(mock_datahub_graph): +def test_simple_add_dataset_properties_overwrite(mock_datahub_graph_instance): new_properties = {"new-simple-property": "new-value"} server_properties = {"p1": "value1"} @@ -1632,7 +1622,7 @@ def test_simple_add_dataset_properties_overwrite(mock_datahub_graph): semantics=TransformerSemantics.OVERWRITE, new_properties=new_properties, server_properties=server_properties, - mock_datahub_graph=mock_datahub_graph, + mock_datahub_graph_instance=mock_datahub_graph_instance, ) assert len(output) == 2 @@ -1648,7 +1638,7 @@ def test_simple_add_dataset_properties_overwrite(mock_datahub_graph): } -def test_simple_add_dataset_properties_patch(mock_datahub_graph): +def test_simple_add_dataset_properties_patch(mock_datahub_graph_instance): new_properties = {"new-simple-property": "new-value"} server_properties = {"p1": "value1"} @@ -1656,7 +1646,7 @@ def test_simple_add_dataset_properties_patch(mock_datahub_graph): semantics=TransformerSemantics.PATCH, new_properties=new_properties, server_properties=server_properties, - mock_datahub_graph=mock_datahub_graph, + mock_datahub_graph_instance=mock_datahub_graph_instance, ) assert len(output) == 2 @@ -2334,24 +2324,24 @@ def run_container_transformer_pipeline( return outputs -def test_simple_add_dataset_domain_aspect_name(mock_datahub_graph): +def test_simple_add_dataset_domain_aspect_name(mock_datahub_graph_instance): pipeline_context: PipelineContext = PipelineContext( run_id="test_simple_add_dataset_domain" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance transformer = SimpleAddDatasetDomain.create({"domains": []}, pipeline_context) assert transformer.aspect_name() == models.DomainsClass.ASPECT_NAME -def test_simple_add_dataset_domain(mock_datahub_graph): +def test_simple_add_dataset_domain(mock_datahub_graph_instance): acryl_domain = builder.make_domain_urn("acryl.io") datahub_domain = builder.make_domain_urn("datahubproject.io") pipeline_context: PipelineContext = PipelineContext( run_id="test_simple_add_dataset_domain" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance output = run_dataset_transformer_pipeline( transformer_type=SimpleAddDatasetDomain, @@ -2372,14 +2362,14 @@ def test_simple_add_dataset_domain(mock_datahub_graph): assert acryl_domain in transformed_aspect.domains -def test_simple_add_dataset_domain_mce_support(mock_datahub_graph): +def test_simple_add_dataset_domain_mce_support(mock_datahub_graph_instance): acryl_domain = builder.make_domain_urn("acryl.io") datahub_domain = builder.make_domain_urn("datahubproject.io") pipeline_context: PipelineContext = PipelineContext( run_id="test_simple_add_dataset_domain" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance output = run_dataset_transformer_pipeline( transformer_type=SimpleAddDatasetDomain, @@ -2403,14 +2393,14 @@ def test_simple_add_dataset_domain_mce_support(mock_datahub_graph): assert acryl_domain in transformed_aspect.domains -def test_simple_add_dataset_domain_replace_existing(mock_datahub_graph): +def test_simple_add_dataset_domain_replace_existing(mock_datahub_graph_instance): acryl_domain = builder.make_domain_urn("acryl.io") datahub_domain = builder.make_domain_urn("datahubproject.io") pipeline_context: PipelineContext = PipelineContext( run_id="test_simple_add_dataset_domain" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance output = run_dataset_transformer_pipeline( transformer_type=SimpleAddDatasetDomain, @@ -2431,13 +2421,13 @@ def test_simple_add_dataset_domain_replace_existing(mock_datahub_graph): assert acryl_domain in transformed_aspect.domains -def test_simple_add_dataset_domain_semantics_overwrite(mock_datahub_graph): +def test_simple_add_dataset_domain_semantics_overwrite(mock_datahub_graph_instance): acryl_domain = builder.make_domain_urn("acryl.io") datahub_domain = builder.make_domain_urn("datahubproject.io") server_domain = builder.make_domain_urn("test.io") pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance # Return fake aspect to simulate server behaviour def fake_get_domain(entity_urn: str) -> models.DomainsClass: @@ -2469,14 +2459,14 @@ def fake_get_domain(entity_urn: str) -> models.DomainsClass: def test_simple_add_dataset_domain_semantics_patch( - pytestconfig, tmp_path, mock_time, mock_datahub_graph + pytestconfig, tmp_path, mock_time, mock_datahub_graph_instance ): acryl_domain = builder.make_domain_urn("acryl.io") datahub_domain = builder.make_domain_urn("datahubproject.io") server_domain = builder.make_domain_urn("test.io") pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance # Return fake aspect to simulate server behaviour def fake_get_domain(entity_urn: str) -> models.DomainsClass: @@ -2508,11 +2498,11 @@ def fake_get_domain(entity_urn: str) -> models.DomainsClass: assert server_domain in transformed_aspect.domains -def test_pattern_add_dataset_domain_aspect_name(mock_datahub_graph): +def test_pattern_add_dataset_domain_aspect_name(mock_datahub_graph_instance): pipeline_context: PipelineContext = PipelineContext( run_id="test_simple_add_dataset_domain" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance transformer = PatternAddDatasetDomain.create( {"domain_pattern": {"rules": {}}}, pipeline_context @@ -2520,7 +2510,7 @@ def test_pattern_add_dataset_domain_aspect_name(mock_datahub_graph): assert transformer.aspect_name() == models.DomainsClass.ASPECT_NAME -def test_pattern_add_dataset_domain_match(mock_datahub_graph): +def test_pattern_add_dataset_domain_match(mock_datahub_graph_instance): acryl_domain = builder.make_domain_urn("acryl.io") datahub_domain = builder.make_domain_urn("datahubproject.io") pattern = "urn:li:dataset:\\(urn:li:dataPlatform:bigquery,.*" @@ -2528,7 +2518,7 @@ def test_pattern_add_dataset_domain_match(mock_datahub_graph): pipeline_context: PipelineContext = PipelineContext( run_id="test_simple_add_dataset_domain" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance output = run_dataset_transformer_pipeline( transformer_type=PatternAddDatasetDomain, @@ -2551,7 +2541,7 @@ def test_pattern_add_dataset_domain_match(mock_datahub_graph): assert acryl_domain in transformed_aspect.domains -def test_pattern_add_dataset_domain_no_match(mock_datahub_graph): +def test_pattern_add_dataset_domain_no_match(mock_datahub_graph_instance): acryl_domain = builder.make_domain_urn("acryl.io") datahub_domain = builder.make_domain_urn("datahubproject.io") pattern = "urn:li:dataset:\\(urn:li:dataPlatform:invalid,.*" @@ -2559,7 +2549,7 @@ def test_pattern_add_dataset_domain_no_match(mock_datahub_graph): pipeline_context: PipelineContext = PipelineContext( run_id="test_simple_add_dataset_domain" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance output = run_dataset_transformer_pipeline( transformer_type=PatternAddDatasetDomain, @@ -2582,7 +2572,7 @@ def test_pattern_add_dataset_domain_no_match(mock_datahub_graph): assert acryl_domain not in transformed_aspect.domains -def test_pattern_add_dataset_domain_replace_existing_match(mock_datahub_graph): +def test_pattern_add_dataset_domain_replace_existing_match(mock_datahub_graph_instance): acryl_domain = builder.make_domain_urn("acryl.io") datahub_domain = builder.make_domain_urn("datahubproject.io") pattern = "urn:li:dataset:\\(urn:li:dataPlatform:bigquery,.*" @@ -2590,7 +2580,7 @@ def test_pattern_add_dataset_domain_replace_existing_match(mock_datahub_graph): pipeline_context: PipelineContext = PipelineContext( run_id="test_simple_add_dataset_domain" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance output = run_dataset_transformer_pipeline( transformer_type=PatternAddDatasetDomain, @@ -2614,7 +2604,9 @@ def test_pattern_add_dataset_domain_replace_existing_match(mock_datahub_graph): assert acryl_domain in transformed_aspect.domains -def test_pattern_add_dataset_domain_replace_existing_no_match(mock_datahub_graph): +def test_pattern_add_dataset_domain_replace_existing_no_match( + mock_datahub_graph_instance, +): acryl_domain = builder.make_domain_urn("acryl.io") datahub_domain = builder.make_domain_urn("datahubproject.io") pattern = "urn:li:dataset:\\(urn:li:dataPlatform:invalid,.*" @@ -2622,7 +2614,7 @@ def test_pattern_add_dataset_domain_replace_existing_no_match(mock_datahub_graph pipeline_context: PipelineContext = PipelineContext( run_id="test_simple_add_dataset_domain" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance output = run_dataset_transformer_pipeline( transformer_type=PatternAddDatasetDomain, @@ -2644,14 +2636,14 @@ def test_pattern_add_dataset_domain_replace_existing_no_match(mock_datahub_graph assert len(transformed_aspect.domains) == 0 -def test_pattern_add_dataset_domain_semantics_overwrite(mock_datahub_graph): +def test_pattern_add_dataset_domain_semantics_overwrite(mock_datahub_graph_instance): acryl_domain = builder.make_domain_urn("acryl.io") datahub_domain = builder.make_domain_urn("datahubproject.io") server_domain = builder.make_domain_urn("test.io") pattern = "urn:li:dataset:\\(urn:li:dataPlatform:bigquery,.*" pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance # Return fake aspect to simulate server behaviour def fake_get_domain(entity_urn: str) -> models.DomainsClass: @@ -2683,7 +2675,7 @@ def fake_get_domain(entity_urn: str) -> models.DomainsClass: def test_pattern_add_dataset_domain_semantics_patch( - pytestconfig, tmp_path, mock_time, mock_datahub_graph + pytestconfig, tmp_path, mock_time, mock_datahub_graph_instance ): acryl_domain = builder.make_domain_urn("acryl.io") datahub_domain = builder.make_domain_urn("datahubproject.io") @@ -2691,7 +2683,7 @@ def test_pattern_add_dataset_domain_semantics_patch( pattern = "urn:li:dataset:\\(urn:li:dataPlatform:bigquery,.*" pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance # Return fake aspect to simulate server behaviour def fake_get_domain(entity_urn: str) -> models.DomainsClass: @@ -2723,9 +2715,11 @@ def fake_get_domain(entity_urn: str) -> models.DomainsClass: assert server_domain in transformed_aspect.domains -def test_simple_dataset_ownership_transformer_semantics_patch(mock_datahub_graph): +def test_simple_dataset_ownership_transformer_semantics_patch( + mock_datahub_graph_instance, +): pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance server_owner: str = builder.make_owner_urn( "mohd@acryl.io", owner_type=builder.OwnerType.USER @@ -2783,7 +2777,9 @@ def fake_ownership_class(entity_urn: str) -> models.OwnershipClass: assert server_owner in owner_urns -def test_pattern_container_and_dataset_domain_transformation(mock_datahub_graph): +def test_pattern_container_and_dataset_domain_transformation( + mock_datahub_graph_instance, +): datahub_domain = builder.make_domain_urn("datahubproject.io") acryl_domain = builder.make_domain_urn("acryl_domain") server_domain = builder.make_domain_urn("server_domain") @@ -2807,7 +2803,7 @@ def fake_get_aspect( pipeline_context = PipelineContext( run_id="test_pattern_container_and_dataset_domain_transformation" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance pipeline_context.graph.get_aspect = fake_get_aspect # type: ignore with_domain_aspect = make_generic_dataset_mcp( @@ -2889,7 +2885,7 @@ def fake_get_aspect( def test_pattern_container_and_dataset_domain_transformation_with_no_container( - mock_datahub_graph, + mock_datahub_graph_instance, ): datahub_domain = builder.make_domain_urn("datahubproject.io") acryl_domain = builder.make_domain_urn("acryl_domain") @@ -2905,7 +2901,7 @@ def fake_get_aspect( pipeline_context = PipelineContext( run_id="test_pattern_container_and_dataset_domain_transformation_with_no_container" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance pipeline_context.graph.get_aspect = fake_get_aspect # type: ignore with_domain_aspect = make_generic_dataset_mcp( @@ -2955,7 +2951,7 @@ def fake_get_aspect( assert server_domain in second_domain_aspect.domains -def test_pattern_add_container_dataset_domain_no_match(mock_datahub_graph): +def test_pattern_add_container_dataset_domain_no_match(mock_datahub_graph_instance): acryl_domain = builder.make_domain_urn("acryl.io") datahub_domain = builder.make_domain_urn("datahubproject.io") pattern = "urn:li:dataset:\\(urn:li:dataPlatform:invalid,.*" @@ -2963,7 +2959,7 @@ def test_pattern_add_container_dataset_domain_no_match(mock_datahub_graph): pipeline_context: PipelineContext = PipelineContext( run_id="test_simple_add_dataset_domain" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance def fake_get_aspect( entity_urn: str, @@ -3003,10 +2999,10 @@ def fake_get_aspect( def run_pattern_dataset_schema_terms_transformation_semantics( semantics: TransformerSemantics, - mock_datahub_graph: Callable[[DatahubClientConfig], DataHubGraph], + mock_datahub_graph_instance: DataHubGraph, ) -> List[RecordEnvelope]: pipeline_context = PipelineContext(run_id="test_pattern_dataset_schema_terms") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance # fake the server response def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: @@ -3113,10 +3109,10 @@ def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: def test_pattern_dataset_schema_terms_transformation_patch( - mock_time, mock_datahub_graph + mock_time, mock_datahub_graph_instance ): output = run_pattern_dataset_schema_terms_transformation_semantics( - TransformerSemantics.PATCH, mock_datahub_graph + TransformerSemantics.PATCH, mock_datahub_graph_instance ) assert len(output) == 2 # Check that glossary terms were added. @@ -3146,10 +3142,10 @@ def test_pattern_dataset_schema_terms_transformation_patch( def test_pattern_dataset_schema_terms_transformation_overwrite( - mock_time, mock_datahub_graph + mock_time, mock_datahub_graph_instance ): output = run_pattern_dataset_schema_terms_transformation_semantics( - TransformerSemantics.OVERWRITE, mock_datahub_graph + TransformerSemantics.OVERWRITE, mock_datahub_graph_instance ) assert len(output) == 2 @@ -3181,10 +3177,10 @@ def test_pattern_dataset_schema_terms_transformation_overwrite( def run_pattern_dataset_schema_tags_transformation_semantics( semantics: TransformerSemantics, - mock_datahub_graph: Callable[[DatahubClientConfig], DataHubGraph], + mock_datahub_graph_instance: DataHubGraph, ) -> List[RecordEnvelope]: pipeline_context = PipelineContext(run_id="test_pattern_dataset_schema_terms") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance # fake the server response def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: @@ -3284,10 +3280,10 @@ def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: def test_pattern_dataset_schema_tags_transformation_overwrite( - mock_time, mock_datahub_graph + mock_time, mock_datahub_graph_instance ): output = run_pattern_dataset_schema_tags_transformation_semantics( - TransformerSemantics.OVERWRITE, mock_datahub_graph + TransformerSemantics.OVERWRITE, mock_datahub_graph_instance ) assert len(output) == 2 @@ -3318,10 +3314,10 @@ def test_pattern_dataset_schema_tags_transformation_overwrite( def test_pattern_dataset_schema_tags_transformation_patch( - mock_time, mock_datahub_graph + mock_time, mock_datahub_graph_instance ): output = run_pattern_dataset_schema_tags_transformation_semantics( - TransformerSemantics.PATCH, mock_datahub_graph + TransformerSemantics.PATCH, mock_datahub_graph_instance ) assert len(output) == 2 @@ -3542,9 +3538,11 @@ def fake_ownership_class(entity_urn: str) -> models.OwnershipClass: assert set(out_owners) == set(cleaned_owner_urn) -def test_clean_owner_urn_transformation_remove_fixed_string(mock_datahub_graph): +def test_clean_owner_urn_transformation_remove_fixed_string( + mock_datahub_graph_instance, +): pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance user_emails = [ "ABCDEF:email_id@example.com", @@ -3581,9 +3579,11 @@ def test_clean_owner_urn_transformation_remove_fixed_string(mock_datahub_graph): _test_clean_owner_urns(pipeline_context, in_owner_urns, config, expected_owner_urns) -def test_clean_owner_urn_transformation_remove_multiple_values(mock_datahub_graph): +def test_clean_owner_urn_transformation_remove_multiple_values( + mock_datahub_graph_instance, +): pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance user_emails = [ "ABCDEF:email_id@example.com", @@ -3620,9 +3620,11 @@ def test_clean_owner_urn_transformation_remove_multiple_values(mock_datahub_grap _test_clean_owner_urns(pipeline_context, in_owner_urns, config, expected_owner_urns) -def test_clean_owner_urn_transformation_remove_values_using_regex(mock_datahub_graph): +def test_clean_owner_urn_transformation_remove_values_using_regex( + mock_datahub_graph_instance, +): pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance user_emails = [ "ABCDEF:email_id@example.com", @@ -3659,9 +3661,9 @@ def test_clean_owner_urn_transformation_remove_values_using_regex(mock_datahub_g _test_clean_owner_urns(pipeline_context, in_owner_urns, config, expected_owner_urns) -def test_clean_owner_urn_transformation_remove_digits(mock_datahub_graph): +def test_clean_owner_urn_transformation_remove_digits(mock_datahub_graph_instance): pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance user_emails = [ "ABCDEF:email_id@example.com", @@ -3698,9 +3700,9 @@ def test_clean_owner_urn_transformation_remove_digits(mock_datahub_graph): _test_clean_owner_urns(pipeline_context, in_owner_urns, config, expected_owner_urns) -def test_clean_owner_urn_transformation_remove_pattern(mock_datahub_graph): +def test_clean_owner_urn_transformation_remove_pattern(mock_datahub_graph_instance): pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance user_emails = [ "ABCDEF:email_id@example.com", @@ -3738,10 +3740,10 @@ def test_clean_owner_urn_transformation_remove_pattern(mock_datahub_graph): def test_clean_owner_urn_transformation_remove_word_in_capital_letters( - mock_datahub_graph, + mock_datahub_graph_instance, ): pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance user_emails = [ "ABCDEF:email_id@example.com", @@ -3781,10 +3783,10 @@ def test_clean_owner_urn_transformation_remove_word_in_capital_letters( def test_clean_owner_urn_transformation_remove_pattern_with_alphanumeric_value( - mock_datahub_graph, + mock_datahub_graph_instance, ): pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance user_emails = [ "ABCDEF:email_id@example.com", @@ -3822,10 +3824,10 @@ def test_clean_owner_urn_transformation_remove_pattern_with_alphanumeric_value( def test_clean_owner_urn_transformation_should_not_remove_system_identifier( - mock_datahub_graph, + mock_datahub_graph_instance, ): pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance user_emails = [ "ABCDEF:email_id@example.com", @@ -3850,12 +3852,12 @@ def test_clean_owner_urn_transformation_should_not_remove_system_identifier( def test_replace_external_url_word_replace( - mock_datahub_graph, + mock_datahub_graph_instance, ): pipeline_context: PipelineContext = PipelineContext( run_id="test_replace_external_url" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance output = run_dataset_transformer_pipeline( transformer_type=ReplaceExternalUrlDataset, @@ -3877,12 +3879,12 @@ def test_replace_external_url_word_replace( def test_replace_external_regex_replace_1( - mock_datahub_graph, + mock_datahub_graph_instance, ): pipeline_context: PipelineContext = PipelineContext( run_id="test_replace_external_url" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance output = run_dataset_transformer_pipeline( transformer_type=ReplaceExternalUrlDataset, @@ -3904,12 +3906,12 @@ def test_replace_external_regex_replace_1( def test_replace_external_regex_replace_2( - mock_datahub_graph, + mock_datahub_graph_instance, ): pipeline_context: PipelineContext = PipelineContext( run_id="test_replace_external_url" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance output = run_dataset_transformer_pipeline( transformer_type=ReplaceExternalUrlDataset, @@ -3931,12 +3933,12 @@ def test_replace_external_regex_replace_2( def test_pattern_cleanup_usage_statistics_user_1( - mock_datahub_graph, + mock_datahub_graph_instance, ): pipeline_context: PipelineContext = PipelineContext( run_id="test_pattern_cleanup_usage_statistics_user" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance TS_1 = datetime(year=2023, month=1, day=1, tzinfo=timezone.utc) @@ -3985,12 +3987,12 @@ def test_pattern_cleanup_usage_statistics_user_1( def test_pattern_cleanup_usage_statistics_user_2( - mock_datahub_graph, + mock_datahub_graph_instance, ): pipeline_context: PipelineContext = PipelineContext( run_id="test_pattern_cleanup_usage_statistics_user" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance TS_1 = datetime(year=2023, month=1, day=1, tzinfo=timezone.utc) @@ -4039,12 +4041,12 @@ def test_pattern_cleanup_usage_statistics_user_2( def test_pattern_cleanup_usage_statistics_user_3( - mock_datahub_graph, + mock_datahub_graph_instance, ): pipeline_context: PipelineContext = PipelineContext( run_id="test_pattern_cleanup_usage_statistics_user" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance TS_1 = datetime(year=2023, month=1, day=1, tzinfo=timezone.utc) @@ -4092,7 +4094,7 @@ def test_pattern_cleanup_usage_statistics_user_3( assert output[0].record.aspect.userCounts == expectedUsageStatistics.userCounts -def test_domain_mapping_based_on_tags_with_valid_tags(mock_datahub_graph): +def test_domain_mapping_based_on_tags_with_valid_tags(mock_datahub_graph_instance): acryl_domain = builder.make_domain_urn("acryl.io") server_domain = builder.make_domain_urn("test.io") @@ -4103,7 +4105,7 @@ def fake_get_tags(entity_urn: str) -> models.GlobalTagsClass: return models.GlobalTagsClass(tags=[TagAssociationClass(tag=tag_one)]) pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance pipeline_context.graph.get_tags = fake_get_tags # type: ignore @@ -4126,13 +4128,15 @@ def fake_get_tags(entity_urn: str) -> models.GlobalTagsClass: assert server_domain not in transformed_aspect.domains -def test_domain_mapping_based_on_tags_with_no_matching_tags(mock_datahub_graph): +def test_domain_mapping_based_on_tags_with_no_matching_tags( + mock_datahub_graph_instance, +): acryl_domain = builder.make_domain_urn("acryl.io") server_domain = builder.make_domain_urn("test.io") non_matching_tag = builder.make_tag_urn("nonMatching") pipeline_context = PipelineContext(run_id="no_match_pipeline") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance # Return fake aspect to simulate server behaviour def fake_get_tags(entity_urn: str) -> models.GlobalTagsClass: @@ -4157,11 +4161,11 @@ def fake_get_tags(entity_urn: str) -> models.GlobalTagsClass: assert server_domain in transformed_aspect.domains -def test_domain_mapping_based_on_tags_with_empty_config(mock_datahub_graph): +def test_domain_mapping_based_on_tags_with_empty_config(mock_datahub_graph_instance): some_tag = builder.make_tag_urn("someTag") pipeline_context = PipelineContext(run_id="empty_config_pipeline") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance # Return fake aspect to simulate server behaviour def fake_get_tags(entity_urn: str) -> models.GlobalTagsClass: @@ -4180,7 +4184,9 @@ def fake_get_tags(entity_urn: str) -> models.GlobalTagsClass: assert len(output[0].record.aspect.domains) == 0 -def test_domain_mapping_based__r_on_tags_with_multiple_tags(mock_datahub_graph): +def test_domain_mapping_based__r_on_tags_with_multiple_tags( + mock_datahub_graph_instance, +): # Two tags that match different rules in the domain mapping configuration tag_one = builder.make_tag_urn("test:tag_1") tag_two = builder.make_tag_urn("test:tag_2") @@ -4189,7 +4195,7 @@ def test_domain_mapping_based__r_on_tags_with_multiple_tags(mock_datahub_graph): hr = builder.make_domain_urn("hr") pipeline_context = PipelineContext(run_id="multiple_matches_pipeline") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance # Return fake aspect to simulate server behaviour def fake_get_tags(entity_urn: str) -> models.GlobalTagsClass: @@ -4226,11 +4232,11 @@ def fake_get_domain(entity_urn: str) -> models.DomainsClass: assert len(transformed_aspect.domains) == 3 -def test_domain_mapping_based_on_tags_with_empty_tags(mock_datahub_graph): +def test_domain_mapping_based_on_tags_with_empty_tags(mock_datahub_graph_instance): acryl_domain = builder.make_domain_urn("acryl.io") server_domain = builder.make_domain_urn("test.io") pipeline_context = PipelineContext(run_id="empty_config_pipeline") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance # Return fake aspect to simulate server behaviour def fake_get_tags(entity_urn: str) -> models.GlobalTagsClass: @@ -4254,11 +4260,11 @@ def fake_get_tags(entity_urn: str) -> models.GlobalTagsClass: assert server_domain not in transformed_aspect.domains -def test_domain_mapping_based_on_tags_with_no_tags(mock_datahub_graph): +def test_domain_mapping_based_on_tags_with_no_tags(mock_datahub_graph_instance): acryl_domain = builder.make_domain_urn("acryl.io") server_domain = builder.make_domain_urn("test.io") pipeline_context = PipelineContext(run_id="empty_config_pipeline") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance # Return fake aspect to simulate server behaviour def fake_get_tags(entity_urn: str) -> Optional[models.GlobalTagsClass]: @@ -4282,7 +4288,7 @@ def fake_get_tags(entity_urn: str) -> Optional[models.GlobalTagsClass]: assert server_domain not in transformed_aspect.domains -def test_tags_to_terms_transformation(mock_datahub_graph): +def test_tags_to_terms_transformation(mock_datahub_graph_instance): # Create domain URNs for the test term_urn_example1 = builder.make_term_urn("example1") term_urn_example2 = builder.make_term_urn("example2") @@ -4349,7 +4355,7 @@ def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: ) pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance pipeline_context.graph.get_tags = fake_get_tags # type: ignore pipeline_context.graph.get_schema_metadata = fake_schema_metadata # type: ignore @@ -4383,7 +4389,7 @@ def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: } -def test_tags_to_terms_with_no_matching_terms(mock_datahub_graph): +def test_tags_to_terms_with_no_matching_terms(mock_datahub_graph_instance): # Setup for test where no tags match the provided term mappings def fake_get_tags_no_match(entity_urn: str) -> models.GlobalTagsClass: return models.GlobalTagsClass( @@ -4394,7 +4400,7 @@ def fake_get_tags_no_match(entity_urn: str) -> models.GlobalTagsClass: ) pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance pipeline_context.graph.get_tags = fake_get_tags_no_match # type: ignore # No matching terms in config @@ -4420,13 +4426,13 @@ def fake_get_tags_no_match(entity_urn: str) -> models.GlobalTagsClass: assert len(terms_aspect.terms) == 1 -def test_tags_to_terms_with_missing_tags(mock_datahub_graph): +def test_tags_to_terms_with_missing_tags(mock_datahub_graph_instance): # Setup for test where no tags are present def fake_get_no_tags(entity_urn: str) -> models.GlobalTagsClass: return models.GlobalTagsClass(tags=[]) pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance pipeline_context.graph.get_tags = fake_get_no_tags # type: ignore config = {"tags": ["example1", "example2"]} @@ -4451,7 +4457,7 @@ def fake_get_no_tags(entity_urn: str) -> models.GlobalTagsClass: assert len(terms_aspect.terms) == 1 -def test_tags_to_terms_with_partial_match(mock_datahub_graph): +def test_tags_to_terms_with_partial_match(mock_datahub_graph_instance): # Setup for partial match scenario def fake_get_partial_match_tags(entity_urn: str) -> models.GlobalTagsClass: return models.GlobalTagsClass( @@ -4466,7 +4472,7 @@ def fake_get_partial_match_tags(entity_urn: str) -> models.GlobalTagsClass: ) pipeline_context = PipelineContext(run_id="transformer_pipe_line") - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + pipeline_context.graph = mock_datahub_graph_instance pipeline_context.graph.get_tags = fake_get_partial_match_tags # type: ignore config = {"tags": ["example1"]} # Only 'example1' has a term mapped @@ -4493,12 +4499,12 @@ def fake_get_partial_match_tags(entity_urn: str) -> models.GlobalTagsClass: def test_replace_external_url_container_word_replace( - mock_datahub_graph, + mock_datahub_graph_instance, ): pipeline_context: PipelineContext = PipelineContext( run_id="test_replace_external_url_container" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance output = run_container_transformer_pipeline( transformer_type=ReplaceExternalUrlContainer, @@ -4521,12 +4527,12 @@ def test_replace_external_url_container_word_replace( def test_replace_external_regex_container_replace_1( - mock_datahub_graph, + mock_datahub_graph_instance, ): pipeline_context: PipelineContext = PipelineContext( run_id="test_replace_external_url_container" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance output = run_container_transformer_pipeline( transformer_type=ReplaceExternalUrlContainer, @@ -4549,12 +4555,12 @@ def test_replace_external_regex_container_replace_1( def test_replace_external_regex_container_replace_2( - mock_datahub_graph, + mock_datahub_graph_instance, ): pipeline_context: PipelineContext = PipelineContext( run_id="test_replace_external_url_container" ) - pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + pipeline_context.graph = mock_datahub_graph_instance output = run_container_transformer_pipeline( transformer_type=ReplaceExternalUrlContainer, diff --git a/smoke-test/pytest.ini b/smoke-test/pytest.ini index 61ce840fd713c6..3762344cec4bea 100644 --- a/smoke-test/pytest.ini +++ b/smoke-test/pytest.ini @@ -1,3 +1,6 @@ [pytest] markers = read_only: marks tests as read only (deselect with '-m "not read_only"') + ; no_cypress_suite0: main smoke tests; expressed as the negative of the others + no_cypress_suite1: main smoke tests, suite 1 + test_run_cypress: run cypress tests diff --git a/smoke-test/tests/test_stateful_ingestion.py b/smoke-test/tests/test_stateful_ingestion.py index c0df51dd9d98e1..4436cf26c2fd78 100644 --- a/smoke-test/tests/test_stateful_ingestion.py +++ b/smoke-test/tests/test_stateful_ingestion.py @@ -65,10 +65,6 @@ def get_current_checkpoint_from_pipeline( "enabled": True, "remove_stale_metadata": True, "fail_safe_threshold": 100.0, - "state_provider": { - "type": "datahub", - "config": {"datahub_api": {"server": auth_session.gms_url()}}, - }, }, }