Skip to content

Commit

Permalink
fix(ingest/azure-ad): limit the size of the ingestion report (#12498)
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored Jan 31, 2025
1 parent 317b740 commit 1e0f993
Showing 1 changed file with 6 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from datahub.configuration.common import AllowDenyPattern
from datahub.configuration.source_common import DatasetSourceConfigMixin
from datahub.configuration.validate_field_removal import pydantic_removed_field
from datahub.emitter.mce_builder import make_group_urn, make_user_urn
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.common import PipelineContext
Expand Down Expand Up @@ -51,6 +52,7 @@
OriginTypeClass,
StatusClass,
)
from datahub.utilities.lossy_collections import LossyList

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -132,11 +134,7 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
description="regex patterns for groups to include in ingestion.",
)

# If enabled, report will contain names of filtered users and groups.
filtered_tracking: bool = Field(
default=True,
description="If enabled, report will contain names of filtered users and groups.",
)
_remove_filtered_tracking = pydantic_removed_field("filtered_tracking")

# Optional: Whether to mask sensitive information from workunit ID's. On by default.
mask_group_id: bool = Field(
Expand All @@ -156,14 +154,10 @@ class AzureADConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):

@dataclass
class AzureADSourceReport(StaleEntityRemovalSourceReport):
filtered: List[str] = field(default_factory=list)
filtered_tracking: bool = field(default=True, repr=False)
filtered_count: int = field(default=0)
filtered: LossyList[str] = field(default_factory=LossyList)

def report_filtered(self, name: str) -> None:
self.filtered_count += 1
if self.filtered_tracking:
self.filtered.append(name)
self.filtered.append(name)


# Source that extracts Azure AD users, groups and group memberships using Microsoft Graph REST API
Expand Down Expand Up @@ -266,9 +260,7 @@ def create(cls, config_dict, ctx):
def __init__(self, config: AzureADConfig, ctx: PipelineContext):
super().__init__(config, ctx)
self.config = config
self.report = AzureADSourceReport(
filtered_tracking=self.config.filtered_tracking
)
self.report = AzureADSourceReport()
session = requests.Session()
retries = Retry(
total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]
Expand Down

0 comments on commit 1e0f993

Please sign in to comment.