Skip to content

Commit

Permalink
fix: validate and process provided anonym inputs (#122)
Browse files Browse the repository at this point in the history
* fix: validate and process provided anonym inputs

- Validate anonymizer inputs and process in a format that is supported by integration and backend

* fix(linting): code formatting

* fix: typo for aux function

- typo
- reserved python word

* fix: linter issues

* fix(linting): code formatting

---------

Co-authored-by: Azory YData Bot <[email protected]>
  • Loading branch information
fabclmnt and azory-ydata committed Sep 6, 2024
1 parent d775d68 commit 70bcc82
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 2 deletions.
11 changes: 9 additions & 2 deletions examples/synthesizers/anonymize_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def main():

# We initialize a regular synthesizer
# As long as the synthesizer does not call `fit`, it exists only locally
synth = RegularSynthesizer()
synth = RegularSynthesizer(name="Titanic")

# We define anonymization rules, which is a dictionary with format:
# {column_name: anonymization_rule, ...}
Expand All @@ -28,10 +28,17 @@ def main():
"Ticket": "[A-Z]{2}-[A-Z]{4}"
}

# or a different option for anonymization configuration

rules = {
'Name': {'type': 'name'},
'Ticket': {'type': 'regex',
'regex': '[A-Z]{2}-[A-Z]{4}'}
}

# We train the synthesizer on our dataset
synth.fit(
X,
name="titanic_synthesizer",
anonymize=rules
)

Expand Down
47 changes: 47 additions & 0 deletions src/ydata/sdk/synthesizers/anonymizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Validate and process the payload for the synthesizers anonymizer
"""

from ydata.datascience.common import AnonymizerType


def build_and_validate_anonimization(anonimyze: dict, cols: list) -> dict:
isnested = any(isinstance(i, dict) for i in anonimyze.values())

if not all([True if k in cols else False for k in list(anonimyze.keys())]):
# AnonymizationConfigurationError
raise Exception(
'The keys in your configuration must exactly match the column names in the provided dataset. Please check and update your inputs to ensure they align.')

if isnested:
# Validate the format here.
for k, v in anonimyze.items():
if 'type' not in list(v.keys()):
raise Exception("""The provided configuration is not correct. Make sure that your anonymization config follow one of the following formats:
{
'col_name': {'type': 'anonymization_method', kwargs**}
} or
{
'col_name: 'anonymization_method'
}
""")
else:
anon_type = anonimyze[k]['type']
anonimyze[k]['type'] = AnonymizerType.get_anonymizer_type(
anon_type).value
config = anonimyze
else:
config = {}
for k, v in anonimyze.items():
print(k, v)
if AnonymizerType.get_anonymizer_type(v) is None:
col_config = {'type': AnonymizerType.REGEX.value, 'regex': v}
else:
col_config = {'type': AnonymizerType.get_anonymizer_type(v).value}

config[k] = col_config

return config
4 changes: 4 additions & 0 deletions src/ydata/sdk/synthesizers/synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from ydata.sdk.synthesizers._models.status import PrepareState, Status, TrainingState
from ydata.sdk.synthesizers._models.synthesizer import Synthesizer as mSynthesizer
from ydata.sdk.synthesizers._models.synthesizers_list import SynthesizersList
from ydata.sdk.synthesizers.anonymizer import build_and_validate_anonimization
from ydata.sdk.utils.model_mixin import ModelFactoryMixin


Expand Down Expand Up @@ -246,6 +247,9 @@ def _fit_from_datasource(
payload['type'] = str(datatype.value)

if anonymize is not None:
# process and validated the anonymization config shared by the end user
anonymize = build_and_validate_anonimization(
anonimyze=anonymize, cols=[col.name for col in X.metadata.columns])
payload["extraData"]["anonymize"] = anonymize
if condition_on is not None:
payload["extraData"]["condition_on"] = condition_on
Expand Down

0 comments on commit 70bcc82

Please sign in to comment.