From 70bcc820bba5931621ff30463f32318a0f03af23 Mon Sep 17 00:00:00 2001 From: Fabiana <30911746+fabclmnt@users.noreply.github.com> Date: Fri, 6 Sep 2024 15:59:47 +0100 Subject: [PATCH] fix: validate and process provided anonym inputs (#122) * fix: validate and process provided anonym inputs - Validate anonymizer inputs and process in a format that is supported by integration and backend * fix(linting): code formatting * fix: typo for aux function - typo - reserved python word * fix: linter issues * fix(linting): code formatting --------- Co-authored-by: Azory YData Bot --- examples/synthesizers/anonymize_example.py | 11 ++++- src/ydata/sdk/synthesizers/anonymizer.py | 47 ++++++++++++++++++++++ src/ydata/sdk/synthesizers/synthesizer.py | 4 ++ 3 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 src/ydata/sdk/synthesizers/anonymizer.py diff --git a/examples/synthesizers/anonymize_example.py b/examples/synthesizers/anonymize_example.py index 8ae63ec8..17d3d74e 100644 --- a/examples/synthesizers/anonymize_example.py +++ b/examples/synthesizers/anonymize_example.py @@ -17,7 +17,7 @@ def main(): # We initialize a regular synthesizer # As long as the synthesizer does not call `fit`, it exists only locally - synth = RegularSynthesizer() + synth = RegularSynthesizer(name="Titanic") # We define anonymization rules, which is a dictionary with format: # {column_name: anonymization_rule, ...} @@ -28,10 +28,17 @@ def main(): "Ticket": "[A-Z]{2}-[A-Z]{4}" } + # or a different option for anonymization configuration + + rules = { + 'Name': {'type': 'name'}, + 'Ticket': {'type': 'regex', + 'regex': '[A-Z]{2}-[A-Z]{4}'} + } + # We train the synthesizer on our dataset synth.fit( X, - name="titanic_synthesizer", anonymize=rules ) diff --git a/src/ydata/sdk/synthesizers/anonymizer.py b/src/ydata/sdk/synthesizers/anonymizer.py new file mode 100644 index 00000000..93ef3dec --- /dev/null +++ b/src/ydata/sdk/synthesizers/anonymizer.py @@ -0,0 +1,47 @@ +""" + Validate and process the payload for the synthesizers anonymizer +""" + +from ydata.datascience.common import AnonymizerType + + +def build_and_validate_anonimization(anonimyze: dict, cols: list) -> dict: + isnested = any(isinstance(i, dict) for i in anonimyze.values()) + + if not all([True if k in cols else False for k in list(anonimyze.keys())]): + # AnonymizationConfigurationError + raise Exception( + 'The keys in your configuration must exactly match the column names in the provided dataset. Please check and update your inputs to ensure they align.') + + if isnested: + # Validate the format here. + for k, v in anonimyze.items(): + if 'type' not in list(v.keys()): + raise Exception("""The provided configuration is not correct. Make sure that your anonymization config follow one of the following formats: + + { + 'col_name': {'type': 'anonymization_method', kwargs**} + + } or + + { + 'col_name: 'anonymization_method' + } + """) + else: + anon_type = anonimyze[k]['type'] + anonimyze[k]['type'] = AnonymizerType.get_anonymizer_type( + anon_type).value + config = anonimyze + else: + config = {} + for k, v in anonimyze.items(): + print(k, v) + if AnonymizerType.get_anonymizer_type(v) is None: + col_config = {'type': AnonymizerType.REGEX.value, 'regex': v} + else: + col_config = {'type': AnonymizerType.get_anonymizer_type(v).value} + + config[k] = col_config + + return config diff --git a/src/ydata/sdk/synthesizers/synthesizer.py b/src/ydata/sdk/synthesizers/synthesizer.py index 5801eb0e..c2929ec8 100644 --- a/src/ydata/sdk/synthesizers/synthesizer.py +++ b/src/ydata/sdk/synthesizers/synthesizer.py @@ -26,6 +26,7 @@ from ydata.sdk.synthesizers._models.status import PrepareState, Status, TrainingState from ydata.sdk.synthesizers._models.synthesizer import Synthesizer as mSynthesizer from ydata.sdk.synthesizers._models.synthesizers_list import SynthesizersList +from ydata.sdk.synthesizers.anonymizer import build_and_validate_anonimization from ydata.sdk.utils.model_mixin import ModelFactoryMixin @@ -246,6 +247,9 @@ def _fit_from_datasource( payload['type'] = str(datatype.value) if anonymize is not None: + # process and validated the anonymization config shared by the end user + anonymize = build_and_validate_anonimization( + anonimyze=anonymize, cols=[col.name for col in X.metadata.columns]) payload["extraData"]["anonymize"] = anonymize if condition_on is not None: payload["extraData"]["condition_on"] = condition_on