From 70bcc820bba5931621ff30463f32318a0f03af23 Mon Sep 17 00:00:00 2001
From: Fabiana <30911746+fabclmnt@users.noreply.github.com>
Date: Fri, 6 Sep 2024 15:59:47 +0100
Subject: [PATCH] fix: validate and process provided anonym inputs (#122)

* fix: validate and process provided anonym inputs

- Validate anonymizer inputs and process in a format that is supported by integration and backend

* fix(linting): code formatting

* fix: typo for aux function

- typo
- reserved python word

* fix: linter issues

* fix(linting): code formatting

---------

Co-authored-by: Azory YData Bot <azory@ydata.ai>
---
 examples/synthesizers/anonymize_example.py | 11 ++++-
 src/ydata/sdk/synthesizers/anonymizer.py   | 47 ++++++++++++++++++++++
 src/ydata/sdk/synthesizers/synthesizer.py  |  4 ++
 3 files changed, 60 insertions(+), 2 deletions(-)
 create mode 100644 src/ydata/sdk/synthesizers/anonymizer.py

diff --git a/examples/synthesizers/anonymize_example.py b/examples/synthesizers/anonymize_example.py
index 8ae63ec8..17d3d74e 100644
--- a/examples/synthesizers/anonymize_example.py
+++ b/examples/synthesizers/anonymize_example.py
@@ -17,7 +17,7 @@ def main():
 
     # We initialize a regular synthesizer
     # As long as the synthesizer does not call `fit`, it exists only locally
-    synth = RegularSynthesizer()
+    synth = RegularSynthesizer(name="Titanic")
 
     # We define anonymization rules, which is a dictionary with format:
     # {column_name: anonymization_rule, ...}
@@ -28,10 +28,17 @@ def main():
         "Ticket": "[A-Z]{2}-[A-Z]{4}"
     }
 
+    # or a different option for anonymization configuration
+
+    rules = {
+        'Name': {'type': 'name'},
+        'Ticket': {'type': 'regex',
+                   'regex': '[A-Z]{2}-[A-Z]{4}'}
+    }
+
     # We train the synthesizer on our dataset
     synth.fit(
         X,
-        name="titanic_synthesizer",
         anonymize=rules
     )
 
diff --git a/src/ydata/sdk/synthesizers/anonymizer.py b/src/ydata/sdk/synthesizers/anonymizer.py
new file mode 100644
index 00000000..93ef3dec
--- /dev/null
+++ b/src/ydata/sdk/synthesizers/anonymizer.py
@@ -0,0 +1,47 @@
+"""
+    Validate and process the payload for the synthesizers anonymizer
+"""
+
+from ydata.datascience.common import AnonymizerType
+
+
+def build_and_validate_anonimization(anonimyze: dict, cols: list) -> dict:
+    isnested = any(isinstance(i, dict) for i in anonimyze.values())
+
+    if not all([True if k in cols else False for k in list(anonimyze.keys())]):
+        # AnonymizationConfigurationError
+        raise Exception(
+            'The keys in your configuration must exactly match the column names in the provided dataset. Please check and update your inputs to ensure they align.')
+
+    if isnested:
+        # Validate the format here.
+        for k, v in anonimyze.items():
+            if 'type' not in list(v.keys()):
+                raise Exception("""The provided configuration is not correct. Make sure that your anonymization config follow one of the following formats:
+
+                                {
+                                    'col_name': {'type': 'anonymization_method', kwargs**}
+
+                                } or
+
+                                {
+                                    'col_name: 'anonymization_method'
+                                }
+                                """)
+            else:
+                anon_type = anonimyze[k]['type']
+                anonimyze[k]['type'] = AnonymizerType.get_anonymizer_type(
+                    anon_type).value
+        config = anonimyze
+    else:
+        config = {}
+        for k, v in anonimyze.items():
+            print(k, v)
+            if AnonymizerType.get_anonymizer_type(v) is None:
+                col_config = {'type': AnonymizerType.REGEX.value, 'regex': v}
+            else:
+                col_config = {'type': AnonymizerType.get_anonymizer_type(v).value}
+
+            config[k] = col_config
+
+    return config
diff --git a/src/ydata/sdk/synthesizers/synthesizer.py b/src/ydata/sdk/synthesizers/synthesizer.py
index 5801eb0e..c2929ec8 100644
--- a/src/ydata/sdk/synthesizers/synthesizer.py
+++ b/src/ydata/sdk/synthesizers/synthesizer.py
@@ -26,6 +26,7 @@
 from ydata.sdk.synthesizers._models.status import PrepareState, Status, TrainingState
 from ydata.sdk.synthesizers._models.synthesizer import Synthesizer as mSynthesizer
 from ydata.sdk.synthesizers._models.synthesizers_list import SynthesizersList
+from ydata.sdk.synthesizers.anonymizer import build_and_validate_anonimization
 from ydata.sdk.utils.model_mixin import ModelFactoryMixin
 
 
@@ -246,6 +247,9 @@ def _fit_from_datasource(
         payload['type'] = str(datatype.value)
 
         if anonymize is not None:
+            # process and validated the anonymization config shared by the end user
+            anonymize = build_and_validate_anonimization(
+                anonimyze=anonymize, cols=[col.name for col in X.metadata.columns])
             payload["extraData"]["anonymize"] = anonymize
         if condition_on is not None:
             payload["extraData"]["condition_on"] = condition_on