Merge branch 'main' into docs-contribution-deanonymization-best-pract…

…ices-with-openai
microsoft · Oct 9, 2024 · 9ca1f77 · 9ca1f77
2 parents e12d6a3 + 13ae328
commit 9ca1f77
Show file tree

Hide file tree

Showing 8 changed files with 74 additions and 16 deletions.
diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/azure_ai_language.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/azure_ai_language.py
@@ -25,6 +25,7 @@ def __init__(
         ta_client: Optional["TextAnalyticsClient"] = None,
         azure_ai_key: Optional[str] = None,
         azure_ai_endpoint: Optional[str] = None,
+        **kwargs
     ):
         """
         Wrap the PII detection in Azure AI Language.
@@ -36,6 +37,7 @@ def __init__(
         the client will be created using the key and endpoint.
         :param azure_ai_key: Azure AI for language key
         :param azure_ai_endpoint: Azure AI for language endpoint
+        :param kwargs: Additional arguments required by the parent class
 
         For more info, see https://learn.microsoft.com/en-us/azure/ai-services/language-service/personally-identifiable-information/overview
         """  # noqa E501
@@ -45,6 +47,7 @@ def __init__(
             supported_language=supported_language,
             name="Azure AI Language PII",
             version="5.2.0",
+            **kwargs
         )
 
         is_available = bool(TextAnalyticsClient)

diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/us_ssn_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/us_ssn_recognizer.py
@@ -27,8 +27,8 @@ class UsSsnRecognizer(PatternRecognizer):
         # "sec", # Task #603: Support keyphrases ("social sec")
         "ssn",
         "ssns",
-        "ssn#",
-        "ss#",
+        # "ssn#",  # iss:1452 - a # does not work with LemmaContextAwareEnhancer
+        # "ss#",  # iss:1452 - a # does not work with LemmaContextAwareEnhancer
         "ssid",
     ]
 

diff --git a/presidio-analyzer/tests/conf/test_azure_ai_language_reco.yaml b/presidio-analyzer/tests/conf/test_azure_ai_language_reco.yaml
@@ -0,0 +1,18 @@
+recognizer_registry:
+  global_regex_flags: 26
+  recognizers:
+    - name: MockAzureAiLanguageRecognizer
+      type: predefined
+      ta_client: "test" # This is a placeholder for testing purposes
+
+
+supported_languages:
+  - en
+default_score_threshold: 0.7
+
+nlp_configuration:
+  nlp_engine_name: spacy
+  models:
+    -
+      lang_code: en
+      model_name: en_core_web_lg
diff --git a/presidio-analyzer/tests/data/context_sentences_tests.txt b/presidio-analyzer/tests/data/context_sentences_tests.txt
@@ -8,16 +8,22 @@ IP_ADDRESS
 my ip: 192.168.0.1
 
 US_SSN
-my ssn is 078-051120 07805-1120
+my ssn is 078-051121
 
 US_SSN
 my social security number is 078051120
 
 US_SSN
-my social security number is 078-05-1120
+my social security number is 078-05-1121
 
 US_SSN
-my social security number is 078051120
+my social security number is 078051121
+
+US_SSN
+my ssns is 078-05-1121
+
+US_SSN
+my ssid is 078-05-1121
 
 PHONE_NUMBER
 my phone number is (425) 882-9090

diff --git a/presidio-analyzer/tests/test_analyzer_engine_provider.py b/presidio-analyzer/tests/test_analyzer_engine_provider.py
@@ -1,10 +1,13 @@
 import re
 from pathlib import Path
+from typing import List
 
-from presidio_analyzer import AnalyzerEngineProvider
-from presidio_analyzer.nlp_engine import SpacyNlpEngine
+from presidio_analyzer import AnalyzerEngineProvider, RecognizerResult
+from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
 
 from presidio_analyzer.nlp_engine.transformers_nlp_engine import TransformersNlpEngine
+from presidio_analyzer.predefined_recognizers import AzureAILanguageRecognizer
+
 
 def get_full_paths(analyzer_yaml, nlp_engine_yaml=None, recognizer_registry_yaml=None):
     this_path = Path(__file__).parent.absolute()
@@ -83,7 +86,9 @@ def test_analyzer_engine_provider_configuration_file():
     assert engine.nlp_engine.engine_name == "spacy"
 
 
-def test_analyzer_engine_provider_configuration_file_missing_values_expect_defaults(mandatory_recognizers):
+def test_analyzer_engine_provider_configuration_file_missing_values_expect_defaults(
+    mandatory_recognizers,
+):
     test_yaml, _, _ = get_full_paths("conf/test_analyzer_engine_missing_values.yaml")
     provider = AnalyzerEngineProvider(test_yaml)
     engine = provider.create_engine()
@@ -133,10 +138,6 @@ def test_analyzer_engine_provider_with_files_per_provider():
         recognizer_registry_conf_file=recognizer_registry_yaml,
     )
 
-    provider = AnalyzerEngineProvider(analyzer_engine_conf_file=analyzer_yaml,
-                                      nlp_engine_conf_file=nlp_engine_yaml,
-                                      recognizer_registry_conf_file=recognizer_registry_yaml)
-
     analyzer_engine = provider.create_engine()
 
     # assert analyzer instance is correct
@@ -153,3 +154,32 @@ def test_analyzer_engine_provider_with_files_per_provider():
     recognizer_registry = analyzer_engine.registry
     assert len(recognizer_registry.recognizers) == 6
     assert recognizer_registry.supported_languages == ["en", "es"]
+
+
+def test_analyzer_engine_provider_with_azure_ai_language():
+    analyzer_yaml, _, _ = get_full_paths(
+        "conf/test_azure_ai_language_reco.yaml",
+    )
+
+    class MockAzureAiLanguageRecognizer(AzureAILanguageRecognizer):
+        def analyze(
+            self,
+            text: str,
+            entities: List[str] = None,
+            nlp_artifacts: NlpArtifacts = None,
+        ) -> List[RecognizerResult]:
+            return [RecognizerResult(entity_type="PERSON", start=0, end=4, score=0.9)]
+
+    provider = AnalyzerEngineProvider(analyzer_engine_conf_file=analyzer_yaml)
+
+    analyzer_engine = provider.create_engine()
+
+    azure_ai_recognizers = [
+        rec
+        for rec in analyzer_engine.registry.recognizers
+        if rec.name == "Azure AI Language PII"
+    ]
+
+    assert len(azure_ai_recognizers) == 1
+
+    assert len(analyzer_engine.analyze("This is a test", language="en")) > 0
diff --git a/presidio-analyzer/tests/test_context_support.py b/presidio-analyzer/tests/test_context_support.py
@@ -70,9 +70,9 @@ def dataset(recognizers_map):
             raise ValueError(f"bad entity type {entity_type}")
 
         test_items.append((item, recognizer, [entity_type]))
-    # Currently we have 31 sentences, this is a sanity check
-    if not len(test_items) == 32:
-        raise ValueError(f"expected 31 context sentences but found {len(test_items)}")
+    # Currently we have 34 sentences, this is a sanity check
+    if not len(test_items) == 34:
+        raise ValueError(f"expected 34 context sentences but found {len(test_items)}")
 
     yield test_items
 

diff --git a/presidio-cli/README.md b/presidio-cli/README.md
@@ -126,7 +126,7 @@ presidio -d "ignore: |
   *.cfg" tests/
 
 # limit list of entities to CREDIT_CARD
-presidio-d "entities:
+presidio -d "entities:
   - CREDIT_CARD" tests/
 
 # equivalent to use -c parameter

diff --git a/presidio-structured/pyproject.toml b/presidio-structured/pyproject.toml
@@ -25,6 +25,7 @@ python = ">=3.8,<4.0"
 presidio-analyzer = ">=2.2"
 presidio-anonymizer = ">=2.2"
 pandas = ">=1.5.2"
+numpy = "<2.0.0"
 
 [tool.poetry.group.dev.dependencies]
 pip = "*"