From 89ccadb8fce13a1ea73488c55ea801ced91019c2 Mon Sep 17 00:00:00 2001 From: claesmk <44329220+claesmk@users.noreply.github.com> Date: Sun, 29 Sep 2024 07:27:51 -0400 Subject: [PATCH 1/4] Update US_SSN CONTEXT and unit test (#1455) --- .../predefined_recognizers/us_ssn_recognizer.py | 4 ++-- .../tests/data/context_sentences_tests.txt | 12 +++++++++--- presidio-analyzer/tests/test_context_support.py | 6 +++--- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/us_ssn_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/us_ssn_recognizer.py index 7580d6526..8ba04fdd4 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/us_ssn_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/us_ssn_recognizer.py @@ -27,8 +27,8 @@ class UsSsnRecognizer(PatternRecognizer): # "sec", # Task #603: Support keyphrases ("social sec") "ssn", "ssns", - "ssn#", - "ss#", + # "ssn#", # iss:1452 - a # does not work with LemmaContextAwareEnhancer + # "ss#", # iss:1452 - a # does not work with LemmaContextAwareEnhancer "ssid", ] diff --git a/presidio-analyzer/tests/data/context_sentences_tests.txt b/presidio-analyzer/tests/data/context_sentences_tests.txt index 0968593ec..2a961933e 100644 --- a/presidio-analyzer/tests/data/context_sentences_tests.txt +++ b/presidio-analyzer/tests/data/context_sentences_tests.txt @@ -8,16 +8,22 @@ IP_ADDRESS my ip: 192.168.0.1 US_SSN -my ssn is 078-051120 07805-1120 +my ssn is 078-051121 US_SSN my social security number is 078051120 US_SSN -my social security number is 078-05-1120 +my social security number is 078-05-1121 US_SSN -my social security number is 078051120 +my social security number is 078051121 + +US_SSN +my ssns is 078-05-1121 + +US_SSN +my ssid is 078-05-1121 PHONE_NUMBER my phone number is (425) 882-9090 diff --git a/presidio-analyzer/tests/test_context_support.py b/presidio-analyzer/tests/test_context_support.py index 328f54ee4..3458ae6d6 100644 --- a/presidio-analyzer/tests/test_context_support.py +++ b/presidio-analyzer/tests/test_context_support.py @@ -70,9 +70,9 @@ def dataset(recognizers_map): raise ValueError(f"bad entity type {entity_type}") test_items.append((item, recognizer, [entity_type])) - # Currently we have 31 sentences, this is a sanity check - if not len(test_items) == 32: - raise ValueError(f"expected 31 context sentences but found {len(test_items)}") + # Currently we have 34 sentences, this is a sanity check + if not len(test_items) == 34: + raise ValueError(f"expected 34 context sentences but found {len(test_items)}") yield test_items From b9f6cbaf05583e65cd35bdb9a22171a7f8d21b71 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 2 Oct 2024 10:45:05 +0300 Subject: [PATCH 2/4] Bug/azure ai language context (#1458) --- .../azure_ai_language.py | 3 ++ .../conf/test_azure_ai_language_reco.yaml | 18 ++++++++ .../tests/test_analyzer_engine_provider.py | 44 ++++++++++++++++--- 3 files changed, 58 insertions(+), 7 deletions(-) create mode 100644 presidio-analyzer/tests/conf/test_azure_ai_language_reco.yaml diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/azure_ai_language.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/azure_ai_language.py index fd306cf9d..17c60f613 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/azure_ai_language.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/azure_ai_language.py @@ -25,6 +25,7 @@ def __init__( ta_client: Optional["TextAnalyticsClient"] = None, azure_ai_key: Optional[str] = None, azure_ai_endpoint: Optional[str] = None, + **kwargs ): """ Wrap the PII detection in Azure AI Language. @@ -36,6 +37,7 @@ def __init__( the client will be created using the key and endpoint. :param azure_ai_key: Azure AI for language key :param azure_ai_endpoint: Azure AI for language endpoint + :param kwargs: Additional arguments required by the parent class For more info, see https://learn.microsoft.com/en-us/azure/ai-services/language-service/personally-identifiable-information/overview """ # noqa E501 @@ -45,6 +47,7 @@ def __init__( supported_language=supported_language, name="Azure AI Language PII", version="5.2.0", + **kwargs ) is_available = bool(TextAnalyticsClient) diff --git a/presidio-analyzer/tests/conf/test_azure_ai_language_reco.yaml b/presidio-analyzer/tests/conf/test_azure_ai_language_reco.yaml new file mode 100644 index 000000000..3f7fdd7c6 --- /dev/null +++ b/presidio-analyzer/tests/conf/test_azure_ai_language_reco.yaml @@ -0,0 +1,18 @@ +recognizer_registry: + global_regex_flags: 26 + recognizers: + - name: MockAzureAiLanguageRecognizer + type: predefined + ta_client: "test" # This is a placeholder for testing purposes + + +supported_languages: + - en +default_score_threshold: 0.7 + +nlp_configuration: + nlp_engine_name: spacy + models: + - + lang_code: en + model_name: en_core_web_lg \ No newline at end of file diff --git a/presidio-analyzer/tests/test_analyzer_engine_provider.py b/presidio-analyzer/tests/test_analyzer_engine_provider.py index 293ba2ab3..042fe5f67 100644 --- a/presidio-analyzer/tests/test_analyzer_engine_provider.py +++ b/presidio-analyzer/tests/test_analyzer_engine_provider.py @@ -1,10 +1,13 @@ import re from pathlib import Path +from typing import List -from presidio_analyzer import AnalyzerEngineProvider -from presidio_analyzer.nlp_engine import SpacyNlpEngine +from presidio_analyzer import AnalyzerEngineProvider, RecognizerResult +from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts from presidio_analyzer.nlp_engine.transformers_nlp_engine import TransformersNlpEngine +from presidio_analyzer.predefined_recognizers import AzureAILanguageRecognizer + def get_full_paths(analyzer_yaml, nlp_engine_yaml=None, recognizer_registry_yaml=None): this_path = Path(__file__).parent.absolute() @@ -83,7 +86,9 @@ def test_analyzer_engine_provider_configuration_file(): assert engine.nlp_engine.engine_name == "spacy" -def test_analyzer_engine_provider_configuration_file_missing_values_expect_defaults(mandatory_recognizers): +def test_analyzer_engine_provider_configuration_file_missing_values_expect_defaults( + mandatory_recognizers, +): test_yaml, _, _ = get_full_paths("conf/test_analyzer_engine_missing_values.yaml") provider = AnalyzerEngineProvider(test_yaml) engine = provider.create_engine() @@ -133,10 +138,6 @@ def test_analyzer_engine_provider_with_files_per_provider(): recognizer_registry_conf_file=recognizer_registry_yaml, ) - provider = AnalyzerEngineProvider(analyzer_engine_conf_file=analyzer_yaml, - nlp_engine_conf_file=nlp_engine_yaml, - recognizer_registry_conf_file=recognizer_registry_yaml) - analyzer_engine = provider.create_engine() # assert analyzer instance is correct @@ -153,3 +154,32 @@ def test_analyzer_engine_provider_with_files_per_provider(): recognizer_registry = analyzer_engine.registry assert len(recognizer_registry.recognizers) == 6 assert recognizer_registry.supported_languages == ["en", "es"] + + +def test_analyzer_engine_provider_with_azure_ai_language(): + analyzer_yaml, _, _ = get_full_paths( + "conf/test_azure_ai_language_reco.yaml", + ) + + class MockAzureAiLanguageRecognizer(AzureAILanguageRecognizer): + def analyze( + self, + text: str, + entities: List[str] = None, + nlp_artifacts: NlpArtifacts = None, + ) -> List[RecognizerResult]: + return [RecognizerResult(entity_type="PERSON", start=0, end=4, score=0.9)] + + provider = AnalyzerEngineProvider(analyzer_engine_conf_file=analyzer_yaml) + + analyzer_engine = provider.create_engine() + + azure_ai_recognizers = [ + rec + for rec in analyzer_engine.registry.recognizers + if rec.name == "Azure AI Language PII" + ] + + assert len(azure_ai_recognizers) == 1 + + assert len(analyzer_engine.analyze("This is a test", language="en")) > 0 From 49f2b6a3bd701e40f7306bdb13bb5dd441e847e8 Mon Sep 17 00:00:00 2001 From: Sudarsan Balaji Date: Thu, 3 Oct 2024 07:20:50 +0100 Subject: [PATCH 3/4] Fix space (#1459) --- presidio-cli/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-cli/README.md b/presidio-cli/README.md index 10259d881..f44444299 100644 --- a/presidio-cli/README.md +++ b/presidio-cli/README.md @@ -126,7 +126,7 @@ presidio -d "ignore: | *.cfg" tests/ # limit list of entities to CREDIT_CARD -presidio-d "entities: +presidio -d "entities: - CREDIT_CARD" tests/ # equivalent to use -c parameter From 13ae328a07e41f6aad610e52814f5d8241e52897 Mon Sep 17 00:00:00 2001 From: Sharon Hart Date: Tue, 8 Oct 2024 22:07:15 +0300 Subject: [PATCH 4/4] Fix presidio-structured build - lock numpy version (#1465) * Update pyproject.toml * Update pyproject.toml --- presidio-structured/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/presidio-structured/pyproject.toml b/presidio-structured/pyproject.toml index b1e992316..8d9ff1bde 100644 --- a/presidio-structured/pyproject.toml +++ b/presidio-structured/pyproject.toml @@ -25,6 +25,7 @@ python = ">=3.8,<4.0" presidio-analyzer = ">=2.2" presidio-anonymizer = ">=2.2" pandas = ">=1.5.2" +numpy = "<2.0.0" [tool.poetry.group.dev.dependencies] pip = "*"