Skip to content

Commit

Permalink
Merge branch 'main' into docs-contribution-deanonymization-best-pract…
Browse files Browse the repository at this point in the history
…ices-with-openai
  • Loading branch information
SharonHart authored Oct 9, 2024
2 parents e12d6a3 + 13ae328 commit 9ca1f77
Show file tree
Hide file tree
Showing 8 changed files with 74 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(
ta_client: Optional["TextAnalyticsClient"] = None,
azure_ai_key: Optional[str] = None,
azure_ai_endpoint: Optional[str] = None,
**kwargs
):
"""
Wrap the PII detection in Azure AI Language.
Expand All @@ -36,6 +37,7 @@ def __init__(
the client will be created using the key and endpoint.
:param azure_ai_key: Azure AI for language key
:param azure_ai_endpoint: Azure AI for language endpoint
:param kwargs: Additional arguments required by the parent class
For more info, see https://learn.microsoft.com/en-us/azure/ai-services/language-service/personally-identifiable-information/overview
""" # noqa E501
Expand All @@ -45,6 +47,7 @@ def __init__(
supported_language=supported_language,
name="Azure AI Language PII",
version="5.2.0",
**kwargs
)

is_available = bool(TextAnalyticsClient)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ class UsSsnRecognizer(PatternRecognizer):
# "sec", # Task #603: Support keyphrases ("social sec")
"ssn",
"ssns",
"ssn#",
"ss#",
# "ssn#", # iss:1452 - a # does not work with LemmaContextAwareEnhancer
# "ss#", # iss:1452 - a # does not work with LemmaContextAwareEnhancer
"ssid",
]

Expand Down
18 changes: 18 additions & 0 deletions presidio-analyzer/tests/conf/test_azure_ai_language_reco.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
recognizer_registry:
global_regex_flags: 26
recognizers:
- name: MockAzureAiLanguageRecognizer
type: predefined
ta_client: "test" # This is a placeholder for testing purposes


supported_languages:
- en
default_score_threshold: 0.7

nlp_configuration:
nlp_engine_name: spacy
models:
-
lang_code: en
model_name: en_core_web_lg
12 changes: 9 additions & 3 deletions presidio-analyzer/tests/data/context_sentences_tests.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,22 @@ IP_ADDRESS
my ip: 192.168.0.1

US_SSN
my ssn is 078-051120 07805-1120
my ssn is 078-051121

US_SSN
my social security number is 078051120

US_SSN
my social security number is 078-05-1120
my social security number is 078-05-1121

US_SSN
my social security number is 078051120
my social security number is 078051121

US_SSN
my ssns is 078-05-1121

US_SSN
my ssid is 078-05-1121

PHONE_NUMBER
my phone number is (425) 882-9090
Expand Down
44 changes: 37 additions & 7 deletions presidio-analyzer/tests/test_analyzer_engine_provider.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import re
from pathlib import Path
from typing import List

from presidio_analyzer import AnalyzerEngineProvider
from presidio_analyzer.nlp_engine import SpacyNlpEngine
from presidio_analyzer import AnalyzerEngineProvider, RecognizerResult
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts

from presidio_analyzer.nlp_engine.transformers_nlp_engine import TransformersNlpEngine
from presidio_analyzer.predefined_recognizers import AzureAILanguageRecognizer


def get_full_paths(analyzer_yaml, nlp_engine_yaml=None, recognizer_registry_yaml=None):
this_path = Path(__file__).parent.absolute()
Expand Down Expand Up @@ -83,7 +86,9 @@ def test_analyzer_engine_provider_configuration_file():
assert engine.nlp_engine.engine_name == "spacy"


def test_analyzer_engine_provider_configuration_file_missing_values_expect_defaults(mandatory_recognizers):
def test_analyzer_engine_provider_configuration_file_missing_values_expect_defaults(
mandatory_recognizers,
):
test_yaml, _, _ = get_full_paths("conf/test_analyzer_engine_missing_values.yaml")
provider = AnalyzerEngineProvider(test_yaml)
engine = provider.create_engine()
Expand Down Expand Up @@ -133,10 +138,6 @@ def test_analyzer_engine_provider_with_files_per_provider():
recognizer_registry_conf_file=recognizer_registry_yaml,
)

provider = AnalyzerEngineProvider(analyzer_engine_conf_file=analyzer_yaml,
nlp_engine_conf_file=nlp_engine_yaml,
recognizer_registry_conf_file=recognizer_registry_yaml)

analyzer_engine = provider.create_engine()

# assert analyzer instance is correct
Expand All @@ -153,3 +154,32 @@ def test_analyzer_engine_provider_with_files_per_provider():
recognizer_registry = analyzer_engine.registry
assert len(recognizer_registry.recognizers) == 6
assert recognizer_registry.supported_languages == ["en", "es"]


def test_analyzer_engine_provider_with_azure_ai_language():
analyzer_yaml, _, _ = get_full_paths(
"conf/test_azure_ai_language_reco.yaml",
)

class MockAzureAiLanguageRecognizer(AzureAILanguageRecognizer):
def analyze(
self,
text: str,
entities: List[str] = None,
nlp_artifacts: NlpArtifacts = None,
) -> List[RecognizerResult]:
return [RecognizerResult(entity_type="PERSON", start=0, end=4, score=0.9)]

provider = AnalyzerEngineProvider(analyzer_engine_conf_file=analyzer_yaml)

analyzer_engine = provider.create_engine()

azure_ai_recognizers = [
rec
for rec in analyzer_engine.registry.recognizers
if rec.name == "Azure AI Language PII"
]

assert len(azure_ai_recognizers) == 1

assert len(analyzer_engine.analyze("This is a test", language="en")) > 0
6 changes: 3 additions & 3 deletions presidio-analyzer/tests/test_context_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,9 @@ def dataset(recognizers_map):
raise ValueError(f"bad entity type {entity_type}")

test_items.append((item, recognizer, [entity_type]))
# Currently we have 31 sentences, this is a sanity check
if not len(test_items) == 32:
raise ValueError(f"expected 31 context sentences but found {len(test_items)}")
# Currently we have 34 sentences, this is a sanity check
if not len(test_items) == 34:
raise ValueError(f"expected 34 context sentences but found {len(test_items)}")

yield test_items

Expand Down
2 changes: 1 addition & 1 deletion presidio-cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ presidio -d "ignore: |
*.cfg" tests/

# limit list of entities to CREDIT_CARD
presidio-d "entities:
presidio -d "entities:
- CREDIT_CARD" tests/

# equivalent to use -c parameter
Expand Down
1 change: 1 addition & 0 deletions presidio-structured/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ python = ">=3.8,<4.0"
presidio-analyzer = ">=2.2"
presidio-anonymizer = ">=2.2"
pandas = ">=1.5.2"
numpy = "<2.0.0"

[tool.poetry.group.dev.dependencies]
pip = "*"
Expand Down

0 comments on commit 9ca1f77

Please sign in to comment.