Skip to content

Commit

Permalink
Merge branch 'main' into niwilso/dicom/return-bboxes
Browse files Browse the repository at this point in the history
  • Loading branch information
omri374 authored Jul 13, 2023
2 parents 0a33452 + 0a4c76d commit 5569923
Show file tree
Hide file tree
Showing 15 changed files with 865 additions and 509 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from .it_vat_code import ItVatCodeRecognizer
from .it_identity_card_recognizer import ItIdentityCardRecognizer
from .it_passport_recognizer import ItPassportRecognizer
from .in_pan_recognizer import InPanRecognizer

NLP_RECOGNIZERS = {
"spacy": SpacyRecognizer,
Expand Down Expand Up @@ -71,4 +72,5 @@
"ItVatCodeRecognizer",
"ItIdentityCardRecognizer",
"ItPassportRecognizer",
"InPanRecognizer"
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from typing import Optional, List, Tuple

from presidio_analyzer import Pattern, PatternRecognizer


class InPanRecognizer(PatternRecognizer):
"""
Recognizes Indian Permanent Account Number ("PAN").
The Permanent Account Number (PAN) is a ten digit alpha-numeric code
with the last digit being a check digit calculated using a
modified modulus 10 calculation.
This recognizer identifies PAN using regex and context words.
Reference: https://en.wikipedia.org/wiki/Permanent_account_number,
https://incometaxindia.gov.in/Forms/tps/1.Permanent%20Account%20Number%20(PAN).pdf
:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
:param replacement_pairs: List of tuples with potential replacement values
for different strings to be used during pattern matching.
This can allow a greater variety in input, for example by removing dashes or spaces.
"""

PATTERNS = [
Pattern(
"PAN (High)",
r"\b([A-Za-z]{3}[AaBbCcFfGgHhJjLlPpTt]{1}[A-Za-z]{1}[0-9]{4}[A-Za-z]{1})\b",
0.85,
),
Pattern(
"PAN (Medium)",
r"\b([A-Za-z]{5}[0-9]{4}[A-Za-z]{1})\b",
0.6,
),
Pattern(
"PAN (Low)",
r"\b((?=.*?[a-zA-Z])(?=.*?[0-9]{4})[\w@#$%^?~-]{10})\b",
0.05,
),
]

CONTEXT = [
"permanent account number",
"pan",
]

def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "en",
supported_entity: str = "IN_PAN",
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
):
self.replacement_pairs = (
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
)
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
)
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
TransformersRecognizer,
ItPassportRecognizer,
ItIdentityCardRecognizer,
InPanRecognizer,
)

logger = logging.getLogger("presidio-analyzer")
Expand Down Expand Up @@ -87,6 +88,7 @@ def load_predefined_recognizers(
AuAcnRecognizer,
AuTfnRecognizer,
AuMedicareRecognizer,
InPanRecognizer,
],
"es": [EsNifRecognizer],
"it": [
Expand Down
13 changes: 13 additions & 0 deletions presidio-analyzer/tests/data/context_sentences_tests.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,16 @@ Special NRIC numbers e.g. S0000001I that are numerically significant have been i
# Verify SG NRIC/FIN mixed case (e.g. lower case )
FIN
my fin is g3300299L

#Verify IN PAN in adjacent context words
IN_PAN
my pan is DJPMS1234Z amongst so many other things

#Verify IN PAN context words
IN_PAN
Typical tax filing identifier is known as PAN in India also known as permanent account number


#Verify IN PAN mixed case
IN_PAN
my PAN number is DJPMS1234Z
8 changes: 5 additions & 3 deletions presidio-analyzer/tests/test_context_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
IpRecognizer,
UsSsnRecognizer,
SgFinRecognizer,
InPanRecognizer,
)
from presidio_analyzer.nlp_engine import NlpArtifacts
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer
Expand All @@ -32,6 +33,7 @@ def recognizers_map():
"US_BANK_NUMBER": UsBankRecognizer(),
"US_PASSPORT": UsPassportRecognizer(),
"FIN": SgFinRecognizer(),
"IN_PAN": InPanRecognizer(),
}
return rec_map

Expand Down Expand Up @@ -70,9 +72,9 @@ def dataset(recognizers_map):
raise ValueError(f"bad entity type {entity_type}")

test_items.append((item, recognizer, [entity_type]))
# Currently we have 28 sentences, this is a sanity check
if not len(test_items) == 28:
raise ValueError(f"expected 28 context sentences but found {len(test_items)}")
# Currently we have 31 sentences, this is a sanity check
if not len(test_items) == 31:
raise ValueError(f"expected 31 context sentences but found {len(test_items)}")

yield test_items

Expand Down
49 changes: 49 additions & 0 deletions presidio-analyzer/tests/test_in_pan_recognizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pytest

from tests import assert_result
from presidio_analyzer.predefined_recognizers import InPanRecognizer


@pytest.fixture(scope="module")
def recognizer():
return InPanRecognizer()


@pytest.fixture(scope="module")
def entities():
return ["IN_PAN"]


@pytest.mark.parametrize(
"text, expected_len, expected_position, expected_score",
[
# fmt: off
("AAASA1111R", 1, (0,10), 0.6) ,
("ABCPD1234Z", 1, (0, 10), 0.85),
("ABCND1234Z", 1, (0, 10), 0.6),
("A1111DFSFS", 1, (0,10),0.05),
("ABCD1234",0,(),(),),
("My PAN number is ABBPM4567S with a lot of text beyond it", 1, (17,27),.85),
# fmt: on
],
)
def test_when_pan_in_text_then_all_pans_found(
text,
expected_len,
expected_position,
expected_score,
recognizer,
entities,
):
results = recognizer.analyze(text, entities)
print(results)

assert len(results) == expected_len
if results:
assert_result(
results[0],
entities[0],
expected_position[0],
expected_position[1],
expected_score,
)
4 changes: 2 additions & 2 deletions presidio-analyzer/tests/test_recognizer_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ def test_when_get_recognizers_then_all_recognizers_returned(mock_recognizer_regi
registry = mock_recognizer_registry
registry.load_predefined_recognizers()
recognizers = registry.get_recognizers(language="en", all_fields=True)
# 1 custom recognizer in english + 21 predefined
assert len(recognizers) == 1 + 21
# 1 custom recognizer in english + 22 predefined
assert len(recognizers) == 1 + 22


def test_when_get_recognizers_then_return_all_fields(mock_recognizer_registry):
Expand Down
25 changes: 24 additions & 1 deletion presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Handles the entire logic of the Presidio-anonymizer and text anonymizing."""
import logging
import re
from typing import List, Dict, Optional

from presidio_anonymizer.core import EngineBase
Expand Down Expand Up @@ -78,9 +79,13 @@ def anonymize(
analyzer_results
)

merged_results = self._merge_entities_with_whitespace_between(
text, analyzer_results
)

operators = self.__check_or_add_default_operator(operators)

return self._operate(text, analyzer_results, operators, OperatorType.Anonymize)
return self._operate(text, merged_results, operators, OperatorType.Anonymize)

def _remove_conflicts_and_get_text_manipulation_data(
self, analyzer_results: List[RecognizerResult]
Expand Down Expand Up @@ -139,6 +144,24 @@ def _remove_conflicts_and_get_text_manipulation_data(
)
return unique_text_metadata_elements

def _merge_entities_with_whitespace_between(
self,
text: str,
analyzer_results: List[RecognizerResult]
) -> List[RecognizerResult]:
"""Merge adjacent entities of the same type separated by whitespace."""
merged_results = []
prev_result = None
for result in analyzer_results:
if prev_result is not None:
if prev_result.entity_type == result.entity_type:
if re.search(r'^( )+$', text[prev_result.end:result.start]):
merged_results.remove(prev_result)
result.start = prev_result.start
merged_results.append(result)
prev_result = result
return merged_results

def get_anonymizers(self) -> List[str]:
"""Return a list of supported anonymizers."""
names = [p for p in self.operators_factory.get_anonymizers().keys()]
Expand Down
79 changes: 79 additions & 0 deletions presidio-anonymizer/tests/test_anonymizer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,85 @@ def test_given_several_results_then_we_filter_them_and_get_correct_mocked_result
assert result.items[0].text == "text"


@pytest.mark.parametrize(
# fmt: off
"text, analyzer_results, expected",
[
(
"My name is David Jones",
[
RecognizerResult(start=11, end=16, score=0.8, entity_type="PERSON"),
RecognizerResult(start=17, end=22, score=0.8, entity_type="PERSON"),
],
EngineResult(
text="My name is BIP",
items=[
OperatorResult(11, 14, "PERSON", "BIP", "replace"),
]
)
),
(
"My name is David Jones",
[
RecognizerResult(start=11, end=16, score=0.8, entity_type="PERSON"),
RecognizerResult(start=19, end=24, score=0.8, entity_type="PERSON"),
],
EngineResult(
text="My name is BIP",
items=[
OperatorResult(11, 14, "PERSON", "BIP", "replace"),
]
)
),
(
"My name is Jones, David",
[
RecognizerResult(start=11, end=16, score=0.8, entity_type="PERSON"),
RecognizerResult(start=18, end=23, score=0.8, entity_type="PERSON"),
],
EngineResult(
text="My name is BIP, BIP",
items=[
OperatorResult(11, 14, "PERSON", "BIP", "replace"),
OperatorResult(16, 19, "PERSON", "BIP", "replace"),
]
)
),
(
"The phone book said: Jones 212-555-5555",
[
RecognizerResult(start=21, end=26, score=0.8, entity_type="PERSON"),
RecognizerResult(
start=27, end=39, score=0.8, entity_type="PHONE NUMBER"
),
],
EngineResult(
text="The phone book said: BIP BEEP",
items=[
OperatorResult(21, 24, "PERSON", "BIP", "replace"),
OperatorResult(25, 29, "PHONE NUMBER", "BEEP", "replace"),
]
)
),
]
# fmt: on
)
def test_given_sorted_analyzer_results_merge_entities_separated_by_white_space(
text, analyzer_results, expected
):
engine = AnonymizerEngine()
result = engine.anonymize(
text,
analyzer_results,
operators={
"PERSON": OperatorConfig("replace", {"new_value": "BIP"}),
"PHONE NUMBER": OperatorConfig("replace", {"new_value": "BEEP"}),
},
)
assert result.text == expected.text
assert sorted(result.items) == sorted(expected.items)


def _operate(
text: str,
text_metadata: List[PIIEntity],
Expand Down
1 change: 1 addition & 0 deletions presidio-image-redactor/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ presidio-analyzer = ">=2.2.0"
pillow = ">=9.0"
pydicom = ">=2.3.0"
pypng = ">=0.20220715.0"
python-gdcm = ">=3.0.22"
matplotlib = "==3.6.2"
typing-extensions = "*"

Expand Down
Loading

0 comments on commit 5569923

Please sign in to comment.