Skip to content

Commit

Permalink
Initial logic check for merging 2 entities (#1092)
Browse files Browse the repository at this point in the history
  • Loading branch information
gokullan authored Jul 9, 2023
1 parent a661037 commit d51ff56
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 1 deletion.
25 changes: 24 additions & 1 deletion presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Handles the entire logic of the Presidio-anonymizer and text anonymizing."""
import logging
import re
from typing import List, Dict, Optional

from presidio_anonymizer.core import EngineBase
Expand Down Expand Up @@ -78,9 +79,13 @@ def anonymize(
analyzer_results
)

merged_results = self._merge_entities_with_whitespace_between(
text, analyzer_results
)

operators = self.__check_or_add_default_operator(operators)

return self._operate(text, analyzer_results, operators, OperatorType.Anonymize)
return self._operate(text, merged_results, operators, OperatorType.Anonymize)

def _remove_conflicts_and_get_text_manipulation_data(
self, analyzer_results: List[RecognizerResult]
Expand Down Expand Up @@ -139,6 +144,24 @@ def _remove_conflicts_and_get_text_manipulation_data(
)
return unique_text_metadata_elements

def _merge_entities_with_whitespace_between(
self,
text: str,
analyzer_results: List[RecognizerResult]
) -> List[RecognizerResult]:
"""Merge adjacent entities of the same type separated by whitespace."""
merged_results = []
prev_result = None
for result in analyzer_results:
if prev_result is not None:
if prev_result.entity_type == result.entity_type:
if re.search(r'^( )+$', text[prev_result.end:result.start]):
merged_results.remove(prev_result)
result.start = prev_result.start
merged_results.append(result)
prev_result = result
return merged_results

def get_anonymizers(self) -> List[str]:
"""Return a list of supported anonymizers."""
names = [p for p in self.operators_factory.get_anonymizers().keys()]
Expand Down
79 changes: 79 additions & 0 deletions presidio-anonymizer/tests/test_anonymizer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,85 @@ def test_given_several_results_then_we_filter_them_and_get_correct_mocked_result
assert result.items[0].text == "text"


@pytest.mark.parametrize(
# fmt: off
"text, analyzer_results, expected",
[
(
"My name is David Jones",
[
RecognizerResult(start=11, end=16, score=0.8, entity_type="PERSON"),
RecognizerResult(start=17, end=22, score=0.8, entity_type="PERSON"),
],
EngineResult(
text="My name is BIP",
items=[
OperatorResult(11, 14, "PERSON", "BIP", "replace"),
]
)
),
(
"My name is David Jones",
[
RecognizerResult(start=11, end=16, score=0.8, entity_type="PERSON"),
RecognizerResult(start=19, end=24, score=0.8, entity_type="PERSON"),
],
EngineResult(
text="My name is BIP",
items=[
OperatorResult(11, 14, "PERSON", "BIP", "replace"),
]
)
),
(
"My name is Jones, David",
[
RecognizerResult(start=11, end=16, score=0.8, entity_type="PERSON"),
RecognizerResult(start=18, end=23, score=0.8, entity_type="PERSON"),
],
EngineResult(
text="My name is BIP, BIP",
items=[
OperatorResult(11, 14, "PERSON", "BIP", "replace"),
OperatorResult(16, 19, "PERSON", "BIP", "replace"),
]
)
),
(
"The phone book said: Jones 212-555-5555",
[
RecognizerResult(start=21, end=26, score=0.8, entity_type="PERSON"),
RecognizerResult(
start=27, end=39, score=0.8, entity_type="PHONE NUMBER"
),
],
EngineResult(
text="The phone book said: BIP BEEP",
items=[
OperatorResult(21, 24, "PERSON", "BIP", "replace"),
OperatorResult(25, 29, "PHONE NUMBER", "BEEP", "replace"),
]
)
),
]
# fmt: on
)
def test_given_sorted_analyzer_results_merge_entities_separated_by_white_space(
text, analyzer_results, expected
):
engine = AnonymizerEngine()
result = engine.anonymize(
text,
analyzer_results,
operators={
"PERSON": OperatorConfig("replace", {"new_value": "BIP"}),
"PHONE NUMBER": OperatorConfig("replace", {"new_value": "BEEP"}),
},
)
assert result.text == expected.text
assert sorted(result.items) == sorted(expected.items)


def _operate(
text: str,
text_metadata: List[PIIEntity],
Expand Down

0 comments on commit d51ff56

Please sign in to comment.