V2 anonymizer improvements (microsoft#547)

* Add readme file for image redactor Fix anonymizer readme * Fix PR comments * improve run time of the analyzer results loop. * Add note and some more tests. Co-authored-by: Nava Vaisman Levy <[email protected]>
gfog-floqast · Mar 1, 2021 · 6baee6e · 6baee6e
1 parent 0f853ca
commit 6baee6e
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 7 deletions.
diff --git a/presidio-anonymizer/presidio_anonymizer/entities/analyzer_results.py b/presidio-anonymizer/presidio_anonymizer/entities/analyzer_results.py
@@ -54,11 +54,15 @@ def _remove_conflicts(self):
         :return: List
         """
         unique_elements = []
+        # This list contains all elements which we need to check a single result
+        # against. If a result is dropped, it can also be dropped from this list
+        # since it is intersecting with another result and we selected the other one.
+        other_elements = AnalyzerResults(self)
         for result_a in self:
-            other_elements = AnalyzerResults(self)
             other_elements.remove(result_a)
             if not any([result_a.has_conflict(other_element) for other_element in
                         other_elements]):
+                other_elements.append(result_a)
                 unique_elements.append(result_a)
             else:
                 self.logger.debug(

diff --git a/presidio-anonymizer/tests/test_analyzer_results.py b/presidio-anonymizer/tests/test_analyzer_results.py
@@ -11,19 +11,21 @@ def test_given_conflicting_analyzer_results_then_none_conflicting_results_return
     analyze_results = AnonymizerRequest.handle_analyzer_results_json(payload)
     assert len(analyze_results) == len(payload.get("analyzer_results"))
     sorted_results = analyze_results.to_sorted_unique_results()
-    assert len(sorted_results) == 2
-    assert list(sorted_results)[0].start < list(sorted_results)[1].start
-    assert list(sorted_results)[0].end < list(sorted_results)[1].end
+    assert len(sorted_results) == 4
+    for index in range(len(sorted_results) - 1):
+        assert list(sorted_results)[index].start < list(sorted_results)[index + 1].start
+        assert list(sorted_results)[index].end < list(sorted_results)[index + 1].end
 
 
 def test_given_conflict_analyzer_results_then_reversed_none_conflict_list_returned():
     payload = get_dup_payload()
     analyze_results = AnonymizerRequest.handle_analyzer_results_json(payload)
     assert len(analyze_results) == len(payload.get("analyzer_results"))
     sorted_results = analyze_results.to_sorted_unique_results(True)
-    assert len(sorted_results) == 2
-    assert list(sorted_results)[1].start < list(sorted_results)[0].start
-    assert list(sorted_results)[1].end < list(sorted_results)[0].end
+    assert len(sorted_results) == 4
+    for index in range(len(sorted_results) - 1):
+        assert list(sorted_results)[index].start > list(sorted_results)[index + 1].start
+        assert list(sorted_results)[index].end > list(sorted_results)[index + 1].end
 
 
 def get_dup_payload():
@@ -35,5 +37,8 @@ def get_dup_payload():
             {"start": 24, "end": 28, "score": 0.9, "entity_type": "FIRST_NAME"},
             {"start": 29, "end": 32, "score": 0.6, "entity_type": "LAST_NAME"},
             {"start": 24, "end": 30, "score": 0.8, "entity_type": "NAME"},
+            {"start": 18, "end": 32, "score": 0.8, "entity_type": "BLA"},
+            {"start": 23, "end": 35, "score": 0.8, "entity_type": "BLA"},
+            {"start": 28, "end": 36, "score": 0.8, "entity_type": "BLA"},
         ],
     }