microsoft · niwilso · Aug 21, 2023 · Jul 5, 2023 · Jul 5, 2023 · Jul 6, 2023
diff --git a/presidio-image-redactor/presidio_image_redactor/dicom_image_redactor_engine.py b/presidio-image-redactor/presidio_image_redactor/dicom_image_redactor_engine.py
@@ -16,6 +16,7 @@
 from presidio_image_redactor import ImageRedactorEngine
 from presidio_image_redactor import ImageAnalyzerEngine  # noqa: F401
 from presidio_analyzer import PatternRecognizer
+from presidio_image_redactor.entities import ImageRecognizerResult
 
 
 class DicomImageRedactorEngine(ImageRedactorEngine):
@@ -30,6 +31,7 @@ def redact(
         fill: str = "contrast",
         padding_width: int = 25,
         crop_ratio: float = 0.75,
+        redact_approach: Union[str, PatternRecognizer] = "metadata",
         ocr_kwargs: Optional[dict] = None,
         **text_analyzer_kwargs,
     ):
@@ -43,6 +45,8 @@ def redact(
         :param padding_width: Padding width to use when running OCR.
         :param crop_ratio: Portion of image to consider when selecting
         most common pixel value as the background color value.
+        :param redact_approach: What approach to use when redacting
+        ("default", "metadata", or a PatternRecognizer object).
         :param ocr_kwargs: Additional params for OCR methods.
         :param text_analyzer_kwargs: Additional values for the analyze method
         in AnalyzerEngine.
@@ -71,17 +75,9 @@ def redact(
             loaded_image = Image.open(png_filepath)
             image = self._add_padding(loaded_image, is_greyscale, padding_width)
 
-        # Create custom recognizer using DICOM metadata
-        original_metadata, is_name, is_patient = self._get_text_metadata(instance)
-        phi_list = self._make_phi_list(original_metadata, is_name, is_patient)
-        deny_list_recognizer = PatternRecognizer(
-            supported_entity="PERSON", deny_list=phi_list
-        )
-        analyzer_results = self.image_analyzer_engine.analyze(
-            image,
-            ocr_kwargs=ocr_kwargs,
-            ad_hoc_recognizers=[deny_list_recognizer],
-            **text_analyzer_kwargs,
+        # Detect PII
+        analyzer_results = self._get_analyzer_results(
+            image, instance, redact_approach, ocr_kwargs, **text_analyzer_kwargs
         )
 
         # Redact all bounding boxes from DICOM file
@@ -102,6 +98,7 @@ def redact_from_file(
         padding_width: int = 25,
         crop_ratio: float = 0.75,
         fill: str = "contrast",
+        redact_approach: Union[str, PatternRecognizer] = "metadata",
         ocr_kwargs: Optional[dict] = None,
         **text_analyzer_kwargs,
     ) -> None:
@@ -115,6 +112,8 @@ def redact_from_file(
         :param padding_width : Padding width to use when running OCR.
         :param fill: Color setting to use for redaction box
         ("contrast" or "background").
+        :param redact_approach: What approach to use when redacting
+        ("default", "metadata", or a PatternRecognizer object).
         :param ocr_kwargs: Additional params for OCR methods.
         :param text_analyzer_kwargs: Additional values for the analyze method
         in AnalyzerEngine.
@@ -138,6 +137,7 @@ def redact_from_file(
             crop_ratio=crop_ratio,
             fill=fill,
             padding_width=padding_width,
+            redact_approach=redact_approach,
             overwrite=True,
             dst_parent_dir=".",
             ocr_kwargs=ocr_kwargs,
@@ -155,6 +155,7 @@ def redact_from_directory(
         padding_width: int = 25,
         crop_ratio: float = 0.75,
         fill: str = "contrast",
+        redact_approach: Union[str, PatternRecognizer] = "metadata",
         ocr_kwargs: Optional[dict] = None,
         **text_analyzer_kwargs,
     ) -> None:
@@ -170,6 +171,8 @@ def redact_from_directory(
         most common pixel value as the background color value.
         :param fill: Color setting to use for redaction box
         ("contrast" or "background").
+        :param redact_approach: What approach to use when redacting
+        ("default", "metadata", or a PatternRecognizer object).
         :param ocr_kwargs: Additional params for OCR methods.
         :param text_analyzer_kwargs: Additional values for the analyze method
         in AnalyzerEngine.
@@ -193,6 +196,7 @@ def redact_from_directory(
             crop_ratio=crop_ratio,
             fill=fill,
             padding_width=padding_width,
+            redact_approach=redact_approach,
             overwrite=True,
             dst_parent_dir=".",
             ocr_kwargs=ocr_kwargs,
@@ -733,12 +737,74 @@ def _add_redact_box(
 
         return redacted_instance
 
+    def _get_analyzer_results(
+        self,
+        image: PIL.PngImagePlugin.PngImageFile,
+        instance: pydicom.dataset.FileDataset,
+        redact_approach: Union[str, PatternRecognizer],
+        ocr_kwargs: Optional[dict],
+        **text_analyzer_kwargs
+    ) -> List[ImageRecognizerResult]:
+        """Analyze image with selected redaction approach.
+
+        :param image: DICOM pixel data as PIL image.
+        :param instance: DICOM instance (with metadata).
+        :param redact_approach: What approach to use when redacting
+        ("default", "metadata", "allow", or a PatternRecognizer object).
+        :param ocr_kwargs: Additional params for OCR methods.
+        :param text_analyzer_kwargs: Additional values for the analyze method
+        in AnalyzerEngine (e.g., allow_list).
+
+        :return: Analyzer results.
+        """
+        # Detect PII
+        if type(redact_approach) == str:
+            if redact_approach.lower() == "default":
+                # Use default redactor
+                analyzer_results = self.image_analyzer_engine.analyze(
+                    image,
+                    ocr_kwargs=ocr_kwargs,
+                    **text_analyzer_kwargs,
+                )
+            elif redact_approach.lower() == "metadata":
+                # Create custom recognizer using DICOM metadata
+                original_metadata, is_name, is_patient = self._get_text_metadata(
+                    instance
+                )
+                phi_list = self._make_phi_list(
+                    original_metadata, is_name, is_patient
+                )
+                deny_list_recognizer = PatternRecognizer(
+                    supported_entity="PERSON", deny_list=phi_list
+                )
+                analyzer_results = self.image_analyzer_engine.analyze(
+                    image,
+                    ocr_kwargs=ocr_kwargs,
+                    ad_hoc_recognizers=[deny_list_recognizer],
+                    **text_analyzer_kwargs,
+                )
+            else:
+                raise ValueError("Please enter valid string or PatternRecognizer object for redact_approach")  # noqa: E501
+        elif type(redact_approach) == PatternRecognizer:
+            # Use passed in recognizer
+            analyzer_results = self.image_analyzer_engine.analyze(
+                image,
+                ocr_kwargs=ocr_kwargs,
+                ad_hoc_recognizers=[redact_approach],
+                **text_analyzer_kwargs,
+            )
+        else:
+            raise ValueError("Please enter valid string or PatternRecognizer object for redact_approach")  # noqa: E501
+
+        return analyzer_results
+
     def _redact_single_dicom_image(
         self,
         dcm_path: str,
         crop_ratio: float,
         fill: str,
         padding_width: int,
+        redact_approach: Union[str, PatternRecognizer],
         overwrite: bool,
         dst_parent_dir: str,
         ocr_kwargs: Optional[dict] = None,
@@ -752,6 +818,8 @@ def _redact_single_dicom_image(
         :param fill: Color setting to use for bounding boxes
         ("contrast" or "background").
         :param padding_width: Pixel width of padding (uniform).
+        :param redact_approach: What approach to use when redacting
+        ("default", "metadata", or a PatternRecognizer object).
         :param overwrite: Only set to True if you are providing the
         duplicated DICOM path in dcm_path.
         :param dst_parent_dir: String path to parent directory of where to store copies.
@@ -789,17 +857,9 @@ def _redact_single_dicom_image(
             loaded_image = Image.open(png_filepath)
             image = self._add_padding(loaded_image, is_greyscale, padding_width)
 
-        # Create custom recognizer using DICOM metadata
-        original_metadata, is_name, is_patient = self._get_text_metadata(instance)
-        phi_list = self._make_phi_list(original_metadata, is_name, is_patient)
-        deny_list_recognizer = PatternRecognizer(
-            supported_entity="PERSON", deny_list=phi_list
-        )
-        analyzer_results = self.image_analyzer_engine.analyze(
-            image,
-            ocr_kwargs=ocr_kwargs,
-            ad_hoc_recognizers=[deny_list_recognizer],
-            **text_analyzer_kwargs,
+        # Detect PII
+        analyzer_results = self._get_analyzer_results(
+            image, instance, redact_approach, ocr_kwargs, **text_analyzer_kwargs
         )
 
         # Redact all bounding boxes from DICOM file
@@ -822,6 +882,7 @@ def _redact_multiple_dicom_images(
         crop_ratio: float,
         fill: str,
         padding_width: int,
+        redact_approach: Union[str, PatternRecognizer],
         overwrite: bool,
         dst_parent_dir: str,
         ocr_kwargs: Optional[dict] = None,
@@ -835,6 +896,8 @@ def _redact_multiple_dicom_images(
         :param fill: Color setting to use for bounding boxes
         ("contrast" or "background").
         :param padding_width: Pixel width of padding (uniform).
+        :param redact_approach: What approach to use when redacting
+        ("default", "metadata", or a PatternRecognizer object).
         :param overwrite: Only set to True if you are providing
         the duplicated DICOM dir in dcm_dir.
         :param dst_parent_dir: String path to parent directory of where to store copies.
@@ -865,6 +928,7 @@ def _redact_multiple_dicom_images(
                 crop_ratio,
                 fill,
                 padding_width,
+                redact_approach,
                 overwrite,
                 dst_parent_dir,
                 ocr_kwargs=ocr_kwargs,