Added code to replace text after removing PHI

FNNDSC · Feb 7, 2024 · a93c3b5 · a93c3b5
1 parent 0aa6872
commit a93c3b5
Showing 1 changed file with 54 additions and 7 deletions.
diff --git a/image_textRemove.py b/image_textRemove.py
@@ -14,8 +14,9 @@
 import os
 import sys
 from difflib import SequenceMatcher
+import hashlib
 
-__version__ = '1.1.6'
+__version__ = '1.1.7'
 
 DISPLAY_TITLE = r"""
        _        _                             _            _  ______                              
@@ -38,6 +39,8 @@
                     help='output file type(only the extension)')
 parser.add_argument('-j', '--filterTextFromJSON', default='anonymizedTags.json', type=str,
                     help='A dictionary of dicom tags and their values')
+parser.add_argument('-r', '--replaceTextFromJSON', default='replaceTags.json', type=str,
+                    help='A dictionary of dicom tags and their replacement values')
 parser.add_argument('-t', '--threshold', default=0.8, type=float,
                     help='threshold of similarity ration between two words')
 parser.add_argument(  '--pftelDB',
@@ -93,13 +96,24 @@ def main(options: Namespace, inputdir: Path, outputdir: Path):
         data = json.load(f)
     except Exception as ex:
         print("Error: ",ex)
+    # To be removed later
+    replace_json_data_path = ''
+    l_replace_json_path = list(inputdir.glob('**/*.json'))
+    for json_path in l_replace_json_path:
+        if json_path.name == options.replaceTextFromJSON:
+            replace_json_data_path = json_path
+    try:
+        f = open(replace_json_data_path, 'r')
+        replace_data = json.load(f)
+    except Exception as ex:
+        print("Error: ", ex)
     box_list = []
     mapper = PathMapper.file_mapper(inputdir, outputdir, glob=f"**/*.{options.fileFilter}", fail_if_empty=False)
     for input_file, output_file in mapper:
         # The code block below is a small and easy example of how to use a ``PathMapper``.
         # It is recommended that you put your functionality in a helper function, so that
         # it is more legible and can be unit tested.
-        box_list, final_image = inpaint_text(str(input_file), data, box_list, options.threshold)
+        box_list, final_image = inpaint_text(str(input_file), data, replace_data, box_list, options.threshold)
         img_rgb = cv2.cvtColor(final_image, cv2.COLOR_BGR2RGB)
         output_file = str(output_file).replace(options.fileFilter, options.outputType)
         print(f"Saving output file as ----->{output_file}<-----\n\n")
@@ -112,18 +126,29 @@ def midpoint(x1, y1, x2, y2):
     return x_mid, y_mid
 
 
-def inpaint_text(img_path, data, box_list, similarity_threshold):
+def inpaint_text(img_path, data, replace_data, box_list, similarity_threshold):
     word_list = []
+    d_replace_text = {}
     for item in data.keys():
         if item == 'PatientName':
-            word_list.extend(data.get(item).split('^'))
+            anon_name = replace_data.get(item).split('^')
+            real_name = data.get(item).split('^')
+            for i in range(len(real_name)):
+                word_list.append(real_name[i])
+                d_replace_text[real_name[i]] = anon_name[i]
         elif item == 'PatientBirthDate':
             yyyy = data.get(item)[0:4]
             mm = data.get(item)[4:6]
             dd = data.get(item)[6:8]
+            yyyy1 = replace_data.get(item)[0:4]
+            mm1 = replace_data.get(item)[4:6]
+            dd1 = replace_data.get(item)[6:8]
             word_list.append(f'{mm}1{dd}1{yyyy}')
+            d_replace_text[f'{mm}1{dd}1{yyyy}'] = f'{mm1}/{dd1}/{yyyy1}'
         else:
             word_list.append(data.get(item))
+            d_replace_text[data.get(item)] = replace_data.get(item)
+    print(d_replace_text)
     # read image
     print(f"Reading input file from ---->{img_path}<----")
     img = cv2.imread(img_path)
@@ -134,9 +159,10 @@ def inpaint_text(img_path, data, box_list, similarity_threshold):
 
 
     mask = np.zeros(img.shape[:2], dtype="uint8")
+    l_coordinates = []
+    l_text = []
     for box in box_list:
         if (box[0].upper() in word_list) or close_to_similar(box[0].upper(), word_list, similarity_threshold):
-            # Remove PatientName only
             print(f"Removing {box[0].upper()} from image")
             x0, y0 = box[1][0]
             x1, y1 = box[1][1]
@@ -151,9 +177,30 @@ def inpaint_text(img_path, data, box_list, similarity_threshold):
             cv2.line(mask, (x_mid0, y_mid0), (x_mid1, y_mi1), 255,
                  thickness)
             img = cv2.inpaint(img, mask, 7, cv2.INPAINT_NS)
+            org = (round(x3),round(y3))
+            l_coordinates.append(org)
+            if box[0].upper() in d_replace_text:
+                l_text.append(d_replace_text[box[0].upper()])
+            else:
+                word = close_to_similar(box[0].upper(), word_list, similarity_threshold)
+                l_text.append(d_replace_text[word])
+
+    color = (235, 235, 235)
+    print(l_text)
+    for org,text in zip(l_coordinates,l_text):
+        #text = generate_hash_text(text)
+        img = cv2.putText(img, text, org, cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
 
     return box_list, img
 
+def generate_hash_text(input_text: str) -> str:
+    """
+    Given an input text, generate a md5 hash and
+    return a hashed text of the original text's length
+    """
+    result = hashlib.md5(input_text.encode())
+
+    return result.hexdigest()[0:len(input_text)].upper()
 
 def read_input_dicom(input_file_path):
     """
@@ -195,9 +242,9 @@ def similar(a: str, b: str):
 def close_to_similar(target: str, wordlist: str, similarity_threshold: float):
     for word in wordlist:
         if similar(target, word) >= similarity_threshold:
-            return True
+            return word
 
-    return False
+    return None
 
 
 if __name__ == '__main__':