From 77e3aad9e2b8bbf8b60de3f4c720b89e8aed70a2 Mon Sep 17 00:00:00 2001
From: Dylan Hall <dehall@mitre.org>
Date: Fri, 13 Sep 2024 13:38:55 -0400
Subject: [PATCH] complete run through of run_coherent_eyes

---
 run_coherent_eyes.sh                          | 30 ++++----
 .../python/coherent-data/associate_images.py  | 72 ++++++++++---------
 .../python/coherent-data/requirements.txt     |  2 +-
 3 files changed, 53 insertions(+), 51 deletions(-)

diff --git a/run_coherent_eyes.sh b/run_coherent_eyes.sh
index 144c7fdf9..9510fb84d 100755
--- a/run_coherent_eyes.sh
+++ b/run_coherent_eyes.sh
@@ -3,6 +3,13 @@ set -e
 
 basedir=`pwd`
 
+rm -rf selected1000/ selected100/ selected10/
+
+python3 -m venv ./venv/
+source ./venv/bin/activate
+python3 -m pip install -r src/main/python/coherent-data/requirements.txt
+
+
 base_run_synthea () {
     ./run_synthea -a 55-70 \
                   -fm src/test/resources/flexporter/eyes_on_fhir.yaml \
@@ -28,13 +35,6 @@ run_population () {
   base_run_synthea -p $((popcount / 4)) -k keep_pdr.json
 }
 
-
-python3 -m venv ./venv/
-source ./venv/bin/activate
-python3 -m pip install -r src/main/python/coherent-data/requirements.txt
-
-rm -rf selected1000/ selected100/ selected10/
-
 # all populations have:
 # 25% diabetes but no DR
 # 50% NPDR, no PDR
@@ -100,15 +100,13 @@ mkdir selected10
 ./copy.sh selected_files10.txt selected10/
 cp output_population10/fhir/*Information*.json selected10
 
+## IMPORTANT: this last one was manually curated, I put results are in the subfolder keep/
 
-# cd src/main/python/coherent-data/
-# source ./venv/bin/activate
-
-# ./venv/bin/python associate_images.py ${basedir}/images/fundus_index.csv ${basedir}/images/oct_index.csv ${basedir}/output/fhir --clean --output ${basedir}/coherent_eyes
-
-# # ./venv/bin/python associate_images.py ${basedir}/images/fundus_index.csv ${basedir}/images/oct_index.csv ${basedir}/samples --clean --output ${basedir}/coherent_eyes
+cd src/main/python/coherent-data/
 
-# rm ${basedir}/dicom_errors.txt
+./venv/bin/python associate_images.py ${basedir}/images/Model1_250step/index.csv ${basedir}/images/oct_index.csv ${basedir}/selected10/keep --clean --add_dup_images --output ${basedir}/coherent_eyes10
+./venv/bin/python associate_images.py ${basedir}/images/Model1_250step/index.csv ${basedir}/images/oct_index.csv ${basedir}/selected100/ --clean --image_limit 2 --output ${basedir}/coherent_eyes100
+./venv/bin/python associate_images.py ${basedir}/images/Model1_250step/index.csv ${basedir}/images/oct_index.csv ${basedir}/selected1000/ --clean --image_limit 1 --reuse_images --output ${basedir}/coherent_eyes1000
 
-# validate_iods --verbose /Users/dehall/synthea/nei/coherent_eyes/dicom/Annabel185_Lettie611_Fisher429_af88404e-aad1-c9cb-3e7f-07daf0e44eac_fundus_1.2.840.99999999.10633938.1562002233954_1.2.840.99999999.1.1.99330560.1562002233954.dcm > ${basedir}/dicom_errors.txt
-# validate_iods --verbose /Users/dehall/synthea/nei/coherent_eyes/dicom/Annabel185_Lettie611_Fisher429_af88404e-aad1-c9cb-3e7f-07daf0e44eac_OCT_1.2.840.99999999.11240513.1609790227954_1.2.840.99999999.1.1.66970829.1609790227954.dcm >> ${basedir}/dicom_errors.txt
\ No newline at end of file
+# Note tool to validate dicoms:
+# validate_iods --verbose coherent_eyes/dicom/Annabel185_Lettie611_Fisher429_af88404e-aad1-c9cb-3e7f-07daf0e44eac_fundus_1.2.840.99999999.10633938.1562002233954_1.2.840.99999999.1.1.99330560.1562002233954.dcm > ${basedir}/dicom_errors.txt
diff --git a/src/main/python/coherent-data/associate_images.py b/src/main/python/coherent-data/associate_images.py
index 51be59c8a..284465703 100644
--- a/src/main/python/coherent-data/associate_images.py
+++ b/src/main/python/coherent-data/associate_images.py
@@ -57,6 +57,25 @@ def parse_args():
         default="./output",
         help="Output directory",
     )
+    parser.add_argument(
+        "--reuse_images",
+        dest="reuse_images",
+        action="store_true",
+        help="Reuse images between patients",
+    )
+    parser.add_argument(
+        "--add_dup_images",
+        dest="add_dup_images",
+        action="store_true",
+        help="Add DICOM and FHIR Media for duplicate images (eg, when there's more than one ImagingStudy with no change in disease state)",
+    )
+    parser.add_argument(
+        "--image_limit",
+        dest="image_limit",
+        type=float,
+        default=float('inf'),
+        help="Maximum number of images to associate, default: no limit",
+    )
 
     args = parser.parse_args()
     return args
@@ -81,7 +100,12 @@ def main():
     for file in fhir_jsons:
         if 'hospitalInformation' in file or 'practitionerInformation' in file:
             continue
-        process_file(file, fundus_index, oct_index, args.output)
+        process_file(file, fundus_index, oct_index, args.output, args)
+
+        if args.reuse_images:
+            fundus_index['selected'] = False
+            oct_index['selected'] = False
+
 
 def clean(output):
     outputpath = Path(output)
@@ -94,7 +118,7 @@ def clean(output):
     (outputpath / '.keep').touch()
 
 
-def process_file(file, fundus_index, oct_index, output):
+def process_file(file, fundus_index, oct_index, output, args):
     print(f"Processing {file}")
     with open(file) as f:
         bundle = json.load(f)
@@ -104,6 +128,7 @@ def process_file(file, fundus_index, oct_index, output):
     diag_reports = []
     diagnoses = { 'npdr': None, 'pdr': None, 'dme': None }
     observations = []
+    added_img_count = 0
 
     for entry in bundle['entry']:
         resource = entry['resource']
@@ -136,14 +161,14 @@ def process_file(file, fundus_index, oct_index, output):
     if not imaging_studies:
         return
 
-    # import pdb; pdb.set_trace()
-
-    # print(f"Found {len(imaging_studies)} imaging studies")
 
     previous_context = None
     previous_image = { 'OCT': [None, None], 'fundus': [None, None] }
 
     for i in range(len(imaging_studies)):
+        if added_img_count > args.image_limit:
+            break
+
         imaging_study = imaging_studies[i]
         diag_report = diag_reports[i]
         # these should always be 1:1
@@ -164,6 +189,9 @@ def process_file(file, fundus_index, oct_index, output):
                 previous_image[img_type][index] = None
                 continue
 
+            if not args.add_dup_images and image == previous_image[img_type][index]:
+                continue
+
             dicom = create_dicom(image, imaging_study, context)
             dicom_uid = imaging_study['identifier'][0]['value'][8:]  # cut off urn:uuid:
             instance_uid = context['instance']['uid']
@@ -176,6 +204,7 @@ def process_file(file, fundus_index, oct_index, output):
             media = create_fhir_media(context, imaging_study, image, dicom)
             bundle['entry'].append(wrap_in_entry(media))
             previous_image[img_type][index] = image
+            added_img_count = added_img_count + 1
 
         previous_context = context
 
@@ -249,6 +278,7 @@ def pick_image(fundus_index, oct_index, context):
     index.at[selected.index[0], 'selected'] = True
 
     path = selected['File Path'].iat[0]
+    print(f"Loading image from {path}")
     image = Image.open(path)
 
     return image
@@ -259,43 +289,17 @@ def filter_oct_index(oct_index, context):
     # CNV = Choroidal neovascularization
     # DME = diabetic macular edema
 
-    if context['dme']:
+    if context['dme'] or context['pdr']:
         oct_index = oct_index[oct_index['Class'] == 'DME']
-    elif context['pdr']:
-        oct_index = oct_index[oct_index['Class'] == 'CNV']
     else:
         oct_index = oct_index[oct_index['Class'] == 'Normal']
 
     return oct_index
 
 
-# def filter_fundus_index(fundus_index, context):
-#     # fundus_index items are 0/1
-#     # DR = diabetic retinopathy
-#     # MH = macular hole
-#     # DN = ??
-#     # BRVO = Branch Retinal Vein Occlusion
-#     # ODC = Optic Disc Coloboma?
-#     # ODE = Optic disc edema?
-#     # (there were more but i deleted all columns with all 0s)
-
-#     if context['npdr']:
-#         fundus_index = fundus_index[fundus_index['DR'] == '1']
-#     else:
-#         fundus_index = fundus_index[fundus_index['DR'] == '0']
-
-#     return fundus_index
-
 def filter_fundus_index(fundus_index, context):
-    # Retinopathy grade = lines up to our stages
-    # Risk of macular edema = unclear. seems like 0 = no DME, 1/2 = DME
-
-    fundus_index = fundus_index[fundus_index['Retinopathy grade'] == context['dr_stage']]
-
-    if context['dme']:
-        fundus_index = fundus_index[fundus_index['Risk of macular edema'] != '0']
-    else:
-        fundus_index = fundus_index[fundus_index['Risk of macular edema'] == '0']
+    # dr_stage = lines up to our stages
+    fundus_index = fundus_index[fundus_index['dr_stage'] == context['dr_stage']]
 
     return fundus_index
 
diff --git a/src/main/python/coherent-data/requirements.txt b/src/main/python/coherent-data/requirements.txt
index 812f5e577..5f8c55155 100644
--- a/src/main/python/coherent-data/requirements.txt
+++ b/src/main/python/coherent-data/requirements.txt
@@ -1,4 +1,4 @@
 pandas==1.5.3
 numpy==1.26.4
 pillow==10.2.0
-pydicom==2.4.4
\ No newline at end of file
+pydicom==3.0.0
\ No newline at end of file