Skip to content

Commit

Permalink
add all python scripts just for reproducibility
Browse files Browse the repository at this point in the history
  • Loading branch information
dehall committed Sep 12, 2024
1 parent 222e325 commit fb7d08c
Show file tree
Hide file tree
Showing 5 changed files with 205 additions and 3 deletions.
9 changes: 9 additions & 0 deletions copy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

while read line
do
name=$line
# echo "Text read from file - $name"
find $name -exec cp {} $2 \;

done < $1
103 changes: 103 additions & 0 deletions deleted_modules.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
src/main/resources/modules/acute_myeloid_leukemia.json
src/main/resources/modules/allergic_rhinitis.json
src/main/resources/modules/allergies.json
src/main/resources/modules/allergies/allergy_panel.json
src/main/resources/modules/allergies/drug_allergy_incidence.json
src/main/resources/modules/allergies/environmental_allergy_incidence.json
src/main/resources/modules/allergies/immunotherapy.json
src/main/resources/modules/allergies/outgrow_env_allergies.json
src/main/resources/modules/allergies/outgrow_food_allergies.json
src/main/resources/modules/allergies/severe_allergic_reaction.json
src/main/resources/modules/anemia___unknown_etiology.json
src/main/resources/modules/appendicitis.json
src/main/resources/modules/asthma.json
src/main/resources/modules/atopy.json
src/main/resources/modules/atrial_fibrillation.json
src/main/resources/modules/attention_deficit_disorder.json
src/main/resources/modules/breast_cancer.json
src/main/resources/modules/breast_cancer/hormone_diagnosis.json
src/main/resources/modules/breast_cancer/hormonetherapy_breast.json
src/main/resources/modules/bronchitis.json
src/main/resources/modules/colorectal_cancer.json
src/main/resources/modules/contraceptive_maintenance.json
src/main/resources/modules/contraceptives.json
src/main/resources/modules/contraceptives/clear_contraceptive.json
src/main/resources/modules/contraceptives/female_sterilization.json
src/main/resources/modules/contraceptives/implant_contraceptive.json
src/main/resources/modules/contraceptives/injectable_contraceptive.json
src/main/resources/modules/contraceptives/intrauterine_device.json
src/main/resources/modules/contraceptives/male_sterilization.json
src/main/resources/modules/contraceptives/oral_contraceptive.json
src/main/resources/modules/contraceptives/patch_contraceptive.json
src/main/resources/modules/contraceptives/ring_contraceptive.json
src/main/resources/modules/copd.json
src/main/resources/modules/covid19.json
src/main/resources/modules/covid19/determine_risk.json
src/main/resources/modules/covid19/diagnose_bacterial_infection.json
src/main/resources/modules/covid19/diagnose_blood_clot.json
src/main/resources/modules/covid19/end_outcomes.json
src/main/resources/modules/covid19/end_symptoms.json
src/main/resources/modules/covid19/measurements_daily.json
src/main/resources/modules/covid19/measurements_frequent.json
src/main/resources/modules/covid19/measurements_vitals.json
src/main/resources/modules/covid19/medications.json
src/main/resources/modules/covid19/nonsurvivor_lab_values.json
src/main/resources/modules/covid19/supplies_hospitalization.json
src/main/resources/modules/covid19/supplies_icu.json
src/main/resources/modules/covid19/supplies_intubation.json
src/main/resources/modules/covid19/survivor_lab_values.json
src/main/resources/modules/covid19/symptoms.json
src/main/resources/modules/covid19/treat_blood_clot.json
src/main/resources/modules/cystic_fibrosis.json
src/main/resources/modules/dementia.json
src/main/resources/modules/dental_and_oral_examination.json
src/main/resources/modules/dentures.json
src/main/resources/modules/dermatitis.json
src/main/resources/modules/dermatitis/early_moderate_eczema_obs.json
src/main/resources/modules/dermatitis/early_severe_eczema_obs.json
src/main/resources/modules/dermatitis/mid_moderate_eczema_obs.json
src/main/resources/modules/dermatitis/mid_severe_eczema_obs.json
src/main/resources/modules/dermatitis/moderate_cd_obs.json
src/main/resources/modules/dermatitis/severe_cd_obs.json
src/main/resources/modules/ear_infections.json
src/main/resources/modules/epilepsy.json
src/main/resources/modules/female_reproduction.json
src/main/resources/modules/fibromyalgia.json
src/main/resources/modules/food_allergies.json
src/main/resources/modules/gout.json
src/main/resources/modules/hiv/hiv_cd4.json
src/main/resources/modules/hiv/stop_all_art_meds.json
src/main/resources/modules/hiv_care.json
src/main/resources/modules/hiv_diagnosis.json
src/main/resources/modules/homelessness.json
src/main/resources/modules/hypothyroidism.json
src/main/resources/modules/injuries.json
src/main/resources/modules/lung_cancer.json
src/main/resources/modules/lung_cancer/lung_cancer_probabilities.json
src/main/resources/modules/lupus.json
src/main/resources/modules/mTBI.json
src/main/resources/modules/med_rec.json
src/main/resources/modules/mend_program.json
src/main/resources/modules/opioid_addiction.json
src/main/resources/modules/osteoarthritis.json
src/main/resources/modules/osteoporosis.json
src/main/resources/modules/pregnancy.json
src/main/resources/modules/prescribing_opioids_for_chronic_pain_and_treatment_of_oud.json
src/main/resources/modules/rheumatoid_arthritis.json
src/main/resources/modules/sepsis.json
src/main/resources/modules/sexual_activity.json
src/main/resources/modules/sleep_apnea.json
src/main/resources/modules/stroke.json
src/main/resources/modules/total_joint_replacement.json
src/main/resources/modules/total_joint_replacement/functional_status_assessments.json
src/main/resources/modules/trigger_bone_marrow_transplant.json
src/main/resources/modules/urinary_tract_infections.json
src/main/resources/modules/veteran.json
src/main/resources/modules/veteran_hyperlipidemia.json
src/main/resources/modules/veteran_lung_cancer.json
src/main/resources/modules/veteran_mdd.json
src/main/resources/modules/veteran_prostate_cancer.json
src/main/resources/modules/veteran_ptsd.json
src/main/resources/modules/veteran_substance_abuse_conditions.json
src/main/resources/modules/veteran_substance_abuse_treatment.json
src/main/resources/modules/veterans/veteran_suicide_probabilities.json
48 changes: 48 additions & 0 deletions find_good_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import argparse
import glob
import json

NPDR_CONDITION_CODE = '1551000119108'
PDR_CONDITION_CODE = '1501000119109'
DME_CONDITION_CODE = '97331000119101'
DIABETES_CONDITION_CODE = "44054006"

parser = argparse.ArgumentParser(description='Select records for Coherent Eyes dataset')
parser.add_argument('path', help='folder path to process')
args = parser.parse_args()

fhir_jsons = glob.glob(f"{args.path}/fhir/*.json")

print("file,diabetes_onset,npdr_onset,pdr_onset,dme_onset,count")

for file in fhir_jsons:
if 'hospitalInformation' in file or 'practitionerInformation' in file:
continue
with open(file) as f:
bundle = json.load(f)

diabetes_onset = "0000"
npdr_onset = "0000"
pdr_onset = "0000"
dme_onset = "0000"

for entry in bundle['entry']:
resource = entry['resource']

if resource['resourceType'] != 'Condition':
continue

code = resource['code']['coding'][0]['code']
onset = resource['onsetDateTime']

if code == NPDR_CONDITION_CODE:
npdr_onset = onset
elif code == PDR_CONDITION_CODE:
pdr_onset = onset
elif code == DME_CONDITION_CODE:
dme_onset = onset
elif code == DIABETES_CONDITION_CODE:
diabetes_onset = onset


print(f"{file},{diabetes_onset},{npdr_onset},{pdr_onset},{dme_onset},{len(bundle['entry'])}")
6 changes: 3 additions & 3 deletions run_coherent_eyes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ years_of_history=5
run_population 10000

python3 find_good_records.py $outputfolder > ./population1000_details.csv
python3 select_nei_records.py ./population1000_details.csv > selected_files1000.txt
python3 select_good_records.py ./population1000_details.csv > selected_files1000.txt
mkdir selected1000
./copy.sh selected_files1000.txt selected1000/
cp output_population1000/fhir/*Information*.json selected1000
Expand All @@ -75,7 +75,7 @@ years_of_history=5
run_population 1000

python3 find_good_records.py $outputfolder > ./population100_details.csv
python3 select_nei_records.py ./population100_details.csv > selected_files100.txt
python3 select_good_records.py ./population100_details.csv > selected_files100.txt
mkdir selected100
./copy.sh selected_files100.txt selected100/
cp output_population100/fhir/*Information*.json selected100
Expand All @@ -91,7 +91,7 @@ years_of_history=0
run_population 1000

python3 find_good_records.py $outputfolder > ./population10_details.csv
python3 select_nei_records.py ./population10_details.csv > selected_files10.txt
python3 select_good_records.py ./population10_details.csv > selected_files10.txt
mkdir selected10
./copy.sh selected_files10.txt selected10/
cp output_population10/fhir/*Information*.json selected10
Expand Down
42 changes: 42 additions & 0 deletions select_good_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import argparse
from math import ceil
import numpy as np
import pandas as pd
import random


parser = argparse.ArgumentParser(description='Select record for Coherent Eyes dataset')
parser.add_argument('file', help='file to process')
args = parser.parse_args()

df = pd.read_csv(args.file)

pdr = df[df["pdr_onset"] != '0000'].sort_values('pdr_onset', ascending=False)
pdr['weight'] = 1.0 - (pdr['count'] / pdr['count'].max())

selected_pdr = pdr.sample(n=int(ceil(len(df)/40)), weights='weight')

npdr = df[(df["npdr_onset"] != '0000') & (df["pdr_onset"] == '0000')].sort_values('npdr_onset', ascending=False)
npdr = npdr[npdr['npdr_onset'] <= '2023-09-06'] # must have at least 1 years history
npdr['weight'] = 1.0 - (npdr['count'] / npdr['count'].max())

selected_npdr = npdr.sample(n=int(ceil(len(df)/20)), weights='weight')

diabetes = df[(df["diabetes_onset"] != '0000') & (df["npdr_onset"] == '0000')].sort_values('diabetes_onset', ascending=False)
diabetes = diabetes[diabetes['diabetes_onset'] <= '2021-09-06'] # must have at least 3 years history
diabetes['weight'] = 1.0 - (diabetes['count'] / diabetes['count'].max())

selected_diabetes = diabetes.sample(n=int(ceil(len(df)/40)), weights='weight')

selected = pd.concat([selected_npdr, selected_pdr, selected_diabetes])

selected_files = selected['file']

[print(x) for x in selected_files]


files = [
"/Users/dehall/synthea/nei/population1000_details.csv",
"/Users/dehall/synthea/nei/population100_details.csv",
"/Users/dehall/synthea/nei/population10_details.csv"
]

0 comments on commit fb7d08c

Please sign in to comment.