diff --git a/copy.sh b/copy.sh new file mode 100755 index 000000000..91d026d36 --- /dev/null +++ b/copy.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +while read line +do + name=$line + # echo "Text read from file - $name" + find $name -exec cp {} $2 \; + +done < $1 \ No newline at end of file diff --git a/deleted_modules.txt b/deleted_modules.txt new file mode 100644 index 000000000..967967db4 --- /dev/null +++ b/deleted_modules.txt @@ -0,0 +1,103 @@ +src/main/resources/modules/acute_myeloid_leukemia.json +src/main/resources/modules/allergic_rhinitis.json +src/main/resources/modules/allergies.json +src/main/resources/modules/allergies/allergy_panel.json +src/main/resources/modules/allergies/drug_allergy_incidence.json +src/main/resources/modules/allergies/environmental_allergy_incidence.json +src/main/resources/modules/allergies/immunotherapy.json +src/main/resources/modules/allergies/outgrow_env_allergies.json +src/main/resources/modules/allergies/outgrow_food_allergies.json +src/main/resources/modules/allergies/severe_allergic_reaction.json +src/main/resources/modules/anemia___unknown_etiology.json +src/main/resources/modules/appendicitis.json +src/main/resources/modules/asthma.json +src/main/resources/modules/atopy.json +src/main/resources/modules/atrial_fibrillation.json +src/main/resources/modules/attention_deficit_disorder.json +src/main/resources/modules/breast_cancer.json +src/main/resources/modules/breast_cancer/hormone_diagnosis.json +src/main/resources/modules/breast_cancer/hormonetherapy_breast.json +src/main/resources/modules/bronchitis.json +src/main/resources/modules/colorectal_cancer.json +src/main/resources/modules/contraceptive_maintenance.json +src/main/resources/modules/contraceptives.json +src/main/resources/modules/contraceptives/clear_contraceptive.json +src/main/resources/modules/contraceptives/female_sterilization.json +src/main/resources/modules/contraceptives/implant_contraceptive.json +src/main/resources/modules/contraceptives/injectable_contraceptive.json +src/main/resources/modules/contraceptives/intrauterine_device.json +src/main/resources/modules/contraceptives/male_sterilization.json +src/main/resources/modules/contraceptives/oral_contraceptive.json +src/main/resources/modules/contraceptives/patch_contraceptive.json +src/main/resources/modules/contraceptives/ring_contraceptive.json +src/main/resources/modules/copd.json +src/main/resources/modules/covid19.json +src/main/resources/modules/covid19/determine_risk.json +src/main/resources/modules/covid19/diagnose_bacterial_infection.json +src/main/resources/modules/covid19/diagnose_blood_clot.json +src/main/resources/modules/covid19/end_outcomes.json +src/main/resources/modules/covid19/end_symptoms.json +src/main/resources/modules/covid19/measurements_daily.json +src/main/resources/modules/covid19/measurements_frequent.json +src/main/resources/modules/covid19/measurements_vitals.json +src/main/resources/modules/covid19/medications.json +src/main/resources/modules/covid19/nonsurvivor_lab_values.json +src/main/resources/modules/covid19/supplies_hospitalization.json +src/main/resources/modules/covid19/supplies_icu.json +src/main/resources/modules/covid19/supplies_intubation.json +src/main/resources/modules/covid19/survivor_lab_values.json +src/main/resources/modules/covid19/symptoms.json +src/main/resources/modules/covid19/treat_blood_clot.json +src/main/resources/modules/cystic_fibrosis.json +src/main/resources/modules/dementia.json +src/main/resources/modules/dental_and_oral_examination.json +src/main/resources/modules/dentures.json +src/main/resources/modules/dermatitis.json +src/main/resources/modules/dermatitis/early_moderate_eczema_obs.json +src/main/resources/modules/dermatitis/early_severe_eczema_obs.json +src/main/resources/modules/dermatitis/mid_moderate_eczema_obs.json +src/main/resources/modules/dermatitis/mid_severe_eczema_obs.json +src/main/resources/modules/dermatitis/moderate_cd_obs.json +src/main/resources/modules/dermatitis/severe_cd_obs.json +src/main/resources/modules/ear_infections.json +src/main/resources/modules/epilepsy.json +src/main/resources/modules/female_reproduction.json +src/main/resources/modules/fibromyalgia.json +src/main/resources/modules/food_allergies.json +src/main/resources/modules/gout.json +src/main/resources/modules/hiv/hiv_cd4.json +src/main/resources/modules/hiv/stop_all_art_meds.json +src/main/resources/modules/hiv_care.json +src/main/resources/modules/hiv_diagnosis.json +src/main/resources/modules/homelessness.json +src/main/resources/modules/hypothyroidism.json +src/main/resources/modules/injuries.json +src/main/resources/modules/lung_cancer.json +src/main/resources/modules/lung_cancer/lung_cancer_probabilities.json +src/main/resources/modules/lupus.json +src/main/resources/modules/mTBI.json +src/main/resources/modules/med_rec.json +src/main/resources/modules/mend_program.json +src/main/resources/modules/opioid_addiction.json +src/main/resources/modules/osteoarthritis.json +src/main/resources/modules/osteoporosis.json +src/main/resources/modules/pregnancy.json +src/main/resources/modules/prescribing_opioids_for_chronic_pain_and_treatment_of_oud.json +src/main/resources/modules/rheumatoid_arthritis.json +src/main/resources/modules/sepsis.json +src/main/resources/modules/sexual_activity.json +src/main/resources/modules/sleep_apnea.json +src/main/resources/modules/stroke.json +src/main/resources/modules/total_joint_replacement.json +src/main/resources/modules/total_joint_replacement/functional_status_assessments.json +src/main/resources/modules/trigger_bone_marrow_transplant.json +src/main/resources/modules/urinary_tract_infections.json +src/main/resources/modules/veteran.json +src/main/resources/modules/veteran_hyperlipidemia.json +src/main/resources/modules/veteran_lung_cancer.json +src/main/resources/modules/veteran_mdd.json +src/main/resources/modules/veteran_prostate_cancer.json +src/main/resources/modules/veteran_ptsd.json +src/main/resources/modules/veteran_substance_abuse_conditions.json +src/main/resources/modules/veteran_substance_abuse_treatment.json +src/main/resources/modules/veterans/veteran_suicide_probabilities.json diff --git a/find_good_records.py b/find_good_records.py new file mode 100644 index 000000000..eec95ab55 --- /dev/null +++ b/find_good_records.py @@ -0,0 +1,48 @@ +import argparse +import glob +import json + +NPDR_CONDITION_CODE = '1551000119108' +PDR_CONDITION_CODE = '1501000119109' +DME_CONDITION_CODE = '97331000119101' +DIABETES_CONDITION_CODE = "44054006" + +parser = argparse.ArgumentParser(description='Select records for Coherent Eyes dataset') +parser.add_argument('path', help='folder path to process') +args = parser.parse_args() + +fhir_jsons = glob.glob(f"{args.path}/fhir/*.json") + +print("file,diabetes_onset,npdr_onset,pdr_onset,dme_onset,count") + +for file in fhir_jsons: + if 'hospitalInformation' in file or 'practitionerInformation' in file: + continue + with open(file) as f: + bundle = json.load(f) + + diabetes_onset = "0000" + npdr_onset = "0000" + pdr_onset = "0000" + dme_onset = "0000" + + for entry in bundle['entry']: + resource = entry['resource'] + + if resource['resourceType'] != 'Condition': + continue + + code = resource['code']['coding'][0]['code'] + onset = resource['onsetDateTime'] + + if code == NPDR_CONDITION_CODE: + npdr_onset = onset + elif code == PDR_CONDITION_CODE: + pdr_onset = onset + elif code == DME_CONDITION_CODE: + dme_onset = onset + elif code == DIABETES_CONDITION_CODE: + diabetes_onset = onset + + + print(f"{file},{diabetes_onset},{npdr_onset},{pdr_onset},{dme_onset},{len(bundle['entry'])}") diff --git a/run_coherent_eyes.sh b/run_coherent_eyes.sh index 982bd3b61..4546846fb 100755 --- a/run_coherent_eyes.sh +++ b/run_coherent_eyes.sh @@ -56,7 +56,7 @@ years_of_history=5 run_population 10000 python3 find_good_records.py $outputfolder > ./population1000_details.csv -python3 select_nei_records.py ./population1000_details.csv > selected_files1000.txt +python3 select_good_records.py ./population1000_details.csv > selected_files1000.txt mkdir selected1000 ./copy.sh selected_files1000.txt selected1000/ cp output_population1000/fhir/*Information*.json selected1000 @@ -75,7 +75,7 @@ years_of_history=5 run_population 1000 python3 find_good_records.py $outputfolder > ./population100_details.csv -python3 select_nei_records.py ./population100_details.csv > selected_files100.txt +python3 select_good_records.py ./population100_details.csv > selected_files100.txt mkdir selected100 ./copy.sh selected_files100.txt selected100/ cp output_population100/fhir/*Information*.json selected100 @@ -91,7 +91,7 @@ years_of_history=0 run_population 1000 python3 find_good_records.py $outputfolder > ./population10_details.csv -python3 select_nei_records.py ./population10_details.csv > selected_files10.txt +python3 select_good_records.py ./population10_details.csv > selected_files10.txt mkdir selected10 ./copy.sh selected_files10.txt selected10/ cp output_population10/fhir/*Information*.json selected10 diff --git a/select_good_records.py b/select_good_records.py new file mode 100644 index 000000000..4573b53e5 --- /dev/null +++ b/select_good_records.py @@ -0,0 +1,42 @@ +import argparse +from math import ceil +import numpy as np +import pandas as pd +import random + + +parser = argparse.ArgumentParser(description='Select record for Coherent Eyes dataset') +parser.add_argument('file', help='file to process') +args = parser.parse_args() + +df = pd.read_csv(args.file) + +pdr = df[df["pdr_onset"] != '0000'].sort_values('pdr_onset', ascending=False) +pdr['weight'] = 1.0 - (pdr['count'] / pdr['count'].max()) + +selected_pdr = pdr.sample(n=int(ceil(len(df)/40)), weights='weight') + +npdr = df[(df["npdr_onset"] != '0000') & (df["pdr_onset"] == '0000')].sort_values('npdr_onset', ascending=False) +npdr = npdr[npdr['npdr_onset'] <= '2023-09-06'] # must have at least 1 years history +npdr['weight'] = 1.0 - (npdr['count'] / npdr['count'].max()) + +selected_npdr = npdr.sample(n=int(ceil(len(df)/20)), weights='weight') + +diabetes = df[(df["diabetes_onset"] != '0000') & (df["npdr_onset"] == '0000')].sort_values('diabetes_onset', ascending=False) +diabetes = diabetes[diabetes['diabetes_onset'] <= '2021-09-06'] # must have at least 3 years history +diabetes['weight'] = 1.0 - (diabetes['count'] / diabetes['count'].max()) + +selected_diabetes = diabetes.sample(n=int(ceil(len(df)/40)), weights='weight') + +selected = pd.concat([selected_npdr, selected_pdr, selected_diabetes]) + +selected_files = selected['file'] + +[print(x) for x in selected_files] + + +files = [ + "/Users/dehall/synthea/nei/population1000_details.csv", + "/Users/dehall/synthea/nei/population100_details.csv", + "/Users/dehall/synthea/nei/population10_details.csv" +]