From 59b3759d1e9242325d58c4caf3e0cf95e73894aa Mon Sep 17 00:00:00 2001 From: Chen Date: Wed, 30 Aug 2023 09:48:13 -0400 Subject: [PATCH] add config for loading cmb data files to neo4j --- .gitignore | 3 + cmb-data/cmb-local.yml | 49 + cmb-data/ctdc_model_file.yaml | 355 +++++++ cmb-data/ctdc_model_properties_file.yaml | 1157 ++++++++++++++++++++++ cmb-data/props-ctdc-cmb.yml | 58 ++ sample-cli.txt | 8 + 6 files changed, 1630 insertions(+) create mode 100644 cmb-data/cmb-local.yml create mode 100644 cmb-data/ctdc_model_file.yaml create mode 100644 cmb-data/ctdc_model_properties_file.yaml create mode 100644 cmb-data/props-ctdc-cmb.yml create mode 100644 sample-cli.txt diff --git a/.gitignore b/.gitignore index 04124a63..1a45668d 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ config/data-loader.yml config/file-loader.yml config/es_loader_* backup/ +.DS_Store +my_env/ +cmb-data/2023-8-23 \ No newline at end of file diff --git a/cmb-data/cmb-local.yml b/cmb-data/cmb-local.yml new file mode 100644 index 00000000..d4f9392c --- /dev/null +++ b/cmb-data/cmb-local.yml @@ -0,0 +1,49 @@ +Config: + temp_folder: tmp + backup_folder: /tmp/data-loader-backups + + neo4j: + # # Location of Neo4j server, e.g., bolt://127.0.0.1:7687 + # uri: bolt://localhost:7687 + # # Neo4j username + # user: neo4j + # # Neo4j password + # password: "123456" + #Location of Neo4j server, e.g., bolt://127.0.0.1:7687 + uri: bolt://ncidb-d537-c:7687 + # Neo4j username + user: neo4j + # Neo4j password + password: "ctdc_cmbDBneo4j0" + + # Schema files' locations + schema: + - /Users/cheny39/Documents/work/ctdc/crdc-ctdc-dataloader/cmb-data/ctdc_model_file.yaml + - /Users/cheny39/Documents/work/ctdc/crdc-ctdc-dataloader/cmb-data/ctdc_model_properties_file.yaml + + #Property file location + prop_file: /Users/cheny39/Documents/work/ctdc/crdc-ctdc-dataloader/cmb-data/props-ctdc-cmb.yml + + # Skip validations, aka. Cheat Mode + cheat_mode: false + # Validations only, skip loading + dry_run: false + # Wipe out database before loading, you'll lose all data! + wipe_db: true + # Skip backup step + no_backup: true + split_transactions: false + # Automatically confirm deletion and database wiping (without asking user to confirm) + no_confirmation: true + # Max violations to display, default is 10 + max_violations: 10 + no_parents: true + + # S3 bucket name, if you are loading from an S3 bucket + s3_bucket: + # S3 folder for dataset + s3_folder: + # Loading mode, can be upsert, new or delete, default is upsert + loading_mode: upsert + # Location of dataset + dataset: diff --git a/cmb-data/ctdc_model_file.yaml b/cmb-data/ctdc_model_file.yaml new file mode 100644 index 00000000..41ce5b66 --- /dev/null +++ b/cmb-data/ctdc_model_file.yaml @@ -0,0 +1,355 @@ +# Cancer Moonshot Biobank data model nodes, properties and relationships file +# Title case names are "reserved" (meaningful to the parser) +# Lower case names are labels for the entities + +Nodes: + node: + Desc: text + Tags: + Category: value + Assignment: value + Class: value + Template: 'Yes' + Props: + - property_1 + - property_2 + - property_3 + - property_4 + - property_5 + - property_6 + program: + Desc: text + Tags: + Category: administrative + Assignment: core + Class: primary + Template: 'Yes' + Props: + - program_name #11444542 + - program_short_name #11459801 + project: + Desc: text + Tags: + Category: administrative + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - project_name #11459804 + - project_short_name #11459806 + principal_investigator: + Desc: text + Tags: + Category: study + Assignment: core + Class: primary + Template: 'Yes' + Props: + - principal_investigator_first_name #10624731 + - principal_investigator_last_name #10624733 + - principal_investigator_middle_name #10624732 + - principal_investigator_orcid_id #10624734 + study: + Desc: text + Tags: + Category: study + Assignment: core + Class: primary + Template: 'Yes' + Props: + - study_name #11459810 + - study_short_name #11459812 + - study_id + - study_description + - study_type #11160683 + - dates_of_conduct + associated_link: + Desc: text + Tags: + Category: study + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - associated_link_name + - associated_link_url + image_collection: + Desc: text + Tags: + Category: study + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - image_collection_name + - image_type_included + - image_collection_url + - repository_name + - collection_access + subject: + Desc: text + Tags: + Category: case # temporarily, will actually be "subject" + Assignment: core + Class: primary + Template: 'Yes' + Props: + - subject_id + - biomarker_results_available + - radiology_report_available #6944764 + - radiology_images_available + - histology_images_available + demographic: + Desc: the typical demographics + Tags: + Category: case # temporarily, will actually be "subject" + Assignment: core + Class: primary + Template: 'Yes' + Props: + - demographic_id + - sex + - reported_gender #10748236 + - race + - ethnicity + - ncbi_taxonomy_id #10543100 + - ncbi_taxonomy_name #10543082 + exposure: + Desc: environmental, workplace and lifestyle exposure(s) + Tags: + Category: case # temporarily, will actually be "subject" + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - exposure_id + - environmental_exposure_type #11256813 + - carcinogen_exposure + diagnosis: + Desc: diagnosis + Tags: + Category: clinical + Assignment: core + Class: primary + Template: 'Yes' + Props: + - diagnosis_id + - icd_10_disease_code #11479873 + - meddra_disease_code + - ctep_disease_code + - snomed_disease_code # this is disease plus stage I think + - diagnosis_date # quantified in days indexed to date of trial enrollment + - icd_o_primary_site #11341616 + - primary_disease_site + - tumor_grade #11325685 + - subject_age_at_diagnosis #10609539 + treatment: + Desc: text + Tags: + Category: clinical + Assignment: core + Class: primary + Template: 'Yes' + Props: + - treatment_id + - x_targeted_therapy #6400634 + - x_therapy #6400634 + - x_surgical_procedure_name #6411539 + - x_radiological_procedure_name #6411539 + targeted_therapy: + Desc: text + Tags: + Category: treatment + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - targeted_therapy_id + - targeted_therapy #6400634 + therapy: + Desc: text + Tags: + Category: treatment + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - therapy_id + - therapy #6400634 + surgical_procedure: + Desc: text + Tags: + Category: treatment + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - surgical_procedure_id + - surgical_procedure_name #6411539 + radiological_procedure: + Desc: text + Tags: + Category: treatment + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - radiological_procedure_id + - radiological_procedure_name #6411539 + subject_status: + Desc: text + Tags: + Category: clinical + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - subject_status_id + - survival_status #7050072 + - primary_cause_of_death #6421593 + - off_study + - off_study_reason #6355981 + specimen: + Desc: text + Tags: + Category: biospecimen + Assignment: core + Class: primary + Template: 'Yes' + Props: + - specimen_id + - parent_specimen_id + - days_from_diagnosis_to_specimen_collection #11253404 + - diagnosis_date # from the CMB perspective, this is supposedly equivalent to the above, but because of the way this date is indexed to enrollment date, it should not be a specimen property + - days_from_first_subject_visit_to_specimen_collection #11248874 + - days_from_first_treatment_to_specimen_collection #11250807 + - days_from_initial_genomic_sequencing_to_Specimen_collection #11250807 + - days_from_recurrence_to_specimen_collection #11251133 + - days_from_specimen_collection_to_initial_pathologic_diagnosis #11253404 + - collection_date # from the CMB perspective, this is equivalent to the above, with collection date indexed to enrollment date caDSR 6401821 + - icd_o_3_tissue_morphology # 11326261 + - obi_specimen_type #11253427 not a good match to the caDSR term reference by the CDE's ID + - specimen_category # confusingly close to the CMB Catalog Site's "Tissue Category" i.e. the indicator as to normal vs primary vs metastatic, but acceptable terms for caDSR 7069877, quoted as a reference for "specimen category" uses terms that do not relate to "tissue category" + - type_of_tissue #caDSR 7003892 this looks very much like the CMB Catalog Site's "Tissue Category" + - anatomical_collection_site + - parent_biospecimen_type + - biospecimen_type + - tissue_category # this would be the Catalog Site's "Tissue Category" i.e. the indicator as to normal vs primary vs metastatic, which appears within at least two of the DDs as Tissue Type + - assessment_timepoint # this appears to be the Catalog Site's "Collection Timepoint" caDSR 7065963 + data_file: + Desc: file + Tags: + Category: data_file + Assignment: core + Class: primary + Template: 'Yes' + Props: + - data_file_name #11284037 + - data_file_type + - data_file_description #11280338 + - data_file_format #11416926 + - data_file_size #11479876 + - data_file_checksum_value #11480133 + - data_file_checksum_type #11475057 + - data_file_compression_status #11387114 + - data_file_uuid + - data_file_location +Relationships: + belongs_to: + Mul: many_to_one + Ends: + - Src: subject + Dst: study + - Src: study + Dst: project + - Src: study # so can a lowest-level study belong directly to a highest-level program? + Dst: program + #- Src: project # this relationship can be removed because project is above study? + #Dst: study + - Src: project # this seems legitimate + Dst: program + Props: null + associated_with: # group all file relationships in here? + Mul: many_to_one + Ends: + - Src: data_file + Dst: specimen + - Src: data_file + Dst: diagnosis + - Src: data_file + Dst: subject + - Src: data_file + Dst: study + - Src: data_file + Dst: project + - Src: associated_link + Dst: study + - Src: associated_link + Dst: project + - Src: image_collection + Dst: study + - Src: image_collection + Dst: project + Props: null + #of_study: + #Mul: many_to_many + #Ends: + #- Src: principal_ivestigator # alternatively, express principal investigator relationships to both study and project elsewhere as "directs" relationships + #Dst: study + #- Src: data_file # alternatively, group all file relationships elsewhere as "associated_with" relationships? + #Dst: study + #Mul: many_to_one + #Props: null + #of_project: + #Mul: many_to_many + #Ends: + #- Src: principal_investigator # alternatively, express principal investigator relationships to both study and project elsewhere as "directs" relationships + #Dst: project + #- Src: data_file # alternatively, group all file relationships elsewhere as "associated_with" relationships? + #Dst: project + #Mul: many_to_one + #Props: null + of_subject: + Mul: many_to_one + Ends: + - Src: demographic + Dst: subject + Mul: one_to_one + - Src: exposure + Dst: subject + - Src: diagnosis + Dst: subject + - Src: treatment + Dst: subject + - Src: targeted_therapy + Dst: subject + - Src: therapy + Dst: subject + - Src: surgical_procedure + Dst: subject + - Src: radiological_procedure + Dst: subject + - Src: subject_status + Dst: subject + Mul: one_to_one + # to accommodate a Specimen being directly associated with a Subject, rather than being only indirectly associated with a Subject through a Visit, etc. + - Src: specimen + Dst: subject + # to accommodate a Data File being directly associated with a Subject, rather than being only indirectly associated with a Subject through a Specimen or Diagnosis + #- Src: data_file # alternatively, group all file relationships elsewhere as "associated_with" relationships? + #Dst: subject + Props: null + #of_specimen: + #Mul: many_to_one + #Ends: + #- Src: data_file # alternatively, group all file relationships elsewhere as "associated_with" relationships? + #Dst: specimen + #Props: null + directs: + Mul: many_to_many + Ends: + - Src: principal_investigator + Dst: project + - Src: principal_investigator + Dst: study + Props: null \ No newline at end of file diff --git a/cmb-data/ctdc_model_properties_file.yaml b/cmb-data/ctdc_model_properties_file.yaml new file mode 100644 index 00000000..7df4bda8 --- /dev/null +++ b/cmb-data/ctdc_model_properties_file.yaml @@ -0,0 +1,1157 @@ +# Cancer Moonshot Biobank data model properties of properties file +# Title case names are "reserved" (meaningful to the parser) +# Lower case names are labels for the entities + +PropDefinitions: + # name_of_node + property_1: + Desc: text + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: value + Type: integer + Req: 'Yes' + property_2: + Desc: text + Src: value + Type: datetime + Req: 'No' + property_3: + Desc: text + Src: value + Enum: + - 'Yes' + - 'No' + Req: Preferred + # program + program_name: + Desc: text CDE ID = 11444542 + Term: + - Origin: caDSR + Code: '11444542' + Value: Program Name Text + Src: DSS + Type: string + Req: 'Yes' + program_short_name: + Desc: text CDE ID = 11459801
This property is used as the key via which child records, e.g. project and/or study/trial records, can be associated with the appropriate program during data loading, and to identify the correct records during data updates. + Term: + - Origin: caDSR + Code: '11459801' + Value: Program Short Name Text + Src: DSS + Type: string + Req: 'Yes' + Key: true + # project + project_name: + Desc: CDE ID = 11459804 + Term: + - Origin: caDSR + Code: '11459804' + Value: Project Name Text + Src: DSS + Type: string + Req: 'Yes' + project_short_name: + Desc: CDE ID = 11459806
This property is used as the key via which child records, e.g. study/trial, image collection and principal investigator records, can be associated with the appropriate project during data loading, and to identify the correct records during data updates. + Term: + - Origin: caDSR + Code: '11459806' + Value: Project Short Name Text + Src: DSS + Type: string + Req: 'Yes' + Key: true + # study + study_name: + Desc: CDE ID = 11459810 + Term: + - Origin: caDSR + Code: '11459810' + Value: Study Name Text + Src: DSS + Type: string + Req: 'Yes' + study_short_name: + Desc: CDE ID = 11459812
This property is used as the key via which child records, e.g. image collection and subject records, can be associated with the appropriate study/trial during data loading, and to identify the correct records during data updates. + Term: + - Origin: caDSR + Code: '11459812' + Value: Study Short Name Text + Src: DSS + Type: string + Req: 'Yes' + Key: true + study_id: + Desc: text + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: FNL + Type: string + Req: 'No' + study_description: + Desc: text + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: FNL + Type: string + Req: 'Yes' + study_type: + Desc: CDE ID = 11160683 + Term: + - Origin: caDSR + Code: '11160683' + Value: Study Primary Purpose Type + Src: DSS + Enum: + - Adverse Effect Mitigation Study + - Ancillary Study + - Basic Science Research + - Correlative Study + - Cure Study + - Device Feasibility Study + - Diagnosis Study + - Disease Modifying Treatment Study + - Early Detection Study + - Education Training Clinical Study + - Epidemiology Research + - Genomics Research + - Health Services Research + - Imaging Research + - Observational Study + - Outcomes Research + - Prevention Study + - Proteomic Research + - Rehabilitation Clinical Study + - Screening Study + - Supportive Care Study + - Transcriptomics Research + - Treatment Study + Req: 'Yes' + dates_of_conduct: + Desc: text + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: FNL + Type: string + Req: 'Yes' + # principal_investigator + principal_investigator_first_name: + Desc: CDE ID = 10624731 + Term: + - Origin: caDSR + Code: '10624731' + Value: Principal Investigator First Name Text + Src: DSS + Type: string + Req: 'Yes' + principal_investigator_last_name: + Desc: CDE ID = 10624733 + Term: + - Origin: caDSR + Code: '10624733' + Value: Principal Investigator Last Name Text + Src: DSS + Type: string + Req: 'Yes' + principal_investigator_middle_name: + Desc: CDE ID = 10624732 + Term: + - Origin: caDSR + Code: '10624732' + Value: Principal Investigator Middle Name Text + Src: DSS + Type: string + Req: 'No' + principal_investigator_orcid_id: + Desc: CDE ID = 10624734 + Term: + - Origin: caDSR + Code: '10624734' + Value: Principal Investigator ORCID Text + Src: DSS + Type: string + Req: Preferred + # associated_link + associated_link_name: + Desc: text NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: FNL + Type: string + Req: 'Yes' + associated_link_url: + Desc: text NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: FNL + Type: string + Req: 'Yes' + # image collection props + image_collection_name: + Desc: The name of the image collection exactly as it appears at the location where the collection can be viewed and/or accessed. + Src: ICDC + Type: string + Req: 'Yes' + Key: true + Tags: + Labeled: Collection + image_type_included: + Desc: A list of the image types included in the image collection, drawn from a list of acceptable values. + Src: ICDC + Type: + value_type: list + Enum: #updated to accurately reflect the CMB image types + - CT # specific term used by both IDC and TCIA + - Digital Radiography + - DX # specific term used by both IDC and TCIA + - MR # specific term used by both IDC and TCIA + - MRI + - NM # specific term used by both IDC and TCIA + - Nuclear Medicine + - PET + - PT # specific term used by both IDC and TCIA + - Ultrasound + - US # specific term used by both IDC and TCIA + Req: 'Yes' + Tags: + Labeled: Image Types + image_collection_url: + Desc: The external url via which the image collection can be viewed and/or accessed. + Src: ICDC + Type: string + Req: 'Yes' + repository_name: + Desc: The name of the image repository within which the image collection can be found, stated in the form of the appropriate acronym. + Src: ICDC + Type: string + Req: 'Yes' + collection_access: + Desc: Indicator as to whether the image collection can be accessed only via the cloud, accessed only via download, or accessed via both mechanisms. + Src: ICDC + Enum: + - Cloud + - Download + - Unrestricted + Req: 'Yes' + # subject + subject_id: #CMB Participant ID + Desc: The globally unique ID by which any given subject can be unambiguously identified and displayed across studies/trials.
NOT CURRENTLY ASSIGNED ANY CDE
This property is used as the key via which child records, e.g. biospecimen records, can be associated with the appropriate subject during data loading, and to identify the correct records during data updates. + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + Key: true + biomarker_results_available: + Desc: Indicator as to whether any biomarker results relating to the subject in question are available.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - 'Yes' + - 'No' + - Unknown + Req: 'Yes' + radiology_report_available: # aka CMB's RADIOLOGY REPORT annotation #6944764 + Desc: Indicator as to whether any radiology reports results relating to the subject in question are available. CDE ID = 6944764 + Term: + - Origin: caDSR + Code: '6944764' + Value: Report Upload + Src: value + Enum: + - 'Yes' + - 'No' + - Unknown + Req: 'Yes' + radiology_images_available: + Desc: Indicator as to whether any radiology images relating to the subject in question are available.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - 'Yes' + - 'No' + - Unknown + Req: 'Yes' + histology_images_available: + Desc: Indicator as to whether any histology images relating to the subject in question are available.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - 'Yes' + - 'No' + - Unknown + Req: 'Yes' + # demographic + demographic_id: # this will be the same value as subject_id + Desc: A unique identifier of each demographic record, used to identify the correct demographic records during data updates. The value of this property will generally be the same as the value of the subject_id property.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Key: true + age_at_enrollment: + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: + units: + - years + value_type: number + Req: 'Yes' + sex: + Desc: text CDE ID = 6343385 + Term: + - Origin: caDSR + Code: '6343385' + Value: Sex + Src: CMB + Enum: + - F + - Female + - M + - Male + - U + - UNDIFFERENTIATED + Req: 'Yes' + reported_gender: + Desc: text CDE ID = 10748236 + Term: + - Origin: caDSR + Code: '10748236' + Value: Person Reported Gender Type + Src: DSS + Enum: + - Choose Not to Disclose + - Intersex + - Female + - Female-To-Male + - Male + - Male-To-Female + - None Of These Describe Me + - Non-Conforming Gender + - Refused to Answer + - Unknown + Req: 'Yes' + race: # details straight from the CMB Catalog Site + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - American Indian or Alaska Native + - Black or African American + - Native Hawaiian or other Pacific Islander + - Not Reported + - Unknown + Req: 'Yes' + ethnicity: # details straight from the CMB Catalog Site + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - Hispanic or Latino + - Not Hispanic or Latino + - Not reported + - Unknown + Req: 'Yes' + ncbi_taxonomy_id: # perhaps this a direct property of subject? + Desc: text CDE ID = 10543100 + Term: + - Origin: caDSR + Code: '10543100' + Value: Subject National Center for Biotechnology Information Taxonomy Identifier Integer + Src: DSS + Type: integer # not enumerated + Req: 'Yes' + ncbi_taxonomy_name: # perhaps this a direct property of subject? + Desc: text CDE ID = 10543082 + Term: + - Origin: caDSR + Code: '10543082' + Value: Subject National Center for Biotechnology Information Taxonomy Name Text + Src: DSS + Type: string # supposedly enumerated but no permissible values listed + Req: 'Yes' + # exposure + exposure_id: + Desc: A unique identifier of each exposure record, used to identify the correct exposure records during data updates. The value of this property will generally be the same as the value of the subject_id property.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: value + Type: string + Req: 'Yes' + Key: true + environmental_exposure_type: + Desc: text CDE ID = 11256813 + Term: + - Origin: caDSR + Code: '11256813' + Value: Environmental Exposure Type + Src: DSS + Enum: + - Asbestos Exposure + - Chemical Exposure + - Marijuana Smoke Exposure + - Radiation Exposure + - Radon Exposure + - Respirable Crystalline Silica Exposure + - Smoke Exposure + - Smokeless Tobacco Exposure + - Tobacco Related Exposure + - Wood Dust Exposure + Req: 'Yes' + carcinogen_exposure: + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - 'Yes' + - 'No' + - Unknown + Req: 'Yes' + # diagnosis + diagnosis_id: # this will be the same value as subject_id + Desc: text NO CDE REQUIRED
This property is used as the key via which child records, e.g. clinical reports, can be associated with the appropriate diagnosis record during data loading, and to identify the correct records during data updates. + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + Key: true + icd_10_disease_code: + Desc: text CDE ID = 11479873 + Term: + - Origin: caDSR + Code: '11479873' + Value: Diagnosis Disease or Disorder ICD-10-CM Code + Src: DSS + Type: string + Req: Preferred + meddra_disease_code: # the CMB Catalog Site's "Primary Diagnosis (medDRA Disease Code)" is actually a concatenation of medDRA code and CTEP disease code + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: # actual values as displayed within the CMB's Catalog Site + - 10000884-Acute Myeloid Leukemia Not Otherwise Specified + - 10010029 - Colorectal Carcinoma + -10066354 - Gastroesophageal Junction Adenocarcinoma + - 10029514 - Lung Non-Small Cell Carcinoma + - 10041071 - Lung Small Cell Carcinoma + - 10053571 - Melanoma + - 10028566 - Plasma Cell Myeloma + - 10036910 - Prostate Carcinoma + Req: 'Yes' + ctep_disease_code: + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: value + Enum: # actual values in the latest CMB data submission + - Acute Myeloid Leukemia Not Otherwise Specified # 10000884 + - Adenocarcinoma of the Gastroesophageal Junction # 10066354 + - Colon Adenocarcinoma # 10010029 + - Colorectal Carcinoma # 10010029 + - Melanoma # 10053571 + - Non-Small Cell Lung Carcinoma # 10029514 + - Plasma Cell Myeloma # 10028566 + - Prostate Carcinoma # 10036910 + - Small Cell Lung Carcinoma # 10041071 + Req: 'Yes' + snomed_disease_code: #CMB Disease Stage + Desc: text CDE ID = 6642369 + Term: + - Origin: caDSR + Code: '6642369' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + diagnosis_date: + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: + units: + - days + value_type: number + Req: 'Yes' + icd_o_primary_site: + Desc: text CDE ID = 11341616 + Term: + - Origin: caDSR + Code: '11341616' + Value: Diagnosis Primary Anatomic Site ICD-O-3 Code + Src: DSS + Type: string + Req: 'Yes' + primary_disease_site: + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + histological_subtype: + Desc: text CDE ID = 7344580 + Term: + - Origin: caDSR + Code: '7344580' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + date_of_confirmation_of_histology: + Desc: text CDE ID = 6409589 + Term: + - Origin: caDSR + Code: '6409589' + Value: Data Element Name + Src: CMB + Type: + units: + - days + value_type: number + Req: 'Yes' + tumor_grade: + Desc: text CDE ID = 11325685 + Term: + - Origin: caDSR + Code: '11325685' + Value: Diagnosis Tumor Grade + Src: DSS + Enum: + - GB + - GX + - G1 + - G2 + - G3 + - G4 + - High Grade + - Intermediate Grade + - Low Grade + - Not Applicable + - Not Reported + - Grade Cannot Be Assessed # from CMB Catalog Site + - Moderately Differentiated # from CMB Catalog Site + - Poorly Differentiated # from CMB Catalog Site + - Well Differentiated # from CMB Catalog Site + Req: 'Yes' + subject_age_at_diagnosis: + Desc: text CDE ID = 10609539 + Term: + - Origin: caDSR + Code: '10609539' + Value: Subject Age at Diagnosis Integer + Src: DSS + Type: + units: + - years + value_type: number + Req: 'Yes' + # treatment + treatment_id: + Desc: A unique identifier of each treatment record, used to identify the correct treatment records during data updates. The value of this property will generally be the same as the value of the subject_id property.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: value + Type: string + Req: 'Yes' + Key: true + x_targeted_therapy: # CMTRT_DSL in CMB Data Submission - 5a_TargetedTherapyAdministration??? + Desc: text CDE ID = 6400634 + Term: + - Origin: caDSR + Code: '6400634' + Value: Concomitant Medication Name # check on this, repeated below + Src: CMB + Type: string + Req: 'No' + x_therapy: # Non-Targeted Therapy in CMB Data Submission - 5a_NonTargetedTherapySupplement + Desc: text CDE ID = 6400634 + Term: + - Origin: caDSR + Code: '6400634' + Value: Concomitant Medication Name + Src: CMB + Type: string + Req: 'No' + x_surgical_procedure_name: # SURG_PROC_NAME in CMB Data Submission - 5a_NonTargetedSurgerySupplement + Desc: text CDE ID = 6411539 + Term: + - Origin: caDSR + Code: '6411539' + Value: Procedure Name + Src: CMB + Type: string + Req: 'No' + x_radiological_procedure_name: # RAD_PROC_NAME in CMB Data Submission - 5a_NonTargetedRadiationSupplement + Desc: text CDE ID = 6411539 + Term: + - Origin: caDSR + Code: '6411539' + Value: Procedure Name + Src: CMB + Type: string + Req: 'No' + # targeted_therapy + targeted_therapy_id: + Desc: A unique identifier of each targeted therapy record, used to identify the correct targeted therapy records during data updates. The value of this property will generally be based largely upon the value of the subject_id property.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + targeted_therapy: # CMTRT_DSL in CMB Data Submission - 5a_TargetedTherapyAdministration??? + Desc: text CDE ID = 6400634 + Term: + - Origin: caDSR + Code: '6400634' + Value: Concomitant Medication Name # check on this, repeated below + Src: CMB + Type: string + Req: 'Yes' + # therapy + therapy_id: + Desc: A unique identifier of each non-targeted therapy record, used to identify the correct non-targeted therapy records during data updates. The value of this property will generally be based largely upon the value of the subject_id property.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + therapy: # Non-Targeted Therapy in CMB Data Submission - 5a_NonTargetedTherapySupplement + Desc: text CDE ID = 6400634 + Term: + - Origin: caDSR + Code: '6400634' + Value: Concomitant Medication Name + Src: CMB + Type: string + Req: 'Yes' + # surgical_procedure + surgical_procedure_id: + Desc: A unique identifier of each surgical procedure record, used to identify the correct surgical procedure therapy records during data updates. The value of this property will generally be based largely upon the value of the subject_id property.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + surgical_procedure_name: # SURG_PROC_NAME in CMB Data Submission - 5a_NonTargetedSurgerySupplement + Desc: text CDE ID = 6411539 + Term: + - Origin: caDSR + Code: '6411539' + Value: Procedure Name + Src: CMB + Type: string + Req: 'Yes' + # radiological_procedure + radiological_procedure_id: + Desc: A unique identifier of each radiological procedure record, used to identify the correct radiological procedure record during data updates. The value of this property will generally be based largely upon the value of the subject_id property.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + radiological_procedure_name: # RAD_PROC_NAME in CMB Data Submission - 5a_NonTargetedRadiationSupplement + Desc: text CDE ID = 6411539 + Term: + - Origin: caDSR + Code: '6411539' + Value: Procedure Name + Src: CMB + Type: string + Req: 'Yes' + # subject_status + subject_staus_id: # this will be the same value as subject_id + Desc: A unique identifier of each subject_status record, used to identify the correct subject_status records during data updates. The value of this property will generally be the same as the value of the subject_id property.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: integer + Req: 'Yes' + Key: true + survival_status: # in CMB Data Submission - 5a_FollowUp + Desc: text CDE ID = 7050072 + Term: + - Origin: caDSR + Code: '7050072' + Value: Survival Status + Src: CMB + Enum: + - ALIVE + - DEAD + - UNKNOWN + - Alive with Disease # from CMB Data Submission - 5a FollowUp + - Alive with No Evidence of Disease # from CMB Data Submission - 5a FollowUp + - Alive, Disease Status Unknown # from CMB Data Submission - 5a FollowUp + - Dead # from CMB Data Submission - 5a FollowUp + Req: 'Yes' + primary_cause_of_death: # in CMB Data Submission - 5a_DeathSummary + Desc: text CDE ID = 6421593 + Term: + - Origin: caDSR + Code: '6421593' + Value: Primary Cause of Death + Src: CMB + Type: string + Req: 'No' + off_study: # infer from OFF_STUDY_DATE in CMB Data Submission - 5a_OffStudy? + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: value + Enum: + - 'Yes' + - 'No' + - Unknown + Req: 'Yes' + off_study_reason: # in CMB Data Submission - 5a_OffStudy + Desc: text CDE ID = 6355981 + Term: + - Origin: caDSR + Code: '6355981' + Value: Disposition Event Dictionary-Derived/Standardized Term + Src: value + Enum: # enumerated via a long list of ~40 acceptable terms, but do these in fact represent the values CMB has/uses? + - ADVERSE EVENT + - APPROVED DRUG AVAILABLE FOR INDICATION + - COMPLETED + - DEATH + - DISEASE RELAPSE + - FAILURE TO MEET CONTINUATION CRITERIA + - FAILURE TO MEET RANDOMIZATION CRITERIA + - LACK OF EFFICACY + - LOGISTICAL PROBLEM + - LOST TO FOLLOW-UP + - MET ELIGIBILITY CRITERIA BUT NOT NEEDED + - NEVER DOSED + - NO LONGER CLINICALLY BENEFITING + - NON-COMPLIANCE WITH NON-STUDY DEVICE + - NON-COMPLIANCE WITH STUDY DEVICE + - NON-COMPLIANCE WITH STUDY DRUG + - NON-COMPLIANCE WITH STUDY SCHEDULE + - OTHER + - PARTNER PREGNANCY + - PHYSICIAN DECISION + - PREGNANCY + - PROGRESSIVE DISEASE + - PROTOCOL DEVIATION + - PROTOCOL VIOLATION + - PROTOCOL-SPECIFIED WITHDRAWAL CRITERION MET + - RANDOMIZED BY MISTAKE + - RANDOMIZED BY MISTAKE WITH STUDY TREATMENT + - RANDOMIZED BY MISTAKE WITHOUT STUDY TREATMENT + - RECOVERY + - REQUIRES PROHIBITED MEDICATION + - SCREEN FAILURE + - SCREENING NOT COMPLETED + - SITE TERMINATED BY SPONSOR + - SPONSOR REQUEST + - STUDY TERMINATED BY SPONSOR + - TECHNICAL PROBLEMS + - WITHDRAWAL BY PARENT/GUARDIAN + - WITHDRAWAL BY SUBJECT + - WITHDRAWAL OF ASSENT + - WITHDRAWAL OF CONSENT + Req: 'No' + # specimen + specimen_id: # this will be the exact same value that the CMB uses for biospecimen_id + Desc: text NO CDE REQUIRED?
This property is used as the key via which child records, e.g. data files generated via Oncomine panel analyses, can be associated with the appropriate specimen during data loading, and to identify the correct records during data updates. + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + Key: true + parent_specimen_id: # this will be the exact same value that the CMB uses for parent_biospecimen_id + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + days_from_diagnosis_to_specimen_collection: + Desc: text CDE ID = 11253404 + Term: + - Origin: caDSR + Code: '11253404' + Value: Specimen Diagnosis To Specimen Collection Day Count + Src: DSS + Type: + units: + - days + value_type: number + Req: 'No' + days_from_first_subject_visit_to_specimen_collection: + Desc: text CDE ID = 11248874 + Term: + - Origin: caDSR + Code: '11248874' + Value: Specimen First Subject Visit To Specimen Collection Day Count + Src: DSS + Type: + units: + - days + value_type: number + Req: 'No' + days_from_first_treatment_to_specimen_collection: + Desc: text CDE ID = 11250807 + Term: + - Origin: caDSR + Code: '11250807' + Value: Specimen First Treatment To Specimen Collection Day Count + Src: DSS + Type: + units: + - days + value_type: number + Req: 'No' + days_from_initial_genomic_sequencing_to_Specimen_collection: + Desc: text CDE ID = 11251130 + Term: + - Origin: caDSR + Code: '11251130' + Value: Specimen Initial Genomic Sequencing To Specimen Collection Day Count + Src: DSS + Type: + units: + - days + value_type: number + Req: 'No' + days_from_recurrence_to_specimen_collection: + Desc: text cde id = 11251133 + Term: + - Origin: caDSR + Code: '11251133' + Value: Specimen Recurrence To Specimen Collection Day Count + Src: DSS + Type: + units: + - days + value_type: number + Req: 'No' + days_from_specimen_collection_to_initial_pathologic_diagnosis: + Desc: text CDE ID = 11253404 + Term: + - Origin: caDSR + Code: '11253404' + Value: Specimen Collection To Initial Pathologic Diagnosis Day Count + Src: DSS + Type: + units: + - days + value_type: number + Req: 'No' + collection_date: #from the CMB perspective, this is equivalent to the above, with collection date indexed to enrollment date caDSR 6401821 + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: + units: + - days + value_type: number + Req: 'Yes' + icd_o_3_tissue_morphology: + Desc: text CDE ID = 11326261 + Term: + - Origin: caDSR + Code: '11326261' + Value: Specimen Tumor Tissue ICD-O-3 Morphology Code + Src: DSS + Type: string # actually enumerated to a long list of ICD-O-3 codes, which is too long to specifiy here; we should instead just include the codes that will be seen in CMB data? + Req: 'No' + obi_specimen_type: + Desc: text CDE ID = 11253427 + Term: + - Origin: caDSR + Code: '11253427' + Value: Specimen Type Text # this does not match the caDSR record of "Specimen Material OBIB Source" + Src: DSS + Type: integer + Req: 'No' + specimen_category: #is this the Catalog Site's "Tissue Ccategory" i.e. the indicator as to normal vs primary vs metastatic? Based on the caDSR list of permissible values HELL NO! + Desc: text CDE ID = 7069877 + Term: + - Origin: caDSR + Code: '7069877D' + Value: Data Element Name + Src: CMB + Enum: + - BLOOD + - BONE MARROW + - BUCCAL CELL SAMPLE + - CEREBROSPINAL FLUID + - FORMALIN FIXED PARAFFIN EMBEDDED TISSUE + - FORMALIN FIXED TISSUE + - FRESH TISSUE + - FROZEN TISSUE + - OTHER BODILY FLUIDs + - SALIVA + - STOOL + - URINE + Req: 'Yes' + type_of_tissue: # this looks very much like CMB Catalog "Tissue Category" + Desc: text CDE ID = 7003892 + Term: + - Origin: caDSR + Code: '7003892' + Value: Tissue Type + Src: CMB + Enum: + - ANT # Adjacent Normal Tissue + - NORMAL + - METASTATIC + - OTHER + - PRIMARY + Req: 'Yes' + anatomical_collection_site: # details straight from the CMB Catalog Site + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: integer + Req: 'Yes' + parent_biospecimen_type: # details straight from the CMB Catalog Site + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - Bone Marrow Aspirate + - Cell Block + - EDTA Blood + - FFPE Block + - Formalin Bone Marrow Biopsy + - Formalin Fixed Tissue + - Snap Frozen Tissue + - Stained Bone Marrow Aspirate Slide + - Streck Blood + - Unstained Bone Marrow Aspirate Slide + - Unstained Slide + Req: 'Yes' + biospecimen_type: # details straight from the CMB Catalog Site + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - BMMC + - Blood Pellet + - Cells + - Curl + - DNA + - FFPE Block + - Glass H&E Slide + - Glass Slide Smear + - Glass Unstained Slide + - PBMC + - Plasma + - Plasma-cf + - RNA + - Snap Frozen Unfixed Tissue + - Tissue Curl + - cDNA + Req: 'Yes' + tissue_category: # details straight from the CMB Catalog Site. Based upon the values in use, this looks like caDSR "Tissue Type" aka CTDC "type_of_tissue" + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: value + Enum: + - Metastatic + - Normal + - Primary + Req: 'Yes' + assessment_timepoint: # this appears to be the CMB Catalog Site's "Collection Timepoint" + Desc: text CDE ID = 7065963 + Term: + - Origin: caDSR + Code: '7065963' + Value: Data Element Name + Src: CMB + Enum: + - ARCHIVAL + - BASELINE + - ON TREATMENT + - PROGRESSION + Req: 'Yes' + # data_file + data_file_name: + Desc: text CDE ID = 11284037 + Term: + - Origin: caDSR + Code: '11284037' + Value: Electronic Data File Name + Src: DSS + Type: string + Req: 'Yes' + data_file_type: + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: FNL + Enum: + - Clinical Report + - Variant Call File + Req: 'Yes' + data_file_description: + Desc: text CDE ID = 11280338 + Term: + - Origin: caDSR + Code: '11280338' + Value: Electronic Data File Description Text + Src: DSS + Type: string + Req: 'Yes' + data_file_format: + Desc: text CDE ID = 11416926 + Term: + - Origin: caDSR + Code: '11416926' + Value: Electronic Data File Format Type + Src: DSS + Type: string + Req: 'Yes' + data_file_size: + Desc: text CDE ID = 11479876 + Term: + - Origin: caDSR + Code: '11479876' + Value: Electronic Data File Size Integer + Src: DSS + Type: number + Req: 'Yes' + data_file_checksum_value: + Desc: text CDE ID = 11480133 + Term: + - Origin: caDSR + Code: '11480133' + Value: Electronic Data File Checksum Value + Src: DSS + Type: string + Req: 'Yes' + data_file_checksum_type: + Desc: text CDE ID = 11475057 + Term: + - Origin: caDSR + Code: '11475057' + Value: Electronic Data File Checksum Type + Src: DSS + Enum: + - md5sum + - sha1 + - sha256 + Req: 'Yes' + data_file_compression_status: + Desc: text CDE ID = 11387114 + Term: + - Origin: caDSR + Code: '11387114' + Value: Electronic Data File Compression Type + Src: DSS + Enum: + - Compressed + - Uncompressed + - Unknown + Req: Preferred + data_file_uuid: + Desc: text NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: FNL + Type: string + Req: 'Yes' + data_file_location: + Desc: text NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: FNL + Type: string + Req: 'Yes' diff --git a/cmb-data/props-ctdc-cmb.yml b/cmb-data/props-ctdc-cmb.yml new file mode 100644 index 00000000..2fe1c92e --- /dev/null +++ b/cmb-data/props-ctdc-cmb.yml @@ -0,0 +1,58 @@ +Properties: + domain: trialcommons.cancer.gov + rel_prop_delimiter: "$" + + type_mapping: + string: String + number: Float + integer: Int + boolean: Boolean + array: Array + object: Object + datetime: DateTime + date: Date + TBD: String + + + plurals: + node: nodes + program: programs + project: projects + principal_investigator: principal_investigators + study: studies + associated_link: associated_links + image_collection: image_collections + subject: subjects + demographic: demographics + exposure: exposures + diagnosis: diagnoses + treatment: treatments + targeted_therapy: targeted_therapies + therapy: therapies + surgical_procedure: surgical_procedures + radiological_procedure: radiological_procedures + subject_status: subject_status + specimen: specimens + data_file: data_files + + id_fields: + program: program_short_name + project: project_short_name + study: study_short_name + image_collection: image_collection_url + subject: subject_id + demographic: demographic_id + exposure: exposure_id + diagnosis: diagnosis_id + treatment: treatment_id + subject_status: subject_staus_id + specimen: specimen_id + associated_link: associated_link_name + + # node: nodes + # principal_investigator: principal_investigators + # targeted_therapy: targeted_therapies + # therapy: therapies + # surgical_procedure: surgical_procedures + # radiological_procedure: radiological_procedures + # data_file: data_files diff --git a/sample-cli.txt b/sample-cli.txt new file mode 100644 index 00000000..dd7f005e --- /dev/null +++ b/sample-cli.txt @@ -0,0 +1,8 @@ +sample cli: + +you need to install APOC plugin in Neo4j DB before loading data + + +./my_env/bin/python3 ./loader.py /Users/cheny39/Documents/work/ctdc/crdc-ctdc-dataloader/cmb-data/cmb-local.yml --data /Users/cheny39/Documents/work/ctdc/crdc-ctdc-dataloader/cmb-data/2023-8-23 + +