diff --git a/about_page_content_cleaner.py b/about_page_content_cleaner.py new file mode 100644 index 00000000..c72fc760 --- /dev/null +++ b/about_page_content_cleaner.py @@ -0,0 +1,62 @@ +import re +from bento.common.utils import get_logger + +logger = get_logger('AboutPageContentCleaner') + + +class AboutPageContentCleaner: + @staticmethod + def clean_text(text): + # Remove inline links + cleaned_text = re.sub(r'\$\$\[(.*?)\]\(.*?\)\$\$', r'\1', text) + # Remove hash tags + cleaned_text = re.sub(r'\$\$#(.*?)#\$\$', r'\1', cleaned_text) + # Remove asterisk symbol + cleaned_text = re.sub(r'\$\$\*(.*?)\*\$\$', r'\1', cleaned_text) + # Remove extra spaces + cleaned_text = ' '.join(cleaned_text.split()) + return cleaned_text + + @staticmethod + def remove_formatting_content(page_name, content): + cleaned_content = [] + for item in content: + if isinstance(item, dict) and 'paragraph' in item: + item['paragraph'] = AboutPageContentCleaner.clean_text(item['paragraph']) + cleaned_content.append(item) + # Handling unOrdered List + elif isinstance(item, dict) and 'listWithDots' in item: + cleaned_list = [] + for list_item in item['listWithDots']: + # handling Alphabets sub orders list + if 'listWithAlphabets' in list_item: + cleaned_inner_list = [AboutPageContentCleaner.clean_text(inner_list_item) for inner_list_item in list_item['listWithAlphabets']] + cleaned_list.append({'listWithAlphabets': cleaned_inner_list}) + else: + cleaned_list.append(AboutPageContentCleaner.clean_text(list_item)) + cleaned_content.append({'listWithDots': cleaned_list}) + elif isinstance(item, dict) and 'listWithAlphabets' in item: + cleaned_list = [AboutPageContentCleaner.clean_text(list_item) for list_item in item['listWithAlphabets']] + cleaned_content.append({'listWithAlphabets': cleaned_list}) + # Handling Ordered List with Numbers + elif isinstance(item, dict) and 'listWithNumbers' in item: + cleaned_list = [] + for list_item in item['listWithNumbers']: + # Handling Alphabets sub orders list + if 'listWithAlphabets' in list_item: + cleaned_inner_list = [AboutPageContentCleaner.clean_text(inner_list_item) for inner_list_item in list_item['listWithAlphabets']] + cleaned_list.append({'listWithAlphabets': cleaned_inner_list}) + else: + cleaned_list.append(AboutPageContentCleaner.clean_text(list_item)) + cleaned_content.append({'listWithNumbers': cleaned_list}) + # Handle table cleaning logic + elif isinstance(item, dict) and 'table' in item: + cleaned_table = { + 'head': [AboutPageContentCleaner.clean_text(cell) for cell in item['table']['head']], + 'body': [[AboutPageContentCleaner.clean_text(cell) for cell in row] for row in item['table']['body']] + } + cleaned_content.append({'table': cleaned_table}) + else: + cleaned_content.append(item) + logger.info(f'Cleaned content for "{page_name}"') + return cleaned_content diff --git a/config/aboutPagesContent.yml b/config/aboutPagesContent.yml new file mode 100644 index 00000000..99cdd96a --- /dev/null +++ b/config/aboutPagesContent.yml @@ -0,0 +1,88 @@ +- page: '/submit' + title: "Data Submission" + primaryContentImage: https://raw.githubusercontent.com/CBIIT/datacommons-assets/ctdc_Assets/ctdc/images/aboutPages/About_CRDC.png + content: + - paragraph: "CTDC is not accepting external data submissions at this time. For more information on how to submit data to other data repositories within the Cancer Research Data Commons, please see $$[here](type:internal url:https://datacommons.cancer.gov/data/submit-data target:_blank )$$." +- page: '/developers' + title: "For Developers" + primaryContentImage: https://raw.githubusercontent.com/CBIIT/datacommons-assets/ctdc_Assets/ctdc/images/aboutPages/About_Developers.png + content: + - paragraph: "Users can query the CTDC data via Graphical User Interface (GUI) or Application Programming Interface (API). The CTDC GitHub repo is also available for those interested in accessing our codebase and documentation." + - paragraph: "$$#CTDC GUI#$$" + - paragraph: "The GUI provides users a distilled set of parameters (faceted querying) they can use to explore a subset of the CTDC data model. " + - paragraph: "$$#CTDC API#$$" + - paragraph: "CTDC is based on a Graph database, featuring a GraphQL API (Java) and a React front-end (JavaScript). Each tier in the application stack is designed to be modular and adaptable for a variety of use-cases and scenarios. $$[A GraphQL API](type:internal url:/#/graphql target:_blank)$$ enables querying of the entire data model. The API is provided “as is:” there are no warranties or conditions arising out of usage of these services." + - paragraph: "$$#GITHUB#$$" + - paragraph: "The $$[ CTDC GitHub repo](https://github.com/CBIIT/crdc-ctdc-ui)$$ is available for research, usage, forking, and pull requests. The codebase is intended for sharing and building frameworks for related initiatives and projects. The CTDC GitHub repo has documentation about how to access the system, including endpoints and recommendations for tools and example queries. Both the project and documentation are maintained and updated in accordance with major and minor releases." +- page: '/purpose' + title: "Purpose" + primaryContentImage: https://raw.githubusercontent.com/CBIIT/datacommons-assets/ctdc_Assets/ctdc/images/aboutPages/About_Purpose.png + content: + - paragraph: "The goals of the CTDC are to advance cancer research and accelerate the development of innovative therapies by improving access to data from NCI-sponsored clinical studies, including genomic panel assay and clinical data. The CTDC does this through: " + - paragraph: "$$*Graphical User Interface (GUI)*$$ – The CTDC’s GUI includes an Explore dashboard with search filters to help users visualize, explore, and navigate complex metadata without the need for coding or specialized technical skills. " + - paragraph: "$$*Data consolidation:*$$ The CTDC consolidates data from clinical studies funded by the NCI. This allows researchers to analyze data collectively, leading to deeper insights and a better understanding of cancer’s complexities. " + - paragraph: "$$*Data harmonization:*$$ Data harmonization ensures that data across studies within the CTDC are standardized and organized in a consistent manner to improve data compatibility, integration, and meta-analysis. " + - paragraph: "$$*Integration with NCI Cloud Resources:*$$ Users can easily transfer selected CTDC data to the $$[ Velsera Seven Bridges Cancer Genomics Cloud](https://datacommons.cancer.gov/analytical-resource/seven-bridges-cancer-genomics-cloud)$$ (SB-CGC), a cloud-based platform for cancer research funded by the NCI. Here, researchers can integrate mutli-omic data across sources and leverage access to a multitude of tools and workflows for computation and analysis. " + - paragraph: "$$*Fueling collaborative research:*$$ By centralizing data and making them available through NCI’s Cloud Resources, the CTDC promotes secure collaboration among distributed research groups, fostering interdisciplinary partnerships. " + - paragraph: "$$*Democratizing Data access:*$$ Data in the CTDC are made available through various access restrictions including open (no registration required) and controlled access (registration required). The CTDC aims to make each dataset as open as possible while protecting participant privacy and adhering to regulations, agreements, and other considerations specific to each study. " + - paragraph: "$$*Alignment to F.A.I.R data principles:*$$ The CTDC adheres to Findable, Accessible, Interoperable, and Reusable ($$[FAIR]( https://www.go-fair.org/fair-principles/)$$) principles for scientific data management and stewardship. CTDC seeks to provide clearly organized data and guidance enabling end users to search for, find, and access data of interest. The emphasis on harmonization described above promotes the interoperability of data within and across the CRDC ecosystem and beyond and promotes reusability of data beyond the primary publication." +- page: '/support' + title: "Support" + primaryContentImage: https://raw.githubusercontent.com/CBIIT/datacommons-assets/ctdc_Assets/ctdc/images/aboutPages/About_Support.png + content: + - paragraph: "If you have any questions, please contact us at $$[NCICTDCHelpDesk@mail.nih.gov](NCICTDCHelpDesk@mail.nih.gov)$$." +- page: '/cloud-computing' + title: "Cloud computing" + primaryContentImage: https://raw.githubusercontent.com/CBIIT/datacommons-assets/ctdc_Assets/ctdc/images/aboutPages/About_CRDC.png + content: + - paragraph: "$$#CTDC and NCI’s Cloud Resources#$$ " + - paragraph: "The CTDC supports analysis via the $$[Seven Bridges Cancer Genomics Cloud](https://datacommons.cancer.gov/analytical-resource/seven-bridges-cancer-genomics-cloud-developed-velsera#)$$(SB-CGC). SB-CGC supports data access through a web-based user interface, programmatic access to analytic tools and workflows, and collaborative data analysis and sharing pipelines. Users can transfer data of interest from the CTDC directly to SB-CGC, eliminating the need to download and store extremely large datasets. Through the SB-CGC, researchers can bring analysis tools to the data in the cloud, instead of the traditional process of bringing the data to the tools on local hardware. Analyzing data through the cloud offers many benefits including: " + - listWithDots : + - "State of the art analysis using high-performance computing" + - "Remote access and flexibility for nationally or globally distributed teams" + - "On-demand computational capacity to scale resources as needed " + - paragraph: "Data brought to the SB-CGC can be analyzed using more than 200 preinstalled, curated bioinformatics tools and workflows. Researchers can also extend the functionality of the platform by adding their own data and tools via an intuitive software development kit. " + - paragraph: "$300 in credits are available to new users who want to test out the platform. " + - paragraph: "For more information on getting started with SB-CGC including onboarding videos and more, visit: $$[https://www.cancergenomicscloud.org/getting-started](https://www.cancergenomicscloud.org/getting-started )$$." +- page: '/data-use' + title: "CTDC Data Terms of Use" + primaryContentImage: https://raw.githubusercontent.com/CBIIT/datacommons-assets/ctdc_Assets/ctdc/images/aboutPages/About_CRDC.png + content: + - paragraph: "CTDC’s data terms of use are consistent with applicable international, national, tribal, and state laws and regulations, as well as institutional policies for data submission, access, and sharing to help enable broad data access to the extent possible." + - paragraph: "$$#DATA ACCESS#$$" + - paragraph: "Data is made available through open-access, registered access, and controlled access tiers. Visit our $$[Request Access](type:internal url:/#/request-access target:_blank )$$ page for more information. Access to controlled data is restricted to authorized users. Users and Users’ institutions are responsible for understanding terms of use and adhering to study-specific Data Use Agreement(s) (DUAs), Institutional Review Board policies, and other relevant guidelines. A signed specific DUA may be required to access controlled-access tier data for individual trials or studies. If a DUA is required, it will be provided as part of the data request process. " + - paragraph: "$$#RE-IDENTIFICATION#$$" + - paragraph: "Data available within the CTDC includes de-identified clinical study data subject to both general and dataset-specific data use policies. Users of any data provided by CTDC, whether open, registered or controlled access, agree not to attempt to reidentify any individual participant in any study represented by CTDC data, for any purpose whatever. This includes, but is not limited to, the use of analytical techniques of reidentification on genomic or clinical data. " + - paragraph: "$$#INTELLECTUAL PROPERTY#$$" + - paragraph: "NIH considers CTDC data as pre-competitive and discourages users from making IP claims derived directly from the available dataset(s). NIH-provided data, and conclusions derived thereof, shall remain freely available, without requirement for licensing. However, the NIH also recognizes the importance of the subsequent development of IP on downstream discoveries, especially in therapeutics, which will be necessary to support full investment in products that the public needs. " + - paragraph: "For more information about the CTDC and or questions regarding intellectual property, please contact us at $$[NCICTDCHelpDesk@mail.nih.gov](NCICTDCHelpDesk@mail.nih.gov)$$. " + - paragraph: "$$#CITING CTDC IN PUBLICATIONS:#$$" + - paragraph: "Whenever using CTDC data in a publication, please cite: " + - paragraph: "1. CTDC resource or individual study " + - paragraph: "           ● To cite the resource, cite the CTDC website ( $$[clinical.datacommons.cancer.gov](type:internal url:/#/ target:_blank )$$ ) " + - paragraph: "           ● To cite an individual study, either cite the CTDC study id (e.g., NCT04314401) " + - paragraph: "OR" + - paragraph: "           ● Cite the study URL: (e.g., $$[https://clinical.datacommons.cancer.gov/#/study/NCT04314401](type:internal url:/#/study/NCT04314401 target:_blank )$$). " + - paragraph: "2. Primary publication of the data (when applicable) " + - paragraph: "           ● The primary publication from the original data producers is available on the individual study summary pages. " + - paragraph: "$$#QUESTIONS#$$" + - paragraph: "CTDC strongly encourages investigators to contact $$[NCICTDCHelpDesk@mail.nih.gov](NCICTDCHelpDesk@mail.nih.gov)$$ with any questions or concerns related to publication of their analyses." +- page: '/data-harmonization' + title: "Data Harmonization" + primaryContentImage: https://raw.githubusercontent.com/CBIIT/datacommons-assets/ctdc_Assets/ctdc/images/aboutPages/About_CRDC.png + content: + - paragraph: "CTDC data elements have been aligned against NCI’s $$[ Data Standards Services ](https://datascience.cancer.gov/data-commons/data-standards-services)$$(DSS) common data elements (CDEs) curated through the CRDC in $$[ the Cancer Data Standards Registry and Repository](https://cadsr.cancer.gov/onedata/dmdirect/NIH/NCI/CO/CDEDD?filter=Administered%20Item%20%28Data%20Element%20CO%29.CDEDD%20Classification.P_ITEM_ID_VER=10466051v1)$$(caDSR). A list of the current CRDC Standard Data Elements can be found $$[here](https://cadsr.cancer.gov/onedata/clsc_datamanager_container.do?s=JTAycSUwNiUwQVQlMDUwelQrJTI0JTI5RyU3RiUyMkglMDAqc2IlMERxd3d1JTAwJTEzJTBEJTdDaiUyQ0VlJTJDcSUxNSUwNCUwRHIlMEElMDIlMDJ2c3IlMDIlN0MlMDk3JTJDYSUyNSUxRSUxMnclMDVyJTdCJTI3JTBEQnAlMjZRVC5NJTdDJTdGJTdDJTA5bEolMTQ )$$." +- page: "/data-model" + primaryContentImage: https://raw.githubusercontent.com/CBIIT/datacommons-assets/ctdc_Assets/ctdc/images/aboutPages/About_Model.png + title: "Data Model" + content: + - paragraph: "The CTDC data model is a representation of how data in the CTDC are arranged relative to each other. The data model is flexibly designed to accommodate an ever-expanding CTDC database." + - paragraph: "The SVG graphic below represents the current CTDC data model consisting of data nodes, node properties, and relationships (edges). It provides a comprehensive mapping of the system data, part of which may be viewed in the application and user interface. Additional nodes and properties beyond those presented on the front-end are available for inspection and querying via API at $$[https://clinical.datacommons.cancer.gov/#/graphql.](type:internal url:/#/graphql target:_blank )$$ " + - paragraph: "Information about the graphic data model, including the model description files, can be found on GitHub at $$[https://github.com/CBIIT/ctdc-model](https://github.com/CBIIT/ctdc-model)$$." + secondaryZoomImageTitle: "The CTDC Data Model" + secondaryZoomImage: 'https://raw.githubusercontent.com/CBIIT/ctdc-model/ctdc-model/model-desc/ctdc-model.svg' + + + + + \ No newline at end of file diff --git a/config/ctdc-local.yml b/config/ctdc-local.yml new file mode 100644 index 00000000..0e501a10 --- /dev/null +++ b/config/ctdc-local.yml @@ -0,0 +1,43 @@ +Config: + temp_folder: tmp + backup_folder: /tmp/data-loader-backups + neo4j: + # Location of Neo4j server, e.g., bolt://127.0.0.1:7687 + + # Schema files' locations + schema: + - /Users/davenportaw/Downloads/cmb-data/ctdc_model_file.yml + - /Users/davenportaw/Downloads/cmb-data/ctdc_model_properties_file.yml + + plugins: + # - module: loader_plugins.visit_creator + # class: VisitCreator + # - module: loader_plugins.individual_creator + # class: IndividualCreator + + #Property file location + prop_file: /Users/davenportaw/Downloads/cmb-data/props-ctdc-cmb.yml + + # Skip validations, aka. Cheat Mode + cheat_mode: false + # Validations only, skip loading + dry_run: false + # Wipe out database before loading, you'll lose all data! + wipe_db: true + # Skip backup step + no_backup: true + # Automatically confirm deletion and database wiping (without asking user to confirm) + no_confirmation: false + # Max violations to display, default is 10 + max_violations: 10 + # Split transactions + split-transactions: true + + # S3 bucket name, if you are loading from an S3 bucket + s3_bucket: + # S3 folder for dataset + s3_folder: + # Loading mode, can be UPSERT_MODE, NEW_MODE or DELETE_MODE, default is UPSERT_MODE + loading_mode: + # Location of dataset + dataset: diff --git a/config/ctdc_model_file.yaml b/config/ctdc_model_file.yaml new file mode 100644 index 00000000..5b563180 --- /dev/null +++ b/config/ctdc_model_file.yaml @@ -0,0 +1,383 @@ +# Cancer Moonshot Biobank data model nodes, properties and relationships file +# Title case names are "reserved" (meaningful to the parser) +# Lower case names are labels for the entities + +Nodes: + program: + Desc: Within the Clinical Trial Data Commons, projects and studies/trials are grouped into discrete programs, based upon the origins and/or scientific nature of each project or study/trial. These programs may or may not directly relate to any formal program, e.g. NCI funding programs. The Program node contains the properties required to appropriately characterize any given CTDC program. + Tags: + Category: administrative + Assignment: core + Class: primary + Template: 'Yes' + Props: + - program_name #11444542 + - program_short_name #11459801 + project: + Desc: text + Tags: + Category: administrative + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - project_name #11459804 + - project_short_name #11459806 + principal_investigator: + Desc: The Principal Investigator node contains properties which identify the principal investigator(s) responsible for any given study/trial. A study/trial may have one or more principal investigators, and any given individual may be listed as a principal investigator on more than one study/trial. + Tags: + Category: study + Assignment: core + Class: primary + Template: 'Yes' + Props: + - principal_investigator_first_name #10624731 + - principal_investigator_last_name #10624733 + - principal_investigator_middle_name #10624732 + - principal_investigator_orcid_id #10624734 + study: + Desc: The Study node contains properties required to characterize each study/trial in terms of a title, describe when, how and why the study/trial was conducted, and provide links to additional information about the study/trial. + Tags: + Category: study + Assignment: core + Class: primary + Template: 'Yes' + Props: + - study_name #11459810 + - study_short_name #11459812 + - study_id + - study_description + - study_type #11160683 + - dates_of_conduct + associated_link: + Desc: The Associated Link node cotains the properties required to associate multiple links to additional information about any given study/trial to the appropriate study/trial, and define an inuitive label via which each link will be diplayed within the UI. + Tags: + Category: study + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - associated_link_id + - associated_link_name + - associated_link_url + image_collection: + Desc: The Image Collection node is comprised of properties which describe collections of images that are associated with any given study/trial. These properties characterize such image collections in terms of the types of images they contain, where the collections are hosted, and how they can be accessed. + Tags: + Category: study + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - image_collection_id + - image_collection_name + - image_type_included + - image_collection_url + - repository_name + - collection_access + subject: + Desc: text + Tags: + Category: case # in lieu of "subject" being a category recognized by the DMN + Assignment: core + Class: primary + Template: 'Yes' + Props: + - subject_id + - biomarker_results_available + - radiology_report_available #6944764 + - radiology_images_available + - histology_images_available + demographic: + Desc: The Demographic node is comprised of properties which describe the key characteristics of each study/trial subject, such as sex, gender, race and etnnicity, etc. + Tags: + Category: case # in lieu of "subject" being a category recognized by the DMN + Assignment: core + Class: primary + Template: 'Yes' + Props: + - demographic_id + #- age_at_diagnosis + - age_at_enrollment + - race + - ethnicity + - sex + - reported_gender #10748236 + - height + - weight + - body_surface_area + - occupation + - income + - highest_level_of_education + #- site_of_enrollment + - ncbi_taxonomy_id #10543100 + - ncbi_taxonomy_name #10543082 + exposure: + Desc: environmental, workplace and lifestyle exposure(s) + Tags: + Category: case # in lieu of "subject" being a category recognized by the DMN + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - exposure_id + - environmental_exposure_type #11256813 + - carcinogen_exposure + diagnosis: + Desc: The Diagnosis node contains numerous properties which fully characterize the type of cancer with which any given study/trial subject was diagnosed, inclusive of disease stage, histology/pathology, and age at diagnosis. + Tags: + Category: clinical + Assignment: core + Class: primary + Template: 'Yes' + Props: + - diagnosis_id + - primary_diagnosis_disease_group + #- icd_10_disease_code #11479873 + - ctep_disease_term + - meddra_disease_code + - snomed_disease_term + - snomed_disease_code # this is disease plus stage as presented within the CMB Catalog + #- icd_o_primary_site #11341616 + - primary_disease_site + - histology + #- date_of_confirmation_of_histology #6409589 + - histological_subtype #7344580 + - stage_of_disease + - tumor_grade #11325685 + - date_of_diagnosis # quantified in days indexed to date of trial enrollment + #- subject_age_at_diagnosis #10609539 + # treatment: + # Desc: text + # Tags: + # Category: clinical + # Assignment: core + # Class: primary + # Template: 'Yes' + # Props: + # - treatment_id + # - x_targeted_therapy #6400634 + # - x_therapy #6400634 + # - x_surgical_procedure_name #6411539 + # - x_radiological_procedure_name #6411539 + targeted_therapy: + Desc: text + Tags: + Category: clinical_trial # in lieu of "treatment" being a category recognized by the DMN + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - targeted_therapy_id + - targeted_therapy #6400634 + - targeted_therapy_dose + - targeted_therapy_frequency + - targeted_therapy_start_and_end + - best_response_to_targeted_therapy + non_targeted_therapy: + Desc: text + Tags: + Category: clinical_trial # in lieu of "treatment" being a category recognized by the DMN + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - non_targeted_therapy_id + - non_targeted_therapy #6400634 + - non_targeted_therapy_dose + - non_targeted_therapy_frequency + - non_targeted_therapy_start_and_end + - best_response_to_non_targeted_therapy + surgery: + Desc: text + Tags: + Category: clinical_trial # in lieu of "treatment" being a category recognized by the DMN + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - surgical_procedure_id + - surgical_procedure #6411539 + - surgical_procedure_date + - surgical_procedure_anatomical_location + - surgical_procedure_therapeutic + - surgical_procedure_findings + - extent_of_residual_disease + radiotherapy: + Desc: text + Tags: + Category: clinical_trial # in lieu of "treatment" being a category recognized by the DMN + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - radiological_procedure_id + - radiological_procedure #6411539 + - radiological_procedure_anatomical_location + - radiation_dose + - radiation_frequency + - radiation_extent + - radiotherapy_start_and_end + - best_response_to_radiotherapy + subject_status: + Desc: text + Tags: + Category: clinical + Assignment: core + Class: secondary + Template: 'Yes' + Props: + - subject_status_id + - survival_status #7050072 + - primary_cause_of_death #6421593 + - off_study + - off_study_reason #6355981 + specimen: + Desc: The Specimen node contains numerous properties which provide an in-depth characterization of the types of biospecimens which were collected from any given study/trial participant and subsequently analyzed. + Tags: + Category: biospecimen # in lieu of "specimen" being a category recognized by the DMN + Assignment: core + Class: primary + Template: 'Yes' + Props: + - specimen_id + - parent_specimen_id + - parent_specimen_type # this refers to the nature of the specimen originally isolated from the participant, and from which various aliquots and/or derivative biospecimens were subseuqently isolated + - specimen_type # this refers to the nature of the sub-specimen that was actually subject to downstream analysis + #- obi_specimen_type #11253427 not a good match to the caDSR term referenced by the CDE's ID + - specimen_category # confusingly close to the CMB Catalog Site's "Tissue Category" i.e. the indicator as to normal vs primary vs metastatic, but acceptable terms for caDSR 7069877, quoted as a reference for "specimen category" uses terms that do not relate to "tissue category" + - anatomical_collection_site + - type_of_tissue #caDSR 7003892 this looks very much like the CMB Catalog Site's "Tissue Category" + - tissue_category # this would be the Catalog Site's "Tissue Category" i.e. the indicator as to normal vs primary vs metastatic, which appears within at least two of the DDs as Tissue Type + #- icd_o_3_tissue_morphology # 11326261 + - assessment_timepoint # this appears to be the Catalog Site's "Collection Timepoint" caDSR 7065963 + - collection_date # from the CMB perspective, this is equivalent to the above, with collection date indexed to enrollment date caDSR 6401821 + #- collection_method + #- fixative + # - days_from_diagnosis_to_specimen_collection #11253404 + # - diagnosis_date # from the CMB perspective, this is supposedly equivalent to the above, but because of the way this date is indexed to enrollment date, it should not be a specimen property + # - days_from_first_subject_visit_to_specimen_collection #11248874 + # - days_from_first_treatment_to_specimen_collection #11250807 + # - days_from_initial_genomic_sequencing_to_Specimen_collection #11250807 + # - days_from_recurrence_to_specimen_collection #11251133 + # - days_from_specimen_collection_to_initial_pathologic_diagnosis #11253404 + data_file: + Desc: Data files can be associated with CTDC project, study, participant, diagnosis and specimen records, but are not themselves stored within the application. Instead, the application stores records as to the existence and nature of such data files. The Data File node is comprised of properties which characterize these files in terms of their size, format and content, such that they can be appropriately represented within the application’s UI, and in terms of their storage location, such that they can be retrieved for analysis. + Tags: + Category: data_file + Assignment: core + Class: primary + Template: 'Yes' + Props: + - data_file_name #11284037 + - data_file_type + - data_file_description #11280338 + - data_file_format #11416926 + - data_file_size #11479876 + - data_file_checksum_value #11480133 + - data_file_checksum_type #11475057 + - data_file_compression_status #11387114 + - data_file_uuid + - data_file_location +Relationships: + belongs_to: + Mul: many_to_one + Ends: + - Src: subject + Dst: study + - Src: study + Dst: project + - Src: study # so can a lowest-level study belong directly to a highest-level program? + Dst: program + #- Src: project # this relationship can be removed because project is above study? + #Dst: study + - Src: project # this seems legitimate + Dst: program + Props: null + associated_with: # group all file relationships in here? + Mul: many_to_one + Ends: + - Src: data_file + Dst: specimen + - Src: data_file + Dst: diagnosis + - Src: data_file + Dst: subject + - Src: data_file + Dst: study + - Src: data_file + Dst: project + - Src: associated_link + Dst: study + - Src: associated_link + Dst: project + - Src: image_collection + Dst: study + - Src: image_collection + Dst: project + Props: null + #of_study: + #Mul: many_to_many + #Ends: + #- Src: principal_ivestigator # alternatively, express principal investigator relationships to both study and project elsewhere as "directs" relationships + #Dst: study + #- Src: data_file # alternatively, group all file relationships elsewhere as "associated_with" relationships? + #Dst: study + #Mul: many_to_one + #Props: null + #of_project: + #Mul: many_to_many + #Ends: + #- Src: principal_investigator # alternatively, express principal investigator relationships to both study and project elsewhere as "directs" relationships + #Dst: project + #- Src: data_file # alternatively, group all file relationships elsewhere as "associated_with" relationships? + #Dst: project + #Mul: many_to_one + #Props: null + of_subject: + Mul: many_to_one + Ends: + - Src: demographic + Dst: subject + Mul: one_to_one + - Src: exposure + Dst: subject + - Src: diagnosis + Dst: subject + #- Src: treatment + #Dst: subject + - Src: targeted_therapy + Dst: subject + Mul: many_to_many + - Src: non_targeted_therapy + Dst: subject + Mul: many_to_many + - Src: surgery + Dst: subject + Mul: many_to_many + - Src: radiotherapy + Dst: subject + Mul: many_to_many + - Src: subject_status + Dst: subject + Mul: one_to_one + # to accommodate a Specimen being directly associated with a Subject, rather than being only indirectly associated with a Subject through a Visit, etc. + - Src: specimen + Dst: subject + # to accommodate a Data File being directly associated with a Subject, rather than being only indirectly associated with a Subject through a Specimen or Diagnosis + #- Src: data_file # alternatively, group all file relationships elsewhere as "associated_with" relationships? + #Dst: subject + Props: null + #of_specimen: + #Mul: many_to_one + #Ends: + #- Src: data_file # alternatively, group all file relationships elsewhere as "associated_with" relationships? + #Dst: specimen + #Props: null + directs: + Mul: many_to_many + Ends: + - Src: principal_investigator + Dst: project + - Src: principal_investigator + Dst: study + Props: null \ No newline at end of file diff --git a/config/ctdc_model_properties_file.yaml b/config/ctdc_model_properties_file.yaml new file mode 100644 index 00000000..3e01a22f --- /dev/null +++ b/config/ctdc_model_properties_file.yaml @@ -0,0 +1,1945 @@ +# Cancer Moonshot Biobank data model properties of properties file +# Title case names are "reserved" (meaningful to the parser) +# Lower case names are labels for the entities + +PropDefinitions: + # name_of_node + property_1: + Desc: text + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: value + Type: integer + Req: 'Yes' + property_2: + Desc: text + Src: value + Type: datetime + Req: 'No' + property_3: + Desc: text + Src: value + Enum: + - 'Yes' + - 'No' + Req: Preferred + # program + program_name: + Desc: The narrative title used to refer to a broad framework (an administrative umbrella) of goals under which related projects or other research activities are grouped. Example - Clinical Proteomic Tumor Analysis.
CDE ID = 11444542 + Term: + - Origin: caDSR - CRDC + Code: '11444542' + Value: Program Name Text + Src: DSS + Type: string + Req: 'Yes' + program_short_name: + Desc: An acronym or abbreviated form of the title of a broad framework of goals under which related projects or other research activities are grouped. For example, CPTAC.
CDE ID = 11459801
This property is used as the key via which child records, e.g. project and/or study/trial records, can be associated with the appropriate program during data loading, and to identify the correct records during data updates. + Term: + - Origin: caDSR - CRDC + Code: '11459801' + Value: Program Short Name Text + Src: DSS + Type: string + Req: 'Yes' + Key: true + # project + project_name: + Desc: The narrative title used to refer to an organ-, site-, disease- or phase-specific data collection within a broad framework of goals under which related studies or other research activities are grouped.
CDE ID = 11459804 + Term: + - Origin: caDSR - CRDC + Code: '11459804' + Value: Project Name Text + Src: DSS + Type: string + Req: 'Yes' + project_short_name: + Desc: An acronym or abbreviated form of the title used to refer to an organ-, site- or disease-specific data collection within a broad framework of goals under which related studies or other research activities are grouped.
CDE ID = 11459806
This property is used as the key via which child records, e.g. study/trial, image collection and principal investigator records, can be associated with the appropriate project during data loading, and to identify the correct records during data updates. + Term: + - Origin: caDSR - CRDC + Code: '11459806' + Value: Project Short Name Text + Src: DSS + Type: string + Req: 'Yes' + Key: true + # study + study_name: + Desc: The narrative title used as a textual label for a research data collection. For example,Comparative Molecular Life History of Spontaneous Canine and Human Gliomas.
CDE ID = 11459810 + Term: + - Origin: caDSR - CRDC + Code: '11459810' + Value: Study Name Text + Src: DSS + Type: string + Req: 'Yes' + study_short_name: + Desc: The acronym or abbreviated form of the title for a research data collection. For example, GLIOMA01.
CDE ID = 11459812
This property is used as the key via which child records, e.g. image collection and subject records, can be associated with the appropriate study/trial during data loading, and to identify the correct records during data updates. + Term: + - Origin: caDSR - CRDC + Code: '11459812' + Value: Study Short Name Text + Src: DSS + Type: string + Req: 'Yes' + Key: true + study_id: + Desc: Any external identifier by which the study/trial in question is known.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: FNL + Type: string + Req: 'No' + study_description: + Desc: Narrative description of the study/trial.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: FNL + Type: string + Req: 'Yes' + study_type: + Desc: A classification of the study based upon the primary intent of the study's activities.
CDE ID = 11160683 + Term: + - Origin: caDSR - CRDC + Code: '11160683' + Value: Study Primary Purpose Type + Src: DSS + Enum: + - Adverse Effect Mitigation Study + - Ancillary Study + - Basic Science Research + - Correlative Study + - Cure Study + - Device Feasibility Study + - Diagnosis Study + - Disease Modifying Treatment Study + - Early Detection Study + - Education Training Clinical Study + - Epidemiology Research + - Genomics Research + - Health Services Research + - Imaging Research + - Observational Study + - Outcomes Research + - Prevention Study + - Proteomic Research + - Rehabilitation Clinical Study + - Screening Study + - Supportive Care Study + - Transcriptomics Research + - Treatment Study + Req: 'Yes' + dates_of_conduct: + Desc: Timespan over which the study/trial was/is being conducted.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: FNL + Type: string + Req: 'Yes' + # principal_investigator + principal_investigator_first_name: + Desc: The first or given name of the person responsible for the conduct of the clinical trial or research project.
CDE ID = 10624731 + Term: + - Origin: caDSR - NCI Standard + Code: '10624731' + Value: Principal Investigator First Name Text + Src: DSS + Type: string + Req: 'Yes' + principal_investigator_last_name: + Desc: The last or family name of the person responsible for the conduct of the clinical trial or research project.
CDE ID = 10624733 + Term: + - Origin: caDSR - NCI Standard + Code: '10624733' + Value: Principal Investigator Last Name Text + Src: DSS + Type: string + Req: 'Yes' + principal_investigator_middle_name: + Desc: The middle name of the person responsible for the conduct of the clinical trial or research project.
CDE ID = 10624732 + Term: + - Origin: caDSR - NCI Standard + Code: '10624732' + Value: Principal Investigator Middle Name Text + Src: DSS + Type: string + Req: 'No' + principal_investigator_orcid_id: + Desc: A persistent unique digital identifier assigned to a principal investigator by the Open Researcher and Contributor ID (ORCID) organization.
CDE ID = 10624734 + Term: + - Origin: caDSR - NCI Standard + Code: '10624734' + Value: Principal Investigator ORCID Text + Src: DSS + Type: string + Req: Preferred + # associated_link + associated_link_id: + Desc: A unique identifier of each associated link record, used to identify the correct associated link records during data updates.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + associated_link_name: + Desc: The exact text to be displayed within the UI and used as an intuitive identifier of any given link associated with the study/trial in question.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + associated_link_url: + Desc: The url to which an end user will be redirected upon selection of the corresponding link name displayed within the UI.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + # image collection props + image_collection_id: + Desc: A unique identifier of each image collection record, used to identify the correct image collection records during data updates.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: FNL + Type: string + Req: 'Yes' + image_collection_name: + Desc: The name of the image collection exactly as it appears at the location where the collection can be viewed and/or accessed.
NOT CURRENTLY ASSIGNED ANY CDE + Src: ICDC + Type: string + Req: 'Yes' + Key: true + Tags: + Labeled: Collection + image_type_included: + Desc: A list of the image types included in the image collection, drawn from a list of acceptable values.
NOT CURRENTLY ASSIGNED ANY CDE + Src: ICDC + Type: + value_type: list + Enum: #updated to accurately reflect the CMB image types + - CT # specific term used by both IDC and TCIA + - Digital Radiography + - DX # specific term used by both IDC and TCIA + - MR # specific term used by both IDC and TCIA + - MRI + - NM # specific term used by both IDC and TCIA + - Nuclear Medicine + - PET + - PT # specific term used by both IDC and TCIA + - Ultrasound + - US # specific term used by both IDC and TCIA + Req: 'Yes' + Tags: + Labeled: Image Types + image_collection_url: + Desc: The external url via which the image collection can be viewed and/or accessed.
NOT CURRENTLY ASSIGNED ANY CDE + Src: ICDC + Type: string + Req: 'Yes' + repository_name: + Desc: The name of the image repository within which the image collection can be found, stated in the form of the appropriate acronym.
NOT CURRENTLY ASSIGNED ANY CDE + Src: ICDC + Type: string + Req: 'Yes' + collection_access: + Desc: Indicator as to whether the image collection can be accessed only via the cloud, accessed only via download, or accessed via both mechanisms.
NOT CURRENTLY ASSIGNED ANY CDE + Src: ICDC + Enum: + - Cloud + - Download + - Unrestricted + Req: 'Yes' + # subject + subject_id: + Desc: The globally unique ID by which any given subject can be unambiguously identified and displayed across studies/trials.
CMB Participant ID
NOT CURRENTLY ASSIGNED ANY CDE
This property is used as the key via which child records, e.g. specimen records, can be associated with the appropriate subject during data loading, and to identify the correct records during data updates. + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + Key: true + biomarker_results_available: + Desc: Indicator as to whether any biomarker results relating to the subject in question are available.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - 'Yes' + - 'No' + - Unknown + Req: 'Yes' + radiology_report_available: + Desc: Indicator as to whether any radiology reports results relating to the subject in question are available.
This is CMB's RADIOLOGY REPORT annotation
CDE ID = 6944764 + Term: + - Origin: caDSR + Code: '6944764' + Value: Report Upload + Src: value + Enum: + - 'Yes' + - 'No' + - Unknown + Req: 'Yes' + radiology_images_available: + Desc: Indicator as to whether any radiology images relating to the subject in question are available.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - 'Yes' + - 'No' + - Unknown + Req: 'Yes' + histology_images_available: + Desc: Indicator as to whether any histology images relating to the subject in question are available.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - 'Yes' + - 'No' + - Unknown + Req: 'Yes' + # demographic + demographic_id: + Desc: A unique identifier of each demographic record, used to identify the correct demographic records during data updates.
The value of this property will generally be the same as the value of the subject_id property.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Key: true + # age_at_diagnosis: + # Desc: For subjects enrolled upon the CMB trial, the age of the subject as of the diagnosis which resulted in their enrollment being made.
NOT CURRENTLY ASSIGNED ANY CDE + # Term: + # - Origin: caDSR + # Code: 'code/ID' + # Value: Data Element Name + # Src: CMB + # Type: + # units: + # - years + # value_type: number + age_at_enrollment: + Desc: The age of the subject as of enrollment, measured in years.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: + units: + - years + value_type: number + Req: 'Yes' + race: + Desc: The text for reporting information about race based on the Office of Management and Budget (OMB) categories.
Acceptable values taken straight from the CMB Catalog Site.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR - NCI Standard + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - American Indian or Alaska Native + - Black or African American + - Native Hawaiian or other Pacific Islander + - Not Reported + - Unknown + Req: 'Yes' + ethnicity: + Desc: The text for reporting information about ethnicity based on the Office of Management and Budget (OMB) categories.
Acceptable values taken straight from the CMB Catalog Site.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR - NCI Standard + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - Hispanic or Latino + - Not Hispanic or Latino + - Not reported + - Unknown + Req: 'Yes' + sex: + Desc: text CDE ID = 6343385 + Term: + - Origin: caDSR + Code: '6343385' + Value: Sex + Src: CMB + Enum: + - F + - Female #CMB + - Intersex #CMB + - M + - Male #CMB + - U + - UNDIFFERENTIATED + - Unknown #CMB + Req: 'Yes' + reported_gender: + Desc: Characteristics of people that are socially constructed, including norms, behaviors, and roles based on their sex. As a social construct, these norms, behaviors, and roles vary from society to society and can change over time.
CDE ID = 10748236 + Term: + - Origin: caDSR - CRDC + Code: '10748236' + Value: Person Reported Gender Type + Src: DSS + Enum: + - Choose Not to Disclose # = Response Declined per CMB? + - Female # perfect match to CMB data values + - Female-To-Male + - Female-to-Male Transsexual # per CMB + - Intersex # perfect match to CMB data values + - Male + - Male-To-Female + - Male-to-Female Transsexual # per CMB + - None Of These Describe Me + - Non-Conforming Gender + - Refused to Answer # = Response Declined per CMB? + - Response Declined # per CMB + - Unknown # perfect match to CMB data values + Req: 'Yes' + height: + Desc: The height of the subject as of enrollment, measured in cm.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: + units: + - cm + value_type: number + Req: Preferred + weight: + Desc: The weight of the subject as of enrollment, measured in kg.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: + units: + - kg + value_type: number + Req: Preferred + body_surface_area: + Desc: The body surface area of the subject as of enrollment, mathematically derived from the subject's height and weight, and expressed in square meters.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: + units: + - m2 + value_type: number + Req: Preferred + occupation: + Desc: text
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + income: + Desc: text
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: Preferred + highest_level_of_education: + Desc: An indication as to the highest level of education attained by the subject.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - No formal education + - Grade school + - Not high school graduate + - High school graduate (including equivalency) + - Some college or Associate degree + - Bachelor's degree + - Master's degree + - Doctoral or professional degree + - Declined to answer + Req: Preferred + # site_of_enrollment: + # Desc: The identity of the hospital, medical center or clinic at which the subject was enrolled upon the study/trial.
NOT CURRENTLY ASSIGNED ANY CDE + # Term: + # - Origin: caDSR + # Code: 'code/ID' + # Value: Data Element Name + # Src: CMB + # Type: string + # Req: Preferred + ncbi_taxonomy_id: # perhaps this a direct property of subject? + Desc: A label provided by NCBI Taxonomy Database (https://www.ncbi.nlm.nih.gov/taxonomy/), which uniquely identifies group or category, at any level, in a system for classifying plants or animals (including humans) providing ranked categories for the classification of organisms according to their suspected evolutionary relationships.
CDE ID = 10543100 + Term: + - Origin: caDSR - CRDC + Code: '10543100' + Value: Subject National Center for Biotechnology Information Taxonomy Identifier Integer + Src: DSS + Type: integer # not enumerated + Req: 'Yes' + ncbi_taxonomy_name: # perhaps this a direct property of subject? + Desc: The textual label associated with a subject's organismal classification as captured in the National Center for Biotechnology Information (NCBI) Taxonomy standard nomenclature and classification repository.
Supposedly enumerated but no permissible values specified.
CDE ID = 10543082 + Term: + - Origin: caDSR - CRDC + Code: '10543082' + Value: Subject National Center for Biotechnology Information Taxonomy Name Text + Src: DSS + Type: string # supposedly enumerated but no permissible values specified + Req: 'Yes' + # exposure + exposure_id: + Desc: A unique identifier of each exposure record, used to identify the correct exposure records during data updates. The value of this property will generally be the same as the value of the subject_id property.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: value + Type: string + Req: 'Yes' + Key: true + environmental_exposure_type: + Desc: The category related to contact with chemical, biological, or physical substances found in air, water, food, soil, or product that may have a harmful effect on a person's health.
CDE ID = 11256813 + Term: + - Origin: caDSR - CRDC + Code: '11256813' + Value: Environmental Exposure Type + Src: DSS + Enum: + - Asbestos Exposure + - Chemical Exposure + - Marijuana Smoke Exposure + - Radiation Exposure + - Radon Exposure + - Respirable Crystalline Silica Exposure + - Smoke Exposure + - Smokeless Tobacco Exposure + - Tobacco Related Exposure + - Wood Dust Exposure + Req: 'Yes' + carcinogen_exposure: + Desc: Indicator as to whether the subject in question has had significant exposure to any known carcinogen(s)
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - 'Yes' + - 'No' + - Unknown + Req: 'Yes' + # diagnosis + diagnosis_id: + Desc: A unique identifier of each diagnosisc record, used to identify the correct diagnosis records during data updates.
The value of this property will generally be the same as the value of the subject_id property.
NO CDE REQUIRED
This property is used as the key via which child records, e.g. clinical reports, can be associated with the appropriate diagnosis record during data loading, and to identify the correct records during data updates. + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + Key: true + primary_diagnosis_disease_group: + Desc:
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - Colorectal cancer, NOS + - Acute myeloid leukemia, NOS + - Adenocarcinoma of the gastroesophageal junction + - Melanoma + - Myeloma, NOS + - Non-small cell lung cancer, NOS + - Prostate cancer, NOS + - Small cell lung cancer + - Invasive breast carcinoma + - Fallopian tube carcinoma + - Gastric cancer, NOS + - Primary peritoneal carcinoma + - Esophageal cancer, NOS + - Ovarian epithelial cancer + Req: 'Yes' + # icd_10_disease_code: + # Desc: An alphanumeric value from the tenth version of the International Classification of Disease (ICD-10-CM, the disease code subset of ICD-10) used to identify the diagnosis in humans.
CDE ID = 11479873 + # Term: + # - Origin: caDSR - CRDC + # Code: '11479873' + # Value: Diagnosis Disease or Disorder ICD-10-CM Code + # Src: DSS + # Type: string + # Req: Preferred + ctep_disease_term: #this really should be referred to as CTEP disease term + Desc: In at least one data submission file, i.e. 5A DS/5a_Enrollment.xlsx (in Version 2), what is identified as CTEP_DISEASE_CODE is provided in the form of a human-readable "narrative" statement of the primary disease condition with which the subject in question has been diagnosed, with the values in use essentially matching those of "CTEP TERM" or CTEP's "SHORT NAME", which are associated with an 8 digit numerical MEDDRA code. See here - https://ctep.cancer.gov/protocoldevelopment/codes_values.htm#disease
The CMB's Catalog Site presents these human readable CTEP disease "terms" prefixed with the corresponding meddra disease code
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: value + Enum: + # actual values in the v2 CMB data submission + # - Acute Myeloid Leukemia Not Otherwise Specified # meddra code 10000884 + # - Adenocarcinoma of the Gastroesophageal Junction # meddra code 10066354 + # - Colon Adenocarcinoma # meddra code 10010029 + # - Colorectal Carcinoma # meddra code 10010029 + # - Melanoma # meddra code 10053571 + # - Non-Small Cell Lung Carcinoma # meddra code 10029514 + # - Plasma Cell Myeloma # meddra code 10028566 + # - Prostate Carcinoma # meddra code 10036910 + # - Small Cell Lung Carcinoma # meddra code 10041071 + # actual values in the v3 CMB data submission + - Acute Myeloid Leukemia Not Otherwise Specified # meddra code 10000884 + - Adenocarcinoma of the Gastroesophageal Junction # meddra code 10066354 + - Colorectal Carcinoma # meddra code 10010029 + - Invasive Breast Carcinoma # meddra code 10006190 NEW + - Melanoma # meddra code 10053571 + - Non-Small Cell Lung Carcinoma # meddra code 10029514 + - Ovarian Carcinoma # meddra code 10033159 NEW + - Plasma Cell Myeloma # meddra code 10028566 + - Prostate Carcinoma # meddra code 10036910 + - Small Cell Lung Carcinoma # meddra code 10041071 + Req: 'Yes' + meddra_disease_code: + Desc: An eight digit numerical code for the primary disease with which the subject in question has been diagnosed.
The CMB Catalog Site's "Primary Diagnosis (medDRA Disease Code)" is actually a concatenation of CTEP disease "term" and medDRA code
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + # actual values as displayed within the CMB's Catalog Site + # - "10000884" #- Acute Myeloid Leukemia Not Otherwise Specified + # - "10010029" #- Colorectal Carcinoma + # - "10066354" #- Gastroesophageal Junction Adenocarcinoma + # - "10029514" #- Lung Non-Small Cell Carcinoma + # - "10041071" #- Lung Small Cell Carcinoma + # - "10053571" #- Melanoma + # - "10028566" #- Plasma Cell Myeloma + # - "10036910" #- Prostate Carcinoma + #actual values in the v3 CMB data submission + - "10000884" # Acute Myeloid Leukemia Not Otherwise Specified + - "10066354" # Adenocarcinoma of the Gastroesophageal Junction + - "10010029" # Colorectal Carcinoma + - "10006190" # Invasive Breast Carcinoma NEW + - "10053571" # Melanoma + - "10029514" # Non-Small Cell Lung Carcinoma + - "10033159" # Ovarian Carcinoma NEW + - "10028566" # Plasma Cell Myeloma + - "10036910" # Prostate Carcinoma + - "10041071" # Small Cell Lung Carcinoma + Req: 'Yes' + snomed_disease_term: + Desc: Acceptable values currently specified, all of which are sourced from actual data, may be of use only during the generation of mock data, with the data type for this property potentially being changed to string.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - Acute Myeloid Leukemia 17788007 + - Adenocarcinoma 35917007 + - Adenocarcinoma 443961001 + - Amelanotic Melanoma 70594002 + - Breast Carcinoma 254838004 + - Colon Adenoma 428054006 + - Colon Carcinoma 269533000 + - Colorectal Adenocarcinoma 408645001 + - Cutaneous Melanoma 93655004 + - Epithelioid Cell Melanoma 37138001 + - Esophageal Adenocarcinoma 276803003 + - Gastric Adenocarcinoma 408647009 + - Lung Adenocarcinoma 254626006 + - Lung Carcinoma 448993007 + - Melanoma 2092003 + - Melanoma 372244006 + - Mucinous Adenocarcinoma 72495009 + - Multiple myeloma 55921005 + - Multiple myeloma and immunoproliferative disease 188717001 + - Neuroendocrine Carcinoma 253000007 + - Nodular Melanoma 2142002 + - Nodular Melanoma 254731001 + - Non-Small Cell Lung Carcinoma 254637007 + - Plasma Cell Leukemia 128922003 + - Plasma Cell Myeloma 109989006 + - Plasma Cell Myeloma 55921005 + - Prostate Adenocarcinoma 399490008 + - Prostate Carcinoma 254900004 + - Prostate Carcinoma 399068003 + - Rectal Adenoma 399730005 + - Rectal Carcinoma 254582000 + - Small Cell Carcinoma 74364000 + - Small Cell Lung Carcinoma 254632001 + - Small Cell Lung Carcinoma 254633006 + - Squamous Cell Carcinoma 28899001 + - Squamous Cell Carcinoma 402815007 + - Squamous Cell Lung Carcinoma 254634000 + - Superficial Spreading Melanoma 254730000 + Req: 'Yes' + snomed_disease_code: + Desc: Strictly-speaking, a 9-digit numerical code for the primary disease with which the subject in question has been diagnosed.
Within the CMB Catalog Site, Disease Stage (SNOMED) is presented in the form of a human readable narrative statement of the primary disease which includes some histoligical detail and disease stage.
Acceptable values currently specified, all of which are sourced from actual data, may be of use only during the generation of mock data, with the data type for this property potentially being changed to string.
CDE ID = 6642369 + Term: + - Origin: caDSR + Code: '6642369' + Value: Data Element Name + Src: CMB + Enum: + - "17788007" #Acute Myeloid Leukemia + - "35917007" #Adenocarcinoma + - "443961001" #Adenocarcinoma + - "70594002" #Amelanotic Melanoma + - "254838004" #Breast Carcinoma + - "428054006" #Colon Adenoma + - "269533000" #Colon Carcinoma + - "408645001" #Colorectal Adenocarcinoma + - "93655004" #Cutaneous Melanoma + - "37138001" #Epithelioid Cell Melanoma + - "276803003" #Esophageal Adenocarcinoma + - "408647009" #Gastric Adenocarcinoma + - "254626006" #Lung Adenocarcinoma + - "448993007" #Lung Carcinoma + - "2092003" #Melanoma + - "372244006" #Melanoma + - "72495009" #Mucinous Adenocarcinoma + - "55921005a" #Multiple myeloma + - "188717001" #Multiple myeloma and immunoproliferative disease + - "253000007" #Neuroendocrine Carcinoma + - "2142002" #Nodular Melanoma + - "254731001" #Nodular Melanoma + - "254637007" #Non-Small Cell Lung Carcinoma + - "128922003" #Plasma Cell Leukemia + - "109989006" #Plasma Cell Myeloma + - "55921005b" #Plasma Cell Myeloma + - "399490008" #Prostate Adenocarcinoma + - "254900004" #Prostate Carcinoma + - "399068003" #Prostate Carcinoma + - "399730005" #Rectal Adenoma + - "254582000" #Rectal Carcinoma + - "74364000" #Small Cell Carcinoma + - "254632001" #Small Cell Lung Carcinoma + - "254633006" #Small Cell Lung Carcinoma + - "28899001" #Squamous Cell Carcinoma + - "402815007" #Squamous Cell Carcinoma + - "254634000" #Squamous Cell Lung Carcinoma + - "254730000" #Superficial Spreading Melanoma + Req: 'Yes' + # icd_o_primary_site: + # Desc: The organ or part of the body where cancer begins as captured by the topography codes of the International Classification of Diseases for Oncology, 3rd Edition (ICD-O-3).
CDE ID = 11341616 + # Term: + # - Origin: caDSR - CRDC + # Code: '11341616' + # Value: Diagnosis Primary Anatomic Site ICD-O-3 Code + # Src: DSS + # Type: string + # Req: 'Yes' + primary_disease_site: + Desc: Acceptable values currently specified, none of which are sourced from actual data, are to be used only for generation of mock data.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + #Type: string + Enum: + - Bone Marrow + - Colon + - Epidermis + - Gastroesophageal Junction + - Lung + - Lymph Node + - Prostate + - Rectum + Req: 'Yes' + histology: + Desc: Acceptable values currently specified, none of which are sourced from actual data, are to be used only for generation of mock data.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - Acute Myeloid Leukemia - M3 eosinophilic variant + - Acute Myeloid Leukemia - poorly-differentiated M1 variant + - Acute Myeloid Leukemia - M5 granulocytic variant + - Adenocarcinoma + - Adenosquamous Carcinoma + - High-grade Carcinoma + - Invasive Adenocarcinoma + - Large Cell Carcinoma + - Melanoma + - Multiple Myeloma + - Non-Small Cell Carcinoma + - Plasma Cell Myeloma + - Prostate Carcinoma + - Small Cell Carcinoma + - Squamous Cell Carcinoma + Req: 'Yes' + # date_of_confirmation_of_histology: + # Desc: text CDE ID = 6409589 + # Term: + # - Origin: caDSR + # Code: '6409589' + # Value: Data Element Name + # Src: CMB + # Type: + # units: + # - days + # value_type: number + # Req: 'Yes' + histological_subtype: + Desc: Acceptable values currently specified, none of which are sourced from actual data, are to be used only for generation of mock data.
CDE ID = 7344580 + Term: + - Origin: caDSR + Code: '7344580' + Value: Data Element Name + Src: CMB + #Type: string + Enum: + - Acute Myeloid Leukemia - M3 eosinophilic variant + - Acute Myeloid Leukemia - poorly-differentiated M1 variant + - Acute Myeloid Leukemia - M5 granulocytic variant + - Adenocarcinoma + - Adenosquamous Carcinoma + - High-grade Carcinoma + - Invasive Adenocarcinoma + - Large Cell Carcinoma + - Melanoma + - Multiple Myeloma + - Non-Small Cell Carcinoma + - Plasma Cell Myeloma + - Prostate Carcinoma + - Small Cell Carcinoma + - Squamous Cell Carcinoma + Req: 'Yes' + stage_of_disease: + Desc: The stage of the primary disease with which the subject in question has been diagnosed.
Acceptable values currently specified, none of which are sourced from actual data, are to be used only for generation of mock data.
NOT CURRENTLY ASSIGNED ANY CDE + Src: CMB + Enum: + - Stage III + - Stage IIIA + - Stage IIIB + - Stage IIIC + - Stage IV + - Stage IVA + - Stage IVB + - Stage IVC + Req: 'Yes' + tumor_grade: + Desc: A text term to express the degree of abnormality of cancer cells as a measure of differentiation and aggressiveness.
CDE ID = 11325685 + Term: + - Origin: caDSR - CRDC + Code: '11325685' + Value: Diagnosis Tumor Grade + Src: DSS + Enum: + - GB + - GX + - G1 + - G2 + - G3 + - G4 + - High Grade + - Intermediate Grade + - Low Grade + - Not Applicable + - Not Reported + - Grade cannot be assessed # from CMB Catalog Site and directly from Jeff + - Borderline malignancy # directly from Jeff + - Well Differentiated # from CMB Catalog Site and directly from Jeff + - Moderately Differentiated # from CMB Catalog Site and directly from Jeff + - Poorly Differentiated # from CMB Catalog Site and directly from Jeff + - Undifferentiated grade # directly from Jeff + Req: 'Yes' + date_of_diagnosis: + Desc: text NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: + units: + - days + value_type: number + Req: 'Yes' + # subject_age_at_diagnosis: + # Desc: Age at the time of diagnosis expressed in number of days since birth. CMB data has this expressed in years not days!
CDE ID = 10609539 + # Term: + # - Origin: caDSR - CRDC + # Code: '10609539' + # Value: Subject Age at Diagnosis Integer + # Src: DSS + # Type: + # units: + # - years + # value_type: number + # Req: 'Yes' + # treatment + # treatment_id: + # Desc: A unique identifier of each treatment record, used to identify the correct treatment records during data updates. The value of this property will generally be the same as the value of the subject_id property.
NO CDE REQUIRED + # Term: + # - Origin: caDSR + # Code: 'code/ID' + # Value: Data Element Name + # Src: value + # Type: string + # Req: 'Yes' + # Key: true + # x_targeted_therapy: # CMTRT_DSL in CMB Data Submission - 5a_TargetedTherapyAdministration??? + # Desc: text CDE ID = 6400634 + # Term: + # - Origin: caDSR + # Code: '6400634' + # Value: Concomitant Medication Name # check on this, repeated below + # Src: CMB + # Type: string + # Req: 'No' + # x_therapy: # Non-Targeted Therapy in CMB Data Submission - 5a_NonTargetedTherapySupplement + # Desc: text CDE ID = 6400634 + # Term: + # - Origin: caDSR + # Code: '6400634' + # Value: Concomitant Medication Name + # Src: CMB + # Type: string + # Req: 'No' + # x_surgical_procedure_name: # SURG_PROC_NAME in CMB Data Submission - 5a_NonTargetedSurgerySupplement + # Desc: text CDE ID = 6411539 + # Term: + # - Origin: caDSR + # Code: '6411539' + # Value: Procedure Name + # Src: CMB + # Type: string + # Req: 'No' + # x_radiological_procedure_name: # RAD_PROC_NAME in CMB Data Submission - 5a_NonTargetedRadiationSupplement + # Desc: text CDE ID = 6411539 + # Term: + # - Origin: caDSR + # Code: '6411539' + # Value: Procedure Name + # Src: CMB + # Type: string + # Req: 'No' + # targeted_therapy + targeted_therapy_id: + Desc: A unique identifier of each targeted therapy record, used to identify the correct targeted therapy records during data updates. The value of this property will generally be based largely upon the value of the subject_id property.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + Key: true + targeted_therapy: # CMTRT_DSL in CMB Data Submission - 5a_TargetedTherapyAdministration??? + Desc: The name of the therapeutic agent administered to a subject as a targeted therapy, i.e. one that is considered to have enhanced therapeutic effects upon the specific type of cancer with which the subject has been diagnosed, and/or upon that type of cancer specifically within the context of a given subject because of factors such as tumor histology, grade, gene expression and genetic make-up.
Acceptable values to be used for generation of mock data taken from TARGETED_THERAPY in CMB Version 2 DS 5a_TargetedTherapyAdministration
CDE ID = 6400634 + Term: + - Origin: caDSR + Code: '6400634' + Value: Concomitant Medication Name # check on this, repeated below + Src: CMB + #Type: string + Enum: + - Anti-CD19 CAR T-Cell Therapy + - Anti-p53 CAR T-Cell Therapy + - Ipilimumab Immunotherapy + - Nivolumab Immunotherapy + - Nivolumab and Ipilimumab Combination Immunotherapy + - Abiraterone + - afatinib + - Alectinib + - Atezolizumab + - belantamab mafodotin + - Bevacizumab + - bevacizumab + - Bicalutamide + - binimetinib + - Binimetinib + - Bortezomib + - cabazitaxel + - Cabazitaxel + - carfilizomib + - Carfilzomib + - carfilzomib + - Cetuximab + - cetuximab + - dabrafenib + - Dabrafenib + - Daratumumab + - docetaxel + - durvalumab + - Durvalumab + - Elotuzumab + - encorafenib + - Encorafenib + - Encorafenib + cetuximab + - enzalutamide + - Enzalutamide + - Gemtuzumab ozogamicin + - ipilimumab + - isatuximab-irfc + - ixazomib + - Lenalidomide + - lenalidomide + - Midostaurin + - nivolumab + - Nivolumab + - Olaparib + - osimertinib + - Osimertinib + - panitumumab + - pembrolizumab + - Pembrolizumab + - Pomalidamide + - pomalidomide + - regorafenib + - Revlamid + - Revlimib + - Revlimid + - revlimide + - Selinexor + - trametinib + Req: 'Yes' + targeted_therapy_dose: + Desc: The dose at which the targeted therapeutic agent in question was administered, inclusive of the appropriate dosage units.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + targeted_therapy_frequency: + Desc: The frequency at which the targeted therapeutic agent in question was administered.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + targeted_therapy_start_and_end: + Desc: The timeframe over which the targeted therapy in question was administered, expressed as the number of days before or after study enrollment the targeted therapy was started, and the number of days before or after study enrollment the targeted therapy ended.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: Preferred + best_response_to_targeted_therapy: + Desc: An indication as to the best overall response to treatment with the targeted therapy in question.
Acceptable values currently in place are taken directly from ICDC, if only to drive a discussion as to what values would be more/most appropriate.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - Complete Response + - Partial Response + - Less than partial response + - Stable Disease + - Progressive Disease + - Not Assessed + - Too Early + - Not Evaluable + - Unknown + Req: Preferred + # non_targeted_therapy + non_targeted_therapy_id: + Desc: A unique identifier of each non-targeted therapy record, used to identify the correct non-targeted therapy records during data updates. The value of this property will generally be based largely upon the value of the subject_id property.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + Key: true + non_targeted_therapy: + Desc: The name of the therapeutic agent administered to a subject as a non-targeted therapy, i.e. one that is considered to be appropriate for and effective against the specific type of cancer with which the subject has been diagnose, but not to have significantly enhanced therapeutic effects upon that cancer in general or upon that cancer specifically within the context of a given subject.
Acceptable values to be used for generation of mock data taken from THERAPY in CMB Version 2 DS 5a_NonTargetedTherapySupplement
CDE ID = 6400634 + Term: + - Origin: caDSR + Code: '6400634' + Value: Concomitant Medication Name + Src: CMB + #Type: string + Enum: + - CHOP + - Cyclophosphamide + - High-dose Ablastane + - Doxorubicin plus Vincristine Combination Therapy + - R-CHOP + - fluorouracil + - leucovorin + - oxaliplatin + - hydroxyurea + - cytarabine + - daunorubicin + - oxaliplatin, fluorouracil, leucovorin + - lenalidomide + - bortezomib + - cyclophosphamide + - zoledronic acid + - denosumab + - capecitabine + - irinotecan + - cisplatin + - melphalan + - autologous stem cell transplant + - etoposide + - doxorubicin + - fludarabine + - leuprolide + - pemetrexed + - paclitaxel + - melphalan flufenamide + - dexamethasone + - carboplatin + - lurbinectedin + - carboplatin, etoposide + - fluorouracil, irinotecan, leucovorin + - cisplatin, docetaxel + - Leucovorin Calcium/Fluorouracil/Oxaliplatin + - fluorouracil, irinotecan, oxaliplatin, leucovorin + - cyclophosphamide, dexamethasone + - Leucovorin Calcium/Irinotecan Hydrochloride + - bicalutamide + - firmagon + Req: 'Yes' + non_targeted_therapy_dose: + Desc: The dose at which the non-targeted therapeutic agent in question was administered, inclusive of the appropriate dosage units.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + non_targeted_therapy_frequency: + Desc: The frequency at which the non-targeted therapeutic agent in question was administered.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + non_targeted_therapy_start_and_end: + Desc: The timeframe over which the non-targeted therapy in question was administered, expressed as the number of days before or after study enrollment the non-targeted therapy was started, and the number of days before or after study enrollment the non-targeted therapy ended.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: Preferred + best_response_to_non_targeted_therapy: + Desc: An indication as to the best overall response to treatment with the non-targeted therapy in question.
Acceptable values currently in place are taken directly from ICDC, if only to drive a discussion as to what values would be more/most appropriate.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - Complete Response + - Partial Response + - Less than partial response + - Stable Disease + - Progressive Disease + - Not Assessed + - Too Early + - Not Evaluable + - Unknown + Req: 'Yes' + # surgery + surgical_procedure_id: + Desc: A unique identifier of each surgical procedure record, used to identify the correct surgical procedure therapy records during data updates. The value of this property will generally be based largely upon the value of the subject_id property.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + Key: true + surgical_procedure: + Desc: The name and/or a brief narrative description of the surgical procedure performed upon the subject.
Acceptable values to be used for generation of mock data taken from SURG_PROC_NAME in CMB Version 2 DS 5a_NonTargetedSurgerySupplement
CDE ID = 6411539 + Term: + - Origin: caDSR + Code: '6411539' + Value: Procedure Name + Src: CMB + #Type: string + Enum: + - left neck dissection + - resection + - left parotid LN biopsy + - needle/ core biopsy + - CT guided lung biopsy + - biopsy + - Portion of mass from right foot, biopsy + - robotic lower anterior resection + - Laparoscopic mobilization o fleft colon and splenic flexure, open rectosigmoid resection with anastamosis + - endobronchial biopsy + - needle core biopsy + - fine needle aspiration + - CT guided right adrenal mass biopsy + - sigmoid resection + - robotic assisted left lung upper lobe wedge + - right lung mass CT guided FNA (cytology) + - Lymph node dissection + - liver resection + - Colon resection and ovary removal + - left lung wedge resection + - wide local excision of right thigh melanoma with primary closure sentinel lymph node biopsy of right inguinal lymph node + - Liver, RUQ-biopsy + - abdominoperineal resection + - Lymph node, retroperitoneal lymph node-biopsy + - Liver, mass, needle core biopsy + - Lung, right upper lobe, mass, smears and cell block, FNA + - Stomach, antral ulcer-biopsy + - Core biopsy + - CT guided biopsy + - Lymph node, fine-needle aspirate (aspirate smears and cell block + - sidmoidoscopy + - tissue pathology + - robotic assisted laparoscopic radical prostatectomy + - stereotactic volumetric resection of right occipital intraaxial lesion. + - partial colectomy + - fine needle biopsy + - TURP and bilateral orchiectomy + - Tumor debulking + - cytoreductive surgery with HIPEC + - Resection of periumbilical mass + - CT guided left iliac biopsy + - Thoracic Laminectomy - Removal of Lesion + - Thoracic Decompression - Removal of Tumor + - right hemicolectomy + - ultrasound-guided core biopsy + - Biopsy + - Bronchoscopy, endobronchial ultrasound + - Laparoscopic assisted right colectomy + - Parietal Brain Resection, left. Parietal Craniotomy, left. + - rectal biopsy + - colonoscopy + - hepatectomy + - hemicolectomy + - Lymph node, left axillary, needle core biopsy + - TURP + - rectosigmoid colectomy + - Open partial hepatectomy + - Laparoscopic Diverting Colostomy + - IR Kyphoplasty + - Left temporal craniotomy for tumor resection + - subtotal colectomy, end ileostomy, liver biopsy and wedge (segment 8) + - Laparoscopic, hand assisted low anterior resection with end descending colostomy and Laparoscopic complete mobilization of the splenic flexure + - Right video assisted thorascopic upper lobectomy and mediastinal lymphadenectomy + - Open Reduction and Internal Fixation (ORIF) of the right proximal humerus fracture and curettage and cementing of right proximal humerus + - T4 and T5 laminectomy for epidural tumor resection + - Port Placement, Laparoscopic loop colostomy and mobilization of the splenic flexure + - Right sub trochanteric fracture ORIF w/ TFN + Req: 'Yes' + surgical_procedure_date: + Desc: The date upon which the surgical procedure in question was performed. EXPRESSED HOW?
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: date + Req: Preferred + surgical_procedure_anatomical_location: + Desc: The anatomical site or sites at which the surgical procedure in question was performed.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + surgical_procedure_therapeutic: + Desc: An indication as to whether the surgical procedure in question was performed with therapeutic intent.
NOT CURRENTLY ASSIGNED ANY CDEtext + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - 'Yes' + - 'No' + Req: 'Yes' + surgical_procedure_findings: + Desc: A narrative description of any significant findings observed during the surgical procedure in question.
NOT CURRENTLY ASSIGNED ANY CDEtext + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: Preferred + extent_of_residual_disease: + Desc: text
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: Preferred + # radiotherapy + radiological_procedure_id: + Desc: A unique identifier of each radiological procedure record, used to identify the correct radiological procedure record during data updates. The value of this property will generally be based largely upon the value of the subject_id property.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + Key: true + radiological_procedure: # RAD_PROC_NAME in CMB Data Submission - 5a_NonTargetedRadiationSupplement + Desc: The name and/or a brief narrative description of the radiological procedure performed upon the subject.
Acceptable values to be used for generation of mock data taken from RAD_PROC_NAME in CMB Version 2 DS 5a_NonTargetedRadiationSupplement
CDE ID = 6411539 + Term: + - Origin: caDSR + Code: '6411539' + Value: Procedure Name + Src: CMB + #Type: string + Enum: + - LOW DOSE RADIATION + - HIGH DOSE RADIATION + - SBRT + - Yttrium 90 radio embolization Y90 Microspheres injection + - Radiotherapy + - stereotactic ablative radiotherapy + - Salvage radiotherapy + - External beam radiation delivered in 2 fractions to T5-T8. + - CyberKnife + - Palliative Radiotherapy + - Daily Radiation Treatment to R Lung/Nodes + - Palliative radiation to the pelvic mass + - Whole Brain Radiation + - External Beam Radiation + - Radiation Treatment + - External Beam Radiation Therapy + - IGRT + - Palliative Radiation Therapy + - 3D Conformal Radiation Therapy + - IMRT + - CyberKnife SBRT + - Prophylactic Cranial Radiation + Req: 'Yes' + radiological_procedure_anatomical_location: + Desc: The anatomical site or sites subject to the radiological procedure in question.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + radiation_dose: + Desc: The dose at which the radiotherapy in question was administered, inclusive of the appropriate dosage units.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + radiation_frequency: + Desc: The frequency at which the radiotherapy in question was administered.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + radiation_extent: + Desc: text + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + radiotherapy_start_and_end: + Desc: The timeframe over which the radiotherapy in question was administered, expressed as the number of days before or after study enrollment the radiotherapy was started, and the number of days before or after study enrollment the radiotherapy ended.
NOT CURRENTLY ASSIGNED ANY CDEtext + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: integer + Req: 'Yes' + best_response_to_radiotherapy: + Desc: An indication as to the best overall response to the radiotherapy in question.
Acceptable values currently in place are taken directly from ICDC, if only to drive a discussion as to what values would be more/most appropriate.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - Complete Response + - Partial Response + - Less than partial response + - Stable Disease + - Progressive Disease + - Not Assessed + - Too Early + - Not Evaluable + - Unknown + Req: 'Yes' + # subject_status + subject_status_id: + Desc: A unique identifier of each subject_status record, used to identify the correct subject_status records during data updates. The value of this property will generally be the same as the value of the subject_id property.
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + Key: true + survival_status: + Desc: In CMB Data Submission - 5a_FollowUp
The participant's survival state as being living or deceased.
NEW CDE ID = 2847330
OLD CDE ID = 7050072 + Term: + - Origin: caDSR-NCI Standard + Code: '2847330' # formerly '7050072' + Value: Survival Status + Src: DSS # previously CMB + Enum: + - ALIVE + - DEAD + - UNKNOWN + - Alive with Disease # from CMB Data Submission - 5a FollowUp + - Alive with No Evidence of Disease # from CMB Data Submission - 5a FollowUp + - Alive, Disease Status Unknown # from CMB Data Submission - 5a FollowUp + - Dead # from CMB Data Submission - 5a FollowUp + Req: 'Yes' + primary_cause_of_death: + Desc: In CMB Data Submission - 5a_DeathSummary
CDE ID = 6421593 + Term: + - Origin: caDSR + Code: '6421593' + Value: Primary Cause of Death + Src: CMB + Type: string + Req: 'No' + off_study: + Desc: Infer from OFF_STUDY_DATE in CMB Data Submission - 5a_OffStudy?
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: value + Enum: + - 'Yes' + - 'No' + - Unknown + Req: 'Yes' + off_study_reason: + Desc: In CMB Data Submission - 5a_OffStudy
Enumerated via a long list of ~40 acceptable terms, but do these in fact represent the values CMB has/uses?
CDE ID = 6355981 + Term: + - Origin: caDSR + Code: '6355981' + Value: Disposition Event Dictionary-Derived/Standardized Term + Src: value + Enum: + - ADVERSE EVENT + - APPROVED DRUG AVAILABLE FOR INDICATION + - COMPLETED + - DEATH + - DISEASE RELAPSE + - FAILURE TO MEET CONTINUATION CRITERIA + - FAILURE TO MEET RANDOMIZATION CRITERIA + - LACK OF EFFICACY + - LOGISTICAL PROBLEM + - LOST TO FOLLOW-UP + - MET ELIGIBILITY CRITERIA BUT NOT NEEDED + - NEVER DOSED + - NO LONGER CLINICALLY BENEFITING + - NON-COMPLIANCE WITH NON-STUDY DEVICE + - NON-COMPLIANCE WITH STUDY DEVICE + - NON-COMPLIANCE WITH STUDY DRUG + - NON-COMPLIANCE WITH STUDY SCHEDULE + - OTHER + - PARTNER PREGNANCY + - PHYSICIAN DECISION + - PREGNANCY + - PROGRESSIVE DISEASE + - PROTOCOL DEVIATION + - PROTOCOL VIOLATION + - PROTOCOL-SPECIFIED WITHDRAWAL CRITERION MET + - RANDOMIZED BY MISTAKE + - RANDOMIZED BY MISTAKE WITH STUDY TREATMENT + - RANDOMIZED BY MISTAKE WITHOUT STUDY TREATMENT + - RECOVERY + - REQUIRES PROHIBITED MEDICATION + - SCREEN FAILURE + - SCREENING NOT COMPLETED + - SITE TERMINATED BY SPONSOR + - SPONSOR REQUEST + - STUDY TERMINATED BY SPONSOR + - TECHNICAL PROBLEMS + - WITHDRAWAL BY PARENT/GUARDIAN + - WITHDRAWAL BY SUBJECT + - WITHDRAWAL OF ASSENT + - WITHDRAWAL OF CONSENT + Req: 'No' + # specimen + parent_specimen_id: # this will be the exact same value that the CMB uses for parent_biospecimen_id + Desc: CMB's "Parent Biospecimen ID"
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'Yes' + Key: true + specimen_id: # this will be the exact same value that the CMB uses for biospecimen_id + Desc: CMB's "Biospecimen ID"
NO CDE REQUIRED?
This property is used as the key via which child records, e.g. data files generated via Oncomine panel analyses, can be associated with the appropriate specimen during data loading, and to identify the correct records during data updates. + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Type: string + Req: 'No' + parent_specimen_type: # this refers to the nature of the specimen originally isolated from the participant, and from which various aliquots and/or derivative biospecimens were subseuqently isolated + Desc: Acceptable values taken straight from the CMB Catalog Site.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - Bone Marrow Aspirate + - Cell Block + - EDTA Blood + - FFPE Block + - Formalin Bone Marrow Biopsy + - Formalin Fixed Tissue + - Snap Frozen Tissue + - Stained Bone Marrow Aspirate Slide + - Streck Blood + - Unstained Bone Marrow Aspirate Slide + - Unstained Slide + Req: 'Yes' + specimen_type: # this refers to the nature of the sub-specimen that was actually subject to downstream analysis + Desc: Acceptable values taken straight from the CMB Catalog Site.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - BMMC + - Blood Pellet + - Cells + - Curl + - DNA + - FFPE Block + - Glass H&E Slide + - Glass Slide Smear + - Glass Unstained Slide + - PBMC + - Plasma + - Plasma-cf + - RNA + - Snap Frozen Unfixed Tissue + - Tissue Curl + - cDNA + Req: 'Yes' + # obi_specimen_type: + # Desc: This doesn't match the caDSR record of "Specimen Material OBIB Source" for CDE ID = 11253427
The kind of material that forms the sample as captured in the Ontology for Biobanking (OBIB), a specimen subset of the Ontology for Biomedical Investigations (OBI). + # Term: + # - Origin: caDSR - CRDC + # Code: '11253427' + # Value: Specimen Type Text + # Src: DSS + # Type: integer + # Req: 'No' + specimen_category: + Desc: This in NOT the CMB Catalog Site's "Tissue Ccategory", the indicator as to normal vs primary vs metastatic. Not used within the CMB Catalog site itself, but defined within at least one of the associated Data Dictionaries.
CDE ID = 7069877 + Term: + - Origin: caDSR + Code: '7069877D' + Value: Data Element Name + Src: CMB DD + Enum: + - BLOOD + - BONE MARROW + - BUCCAL CELL SAMPLE + - CEREBROSPINAL FLUID + - FORMALIN FIXED PARAFFIN EMBEDDED TISSUE + - FORMALIN FIXED TISSUE + - FRESH TISSUE + - FROZEN TISSUE + - OTHER BODILY FLUIDs + - SALIVA + - STOOL + - URINE + Req: 'Yes' + anatomical_collection_site: + Desc: This is CMB's version of "sample site".
Acceptable values currently specified, none of which are sourced from actual data, are to be used only for generation of mock data.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + #Type: string + Enum: + - Bone Marrow + - Brain + - Bronchus + - Colon + - Duodenum + - Epidermis + - Esophagus + - Kidney + - Liver + - Lung + - Lymph Node + - Pancreas + - Prostate + - Rectum + - Stomach + Req: 'Yes' + type_of_tissue: + Desc: This is caDSR's version of CMB Catalog's "Tissue Category".
Acceptable values taken from the caDSR's definition of the data element.
CDE ID = 7003892 + Term: + - Origin: caDSR + Code: '7003892' + Value: Tissue Type + Src: CMB DD + Enum: + - ANT # Adjacent Normal Tissue + - NORMAL + - METASTATIC + - OTHER + - PRIMARY + Req: 'Yes' + tissue_category: + Desc: Based upon the values in use, this is CMB Catalog's version of caDSR "Tissue Type" aka CTDC "type_of_tissue".
Acceptable values taken straight from the CMB Catalog Site.
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: CMB + Enum: + - Metastatic + - Normal + - Primary + Req: 'Yes' + # icd_o_3_tissue_morphology: + # Desc: The coded result of analyzing the microscopic anatomy of normal and abnormal cells and tissues of the specimen by examining a thin slice (section) under a light (optical) or electron microscope. The code represents the histology of the disease using the third edition of the International Classification of Diseases for Oncology.
Acceptable values currently specified, none of which are sourced from actual data, are to be used only for generation of mock data.
CDE ID = 11326261 + # Term: + # - Origin: caDSR - CRDC + # Code: '11326261' + # Value: Specimen Tumor Tissue ICD-O-3 Morphology Code + # Src: DSS + # #Type: string + # # actually enumerated to a long list of ICD-O-3 codes, which is too long to specifiy here; we should instead just include the codes that will be seen in CMB data? + # Enum: + # - Acute Myeloid Leukemia - M3 eosinophilic variant + # - Acute Myeloid Leukemia - poorly-differentiated M1 variant + # - Acute Myeloid Leukemia - M5 granulocytic variant + # - Adenocarcinoma + # - Adenosquamous Carcinoma + # - High-grade Carcinoma + # - Invasive Adenocarcinoma + # - Large Cell Carcinoma + # - Metastatic Adenocarcinoma + # - Metastatic Adenosquamous Carcinoma + # - Metastatic High-grade Carcinoma + # - Metastatic Invasive Adenocarcinoma + # - Metastatic Large Cell Carcinoma + # - Metastatic Melanoma + # - Metastatic Non-Small Cell Carcinoma + # - Metastatic Prostate Carcinoma + # - Metastatic Small Cell Carcinoma + # - Metastatic Squamous Cell Carcinoma + # - Melanoma + # - Multiple Myeloma + # - Non-Small Cell Carcinoma + # - Plasma Cell Myeloma + # - Prostate Carcinoma + # - Small Cell Carcinoma + # - Squamous Cell Carcinoma + # Req: 'No' + assessment_timepoint: + Desc: CMB Catalog Site's "Collection Timepoint".
CDE ID = 7065963 + Term: + - Origin: caDSR + Code: '7065963' + Value: Data Element Name + Src: CMB + Enum: + - ARCHIVAL + - BASELINE + - ON TREATMENT + - PROGRESSION + Req: 'Yes' + collection_date: + Desc: From the CMB perspective, this may be equivalent to "days_from_diagnosis_to_specimen_collection" (now block commented out) (CDE ID = 11253404), with collection date indexed to enrollment date.
CDE ID 6401821 + Term: + - Origin: caDSR + Code: '6401821' + Value: Data Element Name + Src: CMB + Type: + units: + - days + value_type: number + Req: 'Yes' + # collection_method: + # Desc: text + # Term: + # - Origin: caDSR + # Code: 'code/ID' + # Value: Data Element Name + # Src: CMB + # Type: string + # Req: 'Yes' + # fixative: + # Desc: text + # Term: + # - Origin: caDSR + # Code: 'code/ID' + # Value: Data Element Name + # Src: CMB + # Enum: + # - EDTA + # - FFPE + # - Formalin + # - Streck + # Req: 'Yes' + # days_from_diagnosis_to_specimen_collection: + # Desc: The number of days from the date a sample was collected to the date of the initial pathologic diagnosis.
CDE ID = 11253404 ALL OF THIS IS INCORRECT BECAUSE THE INCORRECT CDE WAS REFERENCED; PER THE CORRECT CDE ID OF 11248865 THE PROPERTY DESCRIPTION SHOULD BE The number of days from the date the diagnosis was made to the date a sample was collected. + # Term: + # - Origin: caDSR - CRDC + # Code: '11253404' THIS IS INCORRECT, AND AS A RESULT, SO IS THE DESCRIPTION THE CORRECT CDE ID IS 11248865 + # Value: Specimen Diagnosis To Specimen Collection Day Count + # Src: DSS + # Type: + # units: + # - days + # value_type: number + # Req: 'No' + # days_from_first_subject_visit_to_specimen_collection: + # Desc: The number of days from the date the subject first visited to the date a sample was collected.
CDE ID = 11248874 + # Term: + # - Origin: caDSR - CRDC + # Code: '11248874' + # Value: Specimen First Subject Visit To Specimen Collection Day Count + # Src: DSS + # Type: + # units: + # - days + # value_type: number + # Req: 'No' + # days_from_first_treatment_to_specimen_collection: + # Desc: The number of days from the date the first treatment was administered to the date a sample was collected.
CDE ID = 11250807 + # Term: + # - Origin: caDSR - CRDC + # Code: '11250807' + # Value: Specimen First Treatment To Specimen Collection Day Count + # Src: DSS + # Type: + # units: + # - days + # value_type: number + # Req: 'No' + # days_from_initial_genomic_sequencing_to_specimen_collection: + # Desc: The number of days from the date the initial genomic sequencing was done to the date a sample was collected.
CDE ID = 11251130 + # Term: + # - Origin: caDSR - CRDC + # Code: '11251130' + # Value: Specimen Initial Genomic Sequencing To Specimen Collection Day Count + # Src: DSS + # Type: + # units: + # - days + # value_type: number + # Req: 'No' + # days_from_recurrence_to_specimen_collection: + # Desc: The number of days from the date the disease recurrence was identified to the date a sample was collected.
CDE ID = 11251133 + # Term: + # - Origin: caDSR-CRDC + # Code: '11251133' + # Value: Specimen Recurrence To Specimen Collection Day Count + # Src: DSS + # Type: + # units: + # - days + # value_type: number + # Req: 'No' + # days_from_specimen_collection_to_initial_pathologic_diagnosis: + # Desc: The number of days from the date a sample was collected to the date of the initial pathologic diagnosis.
CDE ID = 11253404 + # Term: + # - Origin: caDSR - CRDC + # Code: '11253404' + # Value: Specimen Collection To Initial Pathologic Diagnosis Day Count + # Src: DSS + # Type: + # units: + # - days + # value_type: number + # Req: 'No' + # data_file + data_file_name: + Desc: The literal label for an electronic data file.
CDE ID = 11284037 + Term: + - Origin: caDSR - CRDC + Code: '11284037' + Value: Electronic Data File Name + Src: DSS + Type: string + Req: 'Yes' + data_file_type: + Desc: A curated indicator as to the type of content represented by the data file
NOT CURRENTLY ASSIGNED ANY CDE + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: FNL + Enum: + - Clinical Report + - Variant Call File + #- Investigator Report + - DNA Methylation Analysis File + - Index File + - RNA Sequence File + - Whole Exome Sequence File + Req: 'Yes' + data_file_description: + Desc: A free text field that can be used to document the content and other details about an electronic file that may not be captured elsewhere. For example, the file's derivation from a specimen of normal tissue versus tumor tissue.
CDE ID = 11280338 + Term: + - Origin: caDSR - CRDC + Code: '11280338' + Value: Electronic Data File Description Text + Src: DSS + Type: string + Req: 'Yes' + data_file_format: + Desc: A defined organization or layout representing and structuring data in a computer file; the electronic format of the file, as derived during file validation
With the actual values of this property being loader derived, the acceptable values currently specified are in place only to support the generation of mock data
CDE ID = 11416926 + Term: + - Origin: caDSR - CRDC + Code: '11416926' + Value: Electronic Data File Format Type + Src: DSS + #Type: string + Enum: + - bam + - bai + - vcf + - pdf + - docx + - xlsx + - tbi + - csv + - gz + Req: 'Yes' + data_file_size: + Desc: The measure, expressed in bytes, as to how much space a data file takes up on a storage medium.
CDE ID = 11479876 + Term: + - Origin: caDSR - CRDC + Code: '11479876' + Value: Electronic Data File Size Integer + Src: DSS + Type: number + Req: 'Yes' + data_file_checksum_value: + Desc: A small-sized block of data derived from a file for the purpose of detecting errors that may have been introduced during file transmission and/or storage and used to verify data integrity.
CDE ID = 11480133 + Term: + - Origin: caDSR - CRDC + Code: '11480133' + Value: Electronic Data File Checksum Value + Src: DSS + Type: string + Req: 'Yes' + data_file_checksum_type: + Desc: The method by which the file checksum was calculated.
CDE ID = 11475057 + Term: + - Origin: caDSR - CRDC + Code: '11475057' + Value: Electronic Data File Checksum Type + Src: DSS + Enum: + - md5sum + - sha1 + - sha256 + Req: 'Yes' + data_file_compression_status: + Desc: The state of data when saved to storage space or during data transmission.
CDE ID = 11387114 + Term: + - Origin: caDSR-CRDC + Code: '11387114' + Value: Electronic Data File Compression Type + Src: DSS + Enum: + - Compressed + - Uncompressed + - Unknown + Req: Preferred + data_file_uuid: + Desc: text NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: FNL + Type: string + Req: 'Yes' + data_file_location: + Desc: The specific location within the S3 storage bucket at which the file is stored, expressed in terms of a unique url
NO CDE REQUIRED + Term: + - Origin: caDSR + Code: 'code/ID' + Value: Data Element Name + Src: FNL + Type: string + Req: 'Yes' \ No newline at end of file diff --git a/config/es_indices.yml b/config/es_indices.yml new file mode 100644 index 00000000..c829bcce --- /dev/null +++ b/config/es_indices.yml @@ -0,0 +1,1165 @@ +# Indices settings +Indices: + # First index + # Name of the index to be created, existing index with same name will be deleted + - index_name: study + type: neo4j + # type mapping for each property of the index + mapping: + study_short_name: + type: keyword + study_id: + type: keyword + study_name: + type: keyword + study_description: + type: keyword + study_type: + type: keyword + dates_of_conduct: + type: keyword + subjects: + type: nested + properties: + biomarker_results_available: + type: keyword + histology_images_available: + type: keyword + radiology_images_available: + type: keyword + radiology_report_available: + type: keyword + subject_id: + type: keyword + participant_count: + type: keyword + associated_links: + type: nested + properties: + associated_link_name: + type: keyword + associated_link_url: + type: keyword + associated_link_id: + type: keyword + image_collection: + type: nested + properties: + image_collection_name: + type: keyword + image_type_included: + type: keyword + image_collection_url: + type: keyword + repository_name: + type: keyword + collection_access: + type: keyword + + + + # Cypher query will be used to retrieve data from Neo4j, and index into Elasticsearch + cypher_query: " + MATCH (s:study) + optional MATCH (s)<-[:associated_with]-(ic:image_collection) + optional MATCH (s)<-[:associated_with]-(al:associated_link) + optional MATCH (s)<-[:belongs_to]-(subject) + RETURN DISTINCT + s.study_name as study_name, + s.study_short_name as study_short_name, + s.study_id as study_id, + s.study_description as study_description, + s.study_type as study_type, + s.dates_of_conduct as dates_of_conduct, + COLLECT(DISTINCT{ + associated_link_name: al.associated_link_name, + associated_link_url: al.associated_link_url, + associated_link_id: al.associated_link_id + }) AS associated_links, + COLLECT(DISTINCT{ + image_collection_name: ic.image_collection_name, + image_type_included: ic.image_type_included, + image_collection_url: ic.image_collection_url, + repository_name: ic.repository_name, + collection_access: ic.collection_access + }) AS image_collection, + COLLECT(DISTINCT{ + biomarker_results_available: subject.biomarker_results_available, + histology_images_available: subject.histology_images_available, + radiology_images_available: subject.radiology_images_available, + radiology_report_available: subject.radiology_report_available, + subject_id: subject.subject_id + })as subjects, + COUNT( DISTINCT subject) as participant_count + " + + - index_name: study_data_file + type: neo4j + # type mapping for each property of the index + mapping: + study_short_name: + type: keyword + list_type: + type: keyword + study_data_files: + type: nested + properties: + data_file_uuid: + type: keyword + data_file_name: + type: keyword + data_file_type: + type: keyword + data_file_description: + type: keyword + data_file_format: + type: keyword + data_file_size: + type: keyword + data_file_checksum_value: + type: keyword + data_file_checksum_type: + type: keyword + data_file_compression_status: + type: keyword + data_file_location: + type: keyword + data_files: + type: nested + properties: + data_file_uuid: + type: keyword + data_file_name: + type: keyword + data_file_type: + type: keyword + data_file_description: + type: keyword + data_file_format: + type: keyword + data_file_size: + type: keyword + data_file_checksum_value: + type: keyword + data_file_checksum_type: + type: keyword + data_file_compression_status: + type: keyword + data_file_location: + type: keyword + + # Cypher query will be used to retrieve data from Neo4j, and index into Elasticsearch + cypher_query: " + MATCH (s:study)<-[:belongs_to]-(sb:subject) + MATCH (s:study)<-[:associated_with]-(study_file:data_file) + optional MATCH (sb:subject)<-[*..2]-(df:data_file) + RETURN DISTINCT + s.study_short_name as study_short_name, + COLLECT(DISTINCT{ + data_file_uuid: study_file.data_file_uuid, + data_file_name: study_file.data_file_name, + data_file_type: study_file.data_file_type, + data_file_description: study_file.data_file_description, + data_file_format: study_file.data_file_format, + data_file_size: study_file.data_file_size, + data_file_checksum_value: study_file.data_file_checksum_value, + data_file_checksum_type: study_file.data_file_checksum_type, + data_file_compression_status: study_file.data_file_compression_status, + data_file_location: study_file.data_file_location + }) AS study_data_files, + COLLECT(DISTINCT{ + data_file_uuid: df.data_file_uuid, + data_file_name: df.data_file_name, + data_file_type: df.data_file_type, + data_file_description: df.data_file_description, + data_file_format: df.data_file_format, + data_file_size: df.data_file_size, + data_file_checksum_value: df.data_file_checksum_value, + data_file_checksum_type: df.data_file_checksum_type, + data_file_compression_status: df.data_file_compression_status, + data_file_location: df.data_file_location + }) AS data_files, + COLLECT(DISTINCT df.data_file_type) as list_type + " + # Name of the index to be created, existing index with same name will be deleted + - index_name: study_specimen + type: neo4j + # type mapping for each property of the index + mapping: + study_short_name: + type: keyword + specimen_types: + type: nested + properties: + group: + type: keyword + count: + type: keyword + specimen_timepoints: + type: nested + properties: + group: + type: keyword + count: + type: keyword + sample_count: + type: keyword + specimen: + type: nested + properties: + specimen_id: + type: keyword + parent_specimen_id: + type: keyword + collection_date: + type: keyword + icd_o_3_tissue_morphology: + type: keyword + obi_specimen_type: + type: keyword + type_of_tissue: + type: keyword + anatomical_collection_site: + type: keyword + parent_specimen_type: + type: keyword + specimen_type: + type: keyword + tissue_category: + type: keyword + assessment_timepoint: + type: keyword + cypher_query: " + MATCH (s:study)<-[:belongs_to]-(subject)<-[:of_subject]-(sp:specimen) + WITH COUNT(sp) AS sample_count + MATCH (s:study)<-[:belongs_to]-(subject)<-[:of_subject]-(sp:specimen) + WITH DISTINCT sp.specimen_type AS biospecimen_type, COUNT(sp) AS count1, sample_count + + MATCH (s:study)<-[:belongs_to]-(subject)<-[:of_subject]-(sp:specimen) + WITH DISTINCT sp.assessment_timepoint AS assessment_timepoint, COUNT(sp) AS count2, biospecimen_type, count1, sample_count + MATCH (s:study)<-[:belongs_to]-(subject)<-[:of_subject]-(sp:specimen) + WITH sample_count, biospecimen_type, count1, assessment_timepoint, count2, s, sp + RETURN DISTINCT + s.study_short_name AS study_short_name, + COLLECT(DISTINCT{group: assessment_timepoint, count: count2}) AS specimen_timepoints, + COLLECT(DISTINCT{group: biospecimen_type, count: count1}) AS specimen_types, + COLLECT(DISTINCT{ + specimen_id:sp.specimen_id, + parent_specimen_id:sp.parent_specimen_id, + collection_date:sp.collection_date, + icd_o_3_tissue_morphology: sp.icd_o_3_tissue_morphology, + obi_specimen_type: sp.obi_specimen_type, + type_of_tissue: sp.type_of_tissue, + anatomical_collection_site: sp.anatomical_collection_site, + parent_specimen_type: sp.parent_specimen_type, + specimen_type: sp.specimen_type, + tissue_category: sp.tissue_category, + assessment_timepoint: sp.assessment_timepoint + }) as specimen, + sample_count + " + + - index_name: study_diagnosis + type: neo4j + # type mapping for each property of the index + mapping: + study_short_name: + type: keyword + diagnosis: + type: nested + properties: + ctep_disease_term: + type: keyword + diagnosis_date: + type: keyword + diagnosis_date_original: + type: keyword + diagnosis_date_original_unit: + type: keyword + diagnosis_date_unit: + type: keyword + diagnosis_id: + type: keyword + icd_10_disease_code: + type: keyword + icd_o_primary_site: + type: keyword + meddra_disease_code: + type: keyword + primary_disease_site: + type: keyword + snomed_disease_code: + type: keyword + stage_of_disease: + type: keyword + subject_age_at_diagnosis: + type: keyword + subject_age_at_diagnosis_original: + type: keyword + subject_age_at_diagnosis_original_unit: + type: keyword + subject_age_at_diagnosis_unit: + type: keyword + tumor_grade: + type: keyword + ctep_disease_terms: + type: keyword + + # Cypher query will be used to retrieve data from Neo4j, and index into Elasticsearch + cypher_query: " + MATCH (s:study) + optional MATCH (s:study)<-[:belongs_to]-(subject)<-[:of_subject]-(sd:diagnosis) + RETURN DISTINCT + s.study_short_name AS study_short_name, + COLLECT(DISTINCT sd.ctep_disease_term) as ctep_disease_terms, + COLLECT(DISTINCT{ctep_disease_term: sd.ctep_disease_term, + diagnosis_date: sd.diagnosis_date, + diagnosis_date_original: sd.diagnosis_date_original, + diagnosis_date_original_unit: sd.diagnosis_date_original_unit, + diagnosis_date_unit: sd.diagnosis_date_unit, + diagnosis_id: sd.diagnosis_id, + icd_10_disease_code: sd.icd_10_disease_code, + icd_o_primary_site: sd.icd_o_primary_site, + meddra_disease_code: sd.meddra_disease_code, + primary_disease_site: sd.primary_disease_site, + snomed_disease_code: sd.snomed_disease_code, + stage_of_disease: sd.stage_of_disease, + subject_age_at_diagnosis: sd.subject_age_at_diagnosis, + subject_age_at_diagnosis_original: sd.subject_age_at_diagnosis_original, + subject_age_at_diagnosis_original_unit: sd.subject_age_at_diagnosis_original_unit, + subject_age_at_diagnosis_unit: sd.subject_age_at_diagnosis_unit, + tumor_grade: sd.tumor_grade}) as diagnosis + " + # Supplies Hero Image information for the front page + - index_name: home_page + type: neo4j + # type mapping for each property of the index + mapping: + numberOfParticipants: + type: keyword + numberOfDiagnoses: + type: keyword + numberOfTargeted: + type: keyword + numberOfNonTargeted: + type: keyword + numberOfTherapies: + type: keyword + dataFileCount: + type: keyword + specimenCountbyStageOfDisease: + type: nested + properties: + group: + type: keyword + subjects: + type: keyword + dataFileByType: + type: nested + properties: + group: + type: keyword + subjects: + type: keyword + + # Cypher query will be used to retrieve data from Neo4j, and index into Elasticsearch + cypher_query: " + MATCH (sub:subject)<-[*..2]-(df:data_file) + WITH + { + group: df.data_file_type, + subjects:count(df.data_file_type)} + as dataFileByType + MATCH (df:data_file) WHERE df.data_file_format <> 'zip' with dataFileByType , count(df) as dataFileCount + MATCH (sb:subject) WITH count(sb) as subjectcount,dataFileByType,dataFileCount + MATCH (target:targeted_therapy) WHERE target.targeted_therapy <> '' WITH target,subjectcount,dataFileByType,dataFileCount + MATCH (untargeted:non_targeted_therapy) WITH count(DISTINCT(untargeted.targeted_therapy)) as untargetedTherapyCount ,subjectcount,target,dataFileByType,dataFileCount + WITH subjectcount,count(distinct(target.targeted_therapy)) as targetTherapyCount,dataFileByType,dataFileCount,untargetedTherapyCount + + MATCH (diag:diagnosis) with + count(distinct(diag.ctep_disease_term)) as countofDiagnoses ,untargetedTherapyCount,targetTherapyCount,subjectcount,untargetedTherapyCount + targetTherapyCount as TotalTherapy,dataFileByType,dataFileCount + + MATCH (sp:specimen)-[:of_subject]->(sb:subject)<-[:of_subject]-(diag:diagnosis) + WITH targetTherapyCount,countofDiagnoses,subjectcount,untargetedTherapyCount,TotalTherapy,dataFileByType,dataFileCount, + { + group: diag.stage_of_disease, + subjects:count(diag.stage_of_disease)} + as specimenCountbyStageOfDisease + + RETURN DISTINCT + + TotalTherapy as numberOfTherapies, + subjectcount as numberOfParticipants, + countofDiagnoses as numberOfDiagnoses, + untargetedTherapyCount as numberOfNonTargeted, + targetTherapyCount as numberOfTargeted, + dataFileCount as dataFileCount, + apoc.coll.toSet(COLLECT(specimenCountbyStageOfDisease)) as specimenCountbyStageOfDisease, + apoc.coll.toSet(COLLECT(dataFileByType)) as dataFileByType + " + + - index_name: widgets_facets_counts + type: neo4j + # Widgets,Facets,and Global Stats bar + mapping: + type: + type: keyword + study_short_name: + type: keyword + ctep_disease_term: + type: keyword + snomed_disease_code: + type: keyword + tumor_grade: + type: keyword + stage_of_disease: + type: keyword + diagnosis_id: + type: keyword + sex: + type: keyword + reported_gender: + type: keyword + race: + type: keyword + ethnicity: + type: keyword + carcinogen_exposure: + type: keyword + targeted_therapy: + type: keyword + targeted_therapy_id: + type: keyword + anatomical_collection_site: + type: keyword + specimen_type: + type: keyword + tissue_category: + type: keyword + assessment_timepoint: + type: keyword + specimen_id: + type: keyword + parent_specimen_type: + type: keyword + data_file_uuid: + type: keyword + data_file_type: + type: keyword + data_file_format: + type: keyword + biospecimen_info: + type: nested + properties: + specimen_id: + type: keyword + anatomical_collection_site: + type: keyword + specimen_type: + type: keyword + tissue_category: + type: keyword + assessment_timepoint: + type: keyword + file_info: + type: nested + properties: + data_file_uuid: + type: keyword + data_file_type: + type: keyword + data_file_format: + type: keyword + subject_id: + type: keyword + # Cypher query will be used to retrieve data from Neo4j, and index into Elasticsearch + cypher_query: " + MATCH (study:study)<-[:belongs_to]-(sb:subject) + optional MATCH (spec:specimen)-[:of_subject]->(sb) + optional MATCH (sb)<-[*..2]-(data_file:data_file) + optional MATCH (demo:demographic)-[:of_subject]->(sb) + optional MATCH (diag:diagnosis)-[:of_subject]->(sb) + optional MATCH (target:targeted_therapy)-[:of_subject]->(sb) + optional MATCH (expose:exposure)-[:of_subject]->(sb) + RETURN DISTINCT + 'participants' as type, + sb.subject_id as subject_id, + COLLECT(DISTINCT study.study_short_name) as study_short_name, + COLLECT(DISTINCT diag.ctep_disease_term) as ctep_disease_term, + COLLECT(DISTINCT diag.snomed_disease_code) as snomed_disease_code, + COLLECT(DISTINCT diag.tumor_grade) as tumor_grade, + COLLECT(DISTINCT diag.stage_of_disease) as stage_of_disease, + COLLECT(DISTINCT diag.diagnosis_id) as diagnosis_id, + COLLECT(DISTINCT demo.sex) as sex, + COLLECT(DISTINCT demo.reported_gender) as reported_gender, + COLLECT(DISTINCT demo.race) as race, + COLLECT(DISTINCT demo.ethnicity) as ethnicity, + COLLECT(DISTINCT expose.carcinogen_exposure) as carcinogen_exposure, + COLLECT(DISTINCT target.targeted_therapy) as targeted_therapy, + COLLECT(DISTINCT target.targeted_therapy_id) as targeted_therapy_id, + COLLECT(DISTINCT spec.anatomical_collection_site) AS anatomical_collection_site, + COLLECT(DISTINCT spec.specimen_type) AS specimen_type, + COLLECT(DISTINCT spec.tissue_category) AS tissue_category, + COLLECT(DISTINCT spec.assessment_timepoint) AS assessment_timepoint, + COLLECT(DISTINCT spec.specimen_id) AS specimen_id, + COLLECT(DISTINCT COALESCE(spec.parent_specimen_type, '')) AS parent_specimen_type, + COLLECT(DISTINCT data_file.data_file_type) AS data_file_type, + COLLECT(DISTINCT data_file.data_file_uuid) AS data_file_uuid, + COLLECT(DISTINCT data_file.data_file_format) AS data_file_format, + COLLECT(DISTINCT{ + specimen_id: spec.specimen_id, + anatomical_collection_site: spec.anatomical_collection_site, + specimen_type: spec.specimen_type, + tissue_category: spec.tissue_category, + assessment_timepoint: spec.assessment_timepoint + }) AS biospecimen_info, + COLLECT(DISTINCT{ + data_file_uuid: data_file.data_file_uuid, + data_file_format: data_file.data_file_format, + data_file_type: data_file.data_file_type + }) AS file_info + " + # Participant Table Data + - index_name: tab_participants + type: neo4j + # type mapping for each property of the index + mapping: + type: + type: keyword + subject_id: + type: keyword + study_short_name: + type: keyword + ctep_disease_term: + type: keyword + stage_of_disease: + type: keyword + tumor_grade: + type: keyword + age_at_enrollment: + type: integer + sex: + type: keyword + reported_gender: + type: keyword + race: + type: keyword + ethnicity: + type: keyword + carcinogen_exposure: + type: keyword + targeted_therapy: + type: keyword + data_files: + type: nested + properties: + data_file_uuid: + type: keyword + data_file_name: + type: keyword + data_file_type: + type: keyword + data_file_description: + type: keyword + data_file_format: + type: keyword + data_file_size: + type: keyword + data_file_checksum_value: + type: keyword + data_file_checksum_type: + type: keyword + data_file_compression_status: + type: keyword + data_file_location: + type: keyword + data_file_uuid: + type: keyword + specimen_id: + type: keyword + anatomical_collection_site: + type: keyword + specimen_type: + type: keyword + tissue_category: + type: keyword + assessment_timepoint: + type: keyword + + # Facets section: Data Files + data_file_type: + type: keyword + data_file_format: + type: keyword + cypher_query: " + MATCH (study:study)<-[:belongs_to]-(sb:subject) + optional MATCH (sb:subject)<-[*..2]-(df:data_file) + optional MATCH (sb)<-[:of_subject]-(diag:diagnosis) + optional MATCH (sb)<-[:of_subject]-(spec:specimen) + optional MATCH (sb)<-[:of_subject]-(demo:demographic) + optional MATCH (sb)<-[:of_subject]-(exp:exposure) + optional MATCH (sb)<-[:of_subject]-(tt:targeted_therapy) + RETURN DISTINCT + 'participants' AS type, + sb.subject_id AS subject_id, + study.study_short_name as study_short_name, + diag.ctep_disease_term AS ctep_disease_term, + diag.stage_of_disease AS stage_of_disease, + diag.tumor_grade AS tumor_grade, + demo.age_at_enrollment as age_at_enrollment, + demo.sex AS sex, + demo.reported_gender AS reported_gender, + demo.race AS race, + demo.ethnicity AS ethnicity, + COALESCE(exp.carcinogen_exposure,'') AS carcinogen_exposure, + COLLECT(DISTINCT(tt.targeted_therapy)) AS targeted_therapy, + COLLECT(DISTINCT{ + data_file_uuid: df.data_file_uuid, + data_file_name: df.data_file_name, + data_file_type: df.data_file_type, + data_file_description: df.data_file_description, + data_file_format: df.data_file_format, + data_file_size: df.data_file_size, + data_file_checksum_value: df.data_file_checksum_value, + data_file_checksum_type: df.data_file_checksum_type, + data_file_compression_status: df.data_file_compression_status, + data_file_location: df.data_file_location + }) AS data_files, + + COLLECT(DISTINCT(df.data_file_uuid)) AS data_file_uuid, + + COLLECT(DISTINCT(spec.specimen_id)) AS specimen_id, + COLLECT(DISTINCT(spec.anatomical_collection_site)) AS anatomical_collection_site, + COLLECT(DISTINCT(spec.specimen_type)) AS specimen_type, + COLLECT(DISTINCT(spec.tissue_category)) AS tissue_category, + COLLECT(DISTINCT(spec.assessment_timepoint)) AS assessment_timepoint, + + COLLECT(DISTINCT(df.data_file_type)) AS data_file_type, + COLLECT(DISTINCT(df.data_file_format)) AS data_file_format + + ORDER BY COALESCE(sb.subject_id, '') ASC" + + # Biospecimen Table Data + - index_name: tab_biospecimens + type: neo4j + mapping: + type: + type: keyword + subject_id: + type: keyword + study_short_name: + type: keyword + ctep_disease_term: + type: keyword + stage_of_disease: + type: keyword + primary_disease_site: + type: keyword + specimen_id: + type: keyword + parent_specimen_id: + type: keyword + parent_specimen_type: + type: keyword + anatomical_collection_site: + type: keyword + specimen_type: + type: keyword + tissue_category: + type: keyword + assessment_timepoint: + type: keyword + data_files: + type: nested + properties: + data_file_uuid: + type: keyword + data_file_name: + type: keyword + data_file_type: + type: keyword + data_file_description: + type: keyword + data_file_format: + type: keyword + data_file_size: + type: keyword + data_file_checksum_value: + type: keyword + data_file_checksum_type: + type: keyword + data_file_compression_status: + type: keyword + data_file_location: + type: keyword + biospecimen_info: + type: nested + properties: + parent_specimen_id: + type: keyword + specimen_id: + type: keyword + anatomical_collection_site: + type: keyword + specimen_type: + type: keyword + tissue_category: + type: keyword + assessment_timepoint: + type: keyword + data_file_uuid: + type: keyword + + tumor_grade: + type: keyword + sex: + type: keyword + reported_gender: + type: keyword + race: + type: keyword + ethnicity: + type: keyword + carcinogen_exposure: + type: keyword + targeted_therapy: + type: keyword + + data_file_type: + type: keyword + data_file_format: + type: keyword + + # Cypher query for biospecimen listing + # TODO: Remove Collection over exposure property once 1:1 + # TODO: Make sure diagnosis is 1:1 with subject + cypher_query: " + MATCH (spec:specimen)-[:of_subject]->(sub:subject) + optional MATCH (study:study)<-[:belongs_to]-(sub) + optional MATCH (sub)<-[:of_subject]-(diag:diagnosis) + optional MATCH (sub)<-[:of_subject]-(demo:demographic) + optional MATCH (sub)<-[:of_subject]-(exp:exposure) + optional MATCH (sub)<-[:of_subject]-(tt:targeted_therapy) + optional MATCH (sub)<-[*..2]-(df:data_file) + RETURN DISTINCT + 'biospecimens' AS type, + sub.subject_id AS subject_id, + diag.ctep_disease_term AS ctep_disease_term, + diag.stage_of_disease AS stage_of_disease, + diag.primary_disease_site AS primary_disease_site, + spec.specimen_id AS specimen_id, + spec.parent_specimen_id AS parent_specimen_id, + spec.parent_specimen_type AS parent_specimen_type, + spec.anatomical_collection_site AS anatomical_collection_site, + spec.specimen_type AS specimen_type, + spec.tissue_category AS tissue_category, + spec.assessment_timepoint AS assessment_timepoint, + study.study_short_name as study_short_name, + COLLECT(DISTINCT{ + data_file_uuid: df.data_file_uuid, + data_file_name: df.data_file_name, + data_file_type: df.data_file_type, + data_file_description: df.data_file_description, + data_file_format: df.data_file_format, + data_file_size: df.data_file_size, + data_file_checksum_value: df.data_file_checksum_value, + data_file_checksum_type: df.data_file_checksum_type, + data_file_compression_status: df.data_file_compression_status, + data_file_location: df.data_file_location + }) AS data_files, + COLLECT(DISTINCT{ + parent_specimen_id: spec.parent_specimen_id, + specimen_id: spec.specimen_id, + anatomical_collection_site: spec.anatomical_collection_site, + specimen_type: spec.specimen_type, + tissue_category: spec.tissue_category, + assessment_timepoint: spec.assessment_timepoint + }) AS biospecimen_info, + COLLECT(DISTINCT(df.data_file_uuid)) AS data_file_uuid, + + COLLECT(DISTINCT(diag.tumor_grade )) AS tumor_grade, + demo.sex AS sex, + demo.reported_gender AS reported_gender, + demo.race AS race, + demo.ethnicity AS ethnicity, + COLLECT(DISTINCT(exp.carcinogen_exposure)) AS carcinogen_exposure, + COLLECT(DISTINCT(tt.targeted_therapy)) AS targeted_therapy, + + COLLECT(DISTINCT(df.data_file_type)) AS data_file_type, + COLLECT(DISTINCT(df.data_file_format)) AS data_file_format + + ORDER BY COALESCE(spec.specimen_id, '') ASC" + # File Table Data, Add files into cart (For Participant Tab, Biospecimen Tab, and File Tab) + - index_name: tab_data_files + type: neo4j + # type mapping for each property of the index + mapping: + type: + type: keyword + data_file_name: + type: keyword + data_file_format: + type: keyword + data_file_type: + type: keyword + data_file_size: + type: keyword + data_file_description: + type: keyword + data_file_checksum_value: + type: keyword + data_file_checksum_type: + type: keyword + data_file_location: + type: keyword + data_file_compression_status: + type: keyword + association: + type: keyword + subject_id: + type: keyword + specimen_id: + type: keyword + ctep_disease_term: + type: keyword + meddra_disease_code: + type: keyword + histology: + type: keyword + data_file_uuid: + type: keyword + stage_of_disease: + type: keyword + tumor_grade: + type: keyword + sex: + type: keyword + reported_gender: + type: keyword + race: + type: keyword + ethnicity: + type: keyword + age_at_enrollment: + type: keyword + carcinogen_exposure: + type: keyword + targeted_therapy: + type: keyword + anatomical_collection_site: + type: keyword + specimen_type: + type: keyword + tissue_category: + type: keyword + assessment_timepoint: + type: keyword + file_info: + type: nested + properties: + data_file_uuid: + type: keyword + data_file_type: + type: keyword + data_file_format: + type: keyword + parent_specimen_id: + type: keyword + primary_disease_site: + type: keyword + cypher_query: " + MATCH (sub:subject)<-[*..2]-(parent)<--(f:data_file) + optional MATCH (f:data_file)-[:associated_with]->(spec:specimen) + optional MATCH (sub)<-[:of_subject]-(diag:diagnosis) + optional MATCH (sub)<-[:of_subject]-(demo:demographic) + optional MATCH (sub)<-[:of_subject]-(exp:exposure) + optional MATCH (sub)<-[:of_subject]-(tt:targeted_therapy) + RETURN DISTINCT + + 'data file' AS type, + f.data_file_name AS data_file_name, + f.data_file_format AS data_file_format, + f.data_file_type AS data_file_type, + f.data_file_size AS data_file_size, + f.data_file_uuid AS data_file_uuid, + f.data_file_description AS data_file_description, + f.data_file_checksum_value AS data_file_checksum_value, + f.data_file_checksum_type AS data_file_checksum_type, + f.data_file_location AS data_file_location, + f.data_file_compression_status AS data_file_compression_status, + 'biospecimen' AS association, + + + COALESCE(spec.specimen_id, ' ') as specimen_id, + sub.subject_id AS subject_id, + + diag.ctep_disease_term AS ctep_disease_term, + + diag.stage_of_disease AS stage_of_disease, + diag.tumor_grade AS tumor_grade, + diag.primary_disease_site as primary_disease_site, + diag.meddra_disease_code as meddra_disease_code, + diag.histology as histology, + + demo.sex AS sex, + demo.reported_gender AS reported_gender, + demo.race AS race, + demo.ethnicity AS ethnicity, + demo.age_at_enrollment AS age_at_enrollment, + + COLLECT(DISTINCT(exp.carcinogen_exposure)) AS carcinogen_exposure, + COLLECT(DISTINCT(tt.targeted_therapy)) AS targeted_therapy, + + spec.anatomical_collection_site AS anatomical_collection_site, + spec.specimen_type AS specimen_type, + spec.tissue_category AS tissue_category, + spec.assessment_timepoint AS assessment_timepoint, + spec.parent_specimen_id AS parent_specimen_id, + + COLLECT(DISTINCT{ + data_file_uuid: f.data_file_uuid, + data_file_format: f.data_file_format, + data_file_type: f.data_file_type + }) AS file_info + + UNION + + MATCH (sub:subject)<-[:associated_with]-(f:data_file) + optional MATCH (f:data_file)-[:associated_with]->(spec:specimen) + optional MATCH (sub)<-[:of_subject]-(diag:diagnosis) + optional MATCH (sub)<-[:of_subject]-(demo:demographic) + optional MATCH (sub)<-[:of_subject]-(exp:exposure) + optional MATCH (sub)<-[:of_subject]-(tt:targeted_therapy) + + RETURN DISTINCT + 'data file' AS type, + f.data_file_name AS data_file_name, + f.data_file_format AS data_file_format, + f.data_file_type AS data_file_type, + f.data_file_size AS data_file_size, + f.data_file_uuid AS data_file_uuid, + f.data_file_description AS data_file_description, + f.data_file_checksum_value AS data_file_checksum_value, + f.data_file_checksum_type AS data_file_checksum_type, + f.data_file_location AS data_file_location, + f.data_file_compression_status AS data_file_compression_status, + 'participant' AS association, + COALESCE(spec.specimen_id, '') as specimen_id, + sub.subject_id AS subject_id, + + + diag.stage_of_disease AS stage_of_disease, + diag.tumor_grade AS tumor_grade, + diag.primary_disease_site as primary_disease_site, + diag.ctep_disease_term as ctep_disease_term, + diag.meddra_disease_code as meddra_disease_code, + diag.histology as histology, + + demo.sex AS sex, + demo.reported_gender AS reported_gender, + demo.race AS race, + demo.ethnicity AS ethnicity, + demo.age_at_enrollment AS age_at_enrollment, + + + COLLECT(DISTINCT(exp.carcinogen_exposure)) AS carcinogen_exposure, + COLLECT(DISTINCT(tt.targeted_therapy)) AS targeted_therapy, + + spec.anatomical_collection_site AS anatomical_collection_site, + spec.specimen_type AS specimen_type, + spec.tissue_category AS tissue_category, + spec.assessment_timepoint AS assessment_timepoint, + spec.parent_specimen_id AS parent_specimen_id, + + COLLECT(DISTINCT{ + data_file_uuid: f.data_file_uuid, + data_file_format: f.data_file_format, + data_file_type: f.data_file_type + }) AS file_info " + #Handles datafiles only related to Biospecimen + - index_name: biospecimen_data_file + type: neo4j + # type mapping for each property of the index + mapping: + type: + type: keyword + data_file_name: + type: keyword + data_file_format: + type: keyword + data_file_type: + type: keyword + data_file_size: + type: keyword + data_file_uuid: + type: keyword + data_file_description: + type: keyword + + subject_id: + type: keyword + specimen_id: + type: keyword + ctep_disease_term: + type: keyword + stage_of_disease: + type: keyword + tumor_grade: + type: keyword + sex: + type: keyword + reported_gender: + type: keyword + race: + type: keyword + ethnicity: + type: keyword + carcinogen_exposure: + type: keyword + targeted_therapy: + type: keyword + + # Facets section: Biospecimens + anatomical_collection_site: + type: keyword + specimen_type: + type: keyword + tissue_category: + type: keyword + assessment_timepoint: + type: keyword + cypher_query: " + MATCH (sub:subject)<-[*..2]-(parent)<--(f:data_file) + optional MATCH (f:data_file)-[:associated_with]->(study:study) + optional MATCH (f:data_file)-[:associated_with]->(spec:specimen) + MATCH (sub)<-[:of_subject]-(diag:diagnosis) + MATCH (sub)<-[:of_subject]-(demo:demographic) + optional MATCH (sub)<-[:of_subject]-(exp:exposure) + MATCH (sub)<-[:of_subject]-(tt:targeted_therapy) + + RETURN DISTINCT + 'data file' AS type, + f.data_file_name AS data_file_name, + f.data_file_format AS data_file_format, + f.data_file_type AS data_file_type, + f.data_file_size AS data_file_size, + f.data_file_uuid AS data_file_uuid, + f.data_file_description AS data_file_description, + + spec.specimen_id AS specimen_id, + sub.subject_id AS subject_id, + + diag.ctep_disease_term AS ctep_disease_term, + + diag.stage_of_disease AS stage_of_disease, + diag.tumor_grade AS tumor_grade, + demo.sex AS sex, + demo.reported_gender AS reported_gender, + demo.race AS race, + demo.ethnicity AS ethnicity, + + COLLECT(DISTINCT(exp.carcinogen_exposure)) AS carcinogen_exposure, + COLLECT(DISTINCT(tt.targeted_therapy)) AS targeted_therapy, + + spec.anatomical_collection_site AS anatomical_collection_site, + spec.specimen_type AS specimen_type, + spec.tissue_category AS tissue_category, + spec.assessment_timepoint AS assessment_timepoint" + #lists for GS hashmap + - index_name: gs_list + type: neo4j + # type mapping for each property of the index + mapping: + autocomplete_list: + type: keyword + cypher_query: " + MATCH (spec:specimen)-[:of_subject]->(sub:subject) + optional MATCH (study:study)<-[:belongs_to]-(sub) + optional MATCH (sub)<-[:of_subject]-(diag:diagnosis) + WITH COLLECT(diag.stage_of_disease) as stage,COLLECT(diag.ctep_disease_term) as ctep,COLLECT(Distinct(sub.subject_id)) as subjectID,COLLECT(Distinct(spec.specimen_id)) as specimenID + WITH stage,ctep, stage + ctep + subjectID + specimenID as list + UNWIND list as autocomplete_list + RETURN distinct + + autocomplete_list + " + - index_name: therapy_count + type: neo4j + # type mapping for each property of the index + mapping: + targeted_therapy: + type: keyword + cypher_query: " + MATCH (study:study)<-[:belongs_to]-(sb:subject) + optional MATCH (sb)<-[:of_subject]-(tt:targeted_therapy) + WHERE tt is not null + with tt + WHERE tt.targeted_therapy <> '' + + RETURN + tt.targeted_therapy as targeted_therapy + " + - index_name: about_page + type: about_file + # type mapping for each property of the index + mapping: + page: + type: keyword + title: + type: keyword + primaryContentImage: + type: text + content: + type: object + + + - index_name: model_nodes + type: model + subtype: node + # type mapping for each property of the index + mapping: + node: + type: keyword + node_kw: + type: keyword + #Handles information stored in model file + + - index_name: model_properties + type: model + subtype: property + # type mapping for each property of the index + mapping: + node: + type: keyword + property: + type: keyword + property_kw: + type: keyword + property_description: + type: keyword + property_required: + type: keyword + property_type: + type: keyword + #Handles information stored in model file + + - index_name: model_values + type: model + subtype: value_kw + # type mapping for each property of the index + mapping: + no + type: keyword + property: + type: keyword + property_description: + type: keyword + property_required: + type: keyword + property_type: + type: keyword + value_kw: + type: keyword + + + \ No newline at end of file diff --git a/config/es_loader.example.yml b/config/es_loader.example.yml index e053c163..4d613e3f 100644 --- a/config/es_loader.example.yml +++ b/config/es_loader.example.yml @@ -1,6 +1,6 @@ Config: # Neo4j URL with port number - neo4j_uri: "bolt://127.0.0.1:7687" + neo4j_uri: 'bolt://127.0.0.1:7687' # Neo4j user name, default is neo4j neo4j_user: neo4j # Password for Neo4j user @@ -9,9 +9,11 @@ Config: es_host: localhost # Path to about file about_file: path_to_about_yaml_file + # Boolean flag indicating whether to apply formatting cleaning to the content from the about/static page. + clean_about_page_format: True model_files: - bento-model/model-desc/bento_tailorx_model_file.yaml - bento-model/model-desc/bento_tailorx_model_properties.yaml - prop_file: config/props-bento-ext.yml \ No newline at end of file + prop_file: config/props-bento-ext.yml diff --git a/config/es_loader.yml b/config/es_loader.yml new file mode 100644 index 00000000..d681fcff --- /dev/null +++ b/config/es_loader.yml @@ -0,0 +1,16 @@ +Config: + # Neo4j URL with port number + # Neo4j user name, default is neo4j + es_host: localhost + + # Path to about file + about_file: /Users/davenportaw/Downloads/cmb-data/aboutPagesContent.yml + + model_files: + - /Users/davenportaw/Downloads/cmb-data/ctdc_model_file.yaml + - /Users/davenportaw/Downloads/cmb-data/ctdc_model_properties_file.yaml + + prop_file: /Users/davenportaw/Projects/CTDCdataloader/crdc-ctdc-dataloader/config/props-ctdc.yml + + + #./loader.py /Users/davenportaw/Downloads/cmb-data/ctdc-local.yml --data /Users/davenportaw/Downloads/cmb-data/2024-2-5 \ No newline at end of file diff --git a/config/props-ctdc-cmb.yml b/config/props-ctdc-cmb.yml new file mode 100644 index 00000000..bce3901f --- /dev/null +++ b/config/props-ctdc-cmb.yml @@ -0,0 +1,50 @@ +Properties: + domain: trialcommons.cancer.gov + rel_prop_delimiter: "$" + + plurals: + specimen: specimens + study: studies + project: projects + program: programs + associated_link: associated_links + image_collection: image_collections + subject: subjects + demographic: demographics + exposure: exposures + diagnosis: diagnoses + targeted_therapy: targeted_therapies + therapy: therapies + surgical_procedure: surgical_procedures + radiological_procedure: radiological_procedures + subject_status: subject_status + data_file: data_files + principal_investigator: principal_investigators + + type_mapping: + string: String + number: Float + integer: Int + boolean: Boolean + array: Array + object: Object + datetime: DateTime + date: Date + TBD: String + + id_fields: + specimen: parent_specimen_id + study: study_short_name + associated_link: associated_link_name + image_collection: image_collection_id + subject: subject_id + demographic: demographic_id + exposure: exposure_id + diagnosis: diagnosis_id + targeted_therapy: targeted_therapy_id + non_targeted_therapy: non_targeted_therapy_id + surgery: surgical_procedure_id + radiotherapy: radiological_procedure_id + subject_status: subject_status_id + data_file: data_file_uuid + principal_investigator: principal_investigator_orcid_id \ No newline at end of file diff --git a/es_loader.py b/es_loader.py index 76f21752..a4e47302 100755 --- a/es_loader.py +++ b/es_loader.py @@ -12,6 +12,7 @@ from bento.common.utils import get_logger, print_config from icdc_schema import ICDC_Schema, PROPERTIES, ENUM, PROP_ENUM, PROP_TYPE, REQUIRED, DESCRIPTION from props import Props +from about_page_content_cleaner import AboutPageContentCleaner logger = get_logger('ESLoader') @@ -93,7 +94,7 @@ def bulk_load(self, index_name, data): successes += 1 if ok else 0 logger.info(f"Indexed {successes}/{total} documents") - def load_about_page(self, index_name, mapping, file_name): + def load_about_page(self, index_name, mapping, file_name, clean_about_page_format): logger.info('Indexing content from about page') if not os.path.isfile(file_name): raise Exception(f'"{file_name} is not a file!') @@ -103,8 +104,16 @@ def load_about_page(self, index_name, mapping, file_name): about_file = yaml.safe_load(file_obj) for page in about_file: logger.info(f'Indexing about page "{page["page"]}"') + cleaned_content = page['content'] + if clean_about_page_format: + cleaned_content = self.remove_formatting_content(page["page"], cleaned_content) + page['content'] = cleaned_content self.index_data(index_name, page, f'page{page["page"]}') - + + def remove_formatting_content(self, page_name, content): + # Call remove_formatting_content from AboutPageContentCleaner + return AboutPageContentCleaner.remove_formatting_content(page_name, content) + def read_model(self, model_files, prop_file): for file_name in model_files: if not os.path.isfile(file_name): @@ -207,7 +216,8 @@ def main(): loader.load(index['index_name'], index['mapping'], index['cypher_query']) elif index['type'] == 'about_file': if 'about_file' in config: - loader.load_about_page(index['index_name'], index['mapping'], config['about_file']) + clean_about_page_format = config.get('clean_about_page_format', False) + loader.load_about_page(index['index_name'], index['mapping'], config['about_file'], clean_about_page_format) else: logger.warning(f'"about_file" not set in configuration file, {index["index_name"]} will not be loaded!') elif index['type'] == 'model':