es_indices_ccdi_model.yml

# Indices settings
Indices:
  - index_name: study_participants
    type: neo4j
    mapping:
      id:
        type: keyword
      pid:
        type: keyword
      participant_id:
        type: keyword
      race:
        type: keyword
      sex_at_birth:
        type: keyword
      ethnicity:
        type: keyword
      diagnosis_filters:
        type: nested
        properties:
          age_at_diagnosis:
            type: integer
          diagnosis_anatomic_site:
            type: keyword
          disease_phase:
            type: keyword
          diagnosis_classification_system:
            type: keyword
          diagnosis_verification_status:
            type: keyword
          diagnosis_basis:
            type: keyword 
          diagnosis_comment:
            type: keyword           
          diagnosis_classification:
            type: keyword
      vital_status:
        type: keyword
      sample_file_filters:
        type: nested
        properties:
          sample_anatomic_site:
            type: keyword
          participant_age_at_collection:
            type: integer
          sample_tumor_status:
            type: keyword
          tumor_classification:
            type: keyword
          assay_method:
            type: keyword
          file_type:
            type: keyword
          library_selection:
            type: keyword
          library_source:
            type: keyword
          library_strategy:
            type: keyword
      study_id:
        type: keyword
      phs_accession:
        type: keyword
      grant_id:
        type: keyword
      institution:
        type: keyword
      study_acronym:
        type: keyword
      study_short_title:
        type: keyword
    # Cypher query will be used to retrieve data from Neo4j, and index into Elasticsearch
    cypher_query: "
      MATCH (p:participant)
      optional MATCH (p)<-[:of_sample]-(sm1:sample)<--(cl)<--(sm2:sample)
      WHERE (cl: cell_line or cl: pdx)
      optional Match (sm2)<--(file)
      WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:single_cell_sequencing_file OR file:cytogenomic_file) 
      with p, case COLLECT(distinct sm1) when [] then []
                      else COLLECT(DISTINCT {
                              sample_anatomic_site: sm1.anatomic_site,
                              participant_age_at_collection: sm1.participant_age_at_collection,
                              sample_tumor_status: sm1.sample_tumor_status,
                              tumor_classification: sm1.tumor_classification,
                              assay_method: CASE LABELS(file)[0]
                                        WHEN 'sequencing_file' THEN 'Sequencing'
                                        WHEN 'single_cell_sequencing_file' THEN 'Single Cell Sequencing'
                                        WHEN 'cytogenomic_file' THEN 'Cytogenomic'
                                        WHEN 'pathology_file' THEN 'Pathology imaging'
                                        WHEN 'methylation_array_file' THEN 'Methylation array'
                                        ELSE null END,
                              file_type: CASE LABELS(file)[0]
                                        When null then null
                                        else file.file_type end,
                              library_selection: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_selection
                                            WHEN 'single_cell_sequencing_file' THEN file.library_selection
                                            ELSE null END,
                              library_source: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_source
                                            WHEN 'single_cell_sequencing_file' THEN file.library_source
                                            ELSE null END,
                              library_strategy: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_strategy
                                            WHEN 'single_cell_sequencing_file' THEN file.library_strategy
                                            ELSE null END
                          }) end AS sample1,
                          case COLLECT(distinct sm2) 
                          when [] then []
                          else COLLECT(DISTINCT {
                              sample_anatomic_site: sm2.anatomic_site,
                              participant_age_at_collection: sm2.participant_age_at_collection,
                              sample_tumor_status: sm2.sample_tumor_status,
                              tumor_classification: sm2.tumor_classification,
                              assay_method: CASE LABELS(file)[0]
                                        WHEN 'sequencing_file' THEN 'Sequencing'
                                        WHEN 'single_cell_sequencing_file' THEN 'Single Cell Sequencing'
                                        WHEN 'cytogenomic_file' THEN 'Cytogenomic'
                                        WHEN 'pathology_file' THEN 'Pathology imaging'
                                        WHEN 'methylation_array_file' THEN 'Methylation array'
                                        ELSE null END,
                              file_type: CASE LABELS(file)[0]
                                        When null then null
                                        else file.file_type end,
                              library_selection: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_selection
                                            WHEN 'single_cell_sequencing_file' THEN file.library_selection
                                            ELSE null END,
                              library_source: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_source
                                            WHEN 'single_cell_sequencing_file' THEN file.library_source
                                            ELSE null END,
                              library_strategy: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_strategy
                                            WHEN 'single_cell_sequencing_file' THEN file.library_strategy
                                            ELSE null END
                          }) end AS sample2
      with p, apoc.coll.union(sample1,sample2) as cell_line_pdx_file_filters
      OPTIONAL MATCH (p)<-[:of_sample]-(sm:sample)<--(file)
      WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:single_cell_sequencing_file OR file:cytogenomic_file)
      with p, cell_line_pdx_file_filters, COLLECT(DISTINCT {
                    sample_anatomic_site: sm.anatomic_site,
                    participant_age_at_collection: sm.participant_age_at_collection,
                    sample_tumor_status: sm.sample_tumor_status,
                    tumor_classification: sm.tumor_classification,
                    assay_method: CASE LABELS(file)[0]
                              WHEN 'sequencing_file' THEN 'Sequencing'
                              WHEN 'single_cell_sequencing_file' THEN 'Single Cell Sequencing'
                              WHEN 'cytogenomic_file' THEN 'Cytogenomic'
                              WHEN 'pathology_file' THEN 'Pathology imaging'
                              WHEN 'methylation_array_file' THEN 'Methylation array' END,
                    file_type: file.file_type,
                    library_selection: CASE LABELS(file)[0]
                                  WHEN 'sequencing_file' THEN file.library_selection
                                  WHEN 'single_cell_sequencing_file' THEN file.library_selection
                                  ELSE null END,
                    library_source: CASE LABELS(file)[0]
                                  WHEN 'sequencing_file' THEN file.library_source
                                  WHEN 'single_cell_sequencing_file' THEN file.library_source
                                  ELSE null END,
                    library_strategy: CASE LABELS(file)[0]
                                  WHEN 'sequencing_file' THEN file.library_strategy
                                  WHEN 'single_cell_sequencing_file' THEN file.library_strategy
                                  ELSE null END
                }) AS general_file_filters
      OPTIONAL Match (p)<-[:of_sample]-(sm:sample)
      OPTIONAL MATCH (p)<-[:of_clinical_measure_file]-(file1:clinical_measure_file)
      with p, cell_line_pdx_file_filters, general_file_filters,sm, COLLECT(DISTINCT file1.file_type) as file1_types
      UNWIND (case file1_types when [] then [null] else file1_types end)  AS types_1
      with p, cell_line_pdx_file_filters, general_file_filters, COLLECT(DISTINCT {
                sample_anatomic_site: sm.anatomic_site,
                participant_age_at_collection: sm.participant_age_at_collection,
                sample_tumor_status: sm.sample_tumor_status,
                tumor_classification: sm.tumor_classification,
                assay_method: CASE types_1 when null then null else 'Clinical data' end,
                file_type: types_1,
                library_selection: null,
                library_source: null,
                library_strategy: null
        }) as participant_clinical_measure_file_filters
      OPTIONAL Match (p)<-[:of_sample]-(sm:sample)
      OPTIONAL MATCH (p)<-[:of_radiology_file]-(file1:radiology_file)
      with p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters, sm, COLLECT(DISTINCT file1.file_type) as file1_types
      UNWIND (case file1_types when [] then [null] else file1_types end)  AS types_1
      with p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters, COLLECT(DISTINCT {
                sample_anatomic_site: sm.anatomic_site,
                participant_age_at_collection: sm.participant_age_at_collection,
                sample_tumor_status: sm.sample_tumor_status,
                tumor_classification: sm.tumor_classification,
                assay_method: CASE types_1 when null then null else 'Radiology imaging' end,
                file_type: types_1,
                library_selection: null,
                library_source: null,
                library_strategy: null
        }) as participant_radiology_file_filters
      OPTIONAL MATCH (p)<-[:of_diagnosis]-(dg:diagnosis)
      OPTIONAL MATCH (p)<-[:of_follow_up]-(fu:follow_up)
      OPTIONAL MATCH (st:study)<-[:of_participant]-(p)
      OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
      OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
      WITH p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters, participant_radiology_file_filters, fu, st, stf, stp, dg
      RETURN DISTINCT
        p.id as id,
        p.id as pid,
        p.participant_id as participant_id,
        apoc.text.split(p.race, ';') as race,
        p.sex_at_birth as sex_at_birth,
        apoc.text.split(p.ethnicity, ';') as ethnicity,
        COLLECT(DISTINCT {
            age_at_diagnosis: dg.age_at_diagnosis,
            diagnosis_anatomic_site: dg.anatomic_site,
            disease_phase: dg.disease_phase,
            diagnosis_classification_system: dg.diagnosis_classification_system,
            diagnosis_verification_status: dg.diagnosis_verification_status,
            diagnosis_basis: dg.diagnosis_basis, 
            diagnosis_comment: dg.diagnosis_comment,           
            diagnosis_classification: dg.diagnosis_classification
        }) AS diagnosis_filters,
        COLLECT(DISTINCT fu.vital_status) as vital_status,
        apoc.coll.union(cell_line_pdx_file_filters, general_file_filters) + participant_clinical_measure_file_filters + participant_radiology_file_filters AS sample_file_filters,
        st.study_id as study_id,
        st.phs_accession as phs_accession,
        COLLECT(DISTINCT stf.grant_id) as grant_id,
        COLLECT(DISTINCT stp.institution) as institution,
        st.study_acronym as study_acronym,
        st.study_short_title as study_short_title
        Union All
        MATCH (st:study)
        MATCH (st)<-[:of_clinical_measure_file]-(file:clinical_measure_file)
        OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
        OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
        WITH file, st, stf, stp
        RETURN DISTINCT
          null as id,
          null as pid,
          null as participant_id,
          null as race,
          null as sex_at_birth,
          null as ethnicity,
          [] AS diagnosis_filters,
          null as vital_status,
          COLLECT(DISTINCT {
              sample_anatomic_site: null,
              participant_age_at_collection: null,
              sample_tumor_status: null,
              tumor_classification: null,
              assay_method: 'Clinical data',
              file_type: file.file_type,
              library_selection: null,
              library_source: null,
              library_strategy: null
          }) AS sample_file_filters,
          st.study_id as study_id,
          st.phs_accession as phs_accession,
          COLLECT(DISTINCT stf.grant_id) as grant_id,
          COLLECT(DISTINCT stp.institution) as institution,
          st.study_acronym as study_acronym,
          st.study_short_title as study_short_title
        Union All
        MATCH (st:study)
        MATCH (st)<-[:of_cell_line|of_pdx]-(cl)<--(sm:sample)
        Where (cl: cell_line or cl: pdx)
        optional Match (sm)<--(file)
        WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:single_cell_sequencing_file OR file:cytogenomic_file)
        OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
        OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
        WITH file, sm, st, stf, stp
        RETURN DISTINCT
          null as id,
          null as pid,
          null as participant_id,
          null as race,
          null as sex_at_birth,
          null as ethnicity,
          [] AS diagnosis_filters,
          null as vital_status,
          COLLECT(DISTINCT {
              sample_anatomic_site: sm.anatomic_site,
              participant_age_at_collection: sm.participant_age_at_collection,
              sample_tumor_status: sm.sample_tumor_status,
              tumor_classification: sm.tumor_classification,
              assay_method: CASE LABELS(file)[0]
                              WHEN 'sequencing_file' THEN 'Sequencing'
                              WHEN 'single_cell_sequencing_file' THEN 'Single Cell Sequencing'
                              WHEN 'cytogenomic_file' THEN 'Cytogenomic'
                              WHEN 'pathology_file' THEN 'Pathology imaging'
                              WHEN 'methylation_array_file' THEN 'Methylation array'
                              ELSE null END,
              file_type: CASE LABELS(file)[0]
                        When null then null
                        ELSE file.file_type end,
              library_selection: CASE LABELS(file)[0]
                        WHEN 'sequencing_file' THEN file.library_selection
                        WHEN 'single_cell_sequencing_file' THEN file.library_selection
                        ELSE null END,
              library_source: CASE LABELS(file)[0]
                        WHEN 'sequencing_file' THEN file.library_source
                        WHEN 'single_cell_sequencing_file' THEN file.library_source
                        ELSE null END,
              library_strategy: CASE LABELS(file)[0]
                        WHEN 'sequencing_file' THEN file.library_strategy
                        WHEN 'single_cell_sequencing_file' THEN file.library_strategy
                        ELSE null END
          }) AS sample_file_filters,
          st.study_id as study_id,
          st.phs_accession as phs_accession,
          COLLECT(DISTINCT stf.grant_id) as grant_id,
          COLLECT(DISTINCT stp.institution) as institution,
          st.study_acronym as study_acronym,
          st.study_short_title as study_short_title
      "

  - index_name: participants
    type: neo4j
    mapping:
      id:
        type: keyword
      participant_id:
        type: keyword
      race:
        type: keyword
      race_str:
        type: keyword
      sex_at_birth:
        type: keyword
      ethnicity:
        type: keyword
      ethnicity_str:
        type: keyword
      alternate_participant_id:
        type: keyword
      diagnosis_filters:
        type: nested
        properties:
          age_at_diagnosis:
            type: integer
          diagnosis_anatomic_site:
            type: keyword
          disease_phase:
            type: keyword
          diagnosis_classification_system:
            type: keyword
          diagnosis_verification_status:
            type: keyword
          diagnosis_basis:
            type: keyword   
          diagnosis_comment:
            type: keyword            
          diagnosis_classification:
            type: keyword
      vital_status:
        type: keyword
      sample_file_filters:
        type: nested
        properties:
          sample_anatomic_site:
            type: keyword
          participant_age_at_collection:
            type: integer
          sample_tumor_status:
            type: keyword
          tumor_classification:
            type: keyword
          assay_method:
            type: keyword
          file_type:
            type: keyword
          library_selection:
            type: keyword
          library_source:
            type: keyword
          library_strategy:
            type: keyword
      study_id:
        type: keyword
      phs_accession:
        type: keyword
      grant_id:
        type: keyword
      institution:
        type: keyword
      study_acronym:
        type: keyword
      study_short_title:
        type: keyword
      file_count:
        type: integer
      files:
        type: text
        fields:
          keyword:
            type: keyword
    # Cypher query will be used to retrieve data from Neo4j, and index into Elasticsearch
    cypher_query: "
      MATCH (p:participant)
      optional MATCH (p)<-[:of_sample]-(sm1:sample)<--(cl)<--(sm2:sample)
      WHERE (cl: cell_line or cl: pdx)
      optional Match (sm2)<--(file)
      WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:single_cell_sequencing_file OR file:cytogenomic_file) 
      with p, case COLLECT(distinct sm1) when [] then []
                      else COLLECT(DISTINCT {
                              sample_anatomic_site: sm1.anatomic_site,
                              participant_age_at_collection: sm1.participant_age_at_collection,
                              sample_tumor_status: sm1.sample_tumor_status,
                              tumor_classification: sm1.tumor_classification,
                              assay_method: CASE LABELS(file)[0]
                                        WHEN 'sequencing_file' THEN 'Sequencing'
                                        WHEN 'single_cell_sequencing_file' THEN 'Single Cell Sequencing'
                                        WHEN 'cytogenomic_file' THEN 'Cytogenomic'
                                        WHEN 'pathology_file' THEN 'Pathology imaging'
                                        WHEN 'methylation_array_file' THEN 'Methylation array'
                                        ELSE null END,
                              file_type: CASE LABELS(file)[0]
                                        When null then null
                                        else file.file_type end,
                              library_selection: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_selection
                                            WHEN 'single_cell_sequencing_file' THEN file.library_selection
                                            ELSE null END,
                              library_source: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_source
                                            WHEN 'single_cell_sequencing_file' THEN file.library_source
                                            ELSE null END,
                              library_strategy: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_strategy
                                            WHEN 'single_cell_sequencing_file' THEN file.library_strategy
                                            ELSE null END
                          }) end AS sample1,
                          case COLLECT(distinct sm2) 
                          when [] then []
                          else COLLECT(DISTINCT {
                              sample_anatomic_site: sm2.anatomic_site,
                              participant_age_at_collection: sm2.participant_age_at_collection,
                              sample_tumor_status: sm2.sample_tumor_status,
                              tumor_classification: sm2.tumor_classification,
                              assay_method: CASE LABELS(file)[0]
                                        WHEN 'sequencing_file' THEN 'Sequencing'
                                        WHEN 'single_cell_sequencing_file' THEN 'Single Cell Sequencing'
                                        WHEN 'cytogenomic_file' THEN 'Cytogenomic'
                                        WHEN 'pathology_file' THEN 'Pathology imaging'
                                        WHEN 'methylation_array_file' THEN 'Methylation array'
                                        ELSE null END,
                              file_type: CASE LABELS(file)[0]
                                        When null then null
                                        else file.file_type end,
                              library_selection: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_selection
                                            WHEN 'single_cell_sequencing_file' THEN file.library_selection
                                            ELSE null END,
                              library_source: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_source
                                            WHEN 'single_cell_sequencing_file' THEN file.library_source
                                            ELSE null END,
                              library_strategy: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_strategy
                                            WHEN 'single_cell_sequencing_file' THEN file.library_strategy
                                            ELSE null END
                          }) end AS sample2
      with p, apoc.coll.union(sample1,sample2) as cell_line_pdx_file_filters
      OPTIONAL MATCH (p)<-[:of_sample]-(sm:sample)<--(file)
      WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:single_cell_sequencing_file OR file:cytogenomic_file)
      with p, cell_line_pdx_file_filters, COLLECT(DISTINCT {
                    sample_anatomic_site: sm.anatomic_site,
                    participant_age_at_collection: sm.participant_age_at_collection,
                    sample_tumor_status: sm.sample_tumor_status,
                    tumor_classification: sm.tumor_classification,
                    assay_method: CASE LABELS(file)[0]
                              WHEN 'sequencing_file' THEN 'Sequencing'
                              WHEN 'single_cell_sequencing_file' THEN 'Single Cell Sequencing'
                              WHEN 'cytogenomic_file' THEN 'Cytogenomic'
                              WHEN 'pathology_file' THEN 'Pathology imaging'
                              WHEN 'methylation_array_file' THEN 'Methylation array' END,
                    file_type: file.file_type,
                    library_selection: CASE LABELS(file)[0]
                                  WHEN 'sequencing_file' THEN file.library_selection
                                  WHEN 'single_cell_sequencing_file' THEN file.library_selection
                                  ELSE null END,
                    library_source: CASE LABELS(file)[0]
                                  WHEN 'sequencing_file' THEN file.library_source
                                   WHEN 'single_cell_sequencing_file' THEN file.library_source
                                  ELSE null END,
                    library_strategy: CASE LABELS(file)[0]
                                  WHEN 'sequencing_file' THEN file.library_strategy
                                  WHEN 'single_cell_sequencing_file' THEN file.library_strategy
                                  ELSE null END
                }) AS general_file_filters
      OPTIONAL Match (p)<-[:of_sample]-(sm:sample)
      OPTIONAL MATCH (p)<-[:of_clinical_measure_file]-(file1:clinical_measure_file)
      with p, cell_line_pdx_file_filters, general_file_filters,sm, COLLECT(DISTINCT file1.file_type) as file1_types
      UNWIND (case file1_types when [] then [null] else file1_types end)  AS types_1
      with p, cell_line_pdx_file_filters, general_file_filters, COLLECT(DISTINCT {
                sample_anatomic_site: sm.anatomic_site,
                participant_age_at_collection: sm.participant_age_at_collection,
                sample_tumor_status: sm.sample_tumor_status,
                tumor_classification: sm.tumor_classification,
                assay_method: CASE types_1 when null then null else 'Clinical data' end,
                file_type: types_1,
                library_selection: null,
                library_source: null,
                library_strategy: null
        }) as participant_clinical_measure_file_filters
      OPTIONAL Match (p)<-[:of_sample]-(sm:sample)
      OPTIONAL MATCH (p)<-[:of_radiology_file]-(file1:radiology_file)
      with p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters, sm, COLLECT(DISTINCT file1.file_type) as file1_types
      UNWIND (case file1_types when [] then [null] else file1_types end)  AS types_1
      with p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters, COLLECT(DISTINCT {
                sample_anatomic_site: sm.anatomic_site,
                participant_age_at_collection: sm.participant_age_at_collection,
                sample_tumor_status: sm.sample_tumor_status,
                tumor_classification: sm.tumor_classification,
                assay_method: CASE types_1 when null then null else 'Radiology imaging' end,
                file_type: types_1,
                library_selection: null,
                library_source: null,
                library_strategy: null
        }) as participant_radiology_file_filters
      OPTIONAL MATCH (p)<-[*..4]-(file)
      WHERE (file:clinical_measure_file OR file: sequencing_file OR file:pathology_file OR file:radiology_file OR file:methylation_array_file OR file:single_cell_sequencing_file OR file:cytogenomic_file)
      OPTIONAL MATCH (p)<-[:of_diagnosis]-(dg:diagnosis)
      OPTIONAL MATCH (p)<-[:of_follow_up]-(fu:follow_up)
      OPTIONAL MATCH (st:study)<-[:of_participant]-(p)
      OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
      OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
      WITH p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters, participant_radiology_file_filters, file, fu, st, stf, stp, dg
      RETURN DISTINCT
        p.id as id,
        p.participant_id as participant_id,
        apoc.text.split(p.race, ';') as race,
        p.race as race_str,
        p.sex_at_birth as sex_at_birth,
        apoc.text.split(p.ethnicity, ';') as ethnicity,
        p.ethnicity as ethnicity_str,
        p.alternate_participant_id as alternate_participant_id,
        COLLECT(DISTINCT {
            age_at_diagnosis: dg.age_at_diagnosis,
            diagnosis_anatomic_site: dg.anatomic_site,
            disease_phase: dg.disease_phase,
            diagnosis_classification_system: dg.diagnosis_classification_system,
            diagnosis_verification_status: dg.diagnosis_verification_status,
            diagnosis_basis: dg.diagnosis_basis,
            diagnosis_comment: dg.diagnosis_comment,
            diagnosis_classification: dg.diagnosis_classification
        }) AS diagnosis_filters,
        COLLECT(DISTINCT fu.vital_status) as vital_status,
        apoc.coll.union(cell_line_pdx_file_filters, general_file_filters) + participant_clinical_measure_file_filters + participant_radiology_file_filters AS sample_file_filters,
        st.study_id as study_id,
        st.phs_accession as phs_accession,
        COLLECT(DISTINCT stf.grant_id) as grant_id,
        COLLECT(DISTINCT stp.institution) as institution,
        st.study_acronym as study_acronym,
        st.study_short_title as study_short_title,
        COUNT(DISTINCT file.id) as file_count,
        COLLECT(DISTINCT file.id) as files
    "

  - index_name: diagnosis
    type: neo4j
    mapping:
      id:
        type: keyword
      pid:
        type: keyword
      diagnosis_id:
        type: keyword
      participant_id:
        type: keyword
      diagnosis_classification:
        type: keyword
      disease_phase:
        type: keyword
      diagnosis_classification_system:
        type: keyword
      diagnosis_verification_status:
        type: keyword
      diagnosis_basis:
        type: keyword
      diagnosis_comment:
        type: keyword  
      diagnosis_anatomic_site:
        type: keyword
      age_at_diagnosis:
        type: integer
      race:
        type: keyword
      sex_at_birth:
        type: keyword
      ethnicity:
        type: keyword
      phs_accession:
        type: keyword
      study_id:
        type: keyword
      study_acronym:
        type: keyword
      study_short_title:
        type: keyword
      last_vital_status:
        type: keyword
      vital_status:
        type: keyword
      sample_file_filters:
        type: nested
        properties:
          sample_anatomic_site:
            type: keyword
          participant_age_at_collection:
            type: integer
          sample_tumor_status:
            type: keyword
          tumor_classification:
            type: keyword
          assay_method:
            type: keyword
          file_type:
            type: keyword
          library_selection:
            type: keyword
          library_source:
            type: keyword
          library_strategy:
            type: keyword
      grant_id:
        type: keyword
      institution:
        type: keyword
      files:
        type: text
        fields:
          keyword:
            type: keyword
    # Cypher query will be used to retrieve data from Neo4j, and index into Elasticsearch
    cypher_query: "
        MATCH (p:participant)
        optional MATCH (p)<-[:of_sample]-(sm1:sample)<--(cl)<--(sm2:sample)
        WHERE (cl: cell_line or cl: pdx)
        optional Match (sm2)<--(file)
        WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:single_cell_sequencing_file OR file:cytogenomic_file) 
        with p, case COLLECT(distinct sm1) when [] then []
                      else COLLECT(DISTINCT {
                              sample_anatomic_site: sm1.anatomic_site,
                              participant_age_at_collection: sm1.participant_age_at_collection,
                              sample_tumor_status: sm1.sample_tumor_status,
                              tumor_classification: sm1.tumor_classification,
                              assay_method: CASE LABELS(file)[0]
                                        WHEN 'sequencing_file' THEN 'Sequencing'
                                        WHEN 'single_cell_sequencing_file' THEN 'Single Cell Sequencing'
                                        WHEN 'cytogenomic_file' THEN 'Cytogenomic'
                                        WHEN 'pathology_file' THEN 'Pathology imaging'
                                        WHEN 'methylation_array_file' THEN 'Methylation array'
                                        ELSE null END,
                              file_type: CASE LABELS(file)[0]
                                        When null then null
                                        else file.file_type end,
                              library_selection: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_selection
                                            WHEN 'single_cell_sequencing_file' THEN file.library_selection
                                            ELSE null END,
                              library_source: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_source
                                            WHEN 'single_cell_sequencing_file' THEN file.library_source
                                            ELSE null END,
                              library_strategy: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_strategy
                                            WHEN 'single_cell_sequencing_file' THEN file.library_strategy
                                            ELSE null END
                          }) end AS sample1,
                          case COLLECT(distinct sm2) 
                          when [] then []
                          else COLLECT(DISTINCT {
                              sample_anatomic_site: sm2.anatomic_site,
                              participant_age_at_collection: sm2.participant_age_at_collection,
                              sample_tumor_status: sm2.sample_tumor_status,
                              tumor_classification: sm2.tumor_classification,
                              assay_method: CASE LABELS(file)[0]
                                        WHEN 'sequencing_file' THEN 'Sequencing'
                                        WHEN 'single_cell_sequencing_file' THEN 'Single Cell Sequencing'
                                        WHEN 'cytogenomic_file' THEN 'Cytogenomic'
                                        WHEN 'pathology_file' THEN 'Pathology imaging'
                                        WHEN 'methylation_array_file' THEN 'Methylation array'
                                        ELSE null END,
                              file_type: CASE LABELS(file)[0]
                                        When null then null
                                        else file.file_type end,
                              library_selection: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_selection
                                            WHEN 'single_cell_sequencing_file' THEN file.library_selection
                                            ELSE null END,
                              library_source: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_source
                                            WHEN 'single_cell_sequencing_file' THEN file.library_source
                                            ELSE null END,
                              library_strategy: CASE LABELS(file)[0]
                                            WHEN 'sequencing_file' THEN file.library_strategy
                                            WHEN 'single_cell_sequencing_file' THEN file.library_strategy
                                            ELSE null END
                          }) end AS sample2
        with p, apoc.coll.union(sample1,sample2) as cell_line_pdx_file_filters
        OPTIONAL MATCH (p)<-[:of_sample]-(sm:sample)<--(file)
        WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:single_cell_sequencing_file OR file:cytogenomic_file)
        with p, cell_line_pdx_file_filters, COLLECT(DISTINCT {
                      sample_anatomic_site: sm.anatomic_site,
                      participant_age_at_collection: sm.participant_age_at_collection,
                      sample_tumor_status: sm.sample_tumor_status,
                      tumor_classification: sm.tumor_classification,
                      assay_method: CASE LABELS(file)[0]
                                WHEN 'sequencing_file' THEN 'Sequencing'
                                WHEN 'single_cell_sequencing_file' THEN 'Single Cell Sequencing'
                                WHEN 'cytogenomic_file' THEN 'Cytogenomic'
                                WHEN 'pathology_file' THEN 'Pathology imaging'
                                WHEN 'methylation_array_file' THEN 'Methylation array' END,
                      file_type: file.file_type,
                      library_selection: CASE LABELS(file)[0]
                                    WHEN 'sequencing_file' THEN file.library_selection
                                    WHEN 'single_cell_sequencing_file' THEN file.library_selection
                                    ELSE null END,
                      library_source: CASE LABELS(file)[0]
                                    WHEN 'sequencing_file' THEN file.library_source
                                    WHEN 'single_cell_sequencing_file' THEN file.library_source
                                    ELSE null END,
                      library_strategy: CASE LABELS(file)[0]
                                    WHEN 'sequencing_file' THEN file.library_strategy
                                    WHEN 'single_cell_sequencing_file' THEN file.library_strategy
                                    ELSE null END
                  }) AS general_file_filters
        OPTIONAL Match (p)<-[:of_sample]-(sm:sample)
        OPTIONAL MATCH (p)<-[:of_clinical_measure_file]-(file1:clinical_measure_file)
        with p, cell_line_pdx_file_filters, general_file_filters,sm, COLLECT(DISTINCT file1.file_type) as file1_types
        UNWIND (case file1_types when [] then [null] else file1_types end)  AS types_1
        with p, cell_line_pdx_file_filters, general_file_filters, COLLECT(DISTINCT {
                  sample_anatomic_site: sm.anatomic_site,
                  participant_age_at_collection: sm.participant_age_at_collection,
                  sample_tumor_status: sm.sample_tumor_status,
                  tumor_classification: sm.tumor_classification,
                  assay_method: CASE types_1 when null then null else 'Clinical data' end,
                  file_type: types_1,
                  library_selection: null,
                  library_source: null,
                  library_strategy: null
          }) as participant_clinical_measure_file_filters
        OPTIONAL Match (p)<-[:of_sample]-(sm:sample)
        OPTIONAL MATCH (p)<-[:of_radiology_file]-(file1:radiology_file)
        with p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters, sm, COLLECT(DISTINCT file1.file_type) as file1_types
        UNWIND (case file1_types when [] then [null] else file1_types end)  AS types_1
        with p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters, COLLECT(DISTINCT {
                  sample_anatomic_site: sm.anatomic_site,
                  participant_age_at_collection: sm.participant_age_at_collection,
                  sample_tumor_status: sm.sample_tumor_status,
                  tumor_classification: sm.tumor_classification,
                  assay_method: CASE types_1 when null then null else 'Radiology imaging' end,
                  file_type: types_1,
                  library_selection: null,
                  library_source: null,
                  library_strategy: null
          }) as participant_radiology_file_filters
        MATCH (dg:diagnosis)
        MATCH (p)<-[:of_diagnosis]-(dg)
        OPTIONAL MATCH (p)<-[*..4]-(file)
        WHERE (file:clinical_measure_file OR file: sequencing_file OR file:pathology_file OR file:radiology_file OR file:methylation_array_file OR file:single_cell_sequencing_file OR file:cytogenomic_file)
        OPTIONAL MATCH (p)<-[:of_follow_up]-(fu:follow_up)
        with p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters,participant_radiology_file_filters, dg, file, fu order by fu.age_at_follow_up desc
        OPTIONAL MATCH (st:study)<-[:of_participant]-(p)
        OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
        OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
        WITH p, cell_line_pdx_file_filters, general_file_filters, participant_clinical_measure_file_filters,participant_radiology_file_filters, file, fu, st, stf, stp, dg
        RETURN DISTINCT
          dg.id as id,
          p.id as pid,
          dg.diagnosis_id as diagnosis_id,
          dg.diagnosis_classification as diagnosis_classification,
          dg.disease_phase as disease_phase,
          dg.diagnosis_classification_system as diagnosis_classification_system,
          dg.diagnosis_verification_status as diagnosis_verification_status,
          dg.diagnosis_basis as diagnosis_basis,
          dg.diagnosis_comment as diagnosis_comment,
          dg.anatomic_site as diagnosis_anatomic_site,
          dg.age_at_diagnosis as age_at_diagnosis,
          p.participant_id as participant_id,
          apoc.text.split(p.race, ';') as race,
          p.sex_at_birth as sex_at_birth,
          apoc.text.split(p.ethnicity, ';') as ethnicity,
          st.study_id as study_id,
          st.phs_accession as phs_accession,
          st.study_acronym as study_acronym,
          st.study_short_title as study_short_title,
          head(collect(distinct fu.vital_status)) as last_vital_status,
          COLLECT(DISTINCT fu.vital_status) as vital_status,
          apoc.coll.union(cell_line_pdx_file_filters, general_file_filters) + participant_clinical_measure_file_filters + participant_radiology_file_filters AS sample_file_filters,
          COLLECT(DISTINCT stf.grant_id) as grant_id,
          COLLECT(DISTINCT stp.institution) as institution,
          COLLECT(DISTINCT file.id) as files
       "

  - index_name: studies
    type: neo4j
    mapping:
      id:
        type: keyword
      study_id:
        type: keyword
      grant_id:
        type: keyword
      pubmed_ids:
        type: keyword
      phs_accession:
        type: keyword
      study_short_title:
        type: keyword
      study_acronym:
        type: keyword
      PIs:
        type: keyword
      num_of_participants:
        type: integer
      diagnosis_cancer:
        type: text
        fields:
          keyword:
            type: keyword
      diagnosis_anatomic_site:
        type: text
        fields:
          keyword:
            type: keyword
      file_types:
        type: text
        fields:
          keyword:
            type: keyword
      num_of_samples:
        type: integer
      num_of_files:
        type: integer
      files:
        type: text
        fields:
          keyword:
            type: keyword
    # Cypher query will be used to retrieve data from Neo4j, and index into Elasticsearch
    cypher_query: "
        MATCH (st:study)<-[:of_participant]-(p:participant)
        with st, count(p) as num_p
        MATCH (st:study)<-[:of_participant]-(participant)<-[:of_diagnosis]-(dg:diagnosis)
        with st, num_p, dg.diagnosis_classification as dg_cancers, count(dg.diagnosis_classification) as num_cancers
        ORDER BY num_cancers desc
        with st, num_p, collect(dg_cancers + ' (' + toString(num_cancers) + ')') as cancers
        MATCH (st)<-[:of_participant]-(pa:participant)<-[:of_diagnosis]-(diag:diagnosis)
        with st, num_p, cancers, diag.anatomic_site as dg_sites, count(diag.anatomic_site) as num_sites
        ORDER BY num_sites desc
        with st, num_p, cancers, collect(dg_sites + ' (' + toString(num_sites) + ')') as sites
        MATCH (st)<-[*..5]-(fl)
        WHERE (fl:clinical_measure_file OR fl: sequencing_file OR fl:pathology_file OR fl:radiology_file OR fl:methylation_array_file OR fl:single_cell_sequencing_file OR fl:cytogenomic_file)
        with st, num_p, cancers, sites, fl.file_type as ft, count(fl.file_type) as num_ft
        ORDER BY num_ft desc
        with st, num_p, cancers, sites, collect(ft + ' (' + toString(num_ft) + ')') as file_types, sum(num_ft) as num_files
        OPTIONAL MATCH (st)<-[:of_participant|of_cell_line|of_pdx]-(pcp)<-[:of_sample]-(sm1:sample)
        WHERE (pcp:participant or pcp:cell_line or pcp:pdx)
        WITH st, num_p, cancers, sites, file_types, num_files, count(distinct sm1.sample_id) as num_samples_1
        OPTIONAL MATCH (st)<-[:of_participant]-(participant)<-[:of_sample]-(sm1:sample)<--(cp)<--(sm2:sample)
        WHERE (cp:cell_line or cp:pdx)
        WITH st, num_p, cancers, sites, file_types, num_files, num_samples_1, count(distinct sm2.sample_id) as num_samples_2
        WITH st, num_p, cancers, sites, file_types, num_files, num_samples_1 + num_samples_2 as num_samples
        MATCH (st)<-[*..5]-(file)
        WHERE (file:clinical_measure_file OR file: sequencing_file OR file:pathology_file OR file:radiology_file OR file:methylation_array_file OR file:single_cell_sequencing_file OR file:cytogenomic_file)
        OPTIONAL MATCH (st)<-[:of_publication]-(pub:publication)
        OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
        WHERE stp.personnel_type = 'PI'
        OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
        WITH st, num_p, cancers, sites, file_types, num_files, num_samples, file.id as file_id, stf, stp, pub
        RETURN DISTINCT
          st.id as id,
          st.study_id as study_id,
          apoc.text.join(COLLECT(DISTINCT stf.grant_id), ';') as grant_id,
          apoc.text.join(COLLECT(DISTINCT pub.pubmed_id), ';') as pubmed_ids,
          st.phs_accession as phs_accession,
          st.study_short_title as study_short_title,
          st.study_acronym as study_acronym,
          apoc.text.join(COLLECT(DISTINCT stp.personnel_name), ';') as PIs,
          num_p as num_of_participants,
          cancers as diagnosis_cancer,
          sites as diagnosis_anatomic_site,
          file_types as file_types,
          num_samples as num_of_samples,
          num_files as num_of_files,
          COLLECT(DISTINCT file_id) as files
       "

  - index_name: samples
    type: neo4j
    mapping:
      id:
        type: keyword
      pid:
        type: keyword
      sample_id:
        type: keyword
      participant_id:
        type: keyword
      race:
        type: keyword
      sex_at_birth:
        type: keyword
      ethnicity:
        type: keyword
      sample_anatomic_site:
        type: keyword
      sample_diagnosis_classification:
        type: keyword
      sample_diagnosis_classification_system:
        type: keyword
      sample_diagnosis_verification_status:
        type: keyword
      sample_diagnosis_basis:
        type: keyword 
      sample_diagnosis_comment:
        type: keyword       
      participant_age_at_collection:
        type: integer
      sample_tumor_status:
        type: keyword
      tumor_classification:
        type: keyword
      study_id:
        type: keyword
      phs_accession:
        type: keyword
      study_acronym:
        type: keyword
      study_short_title:
        type: keyword
      diagnosis_filters:
        type: nested
        properties:
          age_at_diagnosis:
            type: integer
          diagnosis_anatomic_site:
            type: keyword
          disease_phase:
            type: keyword
          diagnosis_classification_system:
            type: keyword
          diagnosis_verification_status:
            type: keyword
          diagnosis_basis:
            type: keyword
          diagnosis_comment:
            type: keyword  
          diagnosis_classification:
            type: keyword
      vital_status:
        type: keyword
      file_filters:
        type: nested
        properties:
          assay_method:
            type: keyword
          file_type:
            type: keyword
          library_selection:
            type: keyword
          library_source:
            type: keyword
          library_strategy:
            type: keyword
      grant_id:
        type: keyword
      institution:
        type: keyword
      file_count:
        type: integer
      files:
        type: text
        fields:
          keyword:
            type: keyword
    # Cypher query will be used to retrieve data from Neo4j, and index into Elasticsearch
    cypher_query: "
        MATCH (sm:sample)
        OPTIONAL MATCH (p:participant)<-[*..3]-(sm)
        optional match (sm)<-[*..3]-(file)
        WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:single_cell_sequencing_file OR file:cytogenomic_file)
        MATCH (st:study)<-[:of_participant]-(p)
        OPTIONAL MATCH (p)<-[:of_diagnosis]-(dg:diagnosis)
        OPTIONAL MATCH (p)<-[:of_follow_up]-(fu:follow_up)
        OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
        OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
        WITH file, fu, p, st, sm, stf, stp, dg
        RETURN DISTINCT
          sm.id as id,
          p.id as pid,
          sm.sample_id as sample_id,
          p.participant_id as participant_id,
          apoc.text.split(p.race, ';') as race,
          p.sex_at_birth as sex_at_birth,
          apoc.text.split(p.ethnicity, ';') as ethnicity,
          sm.anatomic_site as sample_anatomic_site,
          sm.diagnosis_classification as sample_diagnosis_classification,
          sm.diagnosis_classification_system as sample_diagnosis_classification_system,
          sm.diagnosis_verification_status as sample_diagnosis_verification_status,
          sm.diagnosis_basis as sample_diagnosis_basis,
          sm.diagnosis_comment as sample_diagnosis_comment,
          sm.participant_age_at_collection as participant_age_at_collection,
          sm.sample_tumor_status as sample_tumor_status,
          sm.tumor_classification as tumor_classification,
          st.study_id as study_id,
          st.phs_accession as phs_accession,
          st.study_acronym as study_acronym,
          st.study_short_title as study_short_title,
          COLLECT(DISTINCT {
              age_at_diagnosis: dg.age_at_diagnosis,
              diagnosis_anatomic_site: dg.anatomic_site,
              disease_phase: dg.disease_phase,
              diagnosis_classification_system: dg.diagnosis_classification_system,
              diagnosis_verification_status: dg.diagnosis_verification_status,
              diagnosis_basis: dg.diagnosis_basis,
              diagnosis_comment: dg.diagnosis_comment,
              diagnosis_classification: dg.diagnosis_classification
          }) AS diagnosis_filters,
          COLLECT(DISTINCT fu.vital_status) as vital_status,
          COLLECT(DISTINCT {
              assay_method: CASE LABELS(file)[0]
                        WHEN 'sequencing_file' THEN 'Sequencing'
                        WHEN 'single_cell_sequencing_file' THEN 'Single Cell Sequencing'
                        WHEN 'cytogenomic_file' THEN 'Cytogenomic'
                        WHEN 'pathology_file' THEN 'Pathology imaging'
                        WHEN 'methylation_array_file' THEN 'Methylation array' 
                        ELSE null END,
              file_type: file.file_type,
              library_selection: CASE LABELS(file)[0]
                            WHEN 'sequencing_file' THEN file.library_selection
                            WHEN 'single_cell_sequencing_file' THEN file.library_selection
                            ELSE null END,
              library_source: CASE LABELS(file)[0]
                            WHEN 'sequencing_file' THEN file.library_source
                            WHEN 'single_cell_sequencing_file' THEN file.library_source
                            ELSE null END,
              library_strategy: CASE LABELS(file)[0]
                            WHEN 'sequencing_file' THEN file.library_strategy
                             WHEN 'single_cell_sequencing_file' THEN file.library_strategy
                            ELSE null END
          }) AS file_filters,
          COLLECT(DISTINCT stf.grant_id) as grant_id,
          COLLECT(DISTINCT stp.institution) as institution,
          COUNT(DISTINCT file.id) as file_count,
          COLLECT(DISTINCT file.id) as files
        union all
        MATCH (sm:sample)
        MATCH (st:study)<-[:of_cell_line|of_pdx]-(cl)<--(sm)
        Where (cl:cell_line or cl:pdx)
        optional Match (sm)<--(file)
        WHERE (file: sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:single_cell_sequencing_file OR file:cytogenomic_file)
        MATCH (st)<-[:of_participant]-(p:participant)
        OPTIONAL MATCH (p)<-[:of_diagnosis]-(dg:diagnosis)
        OPTIONAL MATCH (p)<-[:of_follow_up]-(fu:follow_up)
        OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
        OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
        WITH sm, file, fu, st, stf, stp, dg
        RETURN DISTINCT
          sm.id as id,
          null as pid,
          sm.sample_id as sample_id,
          null as participant_id,
          null as race,
          null as sex_at_birth,
          null as ethnicity,
          sm.anatomic_site as sample_anatomic_site,
          sm.diagnosis_classification as sample_diagnosis_classification,
          sm.diagnosis_classification_system as sample_diagnosis_classification_system,
          sm.diagnosis_verification_status as sample_diagnosis_verification_status,
          sm.diagnosis_basis as sample_diagnosis_basis,
          sm.diagnosis_comment as sample_diagnosis_comment,
          sm.participant_age_at_collection as participant_age_at_collection,
          sm.sample_tumor_status as sample_tumor_status,
          sm.tumor_classification as tumor_classification,
          st.study_id as study_id,
          st.phs_accession as phs_accession,
          st.study_acronym as study_acronym,
          st.study_short_title as study_short_title,
          COLLECT(DISTINCT {
              age_at_diagnosis: dg.age_at_diagnosis,
              diagnosis_anatomic_site: dg.anatomic_site,
              disease_phase: dg.disease_phase,
              diagnosis_classification_system: dg.diagnosis_classification_system,
              diagnosis_verification_status: dg.diagnosis_verification_status,
              diagnosis_basis: dg.diagnosis_basis,
              diagnosis_comment: dg.diagnosis_comment,
              diagnosis_classification: dg.diagnosis_classification
          }) AS diagnosis_filters,
          COLLECT(DISTINCT fu.vital_status) as vital_status,
          CASE COLLECT(file) WHEN [] THEN []
                    ELSE COLLECT(DISTINCT {
                        assay_method: CASE LABELS(file)[0]
                                  WHEN 'sequencing_file' THEN 'Sequencing'
                                  WHEN 'single_cell_sequencing_file' THEN 'Single Cell Sequencing'
                                  WHEN 'cytogenomic_file' THEN 'Cytogenomic'
                                  WHEN 'pathology_file' THEN 'Pathology imaging'
                                  WHEN 'methylation_array_file' THEN 'Methylation array' 
                                  ELSE null END,
                        file_type: file.file_type,
                        library_selection: CASE LABELS(file)[0]
                                      WHEN 'sequencing_file' THEN file.library_selection
                                      WHEN 'single_cell_sequencing_file' THEN file.library_selection
                                      ELSE null END,
                        library_source: CASE LABELS(file)[0]
                                      WHEN 'sequencing_file' THEN file.library_source
                                      WHEN 'single_cell_sequencing_file' THEN file.library_source
                                      ELSE null END,
                        library_strategy: CASE LABELS(file)[0]
                                      WHEN 'sequencing_file' THEN file.library_strategy
                                      WHEN 'single_cell_sequencing_file' THEN file.library_strategy
                                      ELSE null END
                    }) END AS file_filters,
          COLLECT(DISTINCT stf.grant_id) as grant_id,
          COLLECT(DISTINCT stp.institution) as institution,
          COUNT(DISTINCT file.id) as file_count,
          COLLECT(DISTINCT file.id) as files
      "

  - index_name: files
    type: neo4j
    mapping:
      id:
        type: keyword
      pid:
        type: keyword
      file_id:
        type: keyword
      guid:
        type: keyword
      file_name:
        type: keyword
      file_category:
        type: keyword
      file_type:
        type: keyword
      file_description:
        type: keyword
      file_size:
        type: long
      md5sum:
        type: keyword
      study_id:
        type: keyword
      phs_accession:
        type: keyword
      study_acronym:
        type: keyword
      study_short_title:
        type: keyword
      participant_id:
        type: keyword
      sample_id:
        type: keyword
      files:
        type: text
        fields:
          keyword:
            type: keyword
      combined_filters:
        type: nested
        properties:
          participant_id:
            type: keyword
          race:
            type: keyword
          sex_at_birth:
            type: keyword
          ethnicity:
            type: keyword
          diagnosis:
            type: nested
            properties:
              age_at_diagnosis:
                type: integer
              diagnosis_anatomic_site:
                type: keyword
              disease_phase:
                type: keyword
              diagnosis_classification_system:
                type: keyword
              diagnosis_verification_status:
                type: keyword
              diagnosis_basis:
                type: keyword 
              diagnosis_comment:
                type: keyword                  
              diagnosis_classification:
                type: keyword
          vital_status:
            type: keyword
          samples:
            type: nested
            properties:
              sample_anatomic_site:
                type: keyword
              participant_age_at_collection:
                type: integer
              sample_tumor_status:
                type: keyword
              tumor_classification:
                type: keyword
      participant_filters:
        type: nested
        properties:
          race:
            type: keyword
          sex_at_birth:
            type: keyword
          ethnicity:
            type: keyword
      diagnosis_filters:
        type: nested
        properties:
          age_at_diagnosis:
            type: integer
          diagnosis_anatomic_site:
            type: keyword
          disease_phase:
            type: keyword
          diagnosis_classification_system:
            type: keyword
          diagnosis_verification_status:
            type: keyword
          diagnosis_basis:
            type: keyword
          diagnosis_comment:
            type: keyword 
          diagnosis_classification:
            type: keyword
      vital_status:
        type: keyword
      sample_filters:
        type: nested
        properties:
          sample_anatomic_site:
            type: keyword
          participant_age_at_collection:
            type: integer
          sample_tumor_status:
            type: keyword
          tumor_classification:
            type: keyword
      grant_id:
        type: keyword
      institution:
        type: keyword
      library_selection:
        type: keyword
      library_source:
        type: keyword
      library_strategy:
        type: keyword
    # Cypher query will be used to retrieve data from Neo4j, and index into Elasticsearch
    cypher_query: "
        MATCH (file:clinical_measure_file)
        MATCH (p:participant)-[:of_clinical_measure_file]-(file)
        MATCH (st:study)<-[:of_participant]-(p)
        OPTIONAL MATCH (p)<-[:of_sample]-(sm:sample)
        OPTIONAL MATCH (st)<-[:of_publication]-(pub:publication)
        OPTIONAL MATCH (p)<-[:of_diagnosis]-(dg:diagnosis)
        OPTIONAL MATCH (p)<-[:of_follow_up]-(fu:follow_up)
        OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
        OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
        UNWIND apoc.text.split(p.ethnicity, ';') AS ethnicities
        UNWIND apoc.text.split(p.race, ';') AS races
        RETURN DISTINCT
          file.id as id,
          p.id as pid,
          file.clinical_measure_file_id AS file_id,
          file.dcf_indexd_guid AS guid,
          file.file_name AS file_name,
          'Clinical data' AS file_category,
          file.file_type AS file_type,
          file.file_description AS file_description,
          file.file_size AS file_size,
          file.md5sum AS md5sum,
          st.study_id AS study_id,
          st.phs_accession as phs_accession,
          st.study_acronym as study_acronym,
          st.study_short_title as study_short_title,
          p.participant_id AS participant_id,
          null AS sample_id,
          COLLECT(DISTINCT file.id) as files,
          null as combined_filters,
          COLLECT(DISTINCT {
              race: races,
              sex_at_birth: p.sex_at_birth,
              ethnicity: ethnicities
          }) AS participant_filters,
          COLLECT(DISTINCT {
              age_at_diagnosis: dg.age_at_diagnosis,
              diagnosis_anatomic_site: dg.anatomic_site,
              disease_phase: dg.disease_phase,
              diagnosis_classification_system: dg.diagnosis_classification_system,
              diagnosis_verification_status: dg.diagnosis_verification_status,
              diagnosis_basis: dg.diagnosis_basis,
              diagnosis_comment: dg.diagnosis_comment,
              diagnosis_classification: dg.diagnosis_classification
          }) AS diagnosis_filters,
          COLLECT(DISTINCT fu.vital_status) as vital_status,
          COLLECT(DISTINCT {
              sample_anatomic_site: sm.anatomic_site,
              participant_age_at_collection: sm.participant_age_at_collection,
              sample_tumor_status: sm.sample_tumor_status,
              tumor_classification: sm.tumor_classification
          }) AS sample_filters,
          COLLECT(DISTINCT stf.grant_id) as grant_id,
          COLLECT(DISTINCT stp.institution) as institution,      
          null AS library_selection,
          null AS library_source,
          null AS library_strategy
        UNION ALL
        MATCH (file:clinical_measure_file)
        MATCH (st:study)<-[:of_clinical_measure_file]-(file)
        OPTIONAL MATCH (st)<-[:of_participant]-(p)
        OPTIONAL MATCH (p)<-[:of_sample]-(sm:sample)
        OPTIONAL MATCH (st)<-[:of_publication]-(pub:publication)
        OPTIONAL MATCH (p)<-[:of_diagnosis]-(dg:diagnosis)
        OPTIONAL MATCH (p)<-[:of_follow_up]-(fu:follow_up)
        OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
        OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
        UNWIND apoc.text.split(p.ethnicity, ';') AS ethnicities
        UNWIND apoc.text.split(p.race, ';') AS races
        With file, st, p, races, ethnicities, COLLECT(DISTINCT {
              age_at_diagnosis: dg.age_at_diagnosis,
              diagnosis_anatomic_site: dg.anatomic_site,
              disease_phase: dg.disease_phase,
              diagnosis_classification_system: dg.diagnosis_classification_system,
              diagnosis_verification_status: dg.diagnosis_verification_status,
              diagnosis_basis: dg.diagnosis_basis,
              diagnosis_comment: dg.diagnosis_comment,
              diagnosis_classification: dg.diagnosis_classification
          }) AS diagnosis_filters, COLLECT(DISTINCT {
              sample_anatomic_site: sm.anatomic_site,
              participant_age_at_collection: sm.participant_age_at_collection,
              sample_tumor_status: sm.sample_tumor_status,
              tumor_classification: sm.tumor_classification
          }) AS sample_filters, COLLECT(DISTINCT fu.vital_status) as vital_status, stf, stp
        RETURN DISTINCT
          file.id as id,
          null as pid,
          file.clinical_measure_file_id AS file_id,
          file.dcf_indexd_guid AS guid,
          file.file_name AS file_name,
          'Clinical data' AS file_category,
          file.file_type AS file_type,
          file.file_description AS file_description,
          file.file_size AS file_size,
          file.md5sum AS md5sum,
          st.study_id AS study_id,
          st.phs_accession as phs_accession,
          st.study_acronym as study_acronym,
          st.study_short_title as study_short_title,
          null AS participant_id,
          null AS sample_id,
          COLLECT(DISTINCT file.id) as files,
          COLLECT(DISTINCT {
              participant_id: p.participant_id,
              race: races,
              sex_at_birth: p.sex_at_birth,
              ethnicity: ethnicities,
              diagnosis: diagnosis_filters,
              vital_status: vital_status,
              samples: sample_filters
          }) as combined_filters,
          null as participant_filters,
          null as diagnosis_filters,
          null as vital_status,
          null as sample_filters,
          COLLECT(DISTINCT stf.grant_id) as grant_id,
          COLLECT(DISTINCT stp.institution) as institution,
          null AS library_selection,
          null AS library_source,
          null AS library_strategy
        UNION ALL
        MATCH (file:methylation_array_file)
        MATCH (p:participant)<-[:of_sample]-(sm1:sample)<-[*0..2]-(sm:sample)<-[:of_methylation_array_file]-(file)
        MATCH (st:study)<-[:of_participant]-(p)
        OPTIONAL MATCH (st)<-[:of_publication]-(pub:publication)
        OPTIONAL MATCH (p)<-[:of_diagnosis]-(dg:diagnosis)
        OPTIONAL MATCH (p)<-[:of_follow_up]-(fu:follow_up)
        OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
        OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
        UNWIND apoc.text.split(p.ethnicity, ';') AS ethnicities
        UNWIND apoc.text.split(p.race, ';') AS races
        with file, p, sm1, sm, st, ethnicities, races, fu, dg, stf, stp
        RETURN DISTINCT
          file.id as id,
          p.id as pid,
          file.methylation_array_file_id AS file_id,
          file.dcf_indexd_guid AS guid,
          file.file_name AS file_name,
          'Methylation array' AS file_category,
          file.file_type AS file_type,
          file.file_description AS file_description,
          file.file_size AS file_size,
          file.md5sum AS md5sum,
          st.study_id AS study_id,
          st.phs_accession as phs_accession,
          st.study_acronym as study_acronym,
          st.study_short_title as study_short_title,
          p.participant_id AS participant_id,
          CASE sm1.sample_id WHEN sm.sample_id THEN sm.sample_id
                    ELSE sm1.sample_id + ',' + sm.sample_id END AS sample_id,
          COLLECT(DISTINCT file.id) as files,
          null as combined_filters,
          COLLECT(DISTINCT {
              race: races,
              sex_at_birth: p.sex_at_birth,
              ethnicity: ethnicities
          }) AS participant_filters,
          COLLECT(DISTINCT {
              age_at_diagnosis: dg.age_at_diagnosis,
              diagnosis_anatomic_site: dg.anatomic_site,
              disease_phase: dg.disease_phase,
              diagnosis_classification_system: dg.diagnosis_classification_system,
              diagnosis_verification_status: dg.diagnosis_verification_status,
              diagnosis_basis: dg.diagnosis_basis,
              diagnosis_comment: dg.diagnosis_comment,
              diagnosis_classification: dg.diagnosis_classification
          }) AS diagnosis_filters,
          COLLECT(DISTINCT fu.vital_status) as vital_status,
          CASE sm1.sample_id WHEN sm.sample_id THEN COLLECT(DISTINCT {
                                      sample_anatomic_site: sm.anatomic_site,
                                      participant_age_at_collection: sm.participant_age_at_collection,
                                      sample_tumor_status: sm.sample_tumor_status,
                                      tumor_classification: sm.tumor_classification
                                  })
                    ELSE apoc.coll.union(COLLECT(DISTINCT {
                                      sample_anatomic_site: sm1.anatomic_site,
                                      participant_age_at_collection: sm1.participant_age_at_collection,
                                      sample_tumor_status: sm1.sample_tumor_status,
                                      tumor_classification: sm1.tumor_classification
                                  }), COLLECT(DISTINCT {
                                      sample_anatomic_site: sm.anatomic_site,
                                      participant_age_at_collection: sm.participant_age_at_collection,
                                      sample_tumor_status: sm.sample_tumor_status,
                                      tumor_classification: sm.tumor_classification
                                  })) END AS sample_filters,
          COLLECT(DISTINCT stf.grant_id) as grant_id,
          COLLECT(DISTINCT stp.institution) as institution,
          null AS library_selection,
          null AS library_source,
          null AS library_strategy
        UNION ALL
        MATCH (file:pathology_file)
        MATCH (p:participant)<-[:of_sample]-(sm1:sample)<-[*0..2]-(sm:sample)<-[:of_pathology_file]-(file)
        MATCH (st:study)<-[:of_participant]-(p)
        OPTIONAL MATCH (st)<-[:of_publication]-(pub:publication)
        OPTIONAL MATCH (p)<-[:of_diagnosis]-(dg:diagnosis)
        OPTIONAL MATCH (p)<-[:of_follow_up]-(fu:follow_up)
        OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
        OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
        UNWIND apoc.text.split(p.ethnicity, ';') AS ethnicities
        UNWIND apoc.text.split(p.race, ';') AS races
        with file, p, sm1, sm, st, ethnicities, races, fu, dg, stf, stp
        RETURN DISTINCT
          file.id as id,
          p.id as pid,
          file.pathology_file_id AS file_id,
          file.dcf_indexd_guid AS guid,
          file.file_name AS file_name,
          'Pathology imaging' AS file_category,
          file.file_type AS file_type,
          file.file_description AS file_description,
          file.file_size AS file_size,
          file.md5sum AS md5sum,
          st.study_id AS study_id,
          st.phs_accession as phs_accession,
          st.study_acronym as study_acronym,
          st.study_short_title as study_short_title,
          p.participant_id AS participant_id,
          CASE sm1.sample_id WHEN sm.sample_id THEN sm.sample_id
                    ELSE sm1.sample_id + ',' + sm.sample_id END AS sample_id,
          COLLECT(DISTINCT file.id) as files,
          null as combined_filters,
          COLLECT(DISTINCT {
              race: races,
              sex_at_birth: p.sex_at_birth,
              ethnicity: ethnicities
          }) AS participant_filters,
          COLLECT(DISTINCT {
              age_at_diagnosis: dg.age_at_diagnosis,
              diagnosis_anatomic_site: dg.anatomic_site,
              disease_phase: dg.disease_phase,
              diagnosis_classification_system: dg.diagnosis_classification_system,
              diagnosis_verification_status: dg.diagnosis_verification_status,
              diagnosis_basis: dg.diagnosis_basis,
              diagnosis_comment: dg.diagnosis_comment,
              diagnosis_classification: dg.diagnosis_classification
          }) AS diagnosis_filters,
          COLLECT(DISTINCT fu.vital_status) as vital_status,
          CASE sm1.sample_id WHEN sm.sample_id THEN COLLECT(DISTINCT {
                                      sample_anatomic_site: sm.anatomic_site,
                                      participant_age_at_collection: sm.participant_age_at_collection,
                                      sample_tumor_status: sm.sample_tumor_status,
                                      tumor_classification: sm.tumor_classification
                                  })
                    ELSE apoc.coll.union(COLLECT(DISTINCT {
                                      sample_anatomic_site: sm1.anatomic_site,
                                      participant_age_at_collection: sm1.participant_age_at_collection,
                                      sample_tumor_status: sm1.sample_tumor_status,
                                      tumor_classification: sm1.tumor_classification
                                  }), COLLECT(DISTINCT {
                                      sample_anatomic_site: sm.anatomic_site,
                                      participant_age_at_collection: sm.participant_age_at_collection,
                                      sample_tumor_status: sm.sample_tumor_status,
                                      tumor_classification: sm.tumor_classification
                                  })) END AS sample_filters,
          COLLECT(DISTINCT stf.grant_id) as grant_id,
          COLLECT(DISTINCT stp.institution) as institution,
          file.library_selection AS library_selection,
          file.library_source AS library_source,
          file.library_strategy AS library_strategy
        UNION ALL
        MATCH (file:radiology_file)
        MATCH (p:participant)<-[:of_radiology_file]-(file)
        MATCH (st:study)<-[:of_participant]-(p)
        OPTIONAL MATCH (p)<-[:of_sample]-(sm:sample)
        OPTIONAL MATCH (st)<-[:of_publication]-(pub:publication)
        OPTIONAL MATCH (p)<-[:of_diagnosis]-(dg:diagnosis)
        OPTIONAL MATCH (p)<-[:of_follow_up]-(fu:follow_up)
        OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
        OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
        UNWIND apoc.text.split(p.ethnicity, ';') AS ethnicities
        UNWIND apoc.text.split(p.race, ';') AS races
        RETURN DISTINCT
          file.id as id,
          p.id as pid,
          file.radiology_file_id AS file_id,
          file.dcf_indexd_guid AS guid,
          file.file_name AS file_name,
          'Radiology imaging' AS file_category,
          file.file_type AS file_type,
          file.file_description AS file_description,
          file.file_size AS file_size,
          file.md5sum AS md5sum,
          st.study_id AS study_id,
          st.phs_accession as phs_accession,
          st.study_acronym as study_acronym,
          st.study_short_title as study_short_title,
          p.participant_id AS participant_id,
          null AS sample_id,
          COLLECT(DISTINCT file.id) as files,
          null as combined_filters,
          COLLECT(DISTINCT {
              race: races,
              sex_at_birth: p.sex_at_birth,
              ethnicity: ethnicities
          }) AS participant_filters,
          COLLECT(DISTINCT {
              age_at_diagnosis: dg.age_at_diagnosis,
              diagnosis_anatomic_site: dg.anatomic_site,
              disease_phase: dg.disease_phase,
              diagnosis_classification_system: dg.diagnosis_classification_system,
              diagnosis_verification_status: dg.diagnosis_verification_status,
              diagnosis_basis: dg.diagnosis_basis,
              diagnosis_comment: dg.diagnosis_comment,
              diagnosis_classification: dg.diagnosis_classification
          }) AS diagnosis_filters,
          COLLECT(DISTINCT fu.vital_status) as vital_status,
          COLLECT(DISTINCT {
              sample_anatomic_site: sm.anatomic_site,
              participant_age_at_collection: sm.participant_age_at_collection,
              sample_tumor_status: sm.sample_tumor_status,
              tumor_classification: sm.tumor_classification
          }) AS sample_filters,
          COLLECT(DISTINCT stf.grant_id) as grant_id,
          COLLECT(DISTINCT stp.institution) as institution,
          null AS library_selection,
          null AS library_source,
          null AS library_strategy
        UNION ALL
        MATCH (file:single_cell_sequencing_file)
        MATCH (p:participant)<-[:of_sample]-(sm1:sample)<-[*0..2]-(sm:sample)<-[:of_single_cell_sequencing_file]-(file)
        MATCH (st:study)<-[:of_participant]-(p)
        OPTIONAL MATCH (st)<-[:of_publication]-(pub:publication)
        OPTIONAL MATCH (p)<-[:of_diagnosis]-(dg:diagnosis)
        OPTIONAL MATCH (p)<-[:of_follow_up]-(fu:follow_up)
        OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
        OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
        UNWIND apoc.text.split(p.ethnicity, ';') AS ethnicities
        UNWIND apoc.text.split(p.race, ';') AS races
        with file, p, sm1, sm, st, ethnicities, races, fu, dg, stf, stp
        RETURN DISTINCT
          file.id as id,
          p.id as pid,
          file.single_cell_sequencing_file_id AS file_id,
          file.dcf_indexd_guid AS guid,
          file.file_name AS file_name,
          'Single Cell Sequencing' AS file_category,
          file.file_type AS file_type,
          file.file_description AS file_description,
          file.file_size AS file_size,
          file.md5sum AS md5sum,
          st.study_id AS study_id,
          st.phs_accession as phs_accession,
          st.study_acronym as study_acronym,
          st.study_short_title as study_short_title,
          p.participant_id AS participant_id,
          CASE sm1.sample_id WHEN sm.sample_id THEN sm.sample_id
                    ELSE sm1.sample_id + ',' + sm.sample_id END AS sample_id,
          COLLECT(DISTINCT file.id) as files,
          null as combined_filters,
          COLLECT(DISTINCT {
              race: races,
              sex_at_birth: p.sex_at_birth,
              ethnicity: ethnicities
          }) AS participant_filters,
          COLLECT(DISTINCT {
              age_at_diagnosis: dg.age_at_diagnosis,
              diagnosis_anatomic_site: dg.anatomic_site,
              disease_phase: dg.disease_phase,
              diagnosis_classification_system: dg.diagnosis_classification_system,
              diagnosis_verification_status: dg.diagnosis_verification_status,
              diagnosis_basis: dg.diagnosis_basis,
              diagnosis_comment: dg.diagnosis_comment,
              diagnosis_classification: dg.diagnosis_classification
          }) AS diagnosis_filters,
          COLLECT(DISTINCT fu.vital_status) as vital_status,
          CASE sm1.sample_id WHEN sm.sample_id THEN COLLECT(DISTINCT {
                                      sample_anatomic_site: sm.anatomic_site,
                                      participant_age_at_collection: sm.participant_age_at_collection,
                                      sample_tumor_status: sm.sample_tumor_status,
                                      tumor_classification: sm.tumor_classification
                                  })
                    ELSE apoc.coll.union(COLLECT(DISTINCT {
                                      sample_anatomic_site: sm1.anatomic_site,
                                      participant_age_at_collection: sm1.participant_age_at_collection,
                                      sample_tumor_status: sm1.sample_tumor_status,
                                      tumor_classification: sm1.tumor_classification
                                  }), COLLECT(DISTINCT {
                                      sample_anatomic_site: sm.anatomic_site,
                                      participant_age_at_collection: sm.participant_age_at_collection,
                                      sample_tumor_status: sm.sample_tumor_status,
                                      tumor_classification: sm.tumor_classification
                                  })) END AS sample_filters,
          COLLECT(DISTINCT stf.grant_id) as grant_id,
          COLLECT(DISTINCT stp.institution) as institution,
          file.library_selection AS library_selection,
          file.library_source AS library_source,
          file.library_strategy AS library_strategy
        UNION ALL
        MATCH (file:sequencing_file)
        MATCH (p:participant)<-[:of_sample]-(sm1:sample)<-[*0..2]-(sm:sample)<-[:of_sequencing_file]-(file)
        MATCH (st:study)<-[:of_participant]-(p)
        OPTIONAL MATCH (st)<-[:of_publication]-(pub:publication)
        OPTIONAL MATCH (p)<-[:of_diagnosis]-(dg:diagnosis)
        OPTIONAL MATCH (p)<-[:of_follow_up]-(fu:follow_up)
        OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
        OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
        UNWIND apoc.text.split(p.ethnicity, ';') AS ethnicities
        UNWIND apoc.text.split(p.race, ';') AS races
        with file, p, sm1, sm, st, ethnicities, races, fu, dg, stf, stp
        RETURN DISTINCT
          file.id as id,
          p.id as pid,
          file.sequencing_file_id AS file_id,
          file.dcf_indexd_guid AS guid,
          file.file_name AS file_name,
          'Sequencing' AS file_category,
          file.file_type AS file_type,
          file.file_description AS file_description,
          file.file_size AS file_size,
          file.md5sum AS md5sum,
          st.study_id AS study_id,
          st.phs_accession as phs_accession,
          st.study_acronym as study_acronym,
          st.study_short_title as study_short_title,
          p.participant_id AS participant_id,
          CASE sm1.sample_id WHEN sm.sample_id THEN sm.sample_id
                    ELSE sm1.sample_id + ',' + sm.sample_id END AS sample_id,
          COLLECT(DISTINCT file.id) as files,
          null as combined_filters,
          COLLECT(DISTINCT {
              race: races,
              sex_at_birth: p.sex_at_birth,
              ethnicity: ethnicities
          }) AS participant_filters,
          COLLECT(DISTINCT {
              age_at_diagnosis: dg.age_at_diagnosis,
              diagnosis_anatomic_site: dg.anatomic_site,
              disease_phase: dg.disease_phase,
              diagnosis_classification_system: dg.diagnosis_classification_system,
              diagnosis_verification_status: dg.diagnosis_verification_status,
              diagnosis_basis: dg.diagnosis_basis,
              diagnosis_comment: dg.diagnosis_comment,
              diagnosis_classification: dg.diagnosis_classification
          }) AS diagnosis_filters,
          COLLECT(DISTINCT fu.vital_status) as vital_status,
          CASE sm1.sample_id WHEN sm.sample_id THEN COLLECT(DISTINCT {
                                      sample_anatomic_site: sm.anatomic_site,
                                      participant_age_at_collection: sm.participant_age_at_collection,
                                      sample_tumor_status: sm.sample_tumor_status,
                                      tumor_classification: sm.tumor_classification
                                  })
                    ELSE apoc.coll.union(COLLECT(DISTINCT {
                                      sample_anatomic_site: sm1.anatomic_site,
                                      participant_age_at_collection: sm1.participant_age_at_collection,
                                      sample_tumor_status: sm1.sample_tumor_status,
                                      tumor_classification: sm1.tumor_classification
                                  }), COLLECT(DISTINCT {
                                      sample_anatomic_site: sm.anatomic_site,
                                      participant_age_at_collection: sm.participant_age_at_collection,
                                      sample_tumor_status: sm.sample_tumor_status,
                                      tumor_classification: sm.tumor_classification
                                  })) END AS sample_filters,
          COLLECT(DISTINCT stf.grant_id) as grant_id,
          COLLECT(DISTINCT stp.institution) as institution,
          file.library_selection AS library_selection,
          file.library_source AS library_source,
          file.library_strategy AS library_strategy
        UNION ALL
        MATCH (file:cytogenomic_file)
        MATCH (p:participant)<-[:of_sample]-(sm1:sample)<-[*0..2]-(sm:sample)<-[:of_cytogenomic_file]-(file)
        MATCH (st:study)<-[:of_participant]-(p)
        OPTIONAL MATCH (st)<-[:of_publication]-(pub:publication)
        OPTIONAL MATCH (p)<-[:of_diagnosis]-(dg:diagnosis)
        OPTIONAL MATCH (p)<-[:of_follow_up]-(fu:follow_up)
        OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
        OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
        UNWIND apoc.text.split(p.ethnicity, ';') AS ethnicities
        UNWIND apoc.text.split(p.race, ';') AS races
        with file, p, sm1, sm, st, ethnicities, races, fu, dg, stf, stp
        RETURN DISTINCT
          file.id as id,
          p.id as pid,
          file.cytogenomic_file_id AS file_id,
          file.dcf_indexd_guid AS guid,
          file.file_name AS file_name,
          'Cytogenomic' AS file_category,
          file.file_type AS file_type,
          file.file_description AS file_description,
          file.file_size AS file_size,
          file.md5sum AS md5sum,
          st.study_id AS study_id,
          st.phs_accession as phs_accession,
          st.study_acronym as study_acronym,
          st.study_short_title as study_short_title,
          p.participant_id AS participant_id,
          CASE sm1.sample_id WHEN sm.sample_id THEN sm.sample_id
                    ELSE sm1.sample_id + ',' + sm.sample_id END AS sample_id,
          COLLECT(DISTINCT file.id) as files,
          null as combined_filters,
          COLLECT(DISTINCT {
              race: races,
              sex_at_birth: p.sex_at_birth,
              ethnicity: ethnicities
          }) AS participant_filters,
          COLLECT(DISTINCT {
              age_at_diagnosis: dg.age_at_diagnosis,
              diagnosis_anatomic_site: dg.anatomic_site,
              disease_phase: dg.disease_phase,
              diagnosis_classification_system: dg.diagnosis_classification_system,
              diagnosis_verification_status: dg.diagnosis_verification_status,
              diagnosis_basis: dg.diagnosis_basis,
              diagnosis_comment: dg.diagnosis_comment,
              diagnosis_classification: dg.diagnosis_classification
          }) AS diagnosis_filters,
          COLLECT(DISTINCT fu.vital_status) as vital_status,
          CASE sm1.sample_id WHEN sm.sample_id THEN COLLECT(DISTINCT {
                                      sample_anatomic_site: sm.anatomic_site,
                                      participant_age_at_collection: sm.participant_age_at_collection,
                                      sample_tumor_status: sm.sample_tumor_status,
                                      tumor_classification: sm.tumor_classification
                                  })
                    ELSE apoc.coll.union(COLLECT(DISTINCT {
                                      sample_anatomic_site: sm1.anatomic_site,
                                      participant_age_at_collection: sm1.participant_age_at_collection,
                                      sample_tumor_status: sm1.sample_tumor_status,
                                      tumor_classification: sm1.tumor_classification
                                  }), COLLECT(DISTINCT {
                                      sample_anatomic_site: sm.anatomic_site,
                                      participant_age_at_collection: sm.participant_age_at_collection,
                                      sample_tumor_status: sm.sample_tumor_status,
                                      tumor_classification: sm.tumor_classification
                                  })) END AS sample_filters,
          COLLECT(DISTINCT stf.grant_id) as grant_id,
          COLLECT(DISTINCT stp.institution) as institution,
          null AS library_selection,
          null AS library_source,
          null AS library_strategy
        UNION ALL
        MATCH (file)
        WHERE (file:sequencing_file OR file:pathology_file OR file:methylation_array_file OR file:single_cell_sequencing_file OR file:cytogenomic_file)
        MATCH (st:study)<-[:of_cell_line|of_pdx]-(cl)<--(sm:sample)
        Where (cl: cell_line or cl: pdx)
        MATCH (sm)<--(file)
        OPTIONAL MATCH (st)<-[:of_publication]-(pub:publication)
        OPTIONAL MATCH (st)<--(p:participant)<-[:of_diagnosis]-(dg:diagnosis)
        OPTIONAL MATCH (st)<--(p)<-[:of_follow_up]-(fu:follow_up)
        OPTIONAL MATCH (st)<-[:of_study_personnel]-(stp:study_personnel)
        OPTIONAL MATCH (st)<-[:of_study_funding]-(stf:study_funding)
        with file, sm, st, fu, dg, stf, stp
        RETURN DISTINCT
          file.id as id,
          null as pid,
          CASE LABELS(file)[0]
                WHEN 'sequencing_file' THEN file.sequencing_file_id
                WHEN 'single_cell_sequencing_file' THEN file.single_cell_sequencing_file_id
                WHEN 'cytogenomic_file' THEN file.cytogenomic_file_id
                WHEN 'pathology_file' THEN file.pathology_file_id
                WHEN 'methylation_array_file' THEN file.methylation_array_file_id END AS file_id,
          file.dcf_indexd_guid AS guid,
          file.file_name AS file_name,
          CASE LABELS(file)[0]
                WHEN 'sequencing_file' THEN 'Sequencing'
                WHEN 'single_cell_sequencing_file' THEN 'Single Cell Sequencing'
                WHEN 'cytogenomic_file' THEN 'Cytogenomic'
                WHEN 'pathology_file' THEN 'Pathology imaging'
                WHEN 'methylation_array_file' THEN 'Methylation array' END AS file_category,
          file.file_type AS file_type,
          file.file_description AS file_description,
          file.file_size AS file_size,
          file.md5sum AS md5sum,
          st.study_id AS study_id,
          st.phs_accession as phs_accession,
          st.study_acronym as study_acronym,
          st.study_short_title as study_short_title,
          null AS participant_id,
          sm.sample_id AS sample_id,
          COLLECT(DISTINCT file.id) as files,
          null as combined_filters,
          null AS participant_filters,
          COLLECT(DISTINCT {
              age_at_diagnosis: dg.age_at_diagnosis,
              diagnosis_anatomic_site: dg.anatomic_site,
              disease_phase: dg.disease_phase,
              diagnosis_classification_system: dg.diagnosis_classification_system,
              diagnosis_verification_status: dg.diagnosis_verification_status,
              diagnosis_basis: dg.diagnosis_basis,
              diagnosis_comment: dg.diagnosis_comment,
              diagnosis_classification: dg.diagnosis_classification
          }) AS diagnosis_filters,
          COLLECT(DISTINCT fu.vital_status) as vital_status,
          COLLECT(DISTINCT {
              sample_anatomic_site: sm.anatomic_site,
              participant_age_at_collection: sm.participant_age_at_collection,
              sample_tumor_status: sm.sample_tumor_status,
              tumor_classification: sm.tumor_classification
          }) AS sample_filters,
          COLLECT(DISTINCT stf.grant_id) as grant_id,
          COLLECT(DISTINCT stp.institution) as institution,
          CASE LABELS(file)[0]
                    WHEN 'sequencing_file' THEN file.library_selection
                    WHEN 'single_cell_sequencing_file' THEN file.library_selection
                    ELSE null END AS library_selection,
          CASE LABELS(file)[0]
                    WHEN 'sequencing_file' THEN file.library_source
                    WHEN 'single_cell_sequencing_file' THEN file.library_source
                    ELSE null END AS library_source,
          CASE LABELS(file)[0]
                    WHEN 'sequencing_file' THEN file.library_strategy
                    WHEN 'single_cell_sequencing_file' THEN file.library_strategy
                    ELSE null END AS library_strategy
      "