diff --git a/Example/xml/Consistent_Config_XMLs_LOCAL/ConfigurationOPENGDC_2019_07.xml b/Example/xml/Consistent_Config_XMLs_LOCAL/ConfigurationOPENGDC_2019_07.xml new file mode 100644 index 00000000..830d0aad --- /dev/null +++ b/Example/xml/Consistent_Config_XMLs_LOCAL/ConfigurationOPENGDC_2019_07.xml @@ -0,0 +1,182 @@ + + + + + /Users/abernasconi/Documents/gitProjects/GMQL-Importer/Example/examples_meta + /Users/abernasconi/Documents/gitProjects/GMQL-Importer/src/main/resources/application.conf + TCGA + jdbc:postgresql://localhost/gmql_metadata_anna + geco + geco78 + org.postgresql.Driver + /Users/abernasconi/Documents/gitProjects/GMQL-Importer/Example/flattener_rule_base.txt + false + false + false + true + false + false + false + false + + + + + bioinformatics.iasi.cnr.it + GRCh38_TCGA_OPENGDC + it.polimi.genomics.metadata.downloader_transformer.default.FtpDownloader + it.polimi.genomics.metadata.downloader_transformer.default.NullTransformer + it.polimi.genomics.metadata.step.GMQLLoader + true + true + true + true + true + true + true + + + For metadata key replacement have to refer another file. Should be big file and would not be useful + metadata_replacement + xml/metadataReplacementTcga.xml + + + It is to know which user in GMQL is going to run the import + gmql_user + public + + + For logging in ftp on tcga2bed we need this anonymous user + username + anonymous + + + And also the corresponding null password + password + + + + decides the separator char for metadata names. + metadata_name_separation_char + __ + + + Mappings + mappings + /Users/abernasconi/Documents/gitProjects/GMQL-Importer/Example/xml/settingsTCGAOpenGDC.xml + + + + + + copy_number_segment + ftp://bioinformatics.iasi.cnr.it/opengdc/bed/tcga/tcga-acc/copy_number_segment/header.schema + true + true + true + true + true + true + true + + + The dataset name on the server + loading_name + GRCh38_TCGA_copy_number_2019_07 + + + The dataset description on the server, when clicking Show Info + loading_description + This dataset contains samples of copy number variations (copy number segment).<br> + A copy number variation (CNV) is a variation in the number of copies of a given genomic segment per cell.<br> + The considered experiments include both germline and somatic CNVs.<br> + <br> + It is directly derived from data available on the Genomic Data Commons portal and transformed into GDM format through the openGDC pipeline.<br> + Documentation is available at http://bioinf.iasi.cnr.it/opengdc/data/OpenGDC_format_definition.pdf.<br> + More details are described in the GDC Data User's Guide available at https://docs.gdc.cancer.gov/Data/PDF/Data_UG.pdf and at https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/CNV_Pipeline/.<br> + <br> + The dataset includes tab separated BED files, in which the CNV file is converted, with the following fields:<br> + <ol> + <li>chrom (equal to the 2. field of the GDC CNV file, e.g., "1")</li> + <li>start (equal to the 3. field of the GDC CNV file, e.g., 61735)</li> + <li>end (equal to the 4. field of the GDC CNV file, e.g., 1628826)</li> + <li>strand (unknown, set to '*')</li> + <li>num_probes (equal to the 5. field of the GDC CNV file, e.g., 229)</li> + <li>segment_mean (equal to the 6. field of the GDC CNV file, e.g., 0.1756)</li> + </ol> + + + We look for all the copy_number_segment folders inside the bed directory + folder_regex + ^/opengdc/bed/tcga/.*/copy_number_segment + + + We look for all the .bed and .meta files to download + files_regex + .*\.bed(\.meta)?$ + + + + + gene_expression_quantification + ftp://bioinformatics.iasi.cnr.it/opengdc/bed/tcga/tcga-acc/gene_expression_quantification/header.schema + true + true + true + true + true + true + true + + + The dataset name on the server + loading_name + GRCh38_TCGA_gene_expression_2019_07 + + + The dataset description on the server, when clicking Show Info + loading_description + This dataset contains data on gene expression quantification.<br> + It is directly derived from data available on the Genomic Data Commons portal and transformed into GDM format through the openGDC pipeline.<br> + Documentation is available at http://bioinf.iasi.cnr.it/opengdc/data/OpenGDC_format_definition.pdf.<br> + <br> + GDC provides gene expression quantification data in three files for each aliquot:<br> + - FPKM (i.e., Fragments Per Kilobase of transcript per Million mapped reads)<br> + - FPKM-UQ (i.e., Upper Quartile normalized FPKM values)<br> + - counts (i.e., raw mapping counts of reads mapped to each gene)<br> + <br> + More details are described in the GDC Data User's Guide available at https://docs.gdc.cancer.gov/Data/PDF/Data_UG.pdf and at https://gdc.cancer.gov/about-data/data-harmonization-and-generation/genomic-data-harmonization/high-level-data-generation/rna-seq-quantification.<br> + <br> + The three original GDC files have been merged into one single BED file with the following fields:<br> + <ol> + <li>chrom (retrieved from GDC.h38 GENCODE v22 GTF annotation file according to the Ensembl ID of the gene, completed with "chr", e.g., "chr2")</li> + <li>start (retrieved from GDC.h38 GENCODE v22 GTF annotation file5 according to the Ensembl ID of the gene, e.g., 32277910)</li> + <li>end (retrieved from GDC.h38 GENCODE v22 GTF annotation file5 according to the Ensembl ID of the gene, e.g., 32316594)</li> + <li>strand (retrieved from GDC.h38 GENCODE v22 GTF annotation file5 according to the Ensembl ID of the gene, e.g., '+')</li> + <li>ensembl_gene_id (equal to the 1. field of any of the GDC gene expression quantification files, e.g., "ENSG00000119820.9")</li> + <li>entrez_gene_id (retrieved from HUGO Gene Nomenclature Committee (HGNC) according to the Ensembl ID of the gene, e.g., "84272")</li> + <li>gene_symbol (retrieved from GDC.h38 GENCODE v22 GTF annotation file5 according to the Ensembl ID of the gene, e.g., "YIPF4")</li> + <li>type (retrieved from GDC.h38 GENCODE v22 GTF annotation files5 according to the Ensembl ID of the gene, e.g., "gene")</li> + <li>htseq_count (equal to the 2. field of the GDC counts file, e.g., 1320)</li> + <li>fpkm_uq (equal to the 2. field of the GDC FPKM-UQ file, e.g., 88737.5390983</li> + <li>fpkm (equal to the 2. field of the GDC FPKM file, e.g., 2.44783943057)</li> + </ol> + + + We look for all the gene_expression_quantification folders inside the bed directory + folder_regex + ^/opengdc/bed/tcga/.*/gene_expression_quantification + + + We look for all the .bed and .meta files to download + files_regex + .*\.bed(\.meta)?$ + + + + + + + diff --git a/src/main/scala/it/polimi/genomics/metadata/mapper/REP/Table/ExperimentTypeREP.scala b/src/main/scala/it/polimi/genomics/metadata/mapper/REP/Table/ExperimentTypeREP.scala index 0eacfbcd..b27738a5 100755 --- a/src/main/scala/it/polimi/genomics/metadata/mapper/REP/Table/ExperimentTypeREP.scala +++ b/src/main/scala/it/polimi/genomics/metadata/mapper/REP/Table/ExperimentTypeREP.scala @@ -20,7 +20,12 @@ class ExperimentTypeREP(repTableId: REPTableId) extends REPTable(repTableId) wit case _ => this.technique = insertMethod(this.technique, param) } case "FEATURE" => this.feature = insertMethod(this.feature,param) - case "TARGET" => this.target = insertMethod(this.target,param) + + case "TARGET" => param.toUpperCase() match { + case "DNASE" => this.target = null + case _ => this.target = insertMethod(this.target,param) + } + case "ANTIBODY" => this.antibody = insertMethod(this.antibody,param) case "ONTOLOGICALCODE" => this.ontologicalCode = insertMethod(this.ontologicalCode,param) case "ORIGINALKEY" => this.originalKey = insertMethod(this.originalKey, param)