forked from DEIB-GECO/Metadata-Manager
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fixed Roadmap DNase target which should be null
- Loading branch information
Showing
2 changed files
with
188 additions
and
1 deletion.
There are no files selected for viewing
182 changes: 182 additions & 0 deletions
182
Example/xml/Consistent_Config_XMLs_LOCAL/ConfigurationOPENGDC_2019_07.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
<?xml version="1.0"?> | ||
<root xmlns="http://polimi.it/GDMImporter" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://polimi.it/GDMImporter /Example/xml/configurationSchema.xsd"> | ||
<settings> | ||
<!--BASE WORKING FOLDER FOR THE IMPORTER--> | ||
<base_working_directory>/Users/abernasconi/Documents/gitProjects/GMQL-Importer/Example/examples_meta</base_working_directory> | ||
<gcm_config_file>/Users/abernasconi/Documents/gitProjects/GMQL-Importer/src/main/resources/application.conf</gcm_config_file> | ||
<mapper_source>TCGA</mapper_source> | ||
<database_connection_url>jdbc:postgresql://localhost/gmql_metadata_anna</database_connection_url> | ||
<database_connection_user>geco</database_connection_user> | ||
<database_connection_pw>geco78</database_connection_pw> | ||
<database_connection_driver>org.postgresql.Driver</database_connection_driver> | ||
<flattener_rule_base>/Users/abernasconi/Documents/gitProjects/GMQL-Importer/Example/flattener_rule_base.txt</flattener_rule_base> | ||
<download_enabled>false</download_enabled> | ||
<transform_enabled>false</transform_enabled> | ||
<cleaner_enabled>false</cleaner_enabled> | ||
<mapper_enabled>true</mapper_enabled> | ||
<enricher_enabled>false</enricher_enabled> | ||
<flattener_enabled>false</flattener_enabled> | ||
<load_enabled>false</load_enabled> | ||
<parallel_execution>false</parallel_execution> | ||
</settings> | ||
<source_list> | ||
<source name="GRCh38_TCGA_OPENGDC"> | ||
<!--url is the host in the FTPDownloader--> | ||
<url>bioinformatics.iasi.cnr.it</url> | ||
<source_working_directory>GRCh38_TCGA_OPENGDC</source_working_directory> | ||
<downloader>it.polimi.genomics.metadata.downloader_transformer.default.FtpDownloader</downloader> | ||
<transformer>it.polimi.genomics.metadata.downloader_transformer.default.NullTransformer</transformer> | ||
<loader>it.polimi.genomics.metadata.step.GMQLLoader</loader> | ||
<download_enabled>true</download_enabled> | ||
<transform_enabled>true</transform_enabled> | ||
<cleaner_enabled>true</cleaner_enabled> | ||
<mapper_enabled>true</mapper_enabled> | ||
<enricher_enabled>true</enricher_enabled> | ||
<flattener_enabled>true</flattener_enabled> | ||
<load_enabled>true</load_enabled> | ||
<parameter_list> | ||
<parameter> | ||
<description>For metadata key replacement have to refer another file. Should be big file and would not be useful</description> | ||
<key>metadata_replacement</key> | ||
<value>xml/metadataReplacementTcga.xml</value> | ||
</parameter> | ||
<parameter> | ||
<description>It is to know which user in GMQL is going to run the import</description> | ||
<key>gmql_user</key> | ||
<value>public</value> | ||
</parameter> | ||
<parameter> | ||
<description>For logging in ftp on tcga2bed we need this anonymous user</description> | ||
<key>username</key> | ||
<value>anonymous</value> | ||
</parameter> | ||
<parameter> | ||
<description>And also the corresponding null password</description> | ||
<key>password</key> | ||
<value/> | ||
</parameter> | ||
<parameter> | ||
<description>decides the separator char for metadata names.</description> | ||
<key>metadata_name_separation_char</key> | ||
<value>__</value> | ||
</parameter> | ||
<parameter> | ||
<description>Mappings</description> | ||
<key>mappings</key> | ||
<value>/Users/abernasconi/Documents/gitProjects/GMQL-Importer/Example/xml/settingsTCGAOpenGDC.xml</value> | ||
</parameter> | ||
</parameter_list> | ||
<dataset_list> | ||
<!--THE SCHEMA URL HAS TO BE REFERRED FROM ROOT PATH EX: ROOT PATH/SCHEMA PATH/schema.xml--> | ||
<dataset name="copy_number_segment"> | ||
<dataset_working_directory>copy_number_segment</dataset_working_directory> | ||
<schema_url location="http">ftp://bioinformatics.iasi.cnr.it/opengdc/bed/tcga/tcga-acc/copy_number_segment/header.schema</schema_url> | ||
<download_enabled>true</download_enabled> | ||
<transform_enabled>true</transform_enabled> | ||
<cleaner_enabled>true</cleaner_enabled> | ||
<mapper_enabled>true</mapper_enabled> | ||
<enricher_enabled>true</enricher_enabled> | ||
<flattener_enabled>true</flattener_enabled> | ||
<load_enabled>true</load_enabled> | ||
<parameter_list> | ||
<parameter> | ||
<description>The dataset name on the server</description> | ||
<key>loading_name</key> | ||
<value>GRCh38_TCGA_copy_number_2019_07</value> | ||
</parameter> | ||
<parameter> | ||
<description>The dataset description on the server, when clicking Show Info</description> | ||
<key>loading_description</key> | ||
<value>This dataset contains samples of copy number variations (copy number segment).<br> | ||
A copy number variation (CNV) is a variation in the number of copies of a given genomic segment per cell.<br> | ||
The considered experiments include both germline and somatic CNVs.<br> | ||
<br> | ||
It is directly derived from data available on the Genomic Data Commons portal and transformed into GDM format through the openGDC pipeline.<br> | ||
Documentation is available at http://bioinf.iasi.cnr.it/opengdc/data/OpenGDC_format_definition.pdf.<br> | ||
More details are described in the GDC Data User's Guide available at https://docs.gdc.cancer.gov/Data/PDF/Data_UG.pdf and at https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/CNV_Pipeline/.<br> | ||
<br> | ||
The dataset includes tab separated BED files, in which the CNV file is converted, with the following fields:<br> | ||
<ol> | ||
<li>chrom (equal to the 2. field of the GDC CNV file, e.g., "1")</li> | ||
<li>start (equal to the 3. field of the GDC CNV file, e.g., 61735)</li> | ||
<li>end (equal to the 4. field of the GDC CNV file, e.g., 1628826)</li> | ||
<li>strand (unknown, set to '*')</li> | ||
<li>num_probes (equal to the 5. field of the GDC CNV file, e.g., 229)</li> | ||
<li>segment_mean (equal to the 6. field of the GDC CNV file, e.g., 0.1756)</li> | ||
</ol></value> | ||
</parameter> | ||
<parameter> | ||
<description>We look for all the copy_number_segment folders inside the bed directory</description> | ||
<key>folder_regex</key> | ||
<value>^/opengdc/bed/tcga/.*/copy_number_segment</value> | ||
</parameter> | ||
<parameter> | ||
<description>We look for all the .bed and .meta files to download</description> | ||
<key>files_regex</key> | ||
<value>.*\.bed(\.meta)?$</value> | ||
</parameter> | ||
</parameter_list> | ||
</dataset> | ||
<dataset name="gene_expression_quantification"> | ||
<dataset_working_directory>gene_expression_quantification</dataset_working_directory> | ||
<schema_url location="http">ftp://bioinformatics.iasi.cnr.it/opengdc/bed/tcga/tcga-acc/gene_expression_quantification/header.schema</schema_url> | ||
<download_enabled>true</download_enabled> | ||
<transform_enabled>true</transform_enabled> | ||
<cleaner_enabled>true</cleaner_enabled> | ||
<mapper_enabled>true</mapper_enabled> | ||
<enricher_enabled>true</enricher_enabled> | ||
<flattener_enabled>true</flattener_enabled> | ||
<load_enabled>true</load_enabled> | ||
<parameter_list> | ||
<parameter> | ||
<description>The dataset name on the server</description> | ||
<key>loading_name</key> | ||
<value>GRCh38_TCGA_gene_expression_2019_07</value> | ||
</parameter> | ||
<parameter> | ||
<description>The dataset description on the server, when clicking Show Info</description> | ||
<key>loading_description</key> | ||
<value>This dataset contains data on gene expression quantification.<br> | ||
It is directly derived from data available on the Genomic Data Commons portal and transformed into GDM format through the openGDC pipeline.<br> | ||
Documentation is available at http://bioinf.iasi.cnr.it/opengdc/data/OpenGDC_format_definition.pdf.<br> | ||
<br> | ||
GDC provides gene expression quantification data in three files for each aliquot:<br> | ||
- FPKM (i.e., Fragments Per Kilobase of transcript per Million mapped reads)<br> | ||
- FPKM-UQ (i.e., Upper Quartile normalized FPKM values)<br> | ||
- counts (i.e., raw mapping counts of reads mapped to each gene)<br> | ||
<br> | ||
More details are described in the GDC Data User's Guide available at https://docs.gdc.cancer.gov/Data/PDF/Data_UG.pdf and at https://gdc.cancer.gov/about-data/data-harmonization-and-generation/genomic-data-harmonization/high-level-data-generation/rna-seq-quantification.<br> | ||
<br> | ||
The three original GDC files have been merged into one single BED file with the following fields:<br> | ||
<ol> | ||
<li>chrom (retrieved from GDC.h38 GENCODE v22 GTF annotation file according to the Ensembl ID of the gene, completed with "chr", e.g., "chr2")</li> | ||
<li>start (retrieved from GDC.h38 GENCODE v22 GTF annotation file5 according to the Ensembl ID of the gene, e.g., 32277910)</li> | ||
<li>end (retrieved from GDC.h38 GENCODE v22 GTF annotation file5 according to the Ensembl ID of the gene, e.g., 32316594)</li> | ||
<li>strand (retrieved from GDC.h38 GENCODE v22 GTF annotation file5 according to the Ensembl ID of the gene, e.g., '+')</li> | ||
<li>ensembl_gene_id (equal to the 1. field of any of the GDC gene expression quantification files, e.g., "ENSG00000119820.9")</li> | ||
<li>entrez_gene_id (retrieved from HUGO Gene Nomenclature Committee (HGNC) according to the Ensembl ID of the gene, e.g., "84272")</li> | ||
<li>gene_symbol (retrieved from GDC.h38 GENCODE v22 GTF annotation file5 according to the Ensembl ID of the gene, e.g., "YIPF4")</li> | ||
<li>type (retrieved from GDC.h38 GENCODE v22 GTF annotation files5 according to the Ensembl ID of the gene, e.g., "gene")</li> | ||
<li>htseq_count (equal to the 2. field of the GDC counts file, e.g., 1320)</li> | ||
<li>fpkm_uq (equal to the 2. field of the GDC FPKM-UQ file, e.g., 88737.5390983</li> | ||
<li>fpkm (equal to the 2. field of the GDC FPKM file, e.g., 2.44783943057)</li> | ||
</ol></value> | ||
</parameter> | ||
<parameter> | ||
<description>We look for all the gene_expression_quantification folders inside the bed directory</description> | ||
<key>folder_regex</key> | ||
<value>^/opengdc/bed/tcga/.*/gene_expression_quantification</value> | ||
</parameter> | ||
<parameter> | ||
<description>We look for all the .bed and .meta files to download</description> | ||
<key>files_regex</key> | ||
<value>.*\.bed(\.meta)?$</value> | ||
</parameter> | ||
</parameter_list> | ||
</dataset> | ||
</dataset_list> | ||
</source> | ||
</source_list> | ||
</root> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters