From 7233e25430982588711e11f379d787a5ccb7cab5 Mon Sep 17 00:00:00 2001 From: AmitBinf Date: Tue, 11 Mar 2025 10:08:41 -0500 Subject: [PATCH 1/2] Updating import yaml files to fix fastq.gz.md5 import issues --- configs/import-mt.yaml | 2 +- configs/import.yaml | 2 +- tests/test_import_mapper.py | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/configs/import-mt.yaml b/configs/import-mt.yaml index 0536caa9..789b13ed 100644 --- a/configs/import-mt.yaml +++ b/configs/import-mt.yaml @@ -128,7 +128,7 @@ Data Objects: - data_object_type: Metagenome Raw Reads description: Metagenome Raw Reads for {id} name: Raw sequencer read data - import_suffix: .[A-Z]+-[A-Z]+.fastq.gz + import_suffix: \.[ACGT]+-[ACGT]+\.fastq\.gz$ nmdc_suffix: .fastq.gz input_to: [nmdc:ReadQcAnalysis] output_of: nmdc:NucleotideSequencing diff --git a/configs/import.yaml b/configs/import.yaml index 50622faf..56941ade 100644 --- a/configs/import.yaml +++ b/configs/import.yaml @@ -159,7 +159,7 @@ Data Objects: - data_object_type: Metagenome Raw Reads description: Metagenome Raw Reads for {id} name: Raw sequencer read data - import_suffix: .[A,C,G,T]+-[A,C,G,T]+.fastq.gz + import_suffix: \.[ACGT]+-[ACGT]+\.fastq\.gz$ nmdc_suffix: .fastq.gz input_to: [nmdc:ReadQcAnalysis] output_of: nmdc:NucleotideSequencing diff --git a/tests/test_import_mapper.py b/tests/test_import_mapper.py index 8ce51096..d0455560 100644 --- a/tests/test_import_mapper.py +++ b/tests/test_import_mapper.py @@ -31,6 +31,8 @@ def mock_minted_ids(): def test_update_do_mappings_from_import_files(import_mapper_instance): import_mapper_instance.update_do_mappings_from_import_files() + for fm_all in import_mapper_instance.mappings: + print(fm_all, "\n\n") assert len(import_mapper_instance.mappings) == 22 From 5883a2853b867c0d3a4dee07d61a2c569d7d7cec Mon Sep 17 00:00:00 2001 From: AmitBinf Date: Thu, 13 Mar 2025 09:39:14 -0500 Subject: [PATCH 2/2] Do not pick files ending in .md5 --- configs/import-mt.yaml | 72 ++++++------ configs/import.yaml | 90 +++++++-------- ...CT-TACACGCT.filter-METAGENOME.fastq.gz.md5 | 1 + .../Ga0597026_bins_1.tar.gz.md5 | 1 + .../Ga0597026_bins_2.tar.gz.md5 | 1 + .../Ga0597026_cath_funfam.gff.md5 | 1 + .../Ga0597026_cds_proteins.faa.md5 | 1 + .../import_project_dir/Ga0597026_cog.gff.md5 | 1 + .../Ga0597026_contig_names_mapping.tsv.md5 | 1 + .../import_project_dir/Ga0597026_crt.gff.md5 | 1 + tests/import_project_dir/Ga0597026_ec.tsv.md5 | 1 + .../Ga0597026_functional_annotation.gff.md5 | 1 + .../Ga0597026_genemark.gff.md5 | 1 + .../Ga0597026_genemark_proteins.faa.md5 | 1 + tests/import_project_dir/Ga0597026_ko.tsv.md5 | 1 + .../import_project_dir/Ga0597026_pfam.gff.md5 | 1 + .../Ga0597026_prodigal.gff.md5 | 1 + .../Ga0597026_prodigal_proteins.faa.md5 | 1 + .../Ga0597026_proteins.faa.md5 | 1 + .../Ga0597026_smart.gff.md5 | 1 + .../Ga0597026_structural_annotation.gff.md5 | 1 + .../Ga0597026_supfam.gff.md5 | 1 + .../Ga0597026_tigrfam.gff.md5 | 1 + .../import_project_dir/Ga0597026_trna.gff.md5 | 1 + tests/import_project_dir/README.txt.md5 | 1 + .../nmdc:omprc-11-importT_minted_ids.json.md5 | 1 + .../pairedMapped.sam.gz.md5 | 1 + .../pairedMapped_sorted.bam.cov.md5 | 1 + tests/import_test.yaml | 106 ++++++++++-------- tests/test_import_mapper.py | 5 +- 30 files changed, 168 insertions(+), 131 deletions(-) create mode 100644 tests/import_project_dir/52710.1.424012.TACACGCT-TACACGCT.filter-METAGENOME.fastq.gz.md5 create mode 100644 tests/import_project_dir/Ga0597026_bins_1.tar.gz.md5 create mode 100644 tests/import_project_dir/Ga0597026_bins_2.tar.gz.md5 create mode 100644 tests/import_project_dir/Ga0597026_cath_funfam.gff.md5 create mode 100644 tests/import_project_dir/Ga0597026_cds_proteins.faa.md5 create mode 100644 tests/import_project_dir/Ga0597026_cog.gff.md5 create mode 100644 tests/import_project_dir/Ga0597026_contig_names_mapping.tsv.md5 create mode 100644 tests/import_project_dir/Ga0597026_crt.gff.md5 create mode 100644 tests/import_project_dir/Ga0597026_ec.tsv.md5 create mode 100644 tests/import_project_dir/Ga0597026_functional_annotation.gff.md5 create mode 100644 tests/import_project_dir/Ga0597026_genemark.gff.md5 create mode 100644 tests/import_project_dir/Ga0597026_genemark_proteins.faa.md5 create mode 100644 tests/import_project_dir/Ga0597026_ko.tsv.md5 create mode 100644 tests/import_project_dir/Ga0597026_pfam.gff.md5 create mode 100644 tests/import_project_dir/Ga0597026_prodigal.gff.md5 create mode 100644 tests/import_project_dir/Ga0597026_prodigal_proteins.faa.md5 create mode 100644 tests/import_project_dir/Ga0597026_proteins.faa.md5 create mode 100644 tests/import_project_dir/Ga0597026_smart.gff.md5 create mode 100644 tests/import_project_dir/Ga0597026_structural_annotation.gff.md5 create mode 100644 tests/import_project_dir/Ga0597026_supfam.gff.md5 create mode 100644 tests/import_project_dir/Ga0597026_tigrfam.gff.md5 create mode 100644 tests/import_project_dir/Ga0597026_trna.gff.md5 create mode 100644 tests/import_project_dir/README.txt.md5 create mode 100644 tests/import_project_dir/nmdc:omprc-11-importT_minted_ids.json.md5 create mode 100644 tests/import_project_dir/pairedMapped.sam.gz.md5 create mode 100644 tests/import_project_dir/pairedMapped_sorted.bam.cov.md5 diff --git a/configs/import-mt.yaml b/configs/import-mt.yaml index 789b13ed..d73a9cc2 100644 --- a/configs/import-mt.yaml +++ b/configs/import-mt.yaml @@ -137,7 +137,7 @@ Data Objects: - data_object_type: Annotation Amino Acid FASTA description: FASTA Amino Acid File for {id} name: FASTA amino acid file for annotated proteins - import_suffix: _proteins.faa + import_suffix: "^(?!.*_(cds|genemark|prodigal)_proteins\\.faa$).*proteins\\.faa$" nmdc_suffix: _proteins.faa input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -146,7 +146,7 @@ Data Objects: - data_object_type: Contig Mapping File description: Contig mapping file for {id} name: Contig mappings between old and new contig names - import_suffix: _contig_names_mapping.tsv + import_suffix: "_contig_names_mapping\\.tsv$" nmdc_suffix: _contig_names_mapping.tsv input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -155,7 +155,7 @@ Data Objects: - data_object_type: Structural Annotation GFF description: Structural Annotation for {id} name: GFF3 format file with structural annotations - import_suffix: _structural_annotation.gff + import_suffix: _structural_annotation\.gff$ nmdc_suffix: _structural_annotation.gff input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -164,7 +164,7 @@ Data Objects: - data_object_type: Functional Annotation GFF description: Functional Annotation for {id} name: GFF3 format file with functional annotations - import_suffix: _functional_annotation.gff + import_suffix: _functional_annotation\.gff$ nmdc_suffix: _functional_annotation.gff input_to: [nmdc:MetatranscriptomeExpressionAnalysis] output_of: nmdc:MetatranscriptomeAnnotation @@ -173,7 +173,7 @@ Data Objects: - data_object_type: Annotation KEGG Orthology description: KEGG Orthology for {id} name: Tab delimited file for KO annotation - import_suffix: _ko.tsv + import_suffix: _ko\.tsv$ nmdc_suffix: _ko.tsv input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -182,7 +182,7 @@ Data Objects: - data_object_type: Annotation Enzyme Commission description: EC Annotations for {id} name: Tab delimited file for EC annotation - import_suffix: _ec.tsv + import_suffix: _ec\.tsv$ nmdc_suffix: _ec.tsv input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -191,7 +191,7 @@ Data Objects: - data_object_type Scaffold Lineage tsv description: Scaffold Lineage tsv for {id} name: Phylogeny at the scaffold level - import_suffix: _scaffold_lineage.tsv + import_suffix: _scaffold_lineage\.tsv$ nmdc_suffix: _scaffold_lineage.tsv input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -199,7 +199,7 @@ Data Objects: - data_object_type: Clusters of Orthologous Groups (COG) Annotation GFF description: COGs for {id} name: GFF3 format file with COGs - import_suffix: _cog.gff + import_suffix: _cog\.gff$ nmdc_suffix: _cog.gff input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -208,7 +208,7 @@ Data Objects: - data_object_type: Pfam Annotation GFF description: Pfam Annotation for {id} name: GFF3 format file with Pfam - import_suffix: _pfam.gff + import_suffix: _pfam\.gff$ nmdc_suffix: _pfam.gff input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -217,7 +217,7 @@ Data Objects: - data_object_type: TIGRFam Annotation GFF description: TIGRFam for {id} name: GFF3 format file with TIGRfam - import_suffix: _tigrfam.gff + import_suffix: _tigrfam\.gff$ nmdc_suffix: _tigrfam.gff input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -226,7 +226,7 @@ Data Objects: - data_object_type: SMART Annotation GFF description: SMART Annotations for {id} name: GFF3 format file with SMART - import_suffix: _smart.gff + import_suffix: _smart\.gff$ nmdc_suffix: _smart.gff input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -235,7 +235,7 @@ Data Objects: - data_object_type: SUPERFam Annotation GFF description: SUPERFam Annotations for {id} name: GFF3 format file with SUPERFam - import_suffix: _supfam.gff + import_suffix: _supfam\.gff$ nmdc_suffix: _supfam.gff input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -244,7 +244,7 @@ Data Objects: - data_object_type: CATH FunFams (Functional Families) Annotation GFF description: CATH FunFams for {id} name: GFF3 format file with CATH FunFams - import_suffix: _cath_funfam.gff + import_suffix: _cath_funfam\.gff$ nmdc_suffix: _cath_funfam.gff input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -253,7 +253,7 @@ Data Objects: - data_object_type: CRT Annotation GFF description: CRT Annotations for {id} name: GFF3 format file with CRT - import_suffix: _crt.gff + import_suffix: _crt\.gff$ nmdc_suffix: _crt.gff input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -262,7 +262,7 @@ Data Objects: - data_object_type: Genemark Annotation GFF description: Genemark Annotations for {id} name: GFF3 format file with Genemark - import_suffix: _genemark.gff + import_suffix: _genemark\.gff$ nmdc_suffix: _genemark.gff input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -271,7 +271,7 @@ Data Objects: - data_object_type: Prodigal Annotation GFF description: Prodigal Annotations {id} name: GFF3 format file with Prodigal - import_suffix: _prodigal.gff + import_suffix: _prodigal\.gff$ nmdc_suffix: _prodigal.gff input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -280,7 +280,7 @@ Data Objects: - data_object_type: TRNA Annotation GFF description: TRNA Annotations {id} name: GFF3 format file with TRNA - import_suffix: _trna.gff + import_suffix: _trna\.gff$ nmdc_suffix: _trna.gff input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -289,7 +289,7 @@ Data Objects: - data_object_type: RFAM Annotation GFF description: RFAM Annotations for {id} name: GFF3 format file with RFAM - import_suffix: _rfam.gff + import_suffix: _rfam\.gff$ nmdc_suffix: _rfam.gff input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -298,7 +298,7 @@ Data Objects: - data_object_type: KO_EC Annotation GFF description: KO_EC Annotations for {id} name: GFF3 format file with KO_EC - import_suffix: _ko_ec.gff + import_suffix: _ko_ec\.gff$ nmdc_suffix: _ko_ec.gff input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -307,7 +307,7 @@ Data Objects: - data_object_type: Product Names description: Product names for {id} name: Product names file - import_suffix: _product_names.tsv + import_suffix: _product_names\.tsv$ nmdc_suffix: _product_names.tsv input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -316,7 +316,7 @@ Data Objects: - data_object_type: Gene Phylogeny tsv description: Gene Phylogeny for {id} name: Gene Phylogeny file - import_suffix: _gene_phylogeny.tsv + import_suffix: _gene_phylogeny\.tsv$ nmdc_suffix: _gene_phylogeny.tsv input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -325,7 +325,7 @@ Data Objects: - data_object_type: Crispr Terms description: Crispr Terms for {id} name: Crispr Terms - import_suffix: _crt.crisprs + import_suffix: _crt\.crisprs$ nmdc_suffix: _crt.crisprs input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -334,7 +334,7 @@ Data Objects: - data_object_type: Annotation Statistics description: Annotation Stats for {id} name: Annotation statistics report - import_suffix: _stats.tsv + import_suffix: _stats\.tsv$ nmdc_suffix: _stats.tsv input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -343,7 +343,7 @@ Data Objects: - data_object_type: Annotation Info File description: Annotation Info File for {id} name: File containing annotation info - import_suffix: _imgap.info + import_suffix: _imgap\.info$ nmdc_suffix: _imgap.info input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -351,7 +351,7 @@ Data Objects: action: rename - data_object_type: Assembly Contigs description: Assembly contigs (remapped) for {id} - import_suffix: _contigs.fna + import_suffix: _contigs\.fna$ nmdc_suffix: _renamed_contigs.fna input_to: [] output_of: nmdc:MetatranscriptomeAnnotation @@ -359,7 +359,7 @@ Data Objects: - data_object_type: Filtered Sequencing Reads description: Reads QC for {id} name: Reads QC result fastq (clean data) - import_suffix: filter-MTF.fastq.gz + import_suffix: filter-MTF\.fastq\.gz$ nmdc_suffix: _filtered.fastq.gz input_to: [nmdc:MetatranscriptomeAssembly] output_of: nmdc:ReadQcAnalysis @@ -368,7 +368,7 @@ Data Objects: - data_object_type: rRNA Filtered Sequencing Reads description: Reads QC rRNA reads file for {id} name: Reads QC rRNA reads result fastq (clean data) - import_suffix: .rRNA.fastq.gz + import_suffix: \.rRNA\.fastq\.gz$ nmdc_suffix: _rRNA.fastq.gz input_to: [] output_of: nmdc:ReadQcAnalysis @@ -377,7 +377,7 @@ Data Objects: - data_object_type: QC Statistics description: Reads QC summary for {id} name: Reads QC summary statistics - import_suffix: .filtered-report.txt + import_suffix: \.filtered-report\.txt$ nmdc_suffix: _filterStats.txt input_to: [] output_of: nmdc:ReadQcAnalysis @@ -386,7 +386,7 @@ Data Objects: - data_object_type: Read Filtering Info File description: Read Filtering Info File for {id} name: File containing read filtering information - import_suffix: .filter_cmd-MTF.sh + import_suffix: \.filter_cmd-MTF\.sh$ nmdc_suffix: _readsQC.info input_to: [] output_of: nmdc:ReadQcAnalysis @@ -395,7 +395,7 @@ Data Objects: - data_object_type: Assembly Contigs description: Assembly contigs for {id} name: Final assembly contigs fasta - import_suffix: assembly.contigs.fasta + import_suffix: assembly\.contigs\.fasta$ nmdc_suffix: _contigs.fna input_to: [nmdc:MetatranscriptomeAnnotation] output_of: nmdc:MetatranscriptomeAssembly @@ -404,7 +404,7 @@ Data Objects: - data_object_type: Assembly Info File description: Assembly info file for {id} name: File containing assembly information - import_suffix: README.txt + import_suffix: README\.txt$ nmdc_suffix: _metaAsm.info input_to: [] output_of: nmdc:MetatranscriptomeAssembly @@ -414,7 +414,7 @@ Data Objects: description: Coverage Stats for {id} name: Assembled contigs coverage information import_suffix: pairedMapped_sorted.bam.cov - nmdc_suffix: _covstats.txt + nmdc_suffix: _covstats\.txt$ input_to: [] output_of: nmdc:MetatranscriptomeAssembly multiple: false @@ -422,7 +422,7 @@ Data Objects: - data_object_type: Assembly Coverage BAM description: Sorted Bam for {id} name: Sorted bam file of reads mapping back to the final assembly - import_suffix: pairedMapped.bam.gz + import_suffix: pairedMapped\.bam\.gz$ nmdc_suffix: _pairedMapped_sorted.bam.gz input_to: [nmdc:MetatranscriptomeExpressionAnalysis] output_of: nmdc:MetatranscriptomeAssembly @@ -431,7 +431,7 @@ Data Objects: - data_object_type: BAI File description: Alignment index file for {id} name: BAM index file - import_suffix: _pairedMapped_sorted.bam.bai + import_suffix: _pairedMapped_sorted\.bam\.bai$ nmdc_suffix: _pairedMapped_sorted.bam.bai input_to: [] output_of: nmdc:MetatranscriptomeAssembly @@ -440,7 +440,7 @@ Data Objects: - data_object_type: Metatranscriptome Expression description: Expression counts for {id} name: Expression counts file - import_suffix: .rnaseq_gea.txt + import_suffix: \.rnaseq_gea\.txt$ nmdc_suffix: _rnaseq_gea.txt input_to: [] output_of: nmdc:MetatranscriptomeExpressionAnalysis @@ -449,7 +449,7 @@ Data Objects: - data_object_type: Metatranscriptome Expression Intergenic description: Expression intergenic counts for {id} name: Expression intergenic counts file - import_suffix: .rnaseq_gea.intergenic.txt + import_suffix: \.rnaseq_gea\.intergenic\.txt$ nmdc_suffix: _rnaseq_gea.intergenic.txt input_to: [] output_of: nmdc:MetatranscriptomeExpressionAnalysis diff --git a/configs/import.yaml b/configs/import.yaml index 56941ade..3888f61f 100644 --- a/configs/import.yaml +++ b/configs/import.yaml @@ -168,7 +168,7 @@ Data Objects: - data_object_type: CheckM Statistics description: CheckM for {id} name: CheckM statistics report - import_suffix: _checkm_qa.out + import_suffix: _checkm_qa\.out$ nmdc_suffix: _checkm_qa.out input_to: [] output_of: nmdc:MagsAnalysis @@ -177,7 +177,7 @@ Data Objects: - data_object_type: GTDBTK Bacterial Summary description: Bacterial Summary for {id} name: GTDBTK bacterial summary - import_suffix: _gtdbtk.bac122.summary.tsv + import_suffix: _gtdbtk\.bac122\.summary\.tsv$ nmdc_suffix: _gtdbtk.bac122.summary.tsv input_to: [] output_of: nmdc:MagsAnalysis @@ -186,7 +186,7 @@ Data Objects: - data_object_type: GTDBTK Archaeal Summary description: Archaeal Summary for {id} name: GTDBTK archaeal summary - import_suffix: _gtdbtk.ar122.summary.tsv + import_suffix: _gtdbtk\.ar122\.summary\.tsv$ nmdc_suffix: _gtdbtk.ar122.summary.tsv input_to: [] output_of: nmdc:MagsAnalysis @@ -204,7 +204,7 @@ Data Objects: - data_object_type: Contig Mapping File description: Contig mapping file for {id} name: Contig mappings between contigs and scaffolds - import_suffix: _contig_names_mapping.tsv + import_suffix: "_contig_names_mapping\\.tsv$" nmdc_suffix: _contig_names_mapping.tsv input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -213,7 +213,7 @@ Data Objects: - data_object_type: Structural Annotation GFF description: Structural Annotation for {id} name: GFF3 format file with structural annotations - import_suffix: _structural_annotation.gff + import_suffix: _structural_annotation\.gff$ nmdc_suffix: _structural_annotation.gff input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -222,7 +222,7 @@ Data Objects: - data_object_type: Functional Annotation GFF description: Functional Annotation for {id} name: GFF3 format file with functional annotations - import_suffix: _functional_annotation.gff + import_suffix: _functional_annotation\.gff$ nmdc_suffix: _functional_annotation.gff input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -231,7 +231,7 @@ Data Objects: - data_object_type: Annotation KEGG Orthology description: KEGG Orthology for {id} name: Tab delimited file for KO annotation - import_suffix: _ko.tsv + import_suffix: _ko\.tsv$ nmdc_suffix: _ko.tsv input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -240,7 +240,7 @@ Data Objects: - data_object_type: Annotation Enzyme Commission description: EC Annotations for {id} name: Tab delimited file for EC annotation - import_suffix: _ec.tsv + import_suffix: _ec\.tsv$ nmdc_suffix: _ec.tsv input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -249,7 +249,7 @@ Data Objects: - data_object_type: Clusters of Orthologous Groups (COG) Annotation GFF description: COGs for {id} name: GFF3 format file with COGs - import_suffix: _cog.gff + import_suffix: _cog\.gff$ nmdc_suffix: _cog.gff input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -258,7 +258,7 @@ Data Objects: - data_object_type: Pfam Annotation GFF description: Pfam Annotation for {id} name: GFF3 format file with Pfam - import_suffix: _pfam.gff + import_suffix: _pfam\.gff$ nmdc_suffix: _pfam.gff input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -267,7 +267,7 @@ Data Objects: - data_object_type: TIGRFam Annotation GFF description: TIGRFam for {id} name: GFF3 format file with TIGRfam - import_suffix: _tigrfam.gff + import_suffix: _tigrfam\.gff$ nmdc_suffix: _tigrfam.gff input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -276,7 +276,7 @@ Data Objects: - data_object_type: SMART Annotation GFF description: SMART Annotations for {id} name: GFF3 format file with SMART - import_suffix: _smart.gff + import_suffix: _smart\.gff$ nmdc_suffix: _smart.gff input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -285,7 +285,7 @@ Data Objects: - data_object_type: SUPERFam Annotation GFF description: SUPERFam Annotations for {id} name: GFF3 format file with SUPERFam - import_suffix: _supfam.gff + import_suffix: _supfam\.gff$ nmdc_suffix: _supfam.gff input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -294,7 +294,7 @@ Data Objects: - data_object_type: CATH FunFams (Functional Families) Annotation GFF description: CATH FunFams for {id} name: GFF3 format file with CATH FunFams - import_suffix: _cath_funfam.gff + import_suffix: _cath_funfam\.gff$ nmdc_suffix: _cath_funfam.gff input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -303,7 +303,7 @@ Data Objects: - data_object_type: CRT Annotation GFF description: CRT Annotations for {id} name: GFF3 format file with CRT - import_suffix: _crt.gff + import_suffix: _crt\.gff$ nmdc_suffix: _crt.gff input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -312,7 +312,7 @@ Data Objects: - data_object_type: Genemark Annotation GFF description: Genemark Annotations for {id} name: GFF3 format file with Genemark - import_suffix: _genemark.gff + import_suffix: _genemark\.gff$ nmdc_suffix: _genemark.gff input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -321,7 +321,7 @@ Data Objects: - data_object_type: Prodigal Annotation GFF description: Prodigal Annotations {id} name: GFF3 format file with Prodigal - import_suffix: _prodigal.gff + import_suffix: _prodigal\.gff$ nmdc_suffix: _prodigal.gff input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -330,7 +330,7 @@ Data Objects: - data_object_type: TRNA Annotation GFF description: TRNA Annotations {id} name: GFF3 format file with TRNA - import_suffix: _trna.gff + import_suffix: _trna\.gff$ nmdc_suffix: _trna.gff input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -339,7 +339,7 @@ Data Objects: - data_object_type: RFAM Annotation GFF description: RFAM Annotations for {id} name: GFF3 format file with RFAM - import_suffix: _rfam.gff + import_suffix: _rfam\.gff$ nmdc_suffix: _rfam.gff input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -348,7 +348,7 @@ Data Objects: - data_object_type: KO_EC Annotation GFF description: KO_EC Annotations for {id} name: GFF3 format file with KO_EC - import_suffix: _ko_ec.gff + import_suffix: _ko_ec\.gff$ nmdc_suffix: _ko_ec.gff input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -357,7 +357,7 @@ Data Objects: - data_object_type: Product Names description: Product names for {id} name: Product names file - import_suffix: _product_names.tsv + import_suffix: _product_names\.tsv$ nmdc_suffix: _product_names.tsv input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -366,7 +366,7 @@ Data Objects: - data_object_type: Gene Phylogeny tsv description: Gene Phylogeny for {id} name: Gene Phylogeny file - import_suffix: _gene_phylogeny.tsv + import_suffix: _gene_phylogeny\.tsv$ nmdc_suffix: _gene_phylogeny.tsv input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -375,7 +375,7 @@ Data Objects: - data_object_type: Crispr Terms description: Crispr Terms for {id} name: Crispr Terms - import_suffix: _crt.crisprs + import_suffix: _crt\.crisprs$ nmdc_suffix: _crt.crisprs input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -384,7 +384,7 @@ Data Objects: - data_object_type: Annotation Statistics description: Annotation Stats for {id} name: Annotation statistics report - import_suffix: _stats.tsv + import_suffix: _stats\.tsv$ nmdc_suffix: _stats.tsv input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -393,7 +393,7 @@ Data Objects: - data_object_type: Annotation Info File description: Annotation Info File for {id} name: File containing annotation info - import_suffix: _imgap.info + import_suffix: _imgap\.info$ nmdc_suffix: _imgap.info input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -402,7 +402,7 @@ Data Objects: - data_object_type: Filtered Sequencing Reads description: Reads QC for {id} name: Reads QC result fastq (clean data) - import_suffix: filter-METAGENOME.fastq.gz + import_suffix: filter-METAGENOME\.fastq\.gz$ nmdc_suffix: _filtered.fastq.gz input_to: [nmdc:ReadBasedTaxonomyAnalysis,nmdc:MetagenomeAssembly] output_of: nmdc:ReadQcAnalysis @@ -411,7 +411,7 @@ Data Objects: - data_object_type: QC Statistics description: Reads QC summary for {id} name: Reads QC summary statistics - import_suffix: .filtered-report.txt + import_suffix: \.filtered-report\.txt$ nmdc_suffix: _filterStats.txt input_to: [] output_of: nmdc:ReadQcAnalysis @@ -420,7 +420,7 @@ Data Objects: - data_object_type: Read Filtering Info File description: Read Filtering Info File for {id} name: File containing read filtering information - import_suffix: _readsQC.info + import_suffix: _readsQC\.info$ nmdc_suffix: _readsQC.info input_to: [] output_of: nmdc:ReadQcAnalysis @@ -429,7 +429,7 @@ Data Objects: - data_object_type: Assembly Contigs description: Assembly contigs for {id} name: Final assembly contigs fasta - import_suffix: assembly.contigs.fasta + import_suffix: assembly\.contigs\.fasta$ nmdc_suffix: _contigs.fna input_to: [nmdc:MetagenomeAnnotation,nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAssembly @@ -438,7 +438,7 @@ Data Objects: - data_object_type: Assembly Scaffolds description: Assembly scaffolds for {id} name: Final assembly scaffolds fasta - import_suffix: _scaffolds.fna + import_suffix: _scaffolds\.fna$ nmdc_suffix: _scaffolds.fna input_to: [] output_of: nmdc:MetagenomeAssembly @@ -447,7 +447,7 @@ Data Objects: - data_object_type: Assembly Info File description: Assembly info file for {id} name: File containing assembly information - import_suffix: README.txt + import_suffix: README\.txt$ nmdc_suffix: _metaAsm.info input_to: [] output_of: nmdc:MetagenomeAssembly @@ -456,7 +456,7 @@ Data Objects: - data_object_type: Assembly Coverage Stats description: Coverage Stats for {id} name: Assembled contigs coverage information - import_suffix: pairedMapped_sorted.bam.cov + import_suffix: pairedMapped_sorted\.bam\.cov$ nmdc_suffix: _covstats.txt input_to: [] output_of: nmdc:MetagenomeAssembly @@ -465,7 +465,7 @@ Data Objects: - data_object_type: Assembly AGP description: AGP for {id} name: An AGP format file that describes the assembly - import_suffix: _assembly.agp + import_suffix: _assembly\.agp$ nmdc_suffix: _assembly.agp input_to: [] output_of: nmdc:MetagenomeAssembly @@ -474,7 +474,7 @@ Data Objects: - data_object_type: Assembly Coverage BAM description: Sorted Bam for {id} name: Sorted bam file of reads mapping back to the final assembly - import_suffix: pairedMapped.sam.gz + import_suffix: pairedMapped\.sam\.gz$ nmdc_suffix: _pairedMapped_sorted.sam.gz input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAssembly @@ -483,7 +483,7 @@ Data Objects: - data_object_type: Error Corrected Reads description: Error correctde reads for {id} name: bbcms error corrected reads - import_suffix: input.corr.fastq.gz + import_suffix: input\.corr\.fastq\.gz$ nmdc_suffix: _input.corr.fastq.gz input_to: [] output_of: nmdc:MetagenomeAssembly @@ -492,7 +492,7 @@ Data Objects: - data_object_type: GOTTCHA2 Report Full description: GOTTCHA2 Full Report for {id} name: GOTTCHA2 report file - import_suffix: _gottcha2_full.tsv + import_suffix: _gottcha2_full\.tsv$ nmdc_suffix: _gottcha2_full.tsv input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -501,7 +501,7 @@ Data Objects: - data_object_type: GOTTCHA2 Classification Report description: GOTTCHA2 Classification for {id} name: GOTTCHA2 classification report file - import_suffix: _gottcha2_classification.tsv + import_suffix: _gottcha2_classification\.tsv$ nmdc_suffix: _gottcha2_classification.tsv input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -510,7 +510,7 @@ Data Objects: - data_object_type: GOTTCHA2 Krona Plot description: GOTTCHA2 Krona for {id} name: GOTTCHA2 krona plot HTML file - import_suffix: _gottcha2_krona.html + import_suffix: _gottcha2_krona\.html$ nmdc_suffix: _gottcha2_krona.html input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -519,7 +519,7 @@ Data Objects: - data_object_type: Centrifuge Taxonomic Classification description: Centrifuge Report for {id} name: Centrifuge output read classification file - import_suffix: _centrifuge_classification.tsv + import_suffix: _centrifuge_classification\.tsv$ nmdc_suffix: _centrifuge_classification.tsv input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -528,7 +528,7 @@ Data Objects: - data_object_type: Centrifuge output report file description: Centrifuge output report file for {id} name: Centrifuge Classification Report - import_suffix: _centrifuge_report.tsv + import_suffix: _centrifuge_report\.tsv$ nmdc_suffix: _centrifuge_report.tsv input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -537,7 +537,7 @@ Data Objects: - data_object_type: Centrifuge Krona Plot description: Centrifuge Krona for {id} name: Centrifuge krona plot HTML file - import_suffix: _centrifuge_krona.html + import_suffix: _centrifuge_krona\.html$ nmdc_suffix: _centrifuge_krona.html input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -546,7 +546,7 @@ Data Objects: - data_object_type: Kraken2 Classification Report description: Kraken2 report for {id} name: Kraken2 outpur report file - import_suffix: _kraken2_report.tsv + import_suffix: _kraken2_report\.tsv$ nmdc_suffix: _kraken2_report.tsv input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -555,7 +555,7 @@ Data Objects: - data_object_type: Kraken2 Taxonomic Classification description: Kraken2 classification for {id} name: Kraken2 output read classification file - import_suffix: _kraken2_classification.tsv + import_suffix: _kraken2_classification\.tsv$ nmdc_suffix: _kraken2_classification.tsv input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -564,7 +564,7 @@ Data Objects: - data_object_type: Kraken2 Krona Plot description: Kraken2 Krona plot for {id} name: Kraken2 Krona plot HTML file - import_suffix: _kraken2_krona.html + import_suffix: _kraken2_krona\.html$ nmdc_suffix: _kraken2_krona.html input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -574,7 +574,7 @@ Data Objects: - data_object_type: Metagenome HQMQ Bins Compression File description: Metagenome Bins for {id} name: Metagenome bin tarfiles archive - import_suffix: _[0-9]+.tar.gz + import_suffix: _[0-9]+\.tar\.gz$ nmdc_suffix: _hqmq_bin.zip input_to: [] output_of: nmdc:MagsAnalysis diff --git a/tests/import_project_dir/52710.1.424012.TACACGCT-TACACGCT.filter-METAGENOME.fastq.gz.md5 b/tests/import_project_dir/52710.1.424012.TACACGCT-TACACGCT.filter-METAGENOME.fastq.gz.md5 new file mode 100644 index 00000000..06f48291 --- /dev/null +++ b/tests/import_project_dir/52710.1.424012.TACACGCT-TACACGCT.filter-METAGENOME.fastq.gz.md5 @@ -0,0 +1 @@ +f947d0ae037a73d46294f51c73d06eec diff --git a/tests/import_project_dir/Ga0597026_bins_1.tar.gz.md5 b/tests/import_project_dir/Ga0597026_bins_1.tar.gz.md5 new file mode 100644 index 00000000..614859d7 --- /dev/null +++ b/tests/import_project_dir/Ga0597026_bins_1.tar.gz.md5 @@ -0,0 +1 @@ +e1846a4cf021a3df444cad1b9e0b5937 diff --git a/tests/import_project_dir/Ga0597026_bins_2.tar.gz.md5 b/tests/import_project_dir/Ga0597026_bins_2.tar.gz.md5 new file mode 100644 index 00000000..e85181ca --- /dev/null +++ b/tests/import_project_dir/Ga0597026_bins_2.tar.gz.md5 @@ -0,0 +1 @@ +6a90cf1b28e8279bffa4f0a1ebb25868 diff --git a/tests/import_project_dir/Ga0597026_cath_funfam.gff.md5 b/tests/import_project_dir/Ga0597026_cath_funfam.gff.md5 new file mode 100644 index 00000000..71d2962f --- /dev/null +++ b/tests/import_project_dir/Ga0597026_cath_funfam.gff.md5 @@ -0,0 +1 @@ +c173ba6672ac81221a83d463c7b8ff17 diff --git a/tests/import_project_dir/Ga0597026_cds_proteins.faa.md5 b/tests/import_project_dir/Ga0597026_cds_proteins.faa.md5 new file mode 100644 index 00000000..578d4950 --- /dev/null +++ b/tests/import_project_dir/Ga0597026_cds_proteins.faa.md5 @@ -0,0 +1 @@ +dc61b1a05767d7bb3cf9d9db7f0e60ac diff --git a/tests/import_project_dir/Ga0597026_cog.gff.md5 b/tests/import_project_dir/Ga0597026_cog.gff.md5 new file mode 100644 index 00000000..28007b98 --- /dev/null +++ b/tests/import_project_dir/Ga0597026_cog.gff.md5 @@ -0,0 +1 @@ +ae08deb984a060091d55a9e3d7a6b3cd diff --git a/tests/import_project_dir/Ga0597026_contig_names_mapping.tsv.md5 b/tests/import_project_dir/Ga0597026_contig_names_mapping.tsv.md5 new file mode 100644 index 00000000..19923de2 --- /dev/null +++ b/tests/import_project_dir/Ga0597026_contig_names_mapping.tsv.md5 @@ -0,0 +1 @@ +5e12e0f853b3c016f1dad0aa260b9193 diff --git a/tests/import_project_dir/Ga0597026_crt.gff.md5 b/tests/import_project_dir/Ga0597026_crt.gff.md5 new file mode 100644 index 00000000..1c41b68c --- /dev/null +++ b/tests/import_project_dir/Ga0597026_crt.gff.md5 @@ -0,0 +1 @@ +cce394d11b2beebd938b625e93111603 diff --git a/tests/import_project_dir/Ga0597026_ec.tsv.md5 b/tests/import_project_dir/Ga0597026_ec.tsv.md5 new file mode 100644 index 00000000..0b5b4147 --- /dev/null +++ b/tests/import_project_dir/Ga0597026_ec.tsv.md5 @@ -0,0 +1 @@ +8b1929ae2abfdfeb3703fd61820970c0 diff --git a/tests/import_project_dir/Ga0597026_functional_annotation.gff.md5 b/tests/import_project_dir/Ga0597026_functional_annotation.gff.md5 new file mode 100644 index 00000000..ceda3e1c --- /dev/null +++ b/tests/import_project_dir/Ga0597026_functional_annotation.gff.md5 @@ -0,0 +1 @@ +1027b02dfc9b3b068fe458d5f96046ce diff --git a/tests/import_project_dir/Ga0597026_genemark.gff.md5 b/tests/import_project_dir/Ga0597026_genemark.gff.md5 new file mode 100644 index 00000000..97b710a2 --- /dev/null +++ b/tests/import_project_dir/Ga0597026_genemark.gff.md5 @@ -0,0 +1 @@ +dfec8637c20ad11b1153ba03863d3c53 diff --git a/tests/import_project_dir/Ga0597026_genemark_proteins.faa.md5 b/tests/import_project_dir/Ga0597026_genemark_proteins.faa.md5 new file mode 100644 index 00000000..578d4950 --- /dev/null +++ b/tests/import_project_dir/Ga0597026_genemark_proteins.faa.md5 @@ -0,0 +1 @@ +dc61b1a05767d7bb3cf9d9db7f0e60ac diff --git a/tests/import_project_dir/Ga0597026_ko.tsv.md5 b/tests/import_project_dir/Ga0597026_ko.tsv.md5 new file mode 100644 index 00000000..3b56609d --- /dev/null +++ b/tests/import_project_dir/Ga0597026_ko.tsv.md5 @@ -0,0 +1 @@ +75178a8f0e61a3d9ac0b8f2cc14af3aa diff --git a/tests/import_project_dir/Ga0597026_pfam.gff.md5 b/tests/import_project_dir/Ga0597026_pfam.gff.md5 new file mode 100644 index 00000000..5d85a889 --- /dev/null +++ b/tests/import_project_dir/Ga0597026_pfam.gff.md5 @@ -0,0 +1 @@ +ced172fbd4e31649421f6ee852a7daaf diff --git a/tests/import_project_dir/Ga0597026_prodigal.gff.md5 b/tests/import_project_dir/Ga0597026_prodigal.gff.md5 new file mode 100644 index 00000000..5cbcc76c --- /dev/null +++ b/tests/import_project_dir/Ga0597026_prodigal.gff.md5 @@ -0,0 +1 @@ +4f09a37b4055963d70f2e89d92fa526f diff --git a/tests/import_project_dir/Ga0597026_prodigal_proteins.faa.md5 b/tests/import_project_dir/Ga0597026_prodigal_proteins.faa.md5 new file mode 100644 index 00000000..578d4950 --- /dev/null +++ b/tests/import_project_dir/Ga0597026_prodigal_proteins.faa.md5 @@ -0,0 +1 @@ +dc61b1a05767d7bb3cf9d9db7f0e60ac diff --git a/tests/import_project_dir/Ga0597026_proteins.faa.md5 b/tests/import_project_dir/Ga0597026_proteins.faa.md5 new file mode 100644 index 00000000..4ed8b6b5 --- /dev/null +++ b/tests/import_project_dir/Ga0597026_proteins.faa.md5 @@ -0,0 +1 @@ +13f843362a5b31e61bbd7b6628ecd769 diff --git a/tests/import_project_dir/Ga0597026_smart.gff.md5 b/tests/import_project_dir/Ga0597026_smart.gff.md5 new file mode 100644 index 00000000..6f50c2f8 --- /dev/null +++ b/tests/import_project_dir/Ga0597026_smart.gff.md5 @@ -0,0 +1 @@ +b3082908d12788ac0ccdef7f28e6ffad diff --git a/tests/import_project_dir/Ga0597026_structural_annotation.gff.md5 b/tests/import_project_dir/Ga0597026_structural_annotation.gff.md5 new file mode 100644 index 00000000..58673b31 --- /dev/null +++ b/tests/import_project_dir/Ga0597026_structural_annotation.gff.md5 @@ -0,0 +1 @@ +74bbcaf26ce7c1a3c2cf214d13887c0e diff --git a/tests/import_project_dir/Ga0597026_supfam.gff.md5 b/tests/import_project_dir/Ga0597026_supfam.gff.md5 new file mode 100644 index 00000000..05bc0764 --- /dev/null +++ b/tests/import_project_dir/Ga0597026_supfam.gff.md5 @@ -0,0 +1 @@ +3bbb0939d849c06449ca4127a94ab7a6 diff --git a/tests/import_project_dir/Ga0597026_tigrfam.gff.md5 b/tests/import_project_dir/Ga0597026_tigrfam.gff.md5 new file mode 100644 index 00000000..4aefaece --- /dev/null +++ b/tests/import_project_dir/Ga0597026_tigrfam.gff.md5 @@ -0,0 +1 @@ +43f89034c6e3b3e7f5d509027807ba8b diff --git a/tests/import_project_dir/Ga0597026_trna.gff.md5 b/tests/import_project_dir/Ga0597026_trna.gff.md5 new file mode 100644 index 00000000..442dd407 --- /dev/null +++ b/tests/import_project_dir/Ga0597026_trna.gff.md5 @@ -0,0 +1 @@ +a5352da073f4c99798c5af7cfa775db8 diff --git a/tests/import_project_dir/README.txt.md5 b/tests/import_project_dir/README.txt.md5 new file mode 100644 index 00000000..c65aec2c --- /dev/null +++ b/tests/import_project_dir/README.txt.md5 @@ -0,0 +1 @@ +f2a340c2e37593742818b71fccbe2ce2 diff --git a/tests/import_project_dir/nmdc:omprc-11-importT_minted_ids.json.md5 b/tests/import_project_dir/nmdc:omprc-11-importT_minted_ids.json.md5 new file mode 100644 index 00000000..849f8274 --- /dev/null +++ b/tests/import_project_dir/nmdc:omprc-11-importT_minted_ids.json.md5 @@ -0,0 +1 @@ +72d0a8ef10a47295b77ce2e8684f2e91 diff --git a/tests/import_project_dir/pairedMapped.sam.gz.md5 b/tests/import_project_dir/pairedMapped.sam.gz.md5 new file mode 100644 index 00000000..06f48291 --- /dev/null +++ b/tests/import_project_dir/pairedMapped.sam.gz.md5 @@ -0,0 +1 @@ +f947d0ae037a73d46294f51c73d06eec diff --git a/tests/import_project_dir/pairedMapped_sorted.bam.cov.md5 b/tests/import_project_dir/pairedMapped_sorted.bam.cov.md5 new file mode 100644 index 00000000..5b0b7022 --- /dev/null +++ b/tests/import_project_dir/pairedMapped_sorted.bam.cov.md5 @@ -0,0 +1 @@ +e0a11758eb4c30682838f4422bcfa27a diff --git a/tests/import_test.yaml b/tests/import_test.yaml index cd667d9b..1a7ed8d3 100644 --- a/tests/import_test.yaml +++ b/tests/import_test.yaml @@ -158,7 +158,7 @@ Data Objects: - data_object_type: Metagenome Raw Reads description: Metagenome Raw Reads for {id} name: Raw sequencer read data - import_suffix: .[A,C,G,T]+-[A,C,G,T]+.fastq.gz + import_suffix: \.[ACGT]+-[ACGT]+\.fastq\.gz$ nmdc_suffix: .fastq.gz input_to: [nmdc:ReadQcAnalysis] output_of: nmdc:NucleotideSequencing @@ -167,7 +167,7 @@ Data Objects: - data_object_type: CheckM Statistics description: CheckM for {id} name: CheckM statistics report - import_suffix: _checkm_qa.out + import_suffix: _checkm_qa\.out$ nmdc_suffix: _checkm_qa.out input_to: [] output_of: nmdc:MagsAnalysis @@ -176,7 +176,7 @@ Data Objects: - data_object_type: GTDBTK Bacterial Summary description: Bacterial Summary for {id} name: GTDBTK bacterial summary - import_suffix: _gtdbtk.bac122.summary.tsv + import_suffix: _gtdbtk\.bac122\.summary\.tsv$ nmdc_suffix: _gtdbtk.bac122.summary.tsv input_to: [] output_of: nmdc:MagsAnalysis @@ -185,7 +185,7 @@ Data Objects: - data_object_type: GTDBTK Archaeal Summary description: Archaeal Summary for {id} name: GTDBTK archaeal summary - import_suffix: _gtdbtk.ar122.summary.tsv + import_suffix: _gtdbtk\.ar122\.summary\.tsv$ nmdc_suffix: _gtdbtk.ar122.summary.tsv input_to: [] output_of: nmdc:MagsAnalysis @@ -203,7 +203,7 @@ Data Objects: - data_object_type: Contig Mapping File description: Contig mapping file for {id} name: Contig mappings between contigs and scaffolds - import_suffix: _contig_names_mapping.tsv + import_suffix: "_contig_names_mapping\\.tsv$" nmdc_suffix: _contig_names_mapping.tsv input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -212,7 +212,7 @@ Data Objects: - data_object_type: Structural Annotation GFF description: Structural Annotation for {id} name: GFF3 format file with structural annotations - import_suffix: _structural_annotation.gff + import_suffix: _structural_annotation\.gff$ nmdc_suffix: _structural_annotation.gff input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -221,7 +221,7 @@ Data Objects: - data_object_type: Functional Annotation GFF description: Functional Annotation for {id} name: GFF3 format file with functional annotations - import_suffix: _functional_annotation.gff + import_suffix: _functional_annotation\.gff$ nmdc_suffix: _functional_annotation.gff input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -230,7 +230,7 @@ Data Objects: - data_object_type: Annotation KEGG Orthology description: KEGG Orthology for {id} name: Tab delimited file for KO annotation - import_suffix: _ko.tsv + import_suffix: _ko\.tsv$ nmdc_suffix: _ko.tsv input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -239,7 +239,7 @@ Data Objects: - data_object_type: Annotation Enzyme Commission description: EC Annotations for {id} name: Tab delimited file for EC annotation - import_suffix: _ec.tsv + import_suffix: _ec\.tsv$ nmdc_suffix: _ec.tsv input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -248,7 +248,7 @@ Data Objects: - data_object_type: Clusters of Orthologous Groups (COG) Annotation GFF description: COGs for {id} name: GFF3 format file with COGs - import_suffix: _cog.gff + import_suffix: _cog\.gff$ nmdc_suffix: _cog.gff input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -257,7 +257,7 @@ Data Objects: - data_object_type: Pfam Annotation GFF description: Pfam Annotation for {id} name: GFF3 format file with Pfam - import_suffix: _pfam.gff + import_suffix: _pfam\.gff$ nmdc_suffix: _pfam.gff input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -266,7 +266,7 @@ Data Objects: - data_object_type: TIGRFam Annotation GFF description: TIGRFam for {id} name: GFF3 format file with TIGRfam - import_suffix: _tigrfam.gff + import_suffix: _tigrfam\.gff$ nmdc_suffix: _tigrfam.gff input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -275,7 +275,7 @@ Data Objects: - data_object_type: SMART Annotation GFF description: SMART Annotations for {id} name: GFF3 format file with SMART - import_suffix: _smart.gff + import_suffix: _smart\.gff$ nmdc_suffix: _smart.gff input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -284,7 +284,7 @@ Data Objects: - data_object_type: SUPERFam Annotation GFF description: SUPERFam Annotations for {id} name: GFF3 format file with SUPERFam - import_suffix: _supfam.gff + import_suffix: _supfam\.gff$ nmdc_suffix: _supfam.gff input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -293,7 +293,7 @@ Data Objects: - data_object_type: CATH FunFams (Functional Families) Annotation GFF description: CATH FunFams for {id} name: GFF3 format file with CATH FunFams - import_suffix: _cath_funfam.gff + import_suffix: _cath_funfam\.gff$ nmdc_suffix: _cath_funfam.gff input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -302,7 +302,7 @@ Data Objects: - data_object_type: CRT Annotation GFF description: CRT Annotations for {id} name: GFF3 format file with CRT - import_suffix: _crt.gff + import_suffix: _crt\.gff$ nmdc_suffix: _crt.gff input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -311,7 +311,7 @@ Data Objects: - data_object_type: Genemark Annotation GFF description: Genemark Annotations for {id} name: GFF3 format file with Genemark - import_suffix: _genemark.gff + import_suffix: _genemark\.gff$ nmdc_suffix: _genemark.gff input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -320,7 +320,7 @@ Data Objects: - data_object_type: Prodigal Annotation GFF description: Prodigal Annotations {id} name: GFF3 format file with Prodigal - import_suffix: _prodigal.gff + import_suffix: _prodigal\.gff$ nmdc_suffix: _prodigal.gff input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -329,7 +329,7 @@ Data Objects: - data_object_type: TRNA Annotation GFF description: TRNA Annotations {id} name: GFF3 format file with TRNA - import_suffix: _trna.gff + import_suffix: _trna\.gff$ nmdc_suffix: _trna.gff input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -338,7 +338,7 @@ Data Objects: - data_object_type: RFAM Annotation GFF description: RFAM Annotations for {id} name: GFF3 format file with RFAM - import_suffix: _rfam.gff + import_suffix: _rfam\.gff$ nmdc_suffix: _rfam.gff input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -347,7 +347,7 @@ Data Objects: - data_object_type: KO_EC Annotation GFF description: KO_EC Annotations for {id} name: GFF3 format file with KO_EC - import_suffix: _ko_ec.gff + import_suffix: _ko_ec\.gff$ nmdc_suffix: _ko_ec.gff input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -356,7 +356,7 @@ Data Objects: - data_object_type: Product Names description: Product names for {id} name: Product names file - import_suffix: _product_names.tsv + import_suffix: _product_names\.tsv$ nmdc_suffix: _product_names.tsv input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -365,7 +365,7 @@ Data Objects: - data_object_type: Gene Phylogeny tsv description: Gene Phylogeny for {id} name: Gene Phylogeny file - import_suffix: _gene_phylogeny.tsv + import_suffix: _gene_phylogeny\.tsv$ nmdc_suffix: _gene_phylogeny.tsv input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAnnotation @@ -374,7 +374,7 @@ Data Objects: - data_object_type: Crispr Terms description: Crispr Terms for {id} name: Crispr Terms - import_suffix: _crt.crisprs + import_suffix: _crt\.crisprs$ nmdc_suffix: _crt.crisprs input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -383,7 +383,7 @@ Data Objects: - data_object_type: Annotation Statistics description: Annotation Stats for {id} name: Annotation statistics report - import_suffix: _stats.tsv + import_suffix: _stats\.tsv$ nmdc_suffix: _stats.tsv input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -392,7 +392,7 @@ Data Objects: - data_object_type: Annotation Info File description: Annotation Info File for {id} name: File containing annotation info - import_suffix: _imgap.info + import_suffix: _imgap\.info$ nmdc_suffix: _imgap.info input_to: [] output_of: nmdc:MetagenomeAnnotation @@ -401,25 +401,25 @@ Data Objects: - data_object_type: Filtered Sequencing Reads description: Reads QC for {id} name: Reads QC result fastq (clean data) - import_suffix: filter-METAGENOME.fastq.gz + import_suffix: filter-METAGENOME\.fastq\.gz$ nmdc_suffix: _filtered.fastq.gz input_to: [nmdc:ReadBasedTaxonomyAnalysis,nmdc:MetagenomeAssembly] output_of: nmdc:ReadQcAnalysis multiple: false action: rename - - data_object_type: QC Statistics - description: Reads QC summary for {id} + - data_object_type: QC Statistics + description: Reads QC summary for {id} name: Reads QC summary statistics - import_suffix: .filtered-report.txt + import_suffix: \.filtered-report\.txt$ nmdc_suffix: _filterStats.txt input_to: [] output_of: nmdc:ReadQcAnalysis multiple: false action: rename - - data_object_type: Read Filtering Info File + - data_object_type: Read Filtering Info File description: Read Filtering Info File for {id} name: File containing read filtering information - import_suffix: _readsQC.info + import_suffix: _readsQC\.info$ nmdc_suffix: _readsQC.info input_to: [] output_of: nmdc:ReadQcAnalysis @@ -428,7 +428,7 @@ Data Objects: - data_object_type: Assembly Contigs description: Assembly contigs for {id} name: Final assembly contigs fasta - import_suffix: assembly.contigs.fasta + import_suffix: assembly\.contigs\.fasta$ nmdc_suffix: _contigs.fna input_to: [nmdc:MetagenomeAnnotation,nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAssembly @@ -437,7 +437,7 @@ Data Objects: - data_object_type: Assembly Scaffolds description: Assembly scaffolds for {id} name: Final assembly scaffolds fasta - import_suffix: _scaffolds.fna + import_suffix: _scaffolds\.fna$ nmdc_suffix: _scaffolds.fna input_to: [] output_of: nmdc:MetagenomeAssembly @@ -446,7 +446,7 @@ Data Objects: - data_object_type: Assembly Info File description: Assembly info file for {id} name: File containing assembly information - import_suffix: README.txt + import_suffix: README\.txt$ nmdc_suffix: _metaAsm.info input_to: [] output_of: nmdc:MetagenomeAssembly @@ -455,7 +455,7 @@ Data Objects: - data_object_type: Assembly Coverage Stats description: Coverage Stats for {id} name: Assembled contigs coverage information - import_suffix: pairedMapped_sorted.bam.cov + import_suffix: pairedMapped_sorted\.bam\.cov$ nmdc_suffix: _covstats.txt input_to: [] output_of: nmdc:MetagenomeAssembly @@ -464,7 +464,7 @@ Data Objects: - data_object_type: Assembly AGP description: AGP for {id} name: An AGP format file that describes the assembly - import_suffix: _assembly.agp + import_suffix: _assembly\.agp$ nmdc_suffix: _assembly.agp input_to: [] output_of: nmdc:MetagenomeAssembly @@ -473,16 +473,25 @@ Data Objects: - data_object_type: Assembly Coverage BAM description: Sorted Bam for {id} name: Sorted bam file of reads mapping back to the final assembly - import_suffix: pairedMapped.sam.gz + import_suffix: pairedMapped\.sam\.gz$ nmdc_suffix: _pairedMapped_sorted.sam.gz input_to: [nmdc:MagsAnalysis] output_of: nmdc:MetagenomeAssembly multiple: false action: rename + - data_object_type: Error Corrected Reads + description: Error correctde reads for {id} + name: bbcms error corrected reads + import_suffix: input\.corr\.fastq\.gz$ + nmdc_suffix: _input.corr.fastq.gz + input_to: [] + output_of: nmdc:MetagenomeAssembly + multiple: false + action: rename - data_object_type: GOTTCHA2 Report Full description: GOTTCHA2 Full Report for {id} name: GOTTCHA2 report file - import_suffix: _gottcha2_full.tsv + import_suffix: _gottcha2_full\.tsv$ nmdc_suffix: _gottcha2_full.tsv input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -491,7 +500,7 @@ Data Objects: - data_object_type: GOTTCHA2 Classification Report description: GOTTCHA2 Classification for {id} name: GOTTCHA2 classification report file - import_suffix: _gottcha2_classification.tsv + import_suffix: _gottcha2_classification\.tsv$ nmdc_suffix: _gottcha2_classification.tsv input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -500,7 +509,7 @@ Data Objects: - data_object_type: GOTTCHA2 Krona Plot description: GOTTCHA2 Krona for {id} name: GOTTCHA2 krona plot HTML file - import_suffix: _gottcha2_krona.html + import_suffix: _gottcha2_krona\.html$ nmdc_suffix: _gottcha2_krona.html input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -509,7 +518,7 @@ Data Objects: - data_object_type: Centrifuge Taxonomic Classification description: Centrifuge Report for {id} name: Centrifuge output read classification file - import_suffix: _centrifuge_classification.tsv + import_suffix: _centrifuge_classification\.tsv$ nmdc_suffix: _centrifuge_classification.tsv input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -518,7 +527,7 @@ Data Objects: - data_object_type: Centrifuge output report file description: Centrifuge output report file for {id} name: Centrifuge Classification Report - import_suffix: _centrifuge_report.tsv + import_suffix: _centrifuge_report\.tsv$ nmdc_suffix: _centrifuge_report.tsv input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -527,7 +536,7 @@ Data Objects: - data_object_type: Centrifuge Krona Plot description: Centrifuge Krona for {id} name: Centrifuge krona plot HTML file - import_suffix: _centrifuge_krona.html + import_suffix: _centrifuge_krona\.html$ nmdc_suffix: _centrifuge_krona.html input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -536,7 +545,7 @@ Data Objects: - data_object_type: Kraken2 Classification Report description: Kraken2 report for {id} name: Kraken2 outpur report file - import_suffix: _kraken2_report.tsv + import_suffix: _kraken2_report\.tsv$ nmdc_suffix: _kraken2_report.tsv input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -545,7 +554,7 @@ Data Objects: - data_object_type: Kraken2 Taxonomic Classification description: Kraken2 classification for {id} name: Kraken2 output read classification file - import_suffix: _kraken2_classification.tsv + import_suffix: _kraken2_classification\.tsv$ nmdc_suffix: _kraken2_classification.tsv input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -554,7 +563,7 @@ Data Objects: - data_object_type: Kraken2 Krona Plot description: Kraken2 Krona plot for {id} name: Kraken2 Krona plot HTML file - import_suffix: _kraken2_krona.html + import_suffix: _kraken2_krona\.html$ nmdc_suffix: _kraken2_krona.html input_to: [] output_of: nmdc:ReadBasedTaxonomyAnalysis @@ -564,13 +573,14 @@ Data Objects: - data_object_type: Metagenome HQMQ Bins Compression File description: Metagenome Bins for {id} name: Metagenome bin tarfiles archive - import_suffix: _[0-9]+.tar.gz + import_suffix: _[0-9]+\.tar\.gz$ nmdc_suffix: _hqmq_bin.zip input_to: [] output_of: nmdc:MagsAnalysis multiple: true action: zip + Workflow Metadata: Execution Resource: JGI Source URL: https://data.microbiomedata.org/data diff --git a/tests/test_import_mapper.py b/tests/test_import_mapper.py index d0455560..50526ed3 100644 --- a/tests/test_import_mapper.py +++ b/tests/test_import_mapper.py @@ -31,9 +31,10 @@ def mock_minted_ids(): def test_update_do_mappings_from_import_files(import_mapper_instance): import_mapper_instance.update_do_mappings_from_import_files() - for fm_all in import_mapper_instance.mappings: - print(fm_all, "\n\n") assert len(import_mapper_instance.mappings) == 22 + for fm_all in import_mapper_instance.mappings: + print("Import File:", fm_all.import_file) + assert not fm_all.import_file.endswith(".md5"), f"Unexpected .md5 file found: {fm_all.import_file}" def test_update_do_mapping_from_import_files_correct_protein_file_import(import_mapper_instance):