Skip to content

Commit 5f5229e

Browse files
authored
Merge pull request #426 from microbiomedata/421-import-automation-in-some-cases-picks-up-the-md5-sum-file-instead-of-the-fastq-file
Fixing import to exclude import of .md5 files. This fixes #421.
2 parents 20f63b5 + 5883a28 commit 5f5229e

30 files changed

+170
-131
lines changed

configs/import-mt.yaml

+37-37
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ Data Objects:
128128
- data_object_type: Metagenome Raw Reads
129129
description: Metagenome Raw Reads for {id}
130130
name: Raw sequencer read data
131-
import_suffix: .[A-Z]+-[A-Z]+.fastq.gz
131+
import_suffix: \.[ACGT]+-[ACGT]+\.fastq\.gz$
132132
nmdc_suffix: .fastq.gz
133133
input_to: [nmdc:ReadQcAnalysis]
134134
output_of: nmdc:NucleotideSequencing
@@ -137,7 +137,7 @@ Data Objects:
137137
- data_object_type: Annotation Amino Acid FASTA
138138
description: FASTA Amino Acid File for {id}
139139
name: FASTA amino acid file for annotated proteins
140-
import_suffix: _proteins.faa
140+
import_suffix: "^(?!.*_(cds|genemark|prodigal)_proteins\\.faa$).*proteins\\.faa$"
141141
nmdc_suffix: _proteins.faa
142142
input_to: []
143143
output_of: nmdc:MetatranscriptomeAnnotation
@@ -146,7 +146,7 @@ Data Objects:
146146
- data_object_type: Contig Mapping File
147147
description: Contig mapping file for {id}
148148
name: Contig mappings between old and new contig names
149-
import_suffix: _contig_names_mapping.tsv
149+
import_suffix: "_contig_names_mapping\\.tsv$"
150150
nmdc_suffix: _contig_names_mapping.tsv
151151
input_to: []
152152
output_of: nmdc:MetatranscriptomeAnnotation
@@ -155,7 +155,7 @@ Data Objects:
155155
- data_object_type: Structural Annotation GFF
156156
description: Structural Annotation for {id}
157157
name: GFF3 format file with structural annotations
158-
import_suffix: _structural_annotation.gff
158+
import_suffix: _structural_annotation\.gff$
159159
nmdc_suffix: _structural_annotation.gff
160160
input_to: []
161161
output_of: nmdc:MetatranscriptomeAnnotation
@@ -164,7 +164,7 @@ Data Objects:
164164
- data_object_type: Functional Annotation GFF
165165
description: Functional Annotation for {id}
166166
name: GFF3 format file with functional annotations
167-
import_suffix: _functional_annotation.gff
167+
import_suffix: _functional_annotation\.gff$
168168
nmdc_suffix: _functional_annotation.gff
169169
input_to: [nmdc:MetatranscriptomeExpressionAnalysis]
170170
output_of: nmdc:MetatranscriptomeAnnotation
@@ -173,7 +173,7 @@ Data Objects:
173173
- data_object_type: Annotation KEGG Orthology
174174
description: KEGG Orthology for {id}
175175
name: Tab delimited file for KO annotation
176-
import_suffix: _ko.tsv
176+
import_suffix: _ko\.tsv$
177177
nmdc_suffix: _ko.tsv
178178
input_to: []
179179
output_of: nmdc:MetatranscriptomeAnnotation
@@ -182,7 +182,7 @@ Data Objects:
182182
- data_object_type: Annotation Enzyme Commission
183183
description: EC Annotations for {id}
184184
name: Tab delimited file for EC annotation
185-
import_suffix: _ec.tsv
185+
import_suffix: _ec\.tsv$
186186
nmdc_suffix: _ec.tsv
187187
input_to: []
188188
output_of: nmdc:MetatranscriptomeAnnotation
@@ -191,15 +191,15 @@ Data Objects:
191191
- data_object_type Scaffold Lineage tsv
192192
description: Scaffold Lineage tsv for {id}
193193
name: Phylogeny at the scaffold level
194-
import_suffix: _scaffold_lineage.tsv
194+
import_suffix: _scaffold_lineage\.tsv$
195195
nmdc_suffix: _scaffold_lineage.tsv
196196
input_to: []
197197
output_of: nmdc:MetatranscriptomeAnnotation
198198
multiple: false
199199
- data_object_type: Clusters of Orthologous Groups (COG) Annotation GFF
200200
description: COGs for {id}
201201
name: GFF3 format file with COGs
202-
import_suffix: _cog.gff
202+
import_suffix: _cog\.gff$
203203
nmdc_suffix: _cog.gff
204204
input_to: []
205205
output_of: nmdc:MetatranscriptomeAnnotation
@@ -208,7 +208,7 @@ Data Objects:
208208
- data_object_type: Pfam Annotation GFF
209209
description: Pfam Annotation for {id}
210210
name: GFF3 format file with Pfam
211-
import_suffix: _pfam.gff
211+
import_suffix: _pfam\.gff$
212212
nmdc_suffix: _pfam.gff
213213
input_to: []
214214
output_of: nmdc:MetatranscriptomeAnnotation
@@ -217,7 +217,7 @@ Data Objects:
217217
- data_object_type: TIGRFam Annotation GFF
218218
description: TIGRFam for {id}
219219
name: GFF3 format file with TIGRfam
220-
import_suffix: _tigrfam.gff
220+
import_suffix: _tigrfam\.gff$
221221
nmdc_suffix: _tigrfam.gff
222222
input_to: []
223223
output_of: nmdc:MetatranscriptomeAnnotation
@@ -226,7 +226,7 @@ Data Objects:
226226
- data_object_type: SMART Annotation GFF
227227
description: SMART Annotations for {id}
228228
name: GFF3 format file with SMART
229-
import_suffix: _smart.gff
229+
import_suffix: _smart\.gff$
230230
nmdc_suffix: _smart.gff
231231
input_to: []
232232
output_of: nmdc:MetatranscriptomeAnnotation
@@ -235,7 +235,7 @@ Data Objects:
235235
- data_object_type: SUPERFam Annotation GFF
236236
description: SUPERFam Annotations for {id}
237237
name: GFF3 format file with SUPERFam
238-
import_suffix: _supfam.gff
238+
import_suffix: _supfam\.gff$
239239
nmdc_suffix: _supfam.gff
240240
input_to: []
241241
output_of: nmdc:MetatranscriptomeAnnotation
@@ -244,7 +244,7 @@ Data Objects:
244244
- data_object_type: CATH FunFams (Functional Families) Annotation GFF
245245
description: CATH FunFams for {id}
246246
name: GFF3 format file with CATH FunFams
247-
import_suffix: _cath_funfam.gff
247+
import_suffix: _cath_funfam\.gff$
248248
nmdc_suffix: _cath_funfam.gff
249249
input_to: []
250250
output_of: nmdc:MetatranscriptomeAnnotation
@@ -253,7 +253,7 @@ Data Objects:
253253
- data_object_type: CRT Annotation GFF
254254
description: CRT Annotations for {id}
255255
name: GFF3 format file with CRT
256-
import_suffix: _crt.gff
256+
import_suffix: _crt\.gff$
257257
nmdc_suffix: _crt.gff
258258
input_to: []
259259
output_of: nmdc:MetatranscriptomeAnnotation
@@ -262,7 +262,7 @@ Data Objects:
262262
- data_object_type: Genemark Annotation GFF
263263
description: Genemark Annotations for {id}
264264
name: GFF3 format file with Genemark
265-
import_suffix: _genemark.gff
265+
import_suffix: _genemark\.gff$
266266
nmdc_suffix: _genemark.gff
267267
input_to: []
268268
output_of: nmdc:MetatranscriptomeAnnotation
@@ -271,7 +271,7 @@ Data Objects:
271271
- data_object_type: Prodigal Annotation GFF
272272
description: Prodigal Annotations {id}
273273
name: GFF3 format file with Prodigal
274-
import_suffix: _prodigal.gff
274+
import_suffix: _prodigal\.gff$
275275
nmdc_suffix: _prodigal.gff
276276
input_to: []
277277
output_of: nmdc:MetatranscriptomeAnnotation
@@ -280,7 +280,7 @@ Data Objects:
280280
- data_object_type: TRNA Annotation GFF
281281
description: TRNA Annotations {id}
282282
name: GFF3 format file with TRNA
283-
import_suffix: _trna.gff
283+
import_suffix: _trna\.gff$
284284
nmdc_suffix: _trna.gff
285285
input_to: []
286286
output_of: nmdc:MetatranscriptomeAnnotation
@@ -289,7 +289,7 @@ Data Objects:
289289
- data_object_type: RFAM Annotation GFF
290290
description: RFAM Annotations for {id}
291291
name: GFF3 format file with RFAM
292-
import_suffix: _rfam.gff
292+
import_suffix: _rfam\.gff$
293293
nmdc_suffix: _rfam.gff
294294
input_to: []
295295
output_of: nmdc:MetatranscriptomeAnnotation
@@ -298,7 +298,7 @@ Data Objects:
298298
- data_object_type: KO_EC Annotation GFF
299299
description: KO_EC Annotations for {id}
300300
name: GFF3 format file with KO_EC
301-
import_suffix: _ko_ec.gff
301+
import_suffix: _ko_ec\.gff$
302302
nmdc_suffix: _ko_ec.gff
303303
input_to: []
304304
output_of: nmdc:MetatranscriptomeAnnotation
@@ -307,7 +307,7 @@ Data Objects:
307307
- data_object_type: Product Names
308308
description: Product names for {id}
309309
name: Product names file
310-
import_suffix: _product_names.tsv
310+
import_suffix: _product_names\.tsv$
311311
nmdc_suffix: _product_names.tsv
312312
input_to: []
313313
output_of: nmdc:MetatranscriptomeAnnotation
@@ -316,7 +316,7 @@ Data Objects:
316316
- data_object_type: Gene Phylogeny tsv
317317
description: Gene Phylogeny for {id}
318318
name: Gene Phylogeny file
319-
import_suffix: _gene_phylogeny.tsv
319+
import_suffix: _gene_phylogeny\.tsv$
320320
nmdc_suffix: _gene_phylogeny.tsv
321321
input_to: []
322322
output_of: nmdc:MetatranscriptomeAnnotation
@@ -325,7 +325,7 @@ Data Objects:
325325
- data_object_type: Crispr Terms
326326
description: Crispr Terms for {id}
327327
name: Crispr Terms
328-
import_suffix: _crt.crisprs
328+
import_suffix: _crt\.crisprs$
329329
nmdc_suffix: _crt.crisprs
330330
input_to: []
331331
output_of: nmdc:MetatranscriptomeAnnotation
@@ -334,7 +334,7 @@ Data Objects:
334334
- data_object_type: Annotation Statistics
335335
description: Annotation Stats for {id}
336336
name: Annotation statistics report
337-
import_suffix: _stats.tsv
337+
import_suffix: _stats\.tsv$
338338
nmdc_suffix: _stats.tsv
339339
input_to: []
340340
output_of: nmdc:MetatranscriptomeAnnotation
@@ -343,23 +343,23 @@ Data Objects:
343343
- data_object_type: Annotation Info File
344344
description: Annotation Info File for {id}
345345
name: File containing annotation info
346-
import_suffix: _imgap.info
346+
import_suffix: _imgap\.info$
347347
nmdc_suffix: _imgap.info
348348
input_to: []
349349
output_of: nmdc:MetatranscriptomeAnnotation
350350
multiple: false
351351
action: rename
352352
- data_object_type: Assembly Contigs
353353
description: Assembly contigs (remapped) for {id}
354-
import_suffix: _contigs.fna
354+
import_suffix: _contigs\.fna$
355355
nmdc_suffix: _renamed_contigs.fna
356356
input_to: []
357357
output_of: nmdc:MetatranscriptomeAnnotation
358358
multiple: false
359359
- data_object_type: Filtered Sequencing Reads
360360
description: Reads QC for {id}
361361
name: Reads QC result fastq (clean data)
362-
import_suffix: filter-MTF.fastq.gz
362+
import_suffix: filter-MTF\.fastq\.gz$
363363
nmdc_suffix: _filtered.fastq.gz
364364
input_to: [nmdc:MetatranscriptomeAssembly]
365365
output_of: nmdc:ReadQcAnalysis
@@ -368,7 +368,7 @@ Data Objects:
368368
- data_object_type: rRNA Filtered Sequencing Reads
369369
description: Reads QC rRNA reads file for {id}
370370
name: Reads QC rRNA reads result fastq (clean data)
371-
import_suffix: .rRNA.fastq.gz
371+
import_suffix: \.rRNA\.fastq\.gz$
372372
nmdc_suffix: _rRNA.fastq.gz
373373
input_to: []
374374
output_of: nmdc:ReadQcAnalysis
@@ -377,7 +377,7 @@ Data Objects:
377377
- data_object_type: QC Statistics
378378
description: Reads QC summary for {id}
379379
name: Reads QC summary statistics
380-
import_suffix: .filtered-report.txt
380+
import_suffix: \.filtered-report\.txt$
381381
nmdc_suffix: _filterStats.txt
382382
input_to: []
383383
output_of: nmdc:ReadQcAnalysis
@@ -386,7 +386,7 @@ Data Objects:
386386
- data_object_type: Read Filtering Info File
387387
description: Read Filtering Info File for {id}
388388
name: File containing read filtering information
389-
import_suffix: .filter_cmd-MTF.sh
389+
import_suffix: \.filter_cmd-MTF\.sh$
390390
nmdc_suffix: _readsQC.info
391391
input_to: []
392392
output_of: nmdc:ReadQcAnalysis
@@ -395,7 +395,7 @@ Data Objects:
395395
- data_object_type: Assembly Contigs
396396
description: Assembly contigs for {id}
397397
name: Final assembly contigs fasta
398-
import_suffix: assembly.contigs.fasta
398+
import_suffix: assembly\.contigs\.fasta$
399399
nmdc_suffix: _contigs.fna
400400
input_to: [nmdc:MetatranscriptomeAnnotation]
401401
output_of: nmdc:MetatranscriptomeAssembly
@@ -404,7 +404,7 @@ Data Objects:
404404
- data_object_type: Assembly Info File
405405
description: Assembly info file for {id}
406406
name: File containing assembly information
407-
import_suffix: README.txt
407+
import_suffix: README\.txt$
408408
nmdc_suffix: _metaAsm.info
409409
input_to: []
410410
output_of: nmdc:MetatranscriptomeAssembly
@@ -414,15 +414,15 @@ Data Objects:
414414
description: Coverage Stats for {id}
415415
name: Assembled contigs coverage information
416416
import_suffix: pairedMapped_sorted.bam.cov
417-
nmdc_suffix: _covstats.txt
417+
nmdc_suffix: _covstats\.txt$
418418
input_to: []
419419
output_of: nmdc:MetatranscriptomeAssembly
420420
multiple: false
421421
action: rename
422422
- data_object_type: Assembly Coverage BAM
423423
description: Sorted Bam for {id}
424424
name: Sorted bam file of reads mapping back to the final assembly
425-
import_suffix: pairedMapped.bam.gz
425+
import_suffix: pairedMapped\.bam\.gz$
426426
nmdc_suffix: _pairedMapped_sorted.bam.gz
427427
input_to: [nmdc:MetatranscriptomeExpressionAnalysis]
428428
output_of: nmdc:MetatranscriptomeAssembly
@@ -431,7 +431,7 @@ Data Objects:
431431
- data_object_type: BAI File
432432
description: Alignment index file for {id}
433433
name: BAM index file
434-
import_suffix: _pairedMapped_sorted.bam.bai
434+
import_suffix: _pairedMapped_sorted\.bam\.bai$
435435
nmdc_suffix: _pairedMapped_sorted.bam.bai
436436
input_to: []
437437
output_of: nmdc:MetatranscriptomeAssembly
@@ -440,7 +440,7 @@ Data Objects:
440440
- data_object_type: Metatranscriptome Expression
441441
description: Expression counts for {id}
442442
name: Expression counts file
443-
import_suffix: .rnaseq_gea.txt
443+
import_suffix: \.rnaseq_gea\.txt$
444444
nmdc_suffix: _rnaseq_gea.txt
445445
input_to: []
446446
output_of: nmdc:MetatranscriptomeExpressionAnalysis
@@ -449,7 +449,7 @@ Data Objects:
449449
- data_object_type: Metatranscriptome Expression Intergenic
450450
description: Expression intergenic counts for {id}
451451
name: Expression intergenic counts file
452-
import_suffix: .rnaseq_gea.intergenic.txt
452+
import_suffix: \.rnaseq_gea\.intergenic\.txt$
453453
nmdc_suffix: _rnaseq_gea.intergenic.txt
454454
input_to: []
455455
output_of: nmdc:MetatranscriptomeExpressionAnalysis

0 commit comments

Comments
 (0)