diff --git a/CHANGELOG.md b/CHANGELOG.md index 6835818d..723d4a83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` - [#666](https://github.com/nf-core/ampliseq/pull/666) - Added Greengenes2 database, version 2022.10, support for QIIME2 taxonomic classification. -- [#667](https://github.com/nf-core/ampliseq/pull/667) - Added `--qiime_ref_tax_custom` to permit custom reference database for QIIME2 taxonomic classification +- [#667](https://github.com/nf-core/ampliseq/pull/667),[#691](https://github.com/nf-core/ampliseq/pull/691) - Added `--qiime_ref_tax_custom` to permit custom reference database for QIIME2 taxonomic classification - [#674](https://github.com/nf-core/ampliseq/pull/674) - Add PhytoRef database for DADA2 taxonomy assignment using `--dada_ref_taxonomy phytoref` - [#675](https://github.com/nf-core/ampliseq/pull/675) - Add the Zehr lab nifH database for DADA2 taxonomy assignment using `--dada_ref_taxonomy zehr-nifh` - [#681](https://github.com/nf-core/ampliseq/pull/681) - For DADA2, with `--dada_addspecies_allowmultiple` multiple exact species matches are reported and with `--dada_taxonomy_rc` reverse-complement matches are also considered in taxonomic classification @@ -19,7 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` -- [#672](https://github.com/nf-core/ampliseq/pull/672),[#688](https://github.com/nf-core/ampliseq/pull/688) - Updated documentation +- [#672](https://github.com/nf-core/ampliseq/pull/672),[#688](https://github.com/nf-core/ampliseq/pull/688),[#691](https://github.com/nf-core/ampliseq/pull/691) - Updated documentation - [#676](https://github.com/nf-core/ampliseq/pull/676) - Phyloseq sometimes only produced one of multiple output files - [#679](https://github.com/nf-core/ampliseq/pull/679) - Prevent masking low complexity regions by VSEARCH with lower case letters - [#680](https://github.com/nf-core/ampliseq/pull/680),[#673](https://github.com/nf-core/ampliseq/pull/673) - Improved pipeline summary report & error messages diff --git a/conf/test_qiimecustom.config b/conf/test_qiimecustom.config index 2fc9cb73..dd02eb4e 100644 --- a/conf/test_qiimecustom.config +++ b/conf/test_qiimecustom.config @@ -25,7 +25,7 @@ params { input = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/Samplesheet.tsv" // Custom reference taxonomy - qiime_ref_tax_custom = "https://raw.githubusercontent.com/MatthewJM96/test-datasets/ampliseq/testdata/db/85_greengenes.fna.gz,https://raw.githubusercontent.com/MatthewJM96/test-datasets/ampliseq/testdata/db/85_greengenes.tax.gz" + qiime_ref_tax_custom = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/testdata/85_greengenes.fna.gz,https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/testdata/85_greengenes.tax.gz" // Skip downstream analysis with QIIME2 skip_qiime_downstream = true diff --git a/conf/test_reftaxcustom.config b/conf/test_reftaxcustom.config index 1afe1c2d..40408bfb 100644 --- a/conf/test_reftaxcustom.config +++ b/conf/test_reftaxcustom.config @@ -30,7 +30,7 @@ params { dada_assign_taxlevels = "Kingdom,Phylum,Class,Order,Family,Genus" kraken2_ref_tax_custom = "https://genome-idx.s3.amazonaws.com/kraken/16S_Greengenes13.5_20200326.tgz" kraken2_assign_taxlevels = "D,P,C,O" - qiime_ref_tax_custom = "https://raw.githubusercontent.com/MatthewJM96/test-datasets/ampliseq/testdata/db/85_greengenes.tar.gz" + qiime_ref_tax_custom = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/testdata/85_greengenes.tar.gz" // Skip downstream analysis with QIIME2 skip_qiime_downstream = true diff --git a/modules.json b/modules.json index 6969dc1d..595c024f 100644 --- a/modules.json +++ b/modules.json @@ -81,6 +81,11 @@ "git_sha": "4ab13872435962dadc239979554d13709e20bf29", "installed_by": ["modules"] }, + "pigz/uncompress": { + "branch": "master", + "git_sha": "4ef7becf6a2bbc8df466885d10b4051d1f318a6a", + "installed_by": ["modules"] + }, "untar": { "branch": "master", "git_sha": "d0b4fc03af52a1cc8c6fb4493b921b57352b1dd8", diff --git a/modules/local/gzip_decompress.nf b/modules/local/gzip_decompress.nf deleted file mode 100644 index c6ea37a5..00000000 --- a/modules/local/gzip_decompress.nf +++ /dev/null @@ -1,32 +0,0 @@ -process GZIP_DECOMPRESS { - tag "$file" - label 'process_single' - - conda "conda-forge::sed=4.7 conda-forge::gzip=1.13" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'nf-core/ubuntu:20.04' }" - - input: - path(file) - - output: - path("$outfile"), emit: ungzip - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - outfile = task.ext.outfile ?: file.baseName.toString().replaceFirst(/\.gz$/, "") - - """ - gzip $args -c -d $file > $outfile - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - gzip: \$(echo \$(gzip --version 2>&1) | sed 's/gzip //; s/ Copyright.*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/pigz/uncompress/main.nf b/modules/nf-core/pigz/uncompress/main.nf new file mode 100644 index 00000000..9383c146 --- /dev/null +++ b/modules/nf-core/pigz/uncompress/main.nf @@ -0,0 +1,48 @@ +process PIGZ_UNCOMPRESS { + label 'process_low' + //stageInMode 'copy' // this directive can be set in case the original input should be kept + + conda "conda-forge::pigz" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.8': + 'biocontainers/pigz:2.8' }" + + input: + path zip + + output: + path "${uncompressed_filename}" , emit: file + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + uncompressed_filename = zip.toString() - '.gz' + // calling pigz -f to make it follow symlinks + """ + unpigz \\ + -p $task.cpus \\ + -fk \\ + $args \\ + ${zip} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\\w*//' )) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + uncompressed_filename = zip.toString() - '.gz' + """ + touch ${zip.dropRight(3)} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\w*//' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/pigz/uncompress/meta.yml b/modules/nf-core/pigz/uncompress/meta.yml new file mode 100644 index 00000000..574a004b --- /dev/null +++ b/modules/nf-core/pigz/uncompress/meta.yml @@ -0,0 +1,32 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "pigz_uncompress" +description: write your description here +keywords: + - uncompress + - gzip + - parallelized +tools: + - "pigz": + description: "Parallel implementation of the gzip algorithm." + homepage: "https://zlib.net/pigz/" + documentation: "https://zlib.net/pigz/pigz.pdf" + +input: + - zip: + type: file + description: Gzipped file + pattern: "*.{gzip}" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - file: + type: file + description: File to compress + pattern: "*" + +authors: + - "@lrauschning" diff --git a/modules/nf-core/pigz/uncompress/tests/main.nf.test b/modules/nf-core/pigz/uncompress/tests/main.nf.test new file mode 100644 index 00000000..57955658 --- /dev/null +++ b/modules/nf-core/pigz/uncompress/tests/main.nf.test @@ -0,0 +1,33 @@ +nextflow_process { + + name "Test Process PIGZ_UNCOMPRESS" + script "modules/nf-core/pigz/uncompress/main.nf" + process "PIGZ_UNCOMPRESS" + tag "modules" + tag "modules_nfcore" + tag "pigz" + tag "pigz/uncompress" + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/modules/nf-core/pigz/uncompress/tests/main.nf.test.snap b/modules/nf-core/pigz/uncompress/tests/main.nf.test.snap new file mode 100644 index 00000000..038cf2d7 --- /dev/null +++ b/modules/nf-core/pigz/uncompress/tests/main.nf.test.snap @@ -0,0 +1,21 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ], + "1": [ + "versions.yml:md5,a2d5ce72baa8b303f25afb9cf094f683" + ], + "file": [ + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ], + "versions": [ + "versions.yml:md5,a2d5ce72baa8b303f25afb9cf094f683" + ] + } + ], + "timestamp": "2023-10-18T12:37:21.987858" + } +} \ No newline at end of file diff --git a/modules/nf-core/pigz/uncompress/tests/tags.yml b/modules/nf-core/pigz/uncompress/tests/tags.yml new file mode 100644 index 00000000..6719a90a --- /dev/null +++ b/modules/nf-core/pigz/uncompress/tests/tags.yml @@ -0,0 +1,2 @@ +pigz/uncompress: + - modules/nf-core/pigz/uncompress/** diff --git a/nextflow_schema.json b/nextflow_schema.json index 29aee21e..938690d4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -16,7 +16,7 @@ "mimetype": "text/tsv", "fa_icon": "fas fa-dna", "description": "Path to tab-separated sample sheet", - "help_text": "Path to sample sheet, either tab-separated (.tsv), comma-separated (.csv), or in YAML format (.yml/.yaml), that points to compressed fastq files.\n\nThe sample sheet must have two to four tab-separated columns/entries with the following headers: \n- `sampleID` (required): Unique sample IDs, must start with a letter, and can only contain letters, numbers or underscores\n- `forwardReads` (required): Paths to (forward) reads zipped FastQ files\n- `reverseReads` (optional): Paths to reverse reads zipped FastQ files, required if the data is paired-end\n- `run` (optional): If the data was produced by multiple sequencing runs, any string\n\nRelated parameters are:\n- `--pacbio` and `--iontorrent` if the sequencing data is PacBio data or IonTorrent data (default expected: paired-end Illumina data)\n- `--single_end` if the sequencing data is single-ended Illumina data (default expected: paired-end Illumina data)\n- `--dada_ref_taxonomy`, `--qiime_ref_taxonomy`, and/or `--sintax_ref_taxonomy` to choose an appropriate reference taxonomy for the type of amplicon (16S/18S/ITS/CO1) (default: DADA2 assignTaxonomy and 16S rRNA sequence database)", + "help_text": "Path to sample sheet, either tab-separated (.tsv), comma-separated (.csv), or in YAML format (.yml/.yaml), that points to compressed fastq files.\n\nThe sample sheet must have two to four tab-separated columns/entries with the following headers: \n- `sampleID` (required): Unique sample IDs, must start with a letter, and can only contain letters, numbers or underscores\n- `forwardReads` (required): Paths to (forward) reads zipped FastQ files\n- `reverseReads` (optional): Paths to reverse reads zipped FastQ files, required if the data is paired-end\n- `run` (optional): If the data was produced by multiple sequencing runs, any string\n\nRelated parameters are:\n- `--pacbio` and `--iontorrent` if the sequencing data is PacBio data or IonTorrent data (default expected: paired-end Illumina data)\n- `--single_end` if the sequencing data is single-ended Illumina data (default expected: paired-end Illumina data)\n- Choose an appropriate reference taxonomy for the type of amplicon (16S/18S/ITS/CO1) (default: DADA2 assignTaxonomy and 16S rRNA sequence database)", "schema": "assets/schema_input.json" }, "input_fasta": { @@ -24,14 +24,14 @@ "mimetype": "text/tsv", "fa_icon": "fas fa-dna", "description": "Path to ASV/OTU fasta file", - "help_text": "Path to fasta format file with sequences that will be taxonomically classified. The fasta file input option can be used to taxonomically classify previously produced ASV/OTU sequences.\n\nThe fasta sequence header line may contain a description, that will be kept as part of the sequence name. However, tabs will be changed into spaces.\n\nRelated parameters are:\n- `--dada_ref_taxonomy`, `--qiime_ref_taxonomy`, and/or `--sintax_ref_taxonomy` to choose an appropriate reference taxonomy for the type of amplicon (16S/18S/ITS/CO1) (default: DADA2 assignTaxonomy and 16S rRNA sequence database)" + "help_text": "Path to fasta format file with sequences that will be taxonomically classified. The fasta file input option can be used to taxonomically classify previously produced ASV/OTU sequences.\n\nThe fasta sequence header line may contain a description, that will be kept as part of the sequence name. However, tabs will be changed into spaces.\n\nRelated parameters are:\n- Choose an appropriate reference taxonomy for the type of amplicon (16S/18S/ITS/CO1) (default: DADA2 assignTaxonomy and 16S rRNA sequence database)" }, "input_folder": { "type": "string", "mimetype": "text/tsv", "fa_icon": "fas fa-dna", "description": "Path to folder containing zipped FastQ files", - "help_text": "Path to folder containing compressed fastq files.\n\nExample for input data organization from one sequencing run with two samples, paired-end data:\n\n```bash\ndata\n \u251c\u2500sample1_1_L001_R1_001.fastq.gz\n \u251c\u2500sample1_1_L001_R2_001.fastq.gz\n \u251c\u2500sample2_1_L001_R1_001.fastq.gz\n \u2514\u2500sample2_1_L001_R2_001.fastq.gz\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The folder must contain gzip compressed demultiplexed fastq files. If the file names do not follow the default (`\"/*_R{1,2}_001.fastq.gz\"`), please check `--extension`.\n3. Sample identifiers are extracted from file names, i.e. the string before the first underscore `_`, these must be unique\n4. If your data is scattered, produce a sample sheet\n5. All sequencing data should originate from one sequencing run, because processing relies on run-specific error models that are unreliable when data from several sequencing runs are mixed. Sequencing data originating from multiple sequencing runs requires additionally the parameter `--multiple_sequencing_runs` and a specific folder structure.\n\nRelated parameters are:\n- `--pacbio` and `--iontorrent` if the sequencing data is PacBio data or IonTorrent data (default expected: paired-end Illumina data)\n- `--single_end` if the sequencing data is single-ended Illumina data (default expected: paired-end Illumina data)\n- `--multiple_sequencing_runs` if the sequencing data originates from multiple sequencing runs\n- `--extension` if the sequencing file names do not follow the default (`\"/*_R{1,2}_001.fastq.gz\"`)\n- `--dada_ref_taxonomy`, `--qiime_ref_taxonomy`, and/or `--sintax_ref_taxonomy` to choose an appropriate reference taxonomy for the type of amplicon (16S/18S/ITS/CO1) (default: DADA2 assignTaxonomy and 16S rRNA sequence database)" + "help_text": "Path to folder containing compressed fastq files.\n\nExample for input data organization from one sequencing run with two samples, paired-end data:\n\n```bash\ndata\n \u251c\u2500sample1_1_L001_R1_001.fastq.gz\n \u251c\u2500sample1_1_L001_R2_001.fastq.gz\n \u251c\u2500sample2_1_L001_R1_001.fastq.gz\n \u2514\u2500sample2_1_L001_R2_001.fastq.gz\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The folder must contain gzip compressed demultiplexed fastq files. If the file names do not follow the default (`\"/*_R{1,2}_001.fastq.gz\"`), please check `--extension`.\n3. Sample identifiers are extracted from file names, i.e. the string before the first underscore `_`, these must be unique\n4. If your data is scattered, produce a sample sheet\n5. All sequencing data should originate from one sequencing run, because processing relies on run-specific error models that are unreliable when data from several sequencing runs are mixed. Sequencing data originating from multiple sequencing runs requires additionally the parameter `--multiple_sequencing_runs` and a specific folder structure.\n\nRelated parameters are:\n- `--pacbio` and `--iontorrent` if the sequencing data is PacBio data or IonTorrent data (default expected: paired-end Illumina data)\n- `--single_end` if the sequencing data is single-ended Illumina data (default expected: paired-end Illumina data)\n- `--multiple_sequencing_runs` if the sequencing data originates from multiple sequencing runs\n- `--extension` if the sequencing file names do not follow the default (`\"/*_R{1,2}_001.fastq.gz\"`)\n- Choose an appropriate reference taxonomy for the type of amplicon (16S/18S/ITS/CO1) (default: DADA2 assignTaxonomy and 16S rRNA sequence database)" }, "FW_primer": { "type": "string", @@ -372,7 +372,7 @@ }, "dada_ref_tax_custom": { "type": "string", - "help_text": "Is preferred over `--dada_ref_taxonomy`. Either `--skip_dada_addspecies` (no species annotation) or `--dada_ref_tax_custom_sp` (species annotation) is additionally required. Consider also setting `--dada_assign_taxlevels`.\n\nMust be compatible to DADA2's assignTaxonomy function: 'Can be compressed. This reference fasta file should be formatted so that the id lines correspond to the taxonomy (or classification) of the associated sequence, and each taxonomic level is separated by a semicolon.' See also https://rdrr.io/bioc/dada2/man/assignTaxonomy.html", + "help_text": "Overwrites `--dada_ref_taxonomy`. Either `--skip_dada_addspecies` (no species annotation) or `--dada_ref_tax_custom_sp` (species annotation) is additionally required. Consider also setting `--dada_assign_taxlevels`.\n\nMust be compatible to DADA2's assignTaxonomy function: 'Can be compressed. This reference fasta file should be formatted so that the id lines correspond to the taxonomy (or classification) of the associated sequence, and each taxonomic level is separated by a semicolon.' See also https://rdrr.io/bioc/dada2/man/assignTaxonomy.html", "description": "Path to a custom DADA2 reference taxonomy database" }, "dada_ref_tax_custom_sp": { @@ -449,8 +449,8 @@ }, "qiime_ref_tax_custom": { "type": "string", - "help_text": "Is preferred over `--qiime_ref_taxonomy`. A comma separated pair of (possibly gzipped) filepaths (sequence, taxonomy).", - "description": "Path to files of a custom QIIME2 reference taxonomy database (files may be gzipped)" + "help_text": "Overwrites `--qiime_ref_taxonomy`. Either path to tarball (`*.tar.gz` or `*.tgz`) that contains sequence (`*.fna`) and taxonomy (`*.tax`) data, or alternatively a comma separated pair of filepaths to sequence (`*.fna`) and taxonomy (`*.tax`) data (possibly gzipped `*.gz`).", + "description": "Path to files of a custom QIIME2 reference taxonomy database (tarball, or two comma-separated files)" }, "classifier": { "type": "string", @@ -475,7 +475,7 @@ }, "kraken2_ref_tax_custom": { "type": "string", - "help_text": "Is preferred over `--kraken2_ref_taxonomy`. Consider also setting `--kraken2_assign_taxlevels`. Can be compressed tar archive (.tar.gz|.tgz) or folder containing the database. See also https://benlangmead.github.io/aws-indexes/k2.", + "help_text": "Overwrites `--kraken2_ref_taxonomy`. Consider also setting `--kraken2_assign_taxlevels`. Can be compressed tar archive (.tar.gz|.tgz) or folder containing the database. See also https://benlangmead.github.io/aws-indexes/k2.", "description": "Path to a custom Kraken2 reference taxonomy database (*.tar.gz|*.tgz archive or folder)" }, "kraken2_assign_taxlevels": { diff --git a/subworkflows/local/qiime2_preptax.nf b/subworkflows/local/qiime2_preptax.nf index dfa28725..ce7bac78 100644 --- a/subworkflows/local/qiime2_preptax.nf +++ b/subworkflows/local/qiime2_preptax.nf @@ -3,7 +3,7 @@ */ include { UNTAR } from '../../modules/nf-core/untar/main' -include { GZIP_DECOMPRESS } from '../../modules/local/gzip_decompress.nf' +include { PIGZ_UNCOMPRESS } from '../../modules/nf-core/pigz/uncompress/main' include { FORMAT_TAXONOMY_QIIME } from '../../modules/local/format_taxonomy_qiime' include { QIIME2_EXTRACT } from '../../modules/local/qiime2_extract' include { QIIME2_TRAIN } from '../../modules/local/qiime2_train' @@ -29,10 +29,10 @@ workflow QIIME2_PREPTAX { }.set { ch_qiime_ref_tax_branched } ch_qiime_ref_tax_branched.failed.subscribe { error "$it is neither a compressed (ends with `.gz`) or decompressed sequence (ends with `.fna`) or taxonomy file (ends with `.tax`). Please review input." } - GZIP_DECOMPRESS(ch_qiime_ref_tax_branched.compressed) - ch_qiime2_preptax_versions = ch_qiime2_preptax_versions.mix(GZIP_DECOMPRESS.out.versions) + PIGZ_UNCOMPRESS(ch_qiime_ref_tax_branched.compressed) + ch_qiime2_preptax_versions = ch_qiime2_preptax_versions.mix(PIGZ_UNCOMPRESS.out.versions) - ch_qiime_db_files = GZIP_DECOMPRESS.out.ungzip + ch_qiime_db_files = PIGZ_UNCOMPRESS.out.file ch_qiime_db_files = ch_qiime_db_files.mix(ch_qiime_ref_tax_branched.decompressed) ch_ref_database_fna = ch_qiime_db_files.filter {