diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bd42a8d0..45389f8f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -57,6 +57,7 @@ jobs: - "test_pacbio_its" - "test_sintax" - "test_pplace" + - "test_multiregion" profile: - "docker" diff --git a/CHANGELOG.md b/CHANGELOG.md index 485d14a7..5aac746f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` - [#700](https://github.com/nf-core/ampliseq/pull/700) - Optional `--save_intermediates` to publish QIIME2 data objects (.qza) and visualisation objects (.qzv) +- [#702](https://github.com/nf-core/ampliseq/pull/702) - Add multiple regions analysis (including 5R / SMURF / q2-sidle) ### `Changed` diff --git a/CITATIONS.md b/CITATIONS.md index 73e92bc0..54dd5667 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -103,6 +103,24 @@ > Czech, Lucas, Pierre Barbera, and Alexandros Stamatakis. “Genesis and Gappa: Processing, Analyzing and Visualizing Phylogenetic (Placement) Data.” Bioinformatics 36, no. 10 (May 1, 2020): 3263–65. https://doi.org/10.1093/bioinformatics/btaa070. +### Multi region analysis (also include Greengenes 13_8 or SILVA 128) + +- [q2-sidle](https://doi.org/10.1101/2021.03.23.436606) + + > Debelius, J.W.; Robeson, M.; Lhugerth, L.W.; Boulund, F.; Ye, W.; Engstrand, L. "A comparison of approaches to scaffolding multiple regions along the 16S rRNA gene for improved resolution." Preprint in BioRxiv. doi: 10.1101/2021.03.23.436606 + +- [SMURF](https://doi.org/10.1186/s40168-017-0396-x) + + > Fuks, G.; Elgart, M.; Amir, A.; Zeisel, A.; Turnbaugh, P.J., Soen, Y.; and Shental, N. (2018). "Combining 16S rRNA gene variable regions enables high-resolution microbial community profiling." Microbiome. 6: 17. doi: 10.1186/s40168-017-0396-x + +- [RESCRIPt](https://doi.org/10.1371/journal.pcbi.1009581) + + > Robeson MS 2nd, O'Rourke DR, Kaehler BD, Ziemski M, Dillon MR, Foster JT, Bokulich NA. RESCRIPt: Reproducible sequence taxonomy reference database management. PLoS Comput Biol. 2021 Nov 8;17(11):e1009581. doi: 10.1371/journal.pcbi.1009581. PMID: 34748542; PMCID: PMC8601625. + +- [SEPP](https://doi.org/10.1128/msystems.00021-18) + + > Janssen S, McDonald D, Gonzalez A, Navas-Molina JA, Jiang L, Xu ZZ, Winker K, Kado DM, Orwoll E, Manary M, Mirarab S, Knight R. Phylogenetic Placement of Exact Amplicon Sequences Improves Associations with Clinical Information. mSystems. 2018 Apr 17;3(3):e00021-18. doi: 10.1128/mSystems.00021-18. PMID: 29719869; PMCID: PMC5904434. + ### Downstream analysis - [QIIME2](https://pubmed.ncbi.nlm.nih.gov/31341288/) diff --git a/README.md b/README.md index 8319fe0b..85c3c687 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ ## Introduction -**nfcore/ampliseq** is a bioinformatics analysis pipeline used for amplicon sequencing, supporting denoising of any amplicon and supports a variety of taxonomic databases for taxonomic assignment including 16S, ITS, CO1 and 18S. Phylogenetic placement is also possible. Supported is paired-end Illumina or single-end Illumina, PacBio and IonTorrent data. Default is the analysis of 16S rRNA gene amplicons sequenced paired-end with Illumina. +**nfcore/ampliseq** is a bioinformatics analysis pipeline used for amplicon sequencing, supporting denoising of any amplicon and supports a variety of taxonomic databases for taxonomic assignment including 16S, ITS, CO1 and 18S. Phylogenetic placement is also possible. Multiple region analysis such as 5R is implemented. Supported is paired-end Illumina or single-end Illumina, PacBio and IonTorrent data. Default is the analysis of 16S rRNA gene amplicons sequenced paired-end with Illumina. A video about relevance, usage and output of the pipeline (version 2.1.0; 26th Oct. 2021) can also be found in [YouTube](https://youtu.be/a0VOEeAvETs) and [billibilli](https://www.bilibili.com/video/BV1B44y1e7MM), the slides are deposited at [figshare](https://doi.org/10.6084/m9.figshare.16871008.v1). diff --git a/assets/schema_input.json b/assets/schema_input.json index 8a016da6..89e9740b 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -12,7 +12,7 @@ "pattern": "^[a-zA-Z][a-zA-Z0-9_]+$", "unique": true, "errorMessage": "Unique sample ID must be provided: Must start with a letter, and can only contain letters, numbers or underscores; Regex: '^[a-zA-Z][a-zA-Z0-9_]+$'", - "meta": ["id"] + "meta": ["sample"] }, "forwardReads": { "type": "string", diff --git a/assets/schema_multiregion.json b/assets/schema_multiregion.json new file mode 100644 index 00000000..a0ef525f --- /dev/null +++ b/assets/schema_multiregion.json @@ -0,0 +1,37 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/ampliseq/master/assets/schema_multiregion.json", + "title": "nf-core/ampliseq pipeline - params.multiregion schema", + "description": "Schema for the file provided with params.multiregion", + "type": "array", + "items": { + "type": "object", + "properties": { + "region": { + "type": "string", + "pattern": "^\\S+$", + "unique": true, + "errorMessage": "Region name is mandatory, cannot contain spaces, and must be unique", + "meta": ["region"] + }, + "region_length": { + "type": "integer", + "errorMessage": "Length of region must be an integer", + "meta": ["region_length"] + }, + "FW_primer": { + "type": "string", + "pattern": "^[ATUGCYRSWKMBDHVN]*$", + "errorMessage": "FW_primer must be provided and may contain only uppercase nucleotide IUPAC code [ATUGCYRSWKMBDHVN]", + "meta": ["fw_primer"] + }, + "RV_primer": { + "type": "string", + "pattern": "^[ATUGCYRSWKMBDHVN]*$", + "errorMessage": "RV_primer must be provided and may contain only uppercase nucleotide IUPAC code [ATUGCYRSWKMBDHVN]", + "meta": ["rv_primer"] + } + }, + "required": ["region", "region_length", "FW_primer", "RV_primer"] + } +} diff --git a/bin/taxref_reformat_sidle.sh b/bin/taxref_reformat_sidle.sh new file mode 100755 index 00000000..0253ad7d --- /dev/null +++ b/bin/taxref_reformat_sidle.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +derep="$1" + +# Untar any tar file in the working directory +tar xzf database.tar.gz + +# Greengenes 13_8 +if [ -d "gg_13_8_otus" ]; then + mv gg_13_8_otus/rep_set/${derep}_otus.fasta gg_13_8_otus_rep_set_${derep}_otus.seq.fasta + mv gg_13_8_otus/rep_set_aligned/${derep}_otus.fasta gg_13_8_otus_rep_set_aligned_${derep}_otus.alnseq.fasta + mv gg_13_8_otus/taxonomy/${derep}_otu_taxonomy.txt gg_13_8_otus_taxonomy_${derep}_otu_taxonomy.tax.txt + # remove uncompressed folder + rm -r gg_13_8_otus +elif [ -d "SILVA_128_QIIME_release" ]; then + mv SILVA_128_QIIME_release/rep_set/rep_set_all/${derep}/${derep}_otus.fasta SILVA_128_QIIME_release_rep_set_all_${derep}_otus.seq.fasta + gunzip -c SILVA_128_QIIME_release/rep_set_aligned/${derep}/${derep}_otus_aligned.fasta.gz > SILVA_128_QIIME_release_rep_set_aligned_${derep}_otus_aligned.alnseq.fasta + mv SILVA_128_QIIME_release/taxonomy/taxonomy_all/${derep}/consensus_taxonomy_7_levels.txt SILVA_128_QIIME_release_taxonomy_all_${derep}_consensus_taxonomy_7_levels.tax.txt + # remove uncompressed folder + rm -r SILVA_128_QIIME_release +else + echo "No expected directory detected" +fi diff --git a/conf/modules.config b/conf/modules.config index b159d8d0..d93b5bb0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -10,17 +10,6 @@ ---------------------------------------------------------------------------------------- */ -//prepare reverse complement primers to remove those in PacBio, IonTorrent and Illumina read-through cutadapt steps -// Get the complement of a DNA sequence -// Complement table taken from http://arep.med.harvard.edu/labgc/adnan/projects/Utilities/revcomp.html -def make_complement(String seq) { - def complements = [ A:'T', T:'A', U:'A', G:'C', C:'G', Y:'R', R:'Y', S:'S', W:'W', K:'M', M:'K', B:'V', D:'H', H:'D', V:'B', N:'N' ] - comp = seq.toUpperCase().collect { base -> complements[ base ] ?: 'X' }.join() - return comp -} -FW_primer_RevComp = make_complement ( "${params.FW_primer}".reverse() ) -RV_primer_RevComp = make_complement ( "${params.RV_primer}".reverse() ) - process { publishDir = [ @@ -49,16 +38,16 @@ process { } withName: CUTADAPT_BASIC { - ext.args = [ + ext.args = { [ "--minimum-length 1", "-O ${params.cutadapt_min_overlap}", "-e ${params.cutadapt_max_error_rate}", - params.pacbio ? "--rc -g ${params.FW_primer}...${RV_primer_RevComp}" : - params.iontorrent ? "--rc -g ${params.FW_primer}...${RV_primer_RevComp}" : - params.single_end ? "-g ${params.FW_primer}" : - "-g ${params.FW_primer} -G ${params.RV_primer}", + params.pacbio ? "--rc -g ${meta.fw_primer}...${meta.rv_primer_revcomp}" : + params.iontorrent ? "--rc -g ${meta.fw_primer}...${meta.rv_primer_revcomp}" : + params.single_end ? "-g ${meta.fw_primer}" : + "-g ${meta.fw_primer} -G ${meta.rv_primer}", params.retain_untrimmed ? '' : "--discard-untrimmed" - ].join(' ').trim() + ].join(' ').trim() } ext.prefix = { "${meta.id}.trimmed" } publishDir = [ path: { "${params.outdir}/cutadapt" }, @@ -68,12 +57,12 @@ process { } withName: CUTADAPT_READTHROUGH { - ext.args = [ + ext.args = { [ "--minimum-length 1", "-O ${params.cutadapt_min_overlap}", "-e ${params.cutadapt_max_error_rate}", - "-a ${RV_primer_RevComp} -A ${FW_primer_RevComp}" - ].join(' ').trim() + "-a ${meta.rv_primer_revcomp} -A ${meta.fw_primer_revcomp}" + ].join(' ').trim() } ext.prefix = { "${meta.id}.read-through" } publishDir = [ path: { "${params.outdir}/cutadapt" }, @@ -83,15 +72,15 @@ process { } withName: CUTADAPT_DOUBLEPRIMER { - ext.args = [ + ext.args = { [ "--discard-trimmed --minimum-length 1", "-O ${params.cutadapt_min_overlap}", "-e ${params.cutadapt_max_error_rate}", - params.pacbio ? "--rc -g ${params.FW_primer}...${RV_primer_RevComp}" : - params.iontorrent ? "--rc -g ${params.FW_primer}...${RV_primer_RevComp}" : - params.single_end ? "-g ${params.FW_primer}" : - "-g ${params.FW_primer} -G ${params.RV_primer}" - ].join(' ').trim() + params.pacbio ? "--rc -g ${meta.fw_primer}...${meta.rv_primer_revcomp}" : + params.iontorrent ? "--rc -g ${meta.fw_primer}...${meta.rv_primer_revcomp}" : + params.single_end ? "-g ${meta.fw_primer}" : + "-g ${meta.fw_primer} -G ${meta.rv_primer}" + ].join(' ').trim() } ext.prefix = { "${meta.id}.double-primer" } publishDir = [ path: { "${params.outdir}/cutadapt" }, @@ -101,7 +90,7 @@ process { } withName: CUTADAPT_TAXONOMY { - ext.args = "--discard-untrimmed --minimum-length 1 -g ${params.FW_primer}...${RV_primer_RevComp}" + ext.args = { "--discard-untrimmed --minimum-length 1 -g ${meta.fw_primer}...${meta.rv_primer_revcomp}" } publishDir = [ path: { "${params.outdir}/cutadapt" }, mode: params.publish_dir_mode, @@ -188,6 +177,7 @@ process { withName: DADA2_ERR { ext.seed = "${params.seed}" + ext.prefix = { meta.region ? "region-${meta.region}_run-${meta.run}" : "${meta.run}" } ext.args = [ 'nbases = 1e8, nreads = NULL, randomize = TRUE, MAX_CONSIST = 10, OMEGA_C = 0, qualityType = "Auto"', params.pacbio ? "errorEstimationFunction = PacBioErrfun" : "errorEstimationFunction = loessErrfun" @@ -232,6 +222,7 @@ process { } withName: DADA2_DENOISING { + ext.prefix = { meta.region ? "region-${meta.region}_run-${meta.run}" : "${meta.run}" } // standard setting can be inspected with getDadaOpt(option = NULL) ext.args = [ 'selfConsist = FALSE, priors = character(0), DETECT_SINGLETONS = FALSE, GAPLESS = TRUE, GAP_PENALTY = -8, GREEDY = TRUE, KDIST_CUTOFF = 0.42, MATCH = 5, MAX_CLUST = 0, MAX_CONSIST = 10, MIN_ABUNDANCE = 1, MIN_FOLD = 1, MIN_HAMMING = 1, MISMATCH = -4, OMEGA_A = 1e-40, OMEGA_C = 1e-40, OMEGA_P = 1e-4, PSEUDO_ABUNDANCE = Inf, PSEUDO_PREVALENCE = 2, SSE = 2, USE_KMERS = TRUE, USE_QUALS = TRUE, VECTORIZED_ALIGNMENT = TRUE', @@ -259,6 +250,7 @@ process { } withName: DADA2_RMCHIMERA { + ext.prefix = { meta.region ? "region-${meta.region}_run-${meta.run}" : "${meta.run}" } ext.args = 'method="consensus", minSampleFraction = 0.9, ignoreNNegatives = 1, minFoldParentOverAbundance = 2, minParentAbundance = 8, allowOneOff = FALSE, minOneOffParentDistance = 4, maxShift = 16' publishDir = [ path: { "${params.outdir}/dada2/args" }, @@ -267,6 +259,10 @@ process { ] } + withName: DADA2_STATS { + ext.prefix = { meta.region ? "region-${meta.region}_run-${meta.run}" : "${meta.run}" } + } + withName: DADA2_MERGE { publishDir = [ path: { "${params.outdir}/dada2" }, @@ -275,6 +271,151 @@ process { ] } + withName: DADA2_SPLITREGIONS { + publishDir = [ + path: { "${params.outdir}/sidle/per_region" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: SIDLE_DBFILT { + ext.args = { params.sidle_ref_taxonomy.startsWith("greengenes") ? '--p-num-degenerates 3' : '--p-num-degenerates 5' } // 3 for greengenes, 5 for SILVA 128 + ext.args2 = { params.sidle_ref_taxonomy.startsWith("greengenes") ? '--p-exclude "p__;,k__;,mitochondria,chloroplast" --p-mode contains' : '--p-exclude "mitochondria,chloroplast" --p-mode contains' } // "p__;,k__;" for greengenes + publishDir = [ + path: { "${params.outdir}/sidle/DB/1_prefiltering" }, + mode: params.publish_dir_mode, + pattern: "*.qza", + enabled: params.save_intermediates + ] + } + + withName: SIDLE_DBEXTRACT { + ext.args = '--p-identity 2' + publishDir = [ + path: { "${params.outdir}/sidle/DB/2_primer_extraction" }, + mode: params.publish_dir_mode, + pattern: "*.qza", + enabled: params.save_intermediates + ] + } + + withName: SIDLE_TRIM { + publishDir = [ + path: { "${params.outdir}/sidle/ASV/1_trim" }, + mode: params.publish_dir_mode, + pattern: "*.qza", + enabled: params.save_intermediates + ] + } + + withName: SIDLE_ALIGN { + ext.args = '' + publishDir = [ + path: { "${params.outdir}/sidle/ASV/2_align_db" }, + mode: params.publish_dir_mode, + pattern: "*.qza", + enabled: params.save_intermediates + ] + } + + withName: SIDLE_DBRECON { + ext.args = '' + publishDir = [ + [ + path: { "${params.outdir}/sidle/DB/3_reconstructed" }, + mode: params.publish_dir_mode, + pattern: "*.qza", + enabled: params.save_intermediates + ], + [ + path: { "${params.outdir}/sidle/DB/3_reconstructed" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.endsWith('.qza') || filename.equals('versions.yml') ? null : filename } + ] + ] + } + + withName: SIDLE_TABLERECON { + ext.args = "--p-min-counts 0" + publishDir = [ + [ + path: { "${params.outdir}/sidle/reconstructed/qza" }, + mode: params.publish_dir_mode, + pattern: "*.qza", + enabled: params.save_intermediates + ], + [ + path: { "${params.outdir}/sidle/reconstructed" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.endsWith('.qza') || filename.equals('versions.yml') ? null : filename } + ] + ] + } + + withName: SIDLE_TAXRECON { + ext.args = { + params.sidle_ref_taxonomy.startsWith("greengenes") ? '--p-database "greengenes"' : + params.sidle_ref_taxonomy.startsWith("silva") ? '--p-database "silva"' : '--p-database "none"' + } + publishDir = [ + [ + path: { "${params.outdir}/sidle/reconstructed/qza" }, + mode: params.publish_dir_mode, + pattern: "*.qza", + enabled: params.save_intermediates + ], + [ + path: { "${params.outdir}/sidle/reconstructed" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.endsWith('.qza') || filename.equals('versions.yml') ? null : filename }, + enabled: params.save_intermediates + ] + ] + } + + withName: SIDLE_FILTTAX { + publishDir = [ + path: { "${params.outdir}/sidle/reconstructed" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: SIDLE_SEQRECON { + publishDir = [ + [ + path: { "${params.outdir}/sidle/reconstructed/qza" }, + mode: params.publish_dir_mode, + pattern: "*.qza", + enabled: params.save_intermediates + ], + [ + path: { "${params.outdir}/sidle/reconstructed" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.endsWith('.qza') || filename.equals('versions.yml') ? null : filename }, + enabled: params.save_intermediates + ] + ] + } + + withName: SIDLE_TREERECON { + publishDir = [ + [ + path: { "${params.outdir}/sidle/reconstructed/qza" }, + mode: params.publish_dir_mode, + pattern: "*.qza", + enabled: params.save_intermediates + ], + [ + path: { "${params.outdir}/sidle/reconstructed" }, + mode: params.publish_dir_mode, + pattern: "*.nwk" + ] + ] + } + + withName: BARRNAP { ext.kingdom = "bac,arc,mito,euk" ext.args = "--quiet --reject 0.1" @@ -671,7 +812,7 @@ process { ] } - withName: QIIME2_FILTERTAXA { + withName: 'QIIME2_TABLEFILTERTAXA|QIIME2_SEQFILTERTABLE' { publishDir = [ [ path: { "${params.outdir}/qiime2/abundance_tables" }, diff --git a/conf/ref_databases.config b/conf/ref_databases.config index e89df338..c2d12c4f 100644 --- a/conf/ref_databases.config +++ b/conf/ref_databases.config @@ -488,4 +488,49 @@ params { taxlevels = "D,P,C,O,F,G,S" } } + // Sidle reference databases + sidle_ref_databases { + 'silva' { + title = "SILVA - Version 128" + file = [ "https://www.arb-silva.de/fileadmin/silva_databases/qiime/Silva_128_release.tgz" ] + tree_qza = [ "https://data.qiime2.org/2021.4/common/sepp-refs-silva-128.qza" ] + citation = "https://www.arb-silva.de/; Bokulich, N.A., Robeson, M., Dillon, M.R. bokulich-lab/RESCRIPt. Zenodo. http://doi.org/10.5281/zenodo.3891931" + license = "https://www.arb-silva.de/silva-license-information/" + fmtscript = "taxref_reformat_sidle.sh" + taxlevels = "D,P,C,O,F,G" + } + 'silva=128' { + title = "SILVA - Version 128" + file = [ "https://www.arb-silva.de/fileadmin/silva_databases/qiime/Silva_128_release.tgz" ] + tree_qza = [ "https://data.qiime2.org/2021.4/common/sepp-refs-silva-128.qza" ] + citation = "https://www.arb-silva.de/; Bokulich, N.A., Robeson, M., Dillon, M.R. bokulich-lab/RESCRIPt. Zenodo. http://doi.org/10.5281/zenodo.3891931" + license = "https://www.arb-silva.de/silva-license-information/" + fmtscript = "taxref_reformat_sidle.sh" + taxlevels = "D,P,C,O,F,G" + } + 'greengenes' { + title = "Greengenes - Version 13_8" + file = [ "ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz" ] + tree_qza = [ "https://data.qiime2.org/2021.4/common/sepp-refs-gg-13-8.qza" ] + citation = "McDonald, D., Price, M., Goodrich, J. et al. An improved Greengenes taxonomy with explicit ranks for ecological and evolutionary analyses of bacteria and archaea. ISME J 6, 610–618 (2012). https://doi.org/10.1038/ismej.2011.139" + fmtscript = "taxref_reformat_sidle.sh" + taxlevels = "D,P,C,O,F,G,S" + } + 'greengenes=13_8' { + title = "Greengenes - Version 13_8" + file = [ "ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz" ] + tree_qza = [ "https://data.qiime2.org/2021.4/common/sepp-refs-gg-13-8.qza" ] + citation = "McDonald, D., Price, M., Goodrich, J. et al. An improved Greengenes taxonomy with explicit ranks for ecological and evolutionary analyses of bacteria and archaea. ISME J 6, 610–618 (2012). https://doi.org/10.1038/ismej.2011.139" + fmtscript = "taxref_reformat_sidle.sh" + taxlevels = "D,P,C,O,F,G,S" + } + 'greengenes88' { + title = "Greengenes - Version 13_8 - clustered at 88% similarity - for testing purposes only" + file = [ "ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz" ] + citation = "McDonald, D., Price, M., Goodrich, J. et al. An improved Greengenes taxonomy with explicit ranks for ecological and evolutionary analyses of bacteria and archaea. ISME J 6, 610–618 (2012). https://doi.org/10.1038/ismej.2011.139" + fmtscript = "taxref_reformat_sidle.sh" + taxlevels = "D,P,C,O,F,G,S" + derep = "88" + } + } } diff --git a/conf/test_multiregion.config b/conf/test_multiregion.config new file mode 100644 index 00000000..71518374 --- /dev/null +++ b/conf/test_multiregion.config @@ -0,0 +1,34 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/ampliseq -profile test_multiregion, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile for multi-region analysis' + config_profile_description = 'Test dataset to check multi-region pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/samplesheet_multiregion.tsv" + metadata = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/metadata_multiregion.tsv" + multiregion = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/regions_multiregion.tsv" + sidle_ref_taxonomy = "greengenes88" + + // Prevent default taxonomic classification + skip_dada_taxonomy = true + + // Reduce runtimes + skip_alpha_rarefaction = true + tax_agglom_max = 3 +} diff --git a/docs/output.md b/docs/output.md index c2b93537..5c0cd495 100644 --- a/docs/output.md +++ b/docs/output.md @@ -36,7 +36,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Kraken2](#kraken2) - Taxonomic classification with Kraken2 - [QIIME2](#qiime2) - Taxonomic classification with QIIME2 - [Phlogenetic placement and taxonomic classification](#phylogenetic-placement-and-taxonomic-classification) - Placing ASVs into a phyloenetic tree -- [QIIME2](#qiime2) - Secondary analysis +- [Multiple region analysis with Sidle](#multiple-region-analysis-with-sidle) - Scaffolding multiple regions along a reference +- [Secondary analysis with QIIME2](#secondary-analysis-with-qiime2) - Visualisations, diversity and differential abundance analysis with QIIME2 - [Abundance tables](#abundance-tables) - Exported abundance tables - [Relative abundance tables](#relative-abundance-tables) - Exported relative abundance tables - [Barplot](#barplot) - Interactive barplot @@ -374,7 +375,35 @@ Phylogenetic placement grafts sequences onto a phylogenetic reference tree and o -### QIIME2 +### Multiple region analysis with Sidle + +Instead of relying on one short amplicon, scaffolding multiple regions along a reference can improve resolution over a single region. This method applies [Sidle (SMURF Implementation Done to acceLerate Efficiency)](https://doi.org/10.1101/2021.03.23.436606) within [QIIME2](https://pubmed.ncbi.nlm.nih.gov/31341288/). + +Sidle reconstructs taxonomy profiles and abundances of several regions using a taxonomc database, therefore the previous sections about taxonomic classification are not applied. + +Additional output includes reconstruction of abundance table and taxonomic information from multiple regions and a phylogenetic tree that can be further analysed. Apart from region-specific sequences, no useful de-novo sequences are generated. + +
+Output files + +- `sidle/per-region/` + - `ASV_seqs_region*__.fasta`: ASV sequences per region + - `ASV_table_region*__.fasta`: ASV abundances per region + - `DADA2_table_region*__.fasta`: ASV abundances and sequences per region +- `sidle/DB/3_reconstructed/reconstruction_summary/index.html`: Information about the reconstructed reference taxonomy database +- `sidle/reconstructed/` + - `reconstructed_feature-table.biom`: Unified abundance table in biom format + - `reconstructed_feature-table.tsv`: Tab-separated unified abundance table + - `reconstructed_taxonomy.tsv`: Tab-separated unified taxonomy table + - `reconstructed_merged.tsv`: Tab-separated unified table with merged abundance and taxonomy information + - `reconstructed_tree.nwk`: Phylogenetic tree + - `reconstruction_table/index.html`: Information about the unified abundance table + +More intermediate output is populated into the results folder when using `--save_intermediates`. + +
+ +### Secondary analysis with QIIME2 **Quantitative Insights Into Microbial Ecology 2** ([QIIME2](https://qiime2.org/)) is a next-generation microbiome bioinformatics platform and the successor of the widely used [QIIME1](https://www.nature.com/articles/nmeth.f.303). diff --git a/docs/usage.md b/docs/usage.md index acf62a37..6b0cade9 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -14,6 +14,7 @@ - [ASV/OTU fasta input](#asvotu-fasta-input) - [Direct FASTQ input](#direct-fastq-input) - [Taxonomic classification](#taxonomic-classification) + - [Multiple region analysis with Sidle](#multiple-region-analysis-with-sidle) - [Metadata](#metadata) - [Updating the pipeline](#updating-the-pipeline) - [Reproducibility](#reproducibility) @@ -246,6 +247,37 @@ Special features of taxonomic classification tools: Parameter guidance is given in [nf-core/ampliseq website parameter documentation](https://nf-co.re/ampliseq/parameters/#taxonomic-database). Citations are listed in [`CITATIONS.md`](CITATIONS.md). +### Multiple region analysis with Sidle + +Instead of relying on one short amplicon, scaffolding multiple regions along a reference can improve resolution over a single region. This method applies [Sidle (SMURF Implementation Done to acceLerate Efficiency)](https://github.com/jwdebelius/q2-sidle) within [QIIME2](https://qiime2.org/) with [Silva](https://www.arb-silva.de/) (see [licence](https://www.arb-silva.de/silva-license-information/)) or [Greengenes](http://greengenes.microbio.me/greengenes_release/) database. + +For example, multiple variable regions of the 16S rRNA gene were sequenced with various primers and need to be unified. This leads to one unified abundance and taxonomy profile over all variable regions. However, ASV sequences are only available separately, there is no reconstruction of complete de-novo sequences feasible. + +Required is information about sequencing data via [`--input`](#samplesheet-input), region primers length information via [`--multiregion`](https://nf-co.re/ampliseq/parameters#multiregion), and a taxonomic database via [`--sidle_ref_taxonomy`](https://nf-co.re/ampliseq/parameters#sidle_ref_taxonomy) or [`--sidle_ref_tax_custom`](https://nf-co.re/ampliseq/parameters#sidle_ref_tax_custom). + +```bash +--input "samplesheet_multiregion.tsv" --multiregion "regions_multiregion.tsv" --sidle_ref_taxonomy "silva=128" +``` + +The region information file can be tab-separated (.tsv), comma-separated (.csv), or in YAML format (.yml/.yaml) and can have two to four columns/entries with the following headers: + +| Column | Description | +| ------------- | ------------------------------------------------------------------------- | +| region | Unique region identifier | +| region_length | Minimum region length, sequences are trimmed and shorter ones are omitted | +| FW_primer | Forward primer sequence | +| RV_primer | Reverse primer sequence | + +For example, the tab-separated `regions_multiregion.tsv` may contain: + +| region | FW_primer | RV_primer | region_length | +| ------- | --------------------- | -------------------- | ------------- | +| region1 | TGGCGAACGGGTGAGTAA | CCGTGTCTCAGTCCCARTG | 145 | +| region2 | ACTCCTACGGGAGGCAGC | GTATTACCGCGGCTGCTG | 135 | +| region3 | GTGTAGCGGTGRAATGCG | CCCGTCAATTCMTTTGAGTT | 200 | +| region4 | GGAGCATGTGGWTTAATTCGA | CGTTGCGGGACTTAACCC | 115 | +| region5 | GGAGGAAGGTGGGGATGAC | AAGGCCCGGGAACGTATT | 150 | + ### Metadata Metadata is optional, but for performing downstream analysis such as barplots, diversity indices or differential abundance testing, a metadata file is essential. diff --git a/lib/WorkflowAmpliseq.groovy b/lib/WorkflowAmpliseq.groovy index 25db3ed6..4676fd4e 100755 --- a/lib/WorkflowAmpliseq.groovy +++ b/lib/WorkflowAmpliseq.groovy @@ -15,7 +15,7 @@ class WorkflowAmpliseq { Nextflow.error("Missing input declaration: One of `--input`, `--input_fasta`, `--input_folder` is required.") } - if ( !params.input_fasta && (!params.FW_primer || !params.RV_primer) && !params.skip_cutadapt ) { + if ( !params.multiregion && !params.input_fasta && (!params.FW_primer || !params.RV_primer) && !params.skip_cutadapt ) { Nextflow.error("Incompatible parameters: `--FW_primer` and `--RV_primer` are required for primer trimming. If primer trimming is not needed, use `--skip_cutadapt`.") } @@ -131,6 +131,31 @@ class WorkflowAmpliseq { if ( params.orf_end && ( ( ( params.orf_end + 1 ) - params.orf_start ) % 3 != 0 ) ) { Nextflow.error("Incompatible parameters: The difference of `--orf_end` and `--orf_start` must be a multiple of 3.") } + + // When multi-region analysis is used, some parameter combinations are required or not allowed: + if ( params.multiregion ) { + if ( !params.sidle_ref_taxonomy && !params.sidle_ref_tree_custom ) { + log.warn "Missing parameter: Either use `--sidle_ref_taxonomy` or `--sidle_ref_tree_custom` to get (unified) taxonomic classifications" + } + if ( (params.dada_ref_tax_custom || params.dada_ref_taxonomy) && !params.skip_dada_taxonomy ) { + Nextflow.error("Incompatible parameters: Multiple region analysis with `--multiregion` does not work with `--dada_ref_tax_custom`, `--dada_ref_taxonomy`") + } + if ( params.cut_its != "none" ) { + Nextflow.error("Incompatible parameters: Multiple region analysis with `--multiregion` does not work with `--cut_its`") + } + if ( params.vsearch_cluster || params.filter_ssu || params.min_len_asv || params.max_len_asv || params.filter_codons ) { + log.warn "Incompatible parameters: Multiple region analysis with `--multiregion` ignores any of `--vsearch_cluster`, `--filter_ssu`, `--min_len_asv`, `--max_len_asv`, `--filter_codons`, `--cut_its`" + } + } + } + + // + // Prepare complement sequence - ultimately to make reverse complement primers + // Complement table taken from http://arep.med.harvard.edu/labgc/adnan/projects/Utilities/revcomp.html + public static String makeComplement(seq) { + def complements = [ A:'T', T:'A', U:'A', G:'C', C:'G', Y:'R', R:'Y', S:'S', W:'W', K:'M', M:'K', B:'V', D:'H', H:'D', V:'B', N:'N' ] + def comp = seq.toUpperCase().collect { base -> complements[ base ] ?: 'X' }.join() + return comp } // diff --git a/modules/local/dada2_denoising.nf b/modules/local/dada2_denoising.nf index ea07eadf..3a0cb1a2 100644 --- a/modules/local/dada2_denoising.nf +++ b/modules/local/dada2_denoising.nf @@ -23,6 +23,7 @@ process DADA2_DENOISING { task.ext.when == null || task.ext.when script: + def prefix = task.ext.prefix ?: "prefix" def args = task.ext.args ?: '' def args2 = task.ext.args2 ?: '' if (!meta.single_end) { @@ -37,18 +38,18 @@ process DADA2_DENOISING { filtRs <- sort(list.files("./filtered/", pattern = "_2.filt.fastq.gz", full.names = TRUE)) #denoising - sink(file = "${meta.run}.dada.log") + sink(file = "${prefix}.dada.log") dadaFs <- dada(filtFs, err = errF, $args, multithread = $task.cpus) - saveRDS(dadaFs, "${meta.run}_1.dada.rds") + saveRDS(dadaFs, "${prefix}_1.dada.rds") dadaRs <- dada(filtRs, err = errR, $args, multithread = $task.cpus) - saveRDS(dadaRs, "${meta.run}_2.dada.rds") + saveRDS(dadaRs, "${prefix}_2.dada.rds") sink(file = NULL) #make table mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, $args2, verbose=TRUE) - saveRDS(mergers, "${meta.run}.mergers.rds") + saveRDS(mergers, "${prefix}.mergers.rds") seqtab <- makeSequenceTable(mergers) - saveRDS(seqtab, "${meta.run}.seqtab.rds") + saveRDS(seqtab, "${prefix}.seqtab.rds") write.table('dada\t$args', file = "dada.args.txt", row.names = FALSE, col.names = FALSE, quote = FALSE, na = '') write.table('mergePairs\t$args2', file = "mergePairs.args.txt", row.names = FALSE, col.names = FALSE, quote = FALSE, na = '') @@ -64,17 +65,17 @@ process DADA2_DENOISING { filtFs <- sort(list.files("./filtered/", pattern = ".fastq.gz", full.names = TRUE)) #denoising - sink(file = "${meta.run}.dada.log") + sink(file = "${prefix}.dada.log") dadaFs <- dada(filtFs, err = errF, $args, multithread = $task.cpus) - saveRDS(dadaFs, "${meta.run}.dada.rds") + saveRDS(dadaFs, "${prefix}.dada.rds") sink(file = NULL) #make table seqtab <- makeSequenceTable(dadaFs) - saveRDS(seqtab, "${meta.run}.seqtab.rds") + saveRDS(seqtab, "${prefix}.seqtab.rds") #dummy file to fulfill output rules - saveRDS("dummy", "dummy_${meta.run}.mergers.rds") + saveRDS("dummy", "dummy_${prefix}.mergers.rds") write.table('dada\t$args', file = "dada.args.txt", row.names = FALSE, col.names = FALSE, quote = FALSE, na = '') writeLines(c("\\"${task.process}\\":", paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")),paste0(" dada2: ", packageVersion("dada2")) ), "versions.yml") diff --git a/modules/local/dada2_err.nf b/modules/local/dada2_err.nf index cecca72f..4c0aa62e 100644 --- a/modules/local/dada2_err.nf +++ b/modules/local/dada2_err.nf @@ -23,6 +23,7 @@ process DADA2_ERR { task.ext.when == null || task.ext.when script: + def prefix = task.ext.prefix ?: "prefix" def args = task.ext.args ?: '' def seed = task.ext.seed ?: '100' if (!meta.single_end) { @@ -34,32 +35,32 @@ process DADA2_ERR { fnFs <- sort(list.files(".", pattern = "_1.filt.fastq.gz", full.names = TRUE)) fnRs <- sort(list.files(".", pattern = "_2.filt.fastq.gz", full.names = TRUE)) - sink(file = "${meta.run}.err.log") + sink(file = "${prefix}.err.log") errF <- learnErrors(fnFs, $args, multithread = $task.cpus, verbose = TRUE) - saveRDS(errF, "${meta.run}_1.err.rds") + saveRDS(errF, "${prefix}_1.err.rds") errR <- learnErrors(fnRs, $args, multithread = $task.cpus, verbose = TRUE) - saveRDS(errR, "${meta.run}_2.err.rds") + saveRDS(errR, "${prefix}_2.err.rds") sink(file = NULL) - pdf("${meta.run}_1.err.pdf") + pdf("${prefix}_1.err.pdf") plotErrors(errF, nominalQ = TRUE) dev.off() - svg("${meta.run}_1.err.svg") + svg("${prefix}_1.err.svg") plotErrors(errF, nominalQ = TRUE) dev.off() - pdf("${meta.run}_2.err.pdf") + pdf("${prefix}_2.err.pdf") plotErrors(errR, nominalQ = TRUE) dev.off() - svg("${meta.run}_2.err.svg") + svg("${prefix}_2.err.svg") plotErrors(errR, nominalQ = TRUE) dev.off() - sink(file = "${meta.run}_1.err.convergence.txt") + sink(file = "${prefix}_1.err.convergence.txt") dada2:::checkConvergence(errF) sink(file = NULL) - sink(file = "${meta.run}_2.err.convergence.txt") + sink(file = "${prefix}_2.err.convergence.txt") dada2:::checkConvergence(errR) sink(file = NULL) @@ -74,19 +75,19 @@ process DADA2_ERR { fnFs <- sort(list.files(".", pattern = ".filt.fastq.gz", full.names = TRUE)) - sink(file = "${meta.run}.err.log") + sink(file = "${prefix}.err.log") errF <- learnErrors(fnFs, $args, multithread = $task.cpus, verbose = TRUE) - saveRDS(errF, "${meta.run}.err.rds") + saveRDS(errF, "${prefix}.err.rds") sink(file = NULL) - pdf("${meta.run}.err.pdf") + pdf("${prefix}.err.pdf") plotErrors(errF, nominalQ = TRUE) dev.off() - svg("${meta.run}.err.svg") + svg("${prefix}.err.svg") plotErrors(errF, nominalQ = TRUE) dev.off() - sink(file = "${meta.run}.err.convergence.txt") + sink(file = "${prefix}.err.convergence.txt") dada2:::checkConvergence(errF) sink(file = NULL) diff --git a/modules/local/dada2_rmchimera.nf b/modules/local/dada2_rmchimera.nf index 0f25444f..4e14ce1c 100644 --- a/modules/local/dada2_rmchimera.nf +++ b/modules/local/dada2_rmchimera.nf @@ -19,6 +19,7 @@ process DADA2_RMCHIMERA { task.ext.when == null || task.ext.when script: + def prefix = task.ext.prefix ?: "prefix" def args = task.ext.args ?: '' def no_samples = meta.id.size() def first_sample = meta.id.first() @@ -31,7 +32,7 @@ process DADA2_RMCHIMERA { #remove chimera seqtab.nochim <- removeBimeraDenovo(seqtab, $args, multithread=$task.cpus, verbose=TRUE) if ( ${no_samples} == 1 ) { rownames(seqtab.nochim) <- "${first_sample}" } - saveRDS(seqtab.nochim,"${meta.run}.ASVtable.rds") + saveRDS(seqtab.nochim,"${prefix}.ASVtable.rds") write.table('removeBimeraDenovo\t$args', file = "removeBimeraDenovo.args.txt", row.names = FALSE, col.names = FALSE, quote = FALSE, na = '') writeLines(c("\\"${task.process}\\":", paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")),paste0(" dada2: ", packageVersion("dada2")) ), "versions.yml") diff --git a/modules/local/dada2_splitregions.nf b/modules/local/dada2_splitregions.nf new file mode 100644 index 00000000..55ff3a06 --- /dev/null +++ b/modules/local/dada2_splitregions.nf @@ -0,0 +1,63 @@ +process DADA2_SPLITREGIONS { + label 'process_low' + + conda "conda-forge::r-base=4.2.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/r-base:4.2.1' : + 'biocontainers/r-base:4.2.1' }" + + input: + tuple val(meta), val(mapping) + path(table) + + output: + tuple val(meta), path( "DADA2_table_*.tsv" ) , emit: dada2asv + tuple val(meta), path( "ASV_table_*.tsv" ), path( "ASV_seqs_*.fasta" ), emit: for_sidle + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Make groovy map to R list; requirement: Values may not be false,true,null + def mapping_r_list = mapping + .toString() + .replaceAll("':","=") + .replaceAll(", '",",") + .replaceAll("\\['","list(") + .replaceAll("\\[","list(") + .replaceAll("\\]",")") + def suffix = "region" + meta.region + "_" + meta.fw_primer + "_" + meta.rv_primer + """ + #!/usr/bin/env Rscript + + nested_list <- $mapping_r_list + mapping <- as.data.frame(do.call(rbind, nested_list)) + + df <- read.csv("$table", header=TRUE, sep="\\t") + + # extract samples of this region + keep <- intersect( colnames(df), c("ASV_ID", unlist(mapping\$id), "sequence" ) ) + df <- subset( df, select = keep ) + + # modify sample names from .id to .sample + for (row in 1:nrow(mapping)) { + colnames(df) <- gsub( mapping[row,]\$id, mapping[row,]\$sample, colnames(df) ) + } + + # filter rows with only 0, occurring because many samples were removed + df <- df[as.logical(rowSums(df[,2:(ncol(df)-1)] != 0)), ] + + # Write file with ASV abdundance and sequences to file + write.table(df, file = "DADA2_table_${suffix}.tsv", sep = "\\t", row.names = FALSE, quote = FALSE, na = '') + + # Write fasta file with ASV sequences to file + write.table(data.frame(s = sprintf(">%s\n%s", df\$ASV_ID, df\$sequence)), 'ASV_seqs_${suffix}.fasta', col.names = FALSE, row.names = FALSE, quote = FALSE, na = '') + + # Write ASV file with ASV abundances to file + df\$sequence <- NULL + write.table(df, file = "ASV_table_${suffix}.tsv", sep="\\t", row.names = FALSE, quote = FALSE, na = '') + + writeLines(c("\\"${task.process}\\":", paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = "."))), "versions.yml") + """ +} diff --git a/modules/local/dada2_stats.nf b/modules/local/dada2_stats.nf index 2fe792e5..f4112046 100644 --- a/modules/local/dada2_stats.nf +++ b/modules/local/dada2_stats.nf @@ -18,6 +18,7 @@ process DADA2_STATS { task.ext.when == null || task.ext.when script: + def prefix = task.ext.prefix ?: "prefix" if (!meta.single_end) { """ #!/usr/bin/env Rscript @@ -34,7 +35,7 @@ process DADA2_STATS { } rownames(filter_and_trim) <- filter_and_trim\$ID filter_and_trim["ID"] <- NULL - #write.table( filter_and_trim, file = "${meta.run}.filter_and_trim.tsv", sep = "\\t", row.names = TRUE, quote = FALSE, na = '') + #write.table( filter_and_trim, file = "${prefix}.filter_and_trim.tsv", sep = "\\t", row.names = TRUE, quote = FALSE, na = '') #read data dadaFs = readRDS("${denoised[0]}") @@ -52,7 +53,7 @@ process DADA2_STATS { colnames(track) <- c("DADA2_input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim") rownames(track) <- sub(pattern = "_1.fastq.gz\$", replacement = "", rownames(track)) #this is when cutadapt is skipped! track <- cbind(sample = sub(pattern = "(.*?)\\\\..*\$", replacement = "\\\\1", rownames(track)), track) - write.table( track, file = "${meta.run}.stats.tsv", sep = "\\t", row.names = FALSE, quote = FALSE, na = '') + write.table( track, file = "${prefix}.stats.tsv", sep = "\\t", row.names = FALSE, quote = FALSE, na = '') writeLines(c("\\"${task.process}\\":", paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")),paste0(" dada2: ", packageVersion("dada2")) ), "versions.yml") """ @@ -72,7 +73,7 @@ process DADA2_STATS { } rownames(filter_and_trim) <- filter_and_trim\$ID filter_and_trim["ID"] <- NULL - #write.table( filter_and_trim, file = "${meta.run}.filter_and_trim.tsv", sep = "\\t", row.names = TRUE, quote = FALSE, na = '') + #write.table( filter_and_trim, file = "${prefix}.filter_and_trim.tsv", sep = "\\t", row.names = TRUE, quote = FALSE, na = '') #read data dadaFs = readRDS("${denoised[0]}") @@ -87,7 +88,7 @@ process DADA2_STATS { } colnames(track) <- c("DADA2_input", "filtered", "denoised", "nonchim") track <- cbind(sample = sub(pattern = "(.*?)\\\\..*\$", replacement = "\\\\1", rownames(track)), track) - write.table( track, file = "${meta.run}.stats.tsv", sep = "\\t", row.names = FALSE, quote = FALSE, na = '') + write.table( track, file = "${prefix}.stats.tsv", sep = "\\t", row.names = FALSE, quote = FALSE, na = '') writeLines(c("\\"${task.process}\\":", paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")),paste0(" dada2: ", packageVersion("dada2")) ), "versions.yml") """ diff --git a/modules/local/format_taxonomy_sidle.nf b/modules/local/format_taxonomy_sidle.nf new file mode 100644 index 00000000..72e55cd7 --- /dev/null +++ b/modules/local/format_taxonomy_sidle.nf @@ -0,0 +1,39 @@ +process FORMAT_TAXONOMY_SIDLE { + label 'process_low' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://containers.biocontainers.pro/s3/SingImgsRepo/biocontainers/v1.2.0_cv1/biocontainers_v1.2.0_cv1.img' : + 'docker.io/biocontainers/biocontainers:v1.2.0_cv1' }" + + input: + path('database.tar.gz') + val(suffix) + + output: + path( "*.seq.fasta" ) , emit: seq + path( "*.alnseq.fasta") , emit: alnseq + path( "*.tax.txt") , emit: tax + path( "ref_taxonomy.*.txt") , emit: ref_tax_info + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def derep = params.sidle_ref_databases[params.sidle_ref_taxonomy]["derep"] ?: "99" + """ + ${params.sidle_ref_databases[params.sidle_ref_taxonomy]["fmtscript"]} ${derep} + + #Giving out information + echo -e "--sidle_ref_taxonomy: ${params.sidle_ref_taxonomy}\n" >ref_taxonomy.${suffix}.txt + echo -e "Title: ${params.sidle_ref_databases[params.sidle_ref_taxonomy]["title"]}\n" >>ref_taxonomy.${suffix}.txt + echo -e "Citation: ${params.sidle_ref_databases[params.sidle_ref_taxonomy]["citation"]}\n" >>ref_taxonomy.${suffix}.txt + echo "All entries: ${params.sidle_ref_databases[params.sidle_ref_taxonomy]}" >>ref_taxonomy.${suffix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bash: \$(bash --version | sed -n 1p | sed 's/GNU bash, version //g') + END_VERSIONS + """ +} diff --git a/modules/local/qiime2_seqfiltertable.nf b/modules/local/qiime2_seqfiltertable.nf new file mode 100644 index 00000000..d0310fb6 --- /dev/null +++ b/modules/local/qiime2_seqfiltertable.nf @@ -0,0 +1,38 @@ +process QIIME2_SEQFILTERTABLE { + tag "${repseq} filter by ${table}" + label 'process_low' + + container "qiime2/core:2023.7" + + input: + path(table) + path(repseq) + + output: + path("filtered-sequences.qza"), emit: qza + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "QIIME2 does not support Conda. Please use Docker / Singularity / Podman instead." + } + """ + export XDG_CONFIG_HOME="./xdgconfig" + export MPLCONFIGDIR="./mplconfigdir" + export NUMBA_CACHE_DIR="./numbacache" + + qiime feature-table filter-seqs \\ + --i-data $repseq \\ + --i-table $table \\ + --o-filtered-data filtered-sequences.qza + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + qiime2: \$( qiime --version | sed '1!d;s/.* //' ) + END_VERSIONS + """ +} diff --git a/modules/local/qiime2_filtertaxa.nf b/modules/local/qiime2_tablefiltertaxa.nf similarity index 72% rename from modules/local/qiime2_filtertaxa.nf rename to modules/local/qiime2_tablefiltertaxa.nf index 404552dd..edfbdb41 100644 --- a/modules/local/qiime2_filtertaxa.nf +++ b/modules/local/qiime2_tablefiltertaxa.nf @@ -1,4 +1,4 @@ -process QIIME2_FILTERTAXA { +process QIIME2_TABLEFILTERTAXA { tag "taxa:${exclude_taxa};min-freq:${min_frequency};min-samples:${min_samples}" label 'process_low' @@ -6,16 +6,14 @@ process QIIME2_FILTERTAXA { input: path(table) - path(repseq) path(taxonomy) val(min_frequency) val(min_samples) val(exclude_taxa) output: - path("filtered-table.qza"), emit: asv + path("filtered-table.qza"), emit: qza path("filtered-table.tsv"), emit: tsv - path("filtered-sequences.qza"), emit: seq path "versions.yml" , emit: versions when: @@ -32,36 +30,23 @@ process QIIME2_FILTERTAXA { export NUMBA_CACHE_DIR="./numbacache" if ! [ \"${exclude_taxa}\" = \"none\" ]; then - #filter sequences - qiime taxa filter-seqs \\ - --i-sequences ${repseq} \\ - --i-taxonomy ${taxonomy} \\ - --p-exclude ${exclude_taxa} --p-mode contains \\ - --o-filtered-sequences tax_filtered-sequences.qza - #filter abundance table qiime taxa filter-table \\ --i-table ${table} \\ - --i-taxonomy ${taxonomy} \ - --p-exclude ${exclude_taxa} \\ + --i-taxonomy ${taxonomy} \\ + --p-exclude "${exclude_taxa}" \\ --p-mode contains \\ --o-filtered-table tax_filtered-table.qza filtered_table="tax_filtered-table.qza" - filtered_sequences="tax_filtered-sequences.qza" else filtered_table=${table} - filtered_sequences=${repseq} fi + qiime feature-table filter-features \\ --i-table \$filtered_table \\ --p-min-frequency ${min_frequency} \\ --p-min-samples ${min_samples} \\ --o-filtered-table filtered-table.qza - qiime feature-table filter-seqs \\ - --i-data \$filtered_sequences \\ - --i-table filtered-table.qza \\ - --o-filtered-data filtered-sequences.qza - #produce raw count table in biom format "table/feature-table.biom" qiime tools export \\ --input-path filtered-table.qza \\ diff --git a/modules/local/sidle_align.nf b/modules/local/sidle_align.nf new file mode 100644 index 00000000..15a3467b --- /dev/null +++ b/modules/local/sidle_align.nf @@ -0,0 +1,47 @@ +process SIDLE_ALIGN { + tag "$meta.region" + label 'process_medium' + + container 'docker.io/d4straub/pipesidle:0.1.0-beta' + + input: + tuple val(meta), path(kmers), path(seq) + + output: + tuple val(meta), path("*rep-seqs_align-map.qza"), emit: aligned_map + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sidle in QIIME2 does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.region}" + def primerfw = "${meta.fw_primer}" + def primerrv = "${meta.rv_primer}" + """ + # https://q2-sidle.readthedocs.io/en/latest/reconstruction.html#regional-alignment + export XDG_CONFIG_HOME="./xdgconfig" + export MPLCONFIGDIR="./mplconfigdir" + export NUMBA_CACHE_DIR="./numbacache" + + qiime sidle align-regional-kmers \\ + --p-n-workers $task.cpus \\ + --i-kmers ${kmers} \\ + --i-rep-seq ${seq} \\ + --p-region ${meta.region} \\ + $args \\ + --o-regional-alignment ${prefix}_rep-seqs_align-map.qza + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + qiime2: \$( qiime --version | sed '1!d;s/.* //' ) + qiime2 plugin sidle: \$( qiime sidle --version | sed 's/ (.*//' | sed 's/.*version //' ) + q2-sidle: \$( qiime sidle --version | sed 's/.*version //' | sed 's/)//' ) + END_VERSIONS + """ +} diff --git a/modules/local/sidle_dbextract.nf b/modules/local/sidle_dbextract.nf new file mode 100644 index 00000000..91f3635e --- /dev/null +++ b/modules/local/sidle_dbextract.nf @@ -0,0 +1,62 @@ + +process SIDLE_DBEXTRACT { + tag "$meta.region,$meta.region_length" + label 'process_medium' + + container 'docker.io/d4straub/pipesidle:0.1.0-beta' + + input: + tuple val(meta), path(table), path(seq), path(db_seq), path(db_tax) + + output: + tuple val(meta), path("db_*_kmers.qza"), emit: kmers + tuple val(meta), path("db_*_map.qza") , emit: map + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sidle in QIIME2 does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.region}" + def primerfw = "${meta.fw_primer}" + def primerrv = "${meta.rv_primer}" + def length = "${meta.region_length}" + """ + # https://q2-sidle.readthedocs.io/en/latest/database_preparation.html#prepare-a-regional-database-for-each-primer-set + export XDG_CONFIG_HOME="./xdgconfig" + export MPLCONFIGDIR="./mplconfigdir" + export NUMBA_CACHE_DIR="./numbacache" + + #extract sequences + qiime feature-classifier extract-reads \\ + --p-n-jobs $task.cpus \\ + --i-sequences $db_seq \\ + $args \\ + --p-f-primer $primerfw \\ + --p-r-primer $primerrv \\ + --o-reads db_${prefix}.qza + + #prepare to be used in alignment + qiime sidle prepare-extracted-region \\ + --p-n-workers $task.cpus \\ + --i-sequences db_${prefix}.qza \\ + --p-region "${prefix}" \\ + --p-fwd-primer $primerfw \\ + --p-rev-primer $primerrv \\ + --p-trim-length $length \\ + --o-collapsed-kmers db_${prefix}_${length}_kmers.qza \\ + --o-kmer-map db_${prefix}_${length}_map.qza + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + qiime2: \$( qiime --version | sed '1!d;s/.* //' ) + qiime2 plugin sidle: \$( qiime sidle --version | sed 's/ (.*//' | sed 's/.*version //' ) + q2-sidle: \$( qiime sidle --version | sed 's/.*version //' | sed 's/)//' ) + END_VERSIONS + """ +} diff --git a/modules/local/sidle_dbfilt.nf b/modules/local/sidle_dbfilt.nf new file mode 100644 index 00000000..60b4b30f --- /dev/null +++ b/modules/local/sidle_dbfilt.nf @@ -0,0 +1,54 @@ +process SIDLE_DBFILT { + label 'process_low' + + container 'docker.io/d4straub/pipesidle:0.1.0-beta' + + input: + path(seq) + path(tax) + + output: + path("db_filtered_sequences.qza") , emit: seq + path("db_filtered_sequences_tax.qza") , emit: tax + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sidle in QIIME2 does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + """ + # https://q2-sidle.readthedocs.io/en/latest/database_preparation.html#filtering-the-database + #pre-filtering should be very permissive! + export XDG_CONFIG_HOME="./xdgconfig" + export MPLCONFIGDIR="./mplconfigdir" + export NUMBA_CACHE_DIR="./numbacache" + + # authors of SMURF recommend "--p-num-degenerates 3" for greengenes 13_8 database at 99% + # the RESCRIPt formatted Silva 128 database is filtered to exclude sequences with more than 5 degenerates [3], [4] + qiime rescript cull-seqs \\ + --p-n-jobs $task.cpus \\ + --i-sequences $seq \\ + $args \\ + --o-clean-sequences db_filtered_sequences.qza + + #filtering a greengenes database for features missing a phylum (p__;) or kingdom(k__;) designation. + #CPU=1 + qiime taxa filter-seqs \\ + --i-sequences db_filtered_sequences.qza \\ + --i-taxonomy $tax \\ + $args2 \\ + --o-filtered-sequences db_filtered_sequences_tax.qza + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + qiime2: \$( qiime --version | sed '1!d;s/.* //' ) + qiime2 rescript: \$( qiime rescript --version | sed 's/ (.*//' | sed 's/.*version //' ) + END_VERSIONS + """ +} diff --git a/modules/local/sidle_dbrecon.nf b/modules/local/sidle_dbrecon.nf new file mode 100644 index 00000000..18a819b3 --- /dev/null +++ b/modules/local/sidle_dbrecon.nf @@ -0,0 +1,60 @@ +process SIDLE_DBRECON { + label 'process_medium' + + container 'docker.io/d4straub/pipesidle:0.1.0-beta' + + input: + val(metaid) + path(map) + path(aligned_map) + + output: + path("reconstruction_map.qza") , emit: reconstruction_map + path("reconstruction_summary.qza"), emit: reconstruction_summary + path("reconstruction_summary/*") , emit: visualisation + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sidle in QIIME2 does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def db_input = "" + // sort the input so that the regions are sorted by sequence + def df = [metaid, map, aligned_map].transpose().sort{ it[0] } + for (i in df) { + db_input += " --p-region "+i[0]+" --i-kmer-map "+i[1]+" --i-regional-alignment "+i[2] + } + """ + #https://q2-sidle.readthedocs.io/en/latest/reconstruction.html#database-reconstruction + export XDG_CONFIG_HOME="./xdgconfig" + export MPLCONFIGDIR="./mplconfigdir" + export NUMBA_CACHE_DIR="./numbacache" + + qiime sidle reconstruct-database \\ + --p-n-workers $task.cpus \\ + $db_input \\ + $args \\ + --o-database-map reconstruction_map.qza \\ + --o-database-summary reconstruction_summary.qza + + #database summary can be used to evaluate the quality of the reconstruction; see Fuks, C; Elgart, M; Amir, A; et al (2018) “Combining 16S rRNA gene variable regions enables high-resolution microbial community profiling.” Microbiome. 6:17. doi: 10.1186/s40168-017-0396-x + qiime metadata tabulate \\ + --m-input-file reconstruction_summary.qza \\ + --o-visualization reconstruction_summary.qzv + qiime tools export \\ + --input-path reconstruction_summary.qzv \\ + --output-path "reconstruction_summary" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + qiime2: \$( qiime --version | sed '1!d;s/.* //' ) + qiime2 plugin sidle: \$( qiime sidle --version | sed 's/ (.*//' | sed 's/.*version //' ) + q2-sidle: \$( qiime sidle --version | sed 's/.*version //' | sed 's/)//' ) + END_VERSIONS + """ +} diff --git a/modules/local/sidle_filttax.nf b/modules/local/sidle_filttax.nf new file mode 100644 index 00000000..52804b8a --- /dev/null +++ b/modules/local/sidle_filttax.nf @@ -0,0 +1,41 @@ + +process SIDLE_FILTTAX { + label 'process_single' + + container 'docker.io/d4straub/pipesidle:0.1.0-beta' + + input: + path(table_tofilter) + path(table_ref) + + output: + path("reconstructed_taxonomy.tsv"), emit: filtered + path("reconstructed_merged.tsv") , emit: merged + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sidle in QIIME2 does not support Conda. Please use Docker / Singularity / Podman instead." + } + """ + #!/usr/bin/env Rscript + + df_tofilter <- read.table("$table_tofilter", header = TRUE, sep = "\t", stringsAsFactors = FALSE) + colnames(df_tofilter)[1] <- "ID" + + df_ref <- read.table("$table_ref", header = TRUE, sep = "\t", stringsAsFactors = FALSE, skip = 1, comment.char = "") + colnames(df_ref)[1] <- "ID" + + df_merged <- merge(df_tofilter, df_ref, by="ID", all.x=FALSE, all.y=TRUE) + write.table(df_merged, file = "reconstructed_merged.tsv", row.names=FALSE, sep="\t") + + df_filtered <- subset(df_tofilter, df_tofilter\$ID %in% df_ref\$ID) + write.table(df_filtered, file = "reconstructed_taxonomy.tsv", row.names=FALSE, sep="\t") + + writeLines(c("\\"${task.process}\\":", paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")) ), "versions.yml") + """ +} diff --git a/modules/local/sidle_in.nf b/modules/local/sidle_in.nf new file mode 100644 index 00000000..4eddef41 --- /dev/null +++ b/modules/local/sidle_in.nf @@ -0,0 +1,48 @@ +process SIDLE_IN { + tag "$meta.region" + label 'process_single' + + container 'docker.io/d4straub/pipesidle:0.1.0-beta' + + input: + tuple val(meta), path(table), path(seq) + + output: + tuple val(meta), path("*_table.qza"), path("*_rep-seqs.qza"), emit: table_seq + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sidle in QIIME2 does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.region}" + """ + export XDG_CONFIG_HOME="./xdgconfig" + export MPLCONFIGDIR="./mplconfigdir" + export NUMBA_CACHE_DIR="./numbacache" + + # seq + qiime tools import \\ + --input-path "$seq" \\ + --type 'FeatureData[Sequence]' \\ + --output-path ${prefix}_rep-seqs.qza + + # table + biom convert -i "$table" -o table.biom --table-type="OTU table" --to-hdf5 + qiime tools import \\ + --input-path table.biom \\ + --type 'FeatureTable[Frequency]' \\ + --input-format BIOMV210Format \\ + --output-path ${prefix}_table.qza + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + qiime2: \$( qiime --version | sed '1!d;s/.* //' ) + END_VERSIONS + """ +} diff --git a/modules/local/sidle_indb.nf b/modules/local/sidle_indb.nf new file mode 100644 index 00000000..f91a21ac --- /dev/null +++ b/modules/local/sidle_indb.nf @@ -0,0 +1,46 @@ +process SIDLE_INDB { + label 'process_single' + + container 'docker.io/d4straub/pipesidle:0.1.0-beta' + + input: + path(seq) + path(tax) + + output: + path("db_sequences.qza"), emit: seq + path("db_taxonomy.qza") , emit: tax + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sidle in QIIME2 does not support Conda. Please use Docker / Singularity / Podman instead." + } + """ + export XDG_CONFIG_HOME="./xdgconfig" + export MPLCONFIGDIR="./mplconfigdir" + export NUMBA_CACHE_DIR="./numbacache" + + # db_seq + qiime tools import \\ + --input-path $seq \\ + --output-path db_sequences.qza \\ + --type 'FeatureData[Sequence]' + + # db_tax + qiime tools import \\ + --input-path $tax \\ + --output-path db_taxonomy.qza \\ + --type 'FeatureData[Taxonomy]' \\ + --input-format HeaderlessTSVTaxonomyFormat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + qiime2: \$( qiime --version | sed '1!d;s/.* //' ) + END_VERSIONS + """ +} diff --git a/modules/local/sidle_indbaligned.nf b/modules/local/sidle_indbaligned.nf new file mode 100644 index 00000000..4029d697 --- /dev/null +++ b/modules/local/sidle_indbaligned.nf @@ -0,0 +1,38 @@ +process SIDLE_INDBALIGNED { + label 'process_single' + + container 'docker.io/d4straub/pipesidle:0.1.0-beta' + + input: + path(seq) + + output: + path("db_alignedsequences.qza"), emit: seq + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sidle in QIIME2 does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + """ + export XDG_CONFIG_HOME="./xdgconfig" + export MPLCONFIGDIR="./mplconfigdir" + export NUMBA_CACHE_DIR="./numbacache" + + # db_seq + qiime tools import \\ + --input-path $seq \\ + --output-path db_alignedsequences.qza \\ + --type 'FeatureData[AlignedSequence]' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + qiime2: \$( qiime --version | sed '1!d;s/.* //' ) + END_VERSIONS + """ +} diff --git a/modules/local/sidle_seqrecon.nf b/modules/local/sidle_seqrecon.nf new file mode 100644 index 00000000..38fa742d --- /dev/null +++ b/modules/local/sidle_seqrecon.nf @@ -0,0 +1,61 @@ +process SIDLE_SEQRECON { + label 'process_medium' + label 'single_cpu' + + container 'docker.io/d4straub/pipesidle:0.1.0-beta' + + input: + path(reconstruction_map) + path(reconstruction_summary) + path(db_aligned_sequences) + + output: + path("reconstruction_fragments.qza") , emit: qza + path("reconstruction_fragments/*") , emit: visualisation + path("reconstructed_fragments.fasta"), emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sidle in QIIME2 does not support Conda. Please use Docker / Singularity / Podman instead." + } + """ + #https://q2-sidle.readthedocs.io/en/latest/reconstruction.html#reconstructing-the-phylogenetic-tree + #https://forum.qiime2.org/t/sidle-tutorial-missing-aligned-sequence-file/20604/4 for db_aligned_sequences + export XDG_CONFIG_HOME="./xdgconfig" + export MPLCONFIGDIR="./mplconfigdir" + export NUMBA_CACHE_DIR="./numbacache" + + #CPU=1 + qiime sidle reconstruct-fragment-rep-seqs \\ + --i-reconstruction-map ${reconstruction_map} \\ + --i-reconstruction-summary ${reconstruction_summary} \\ + --i-aligned-sequences ${db_aligned_sequences} \\ + --o-representative-fragments reconstruction_fragments.qza + + #export visualisation + qiime metadata tabulate \\ + --m-input-file reconstruction_fragments.qza \\ + --o-visualization reconstruction_fragments.qzv + qiime tools export \\ + --input-path reconstruction_fragments.qzv \\ + --output-path "reconstruction_fragments" + + #export fasta file + qiime tools export \\ + --input-path reconstruction_fragments.qza \\ + --output-path exported + cp exported/dna-sequences.fasta reconstructed_fragments.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + qiime2: \$( qiime --version | sed '1!d;s/.* //' ) + qiime2 plugin sidle: \$( qiime sidle --version | sed 's/ (.*//' | sed 's/.*version //' ) + q2-sidle: \$( qiime sidle --version | sed 's/.*version //' | sed 's/)//' ) + END_VERSIONS + """ +} diff --git a/modules/local/sidle_tablerecon.nf b/modules/local/sidle_tablerecon.nf new file mode 100644 index 00000000..fc6ba9ee --- /dev/null +++ b/modules/local/sidle_tablerecon.nf @@ -0,0 +1,74 @@ +process SIDLE_TABLERECON { + label 'process_medium' + + container 'docker.io/d4straub/pipesidle:0.1.0-beta' + + input: + val(metaid) + path(table) + path(aligned_map) + path(reconstruction_map) + path(reconstruction_summary) + + output: + path("reconstruction_table.qza") , emit: qza + path("reconstruction_table/*") , emit: exported + path("reconstructed_feature-table.biom"), emit: biom + path("reconstructed_feature-table.tsv") , emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sidle in QIIME2 does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def region_input = "" + // sort the input so that the regions are sorted by sequence + def df = [metaid, aligned_map, table].transpose().sort{ it[0] } + for (i in df) { + region_input += " --p-region "+i[0]+" --i-regional-alignment "+i[1]+" --i-regional-table "+i[2] + } + """ + #https://q2-sidle.readthedocs.io/en/latest/reconstruction.html#table-reconstruction + export XDG_CONFIG_HOME="./xdgconfig" + export MPLCONFIGDIR="./mplconfigdir" + export NUMBA_CACHE_DIR="./numbacache" + + qiime sidle reconstruct-counts \\ + --p-n-workers $task.cpus \\ + $region_input \\ + --i-database-map $reconstruction_map \\ + --i-database-summary $reconstruction_summary \\ + $args \\ + --o-reconstructed-table reconstruction_table.qza + + #export visualisation + qiime feature-table summarize \\ + --i-table reconstruction_table.qza \\ + --o-visualization reconstruction_table.qzv + qiime tools export \\ + --input-path reconstruction_table.qzv \\ + --output-path "reconstruction_table" + + #export feature table in biom and tsv format + qiime tools export \\ + --input-path reconstruction_table.qza \\ + --output-path exported + biom convert \\ + -i exported/feature-table.biom \\ + -o reconstructed_feature-table.tsv \\ + --to-tsv + cp exported/feature-table.biom reconstructed_feature-table.biom + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + qiime2: \$( qiime --version | sed '1!d;s/.* //' ) + qiime2 plugin sidle: \$( qiime sidle --version | sed 's/ (.*//' | sed 's/.*version //' ) + q2-sidle: \$( qiime sidle --version | sed 's/.*version //' | sed 's/)//' ) + END_VERSIONS + """ +} diff --git a/modules/local/sidle_taxrecon.nf b/modules/local/sidle_taxrecon.nf new file mode 100644 index 00000000..5503cf65 --- /dev/null +++ b/modules/local/sidle_taxrecon.nf @@ -0,0 +1,60 @@ +process SIDLE_TAXRECON { + label 'process_single' + + container 'docker.io/d4straub/pipesidle:0.1.0-beta' + + input: + path(reconstruction_map) + path(tax) + + output: + path("reconstruction_taxonomy.qza"), emit: qza + path("reconstruction_taxonomy/*") , emit: visualisation + path("reconstruction_taxonomy.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sidle in QIIME2 does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + """ + #https://q2-sidle.readthedocs.io/en/latest/reconstruction.html#taxonomic-reconstruction + #https://forum.qiime2.org/t/sidle-reconstruct-database/25439 + export XDG_CONFIG_HOME="./xdgconfig" + export MPLCONFIGDIR="./mplconfigdir" + export NUMBA_CACHE_DIR="./numbacache" + + #CPU=1 + qiime sidle reconstruct-taxonomy \\ + --i-reconstruction-map ${reconstruction_map} \\ + --i-taxonomy ${tax} \\ + $args \\ + --o-reconstructed-taxonomy reconstruction_taxonomy.qza + + #export visualisation + qiime metadata tabulate \\ + --m-input-file reconstruction_taxonomy.qza \\ + --o-visualization reconstruction_taxonomy.qzv + qiime tools export \\ + --input-path reconstruction_taxonomy.qzv \\ + --output-path "reconstruction_taxonomy" + + #export taxonomic tsv + qiime tools export \\ + --input-path reconstruction_taxonomy.qza \\ + --output-path exported + cp exported/taxonomy.tsv reconstruction_taxonomy.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + qiime2: \$( qiime --version | sed '1!d;s/.* //' ) + qiime2 plugin sidle: \$( qiime sidle --version | sed 's/ (.*//' | sed 's/.*version //' ) + q2-sidle: \$( qiime sidle --version | sed 's/.*version //' | sed 's/)//' ) + END_VERSIONS + """ +} diff --git a/modules/local/sidle_treerecon.nf b/modules/local/sidle_treerecon.nf new file mode 100644 index 00000000..91838e3e --- /dev/null +++ b/modules/local/sidle_treerecon.nf @@ -0,0 +1,51 @@ +process SIDLE_TREERECON { + label 'process_medium' + + container 'docker.io/d4straub/pipesidle:0.1.0-beta' + + input: + path(reconstruction_fragments) + path(ref_db_tree) + + output: + path("reconstructed_tree.qza") , emit: qza + path("reconstruction_placements.qza"), emit: qza_placements + path("reconstructed_tree.nwk") , emit: nwk + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sidle in QIIME2 does not support Conda. Please use Docker / Singularity / Podman instead." + } + """ + # https://q2-sidle.readthedocs.io/en/latest/reconstruction.html#reconstructing-the-phylogenetic-tree + # required: SEPP file https://forum.qiime2.org/t/sidle-tutorial-missing-aligned-sequence-file/20604/8 + # SEPP file only available for Greengenes 13_8 or SILVE 128 (not 138!): https://forum.qiime2.org/t/error-in-reconstructing-the-phylogenetic-tree/23757/8 + export XDG_CONFIG_HOME="./xdgconfig" + export MPLCONFIGDIR="./mplconfigdir" + export NUMBA_CACHE_DIR="./numbacache" + + qiime fragment-insertion sepp \\ + --p-threads $task.cpus \\ + --i-representative-sequences $reconstruction_fragments \\ + --i-reference-database $ref_db_tree \\ + --o-tree reconstructed_tree.qza \\ + --o-placements reconstruction_placements.qza + + #export tree file + qiime tools export \\ + --input-path reconstructed_tree.qza \\ + --output-path exported + cp exported/tree.nwk reconstructed_tree.nwk + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + qiime2: \$( qiime --version | sed '1!d;s/.* //' ) + q2-fragment-insertion: \$( qiime fragment-insertion --version | sed 's/.*version //' | sed 's/)//' ) + END_VERSIONS + """ +} diff --git a/modules/local/sidle_trim.nf b/modules/local/sidle_trim.nf new file mode 100644 index 00000000..7727f837 --- /dev/null +++ b/modules/local/sidle_trim.nf @@ -0,0 +1,49 @@ +process SIDLE_TRIM { + tag "$meta.region,$meta.region_length" + label 'process_single' + + container 'docker.io/d4straub/pipesidle:0.1.0-beta' + + input: + tuple val(meta), path(table), path(seq) + + output: + tuple val(meta), path("*_table.qza") , emit: table + tuple val(meta), path("*_rep-seqs.qza") , emit: seq + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Sidle in QIIME2 does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.region}" + def primerfw = "${meta.fw_primer}" + def primerrv = "${meta.rv_primer}" + def length = "${meta.region_length}" + """ + # https://q2-sidle.readthedocs.io/en/latest/read_preparation.html#dada2 + export XDG_CONFIG_HOME="./xdgconfig" + export MPLCONFIGDIR="./mplconfigdir" + export NUMBA_CACHE_DIR="./numbacache" + + #CPU=1 + qiime sidle trim-dada2-posthoc \\ + --i-table ${table} \\ + --i-representative-sequences ${seq} \\ + --p-trim-length $length \\ + --o-trimmed-table ${prefix}_${length}_table.qza \\ + --o-trimmed-representative-sequences ${prefix}_${length}_rep-seqs.qza + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + qiime2: \$( qiime --version | sed '1!d;s/.* //' ) + qiime2 plugin sidle: \$( qiime sidle --version | sed 's/ (.*//' | sed 's/.*version //' ) + q2-sidle: \$( qiime sidle --version | sed 's/.*version //' | sed 's/)//' ) + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index f9a659c0..66ceda10 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,7 @@ params { input = null input_fasta = null input_folder = null + multiregion = null extension = "/*_R{1,2}_001.fastq.gz" pacbio = false iontorrent = false @@ -117,6 +118,9 @@ params { kraken2_assign_taxlevels = null kraken2_ref_tax_custom = null kraken2_confidence = 0.0 + sidle_ref_taxonomy = null + sidle_ref_tax_custom = null + sidle_ref_tree_custom = null // MultiQC options multiqc_config = null @@ -154,7 +158,7 @@ params { // Schema validation default options validationFailUnrecognisedParams = false validationLenientMode = false - validationSchemaIgnoreParams = 'dada_ref_databases,qiime_ref_databases,sintax_ref_databases,kraken2_ref_databases,genomes,igenomes_base' + validationSchemaIgnoreParams = 'dada_ref_databases,qiime_ref_databases,sintax_ref_databases,kraken2_ref_databases,sidle_ref_databases,genomes,igenomes_base' validationShowHiddenParams = false validate_params = true @@ -283,6 +287,7 @@ profiles { test_novaseq { includeConfig 'conf/test_novaseq.config' } test_pplace { includeConfig 'conf/test_pplace.config' } test_sintax { includeConfig 'conf/test_sintax.config' } + test_multiregion { includeConfig 'conf/test_multiregion.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile diff --git a/nextflow_schema.json b/nextflow_schema.json index efa73e92..c598d233 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -51,6 +51,14 @@ "help_text": "This is optional, but for performing downstream analysis such as barplots, diversity indices or differential abundance testing, a metadata file is essential.\n\nRelated parameter:\n- `--metadata_category` (optional) to choose columns that are used for testing significance\n\nFor example:\n\n```bash\n--metadata \"path/to/metadata.tsv\"\n```\n\nPlease note the following requirements:\n\n1. The path must be enclosed in quotes\n2. The metadata file has to follow the QIIME2 specifications (https://docs.qiime2.org/2021.2/tutorials/metadata/)\n\nThe first column in the tab-separated metadata file is the sample identifier column (required header: `ID`) and defines the sample or feature IDs associated with your study. In addition to the sample identifier column, the metadata file is required to have at least one column with multiple different non-numeric values but not all unique.\n**NB**: without additional columns there might be no groupings for the downstream analyses.\n\nSample identifiers should be 36 characters long or less, and also contain only ASCII alphanumeric characters (i.e. in the range of [a-z], [A-Z], or [0-9]), or the dash (-) character. For downstream analysis, by default all numeric columns, blanks or NA are removed, and only columns with multiple different values but not all unique are selected.\n\nThe columns which are to be assessed can be specified by `--metadata_category`. If `--metadata_category` isn't specified than all columns that fit the specification are automatically chosen.", "fa_icon": "fas fa-file-csv" }, + "multiregion": { + "type": "string", + "mimetype": "text/tsv", + "fa_icon": "fas fa-dna", + "description": "Path to tab-separated multi region sheet", + "help_text": "Path to multi region sheet, either tab-separated (.tsv), comma-separated (.csv), or in YAML format (.yml/.yaml).\n\nChoose a reerence taxonomy with `--sidle_ref_taxonomy` or use `--sidle_ref_tax_custom` and `--sidle_ref_tree_custom`.", + "schema": "assets/schema_multiregion.json" + }, "outdir": { "type": "string", "format": "directory-path", @@ -332,7 +340,7 @@ "taxonomic_database": { "title": "Taxonomic database", "type": "object", - "default": "", + "default": "Choose a method and database for taxonomic assignments to single-region amplicons", "properties": { "dada_ref_taxonomy": { "type": "string", @@ -533,6 +541,29 @@ }, "fa_icon": "fas fa-database" }, + "multiregion_taxonomic_database": { + "title": "Multi-region taxonomic database", + "type": "object", + "default": "Choose database for taxonomic assignments with multi-region amplicons using SIDLE", + "properties": { + "sidle_ref_taxonomy": { + "type": "string", + "help_text": "", + "description": "Name of supported database, and optionally also version number", + "enum": ["silva", "silva=128", "greengenes", "greengenes=13_8", "greengenes88"] + }, + "sidle_ref_tax_custom": { + "type": "string", + "help_text": "Consider also setting `--sidle_ref_tree_custom`. Example usage: `--sidle_ref_tax_custom 'rep_set_99.fasta,rep_set_aligned_99.fasta,taxonomy_99_taxonomy.txt'`", + "description": "Comma separated paths to three files: reference taxonomy sequences (*.fasta), reference taxonomy strings (*.txt)" + }, + "sidle_ref_tree_custom": { + "type": "string", + "help_text": "Overwrites tree chosen by `--sidle_ref_taxonomy`", + "description": "Path to SIDLE reference taxonomy tree (*.qza)" + } + } + }, "asv_filtering": { "title": "ASV filtering", "type": "object", diff --git a/subworkflows/local/dada2_preprocessing.nf b/subworkflows/local/dada2_preprocessing.nf index b79b7035..0715a621 100644 --- a/subworkflows/local/dada2_preprocessing.nf +++ b/subworkflows/local/dada2_preprocessing.nf @@ -136,41 +136,31 @@ workflow DADA2_PREPROCESSING { ch_DADA2_QUALITY2_SVG = DADA2_QUALITY2.out.svg } - //group reads by sequencing run + // group reads by sequencing run and region // 'groupTuple', 'size' or 'groupKey' should be used but to produce it we need to know how many elements to group but some can be lost here, so no way knowing before ch_dada2_filtntrim_reads_passed .map { info, reads -> - def meta = [:] - meta.run = info.run - meta.single_end = info.single_end - [ meta, reads, info.id ] } + def meta = info.subMap( info.keySet() - 'id' - 'sample' ) + [ meta, reads, info.id, info.sample ] } .groupTuple(by: 0 ) .map { - info, reads, ids -> - def meta = [:] - meta.run = info.run - meta.single_end = info.single_end - meta.id = ids.flatten().sort() + info, reads, ids, samples -> + def meta = info + [id: ids.flatten().sort(), sample: samples.flatten().sort()] [ meta, reads.flatten().sort() ] } .set { ch_filt_reads } - //group logs by sequencing run + //group logs by sequencing run and region //for 'groupTuple', 'size' or 'groupKey' should be used but to produce it we need to know how many elements to group but some can be lost here, so no way knowing before ch_dada2_filtntrim_logs_passed .map { info, reads -> - def meta = [:] - meta.run = info.run - meta.single_end = info.single_end - [ meta, reads, info.id ] } + def meta = info.subMap( info.keySet() - 'id' - 'sample' ) + [ meta, reads, info.id, info.sample ] } .groupTuple(by: 0 ) .map { - info, reads, ids -> - def meta = [:] - meta.run = info.run - meta.single_end = info.single_end - meta.id = ids.flatten().sort() + info, reads, ids, samples -> + def meta = info + [id: ids.flatten().sort(), sample: samples.flatten().sort()] [ meta, reads.flatten().sort() ] } .set { ch_filt_logs } diff --git a/subworkflows/local/dada2_taxonomy_wf.nf b/subworkflows/local/dada2_taxonomy_wf.nf index 9aa64145..52c1760d 100644 --- a/subworkflows/local/dada2_taxonomy_wf.nf +++ b/subworkflows/local/dada2_taxonomy_wf.nf @@ -38,6 +38,8 @@ workflow DADA2_TAXONOMY_WF { def meta = [:] meta.single_end = true meta.id = "assignTaxonomy" + meta.fw_primer = params.FW_primer + meta.rv_primer_revcomp = WorkflowAmpliseq.makeComplement ( "${params.RV_primer}".reverse() ) [ meta, db ] } .set { ch_assigntax } CUTADAPT_TAXONOMY ( ch_assigntax ).reads diff --git a/subworkflows/local/parse_input.nf b/subworkflows/local/parse_input.nf index ae134ae9..56658a53 100644 --- a/subworkflows/local/parse_input.nf +++ b/subworkflows/local/parse_input.nf @@ -23,7 +23,7 @@ workflow PARSE_INPUT { .ifEmpty { error("${error_message}") } .map { read -> def meta = [:] - meta.id = read.baseName.toString().indexOf("_") != -1 ? read.baseName.toString().take(read.baseName.toString().indexOf("_")) : read.baseName + meta.sample = read.baseName.toString().indexOf("_") != -1 ? read.baseName.toString().take(read.baseName.toString().indexOf("_")) : read.baseName meta.single_end = single_end.toBoolean() meta.run = multiple_sequencing_runs ? read.take(read.findLastIndexOf{"/"})[-1] : "1" [ meta, read ] } @@ -35,7 +35,7 @@ workflow PARSE_INPUT { .ifEmpty { error("${error_message}") } .map { name, reads -> def meta = [:] - meta.id = name.toString().indexOf("_") != -1 ? name.toString().take(name.toString().indexOf("_")) : name + meta.sample = name.toString().indexOf("_") != -1 ? name.toString().take(name.toString().indexOf("_")) : name meta.single_end = single_end.toBoolean() meta.run = multiple_sequencing_runs ? reads[0].take(reads[0].findLastIndexOf{"/"})[-1] : "1" [ meta, reads ] } @@ -59,9 +59,9 @@ workflow PARSE_INPUT { .subscribe { if ( it == 1 ) error("Found only one folder with read data but \"--multiple_sequencing_runs\" was specified. Please review data input.") } } - //Check whether all sampleID = meta.id are unique + //Check whether all sampleID = meta.sample are unique ch_reads - .map { meta, reads -> [ meta.id ] } + .map { meta, reads -> [ meta.sample ] } .toList() .subscribe { if( it.size() != it.unique().size() ) { @@ -72,12 +72,12 @@ workflow PARSE_INPUT { //Check that no dots "." are in sampleID ch_reads - .map { meta, reads -> meta.id } + .map { meta, reads -> meta.sample } .subscribe { if ( "$it".contains(".") ) error("Please review data input, sampleIDs may not contain dots, but \"$it\" does.") } //Check that sampleIDs do not start with a number when using metadata (sampleID gets X prepended by R and metadata wont match any more!) ch_reads - .map { meta, reads -> meta.id } + .map { meta, reads -> meta.sample } .subscribe { if ( params.metadata && "$it"[0].isNumber() ) error("Please review data input, sampleIDs may not start with a number, but \"$it\" does. The pipeline unintentionally modifies such strings and the metadata will not match any more.") } emit: diff --git a/subworkflows/local/sidle_wf.nf b/subworkflows/local/sidle_wf.nf new file mode 100644 index 00000000..9b091b3e --- /dev/null +++ b/subworkflows/local/sidle_wf.nf @@ -0,0 +1,132 @@ +/* + * Training of a classifier with QIIME2 + */ + +include { FORMAT_TAXONOMY_SIDLE } from '../../modules/local/format_taxonomy_sidle' +include { SIDLE_INDB } from '../../modules/local/sidle_indb' +include { SIDLE_INDBALIGNED } from '../../modules/local/sidle_indbaligned' +include { SIDLE_DBFILT } from '../../modules/local/sidle_dbfilt' +include { SIDLE_IN } from '../../modules/local/sidle_in' +include { SIDLE_TRIM } from '../../modules/local/sidle_trim' +include { SIDLE_DBEXTRACT } from '../../modules/local/sidle_dbextract' +include { SIDLE_ALIGN } from '../../modules/local/sidle_align' +include { SIDLE_DBRECON } from '../../modules/local/sidle_dbrecon' +include { SIDLE_TABLERECON } from '../../modules/local/sidle_tablerecon' +include { SIDLE_TAXRECON } from '../../modules/local/sidle_taxrecon' +include { SIDLE_FILTTAX } from '../../modules/local/sidle_filttax' +include { SIDLE_SEQRECON } from '../../modules/local/sidle_seqrecon' +include { SIDLE_TREERECON } from '../../modules/local/sidle_treerecon' + +workflow SIDLE_WF { + take: + ch_asv_tables_sequences + ch_sidle_ref_taxonomy + val_sidle_ref_taxonomy + ch_db_tree + + main: + ch_sidle_versions = Channel.empty() + + // DB + if (!params.sidle_ref_tax_custom) { + //standard ref taxonomy input from conf/ref_databases.config, one tar.gz / tgz with all files + FORMAT_TAXONOMY_SIDLE ( ch_sidle_ref_taxonomy, val_sidle_ref_taxonomy ) + ch_db_sequences = FORMAT_TAXONOMY_SIDLE.out.seq + ch_db_alignedsequences = FORMAT_TAXONOMY_SIDLE.out.alnseq + ch_db_taxonomy = FORMAT_TAXONOMY_SIDLE.out.tax + } else { + //input from params.sidle_ref_tax_custom: it[0] = fasta = ch_db_sequences, it[1] = aligned fasta = ch_db_alignedsequences, it[2] = taxonomy txt = ch_db_taxonomy + ch_db_sequences = ch_sidle_ref_taxonomy.map{ it[0] } + ch_db_alignedsequences = ch_sidle_ref_taxonomy.map{ it[1] } + ch_db_taxonomy = ch_sidle_ref_taxonomy.map{ it[2] } + } + SIDLE_INDB ( ch_db_sequences, ch_db_taxonomy ) + ch_sidle_versions = ch_sidle_versions.mix(SIDLE_INDB.out.versions) + SIDLE_INDBALIGNED ( ch_db_alignedsequences ) + ch_sidle_versions = ch_sidle_versions.mix(SIDLE_INDBALIGNED.out.versions) + SIDLE_DBFILT ( SIDLE_INDB.out.seq, SIDLE_INDB.out.tax ) + ch_sidle_versions = ch_sidle_versions.mix(SIDLE_DBFILT.out.versions) + + // ASV + SIDLE_IN ( ch_asv_tables_sequences ) + ch_sidle_versions = ch_sidle_versions.mix(SIDLE_IN.out.versions) + SIDLE_TRIM ( SIDLE_IN.out.table_seq ) + ch_sidle_versions = ch_sidle_versions.mix(SIDLE_TRIM.out.versions) + + // Combine & reconstruct + SIDLE_DBEXTRACT ( + SIDLE_IN.out.table_seq + .combine( SIDLE_DBFILT.out.seq ) + .combine( SIDLE_DBFILT.out.tax ) ) + ch_sidle_versions = ch_sidle_versions.mix(SIDLE_DBEXTRACT.out.versions) + + SIDLE_ALIGN ( SIDLE_DBEXTRACT.out.kmers.join(SIDLE_TRIM.out.seq).dump(tag: 'into_SIDLE_ALIGN') ) + ch_sidle_versions = ch_sidle_versions.mix(SIDLE_ALIGN.out.versions) + + SIDLE_DBEXTRACT.out.map + .join(SIDLE_ALIGN.out.aligned_map) + .multiMap { meta, map, aligned_map -> + sampleid: meta.id + map: map + aligned_map: aligned_map + } + .set { ch_db_reconstruction } + + SIDLE_DBRECON ( + ch_db_reconstruction.sampleid.collect(), + ch_db_reconstruction.map.collect(), + ch_db_reconstruction.aligned_map.collect() ) + ch_sidle_versions = ch_sidle_versions.mix(SIDLE_DBRECON.out.versions) + + SIDLE_TRIM.out.table + .join(SIDLE_ALIGN.out.aligned_map) + .multiMap { meta, table, aligned_map -> + sampleid: meta.id + table: table + aligned_map: aligned_map + } + .set { ch_table_reconstruction } + + // Abundance table + SIDLE_TABLERECON ( + ch_table_reconstruction.sampleid.collect(), + ch_table_reconstruction.table.collect(), + ch_table_reconstruction.aligned_map.collect(), + SIDLE_DBRECON.out.reconstruction_map, + SIDLE_DBRECON.out.reconstruction_summary ) + ch_sidle_versions = ch_sidle_versions.mix(SIDLE_TABLERECON.out.versions) + + // Taxonomic classification + SIDLE_TAXRECON ( + SIDLE_DBRECON.out.reconstruction_map, + SIDLE_INDB.out.tax ) + ch_sidle_versions = ch_sidle_versions.mix(SIDLE_TAXRECON.out.versions) + SIDLE_FILTTAX ( SIDLE_TAXRECON.out.tsv, SIDLE_TABLERECON.out.tsv ) + ch_sidle_versions = ch_sidle_versions.mix(SIDLE_FILTTAX.out.versions) + + // Reconstruct sequences/fragments + // required: aligned sequences file: https://forum.qiime2.org/t/finding-alignment-files-for-sidle/23773/2 + SIDLE_SEQRECON ( + SIDLE_DBRECON.out.reconstruction_map, + SIDLE_DBRECON.out.reconstruction_summary, + SIDLE_INDBALIGNED.out.seq ) + // "The output of reconstruct-fragment-rep-seqs provides consensus sequences only if a reference sequence can't be resolved (ids that have a | symbol in them.) It's designed specifically to integrate with the fragment insertion and makes some downstream assumptions, including that you have the same database and insertion tree version.", see https://forum.qiime2.org/t/how-to-merge-q2-sidle-output-with-other-results/22823/2 + ch_sidle_versions = ch_sidle_versions.mix(SIDLE_SEQRECON.out.versions) + + // Reconstruct phylogenetic tree + SIDLE_TREERECON ( + SIDLE_SEQRECON.out.qza, + ch_db_tree ) + ch_sidle_versions = ch_sidle_versions.mix(SIDLE_TREERECON.out.versions) + + emit: + tax_qza = SIDLE_TAXRECON.out.qza + tax_tsv = SIDLE_FILTTAX.out.filtered + tax_tsv_merged = SIDLE_FILTTAX.out.merged + table_biom = SIDLE_TABLERECON.out.biom + table_qza = SIDLE_TABLERECON.out.qza + table_tsv = SIDLE_TABLERECON.out.tsv + tree_nwk = SIDLE_TREERECON.out.nwk + tree_qza = SIDLE_TREERECON.out.qza + versions = ch_sidle_versions +} diff --git a/tests/pipeline/multiregion.nf.test b/tests/pipeline/multiregion.nf.test new file mode 100644 index 00000000..2a75cf2d --- /dev/null +++ b/tests/pipeline/multiregion.nf.test @@ -0,0 +1,48 @@ +nextflow_pipeline { + + name "Test Workflow main.nf" + script "main.nf" + tag "test_multiregion" + tag "pipeline" + + test("Multiregion") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(path("$outputDir/input/samplesheet_multiregion.tsv"), + path("$outputDir/input/metadata_multiregion.tsv"), + path("$outputDir/input/regions_multiregion.tsv")).match("input") }, + { assert snapshot(path("$outputDir/overall_summary.tsv")).match("overall_summary_tsv") }, + { assert snapshot(path("$outputDir/cutadapt/cutadapt_summary.tsv")).match("cutadapt") }, + { assert snapshot(path("$outputDir/dada2/ASV_seqs.fasta"), + path("$outputDir/dada2/ASV_table.tsv"), + path("$outputDir/dada2/DADA2_stats.tsv"), + path("$outputDir/dada2/DADA2_table.rds"), + path("$outputDir/dada2/DADA2_table.tsv")).match("dada2") }, + { assert new File("$outputDir/sidle/per_region/ASV_seqs_region1_TGGCGAACGGGTGAGTAA_CCGTGTCTCAGTCCCARTG.fasta").exists() }, + { assert new File("$outputDir/sidle/per_region/ASV_table_region1_TGGCGAACGGGTGAGTAA_CCGTGTCTCAGTCCCARTG.tsv").exists() }, + { assert new File("$outputDir/sidle/reconstructed/reconstructed_feature-table.tsv").exists() }, + { assert new File("$outputDir/sidle/reconstructed/reconstructed_merged.tsv").exists() }, + { assert new File("$outputDir/sidle/reconstructed/reconstructed_taxonomy.tsv").exists() }, + { assert new File("$outputDir/qiime2/input/table.qza").exists() }, + { assert new File("$outputDir/qiime2/rel_abundance_tables/rel-table-2.tsv").exists() }, + { assert new File("$outputDir/qiime2/rel_abundance_tables/rel-table-3.tsv").exists() }, + { assert new File("$outputDir/qiime2/rel_abundance_tables/rel-table-ASV.tsv").exists() }, + { assert new File("$outputDir/qiime2/abundance_tables/count_table_filter_stats.tsv").exists() }, + { assert new File("$outputDir/qiime2/abundance_tables/filtered-table.qza").exists() }, + { assert new File("$outputDir/qiime2/barplot/index.html").exists() }, + { assert new File("$outputDir/qiime2/ancom/Category-treatment-ASV/index.html").exists() }, + { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"), + path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") } + ) + } + } +} diff --git a/tests/pipeline/multiregion.nf.test.snap b/tests/pipeline/multiregion.nf.test.snap new file mode 100644 index 00000000..69133adb --- /dev/null +++ b/tests/pipeline/multiregion.nf.test.snap @@ -0,0 +1,40 @@ +{ + "input": { + "content": [ + "samplesheet_multiregion.tsv:md5,b5b34bd6bb19f5a130b2328811e03bb8", + "metadata_multiregion.tsv:md5,e2671821f0cf64f15a50d3ddac8f20d1", + "regions_multiregion.tsv:md5,d0ca51b297f896e7b397509497e8264c" + ], + "timestamp": "2024-02-23T00:00:00+0000" + }, + "cutadapt": { + "content": [ + "cutadapt_summary.tsv:md5,1ad83d6d36ac9177d89af4e5caa7dedc" + ], + "timestamp": "2024-02-23T00:00:00+0000" + }, + "overall_summary_tsv": { + "content": [ + "overall_summary.tsv:md5,38ffc548e8d7c0da5ed09c290131f0b5" + ], + "timestamp": "2024-02-23T00:00:00+0000" + }, + "dada2": { + "content": [ + "ASV_seqs.fasta:md5,485d6c6f8c82c17fbd679b2a9032a541", + "ASV_table.tsv:md5,336bd44a6ff10c638ab95f7488030baa", + "DADA2_stats.tsv:md5,e56236955a435abbfdc92d514c9c5443", + "DADA2_table.rds:md5,6358376bb6f42ed782cda2be1564d3d8", + "DADA2_table.tsv:md5,f14f4e62f9898e6a0755e2d1621bcc3b" + ], + "timestamp": "2024-02-23T00:00:00+0000" + }, + "multiqc": { + "content": [ + "multiqc_fastqc.txt:md5,9468ae91af1a841c5e1369f11f704604", + "multiqc_general_stats.txt:md5,92c968d218577d47c2364849c07f7ab6", + "multiqc_cutadapt.txt:md5,d0a09fad9260bc83f85b43a0453d9208" + ], + "timestamp": "2024-02-23T00:00:00+0000" + } +} diff --git a/workflows/ampliseq.nf b/workflows/ampliseq.nf index 10039eda..2a8fae5b 100644 --- a/workflows/ampliseq.nf +++ b/workflows/ampliseq.nf @@ -42,6 +42,29 @@ if (params.classifier) { ch_qiime_classifier = Channel.fromPath("${params.classifier}", checkIfExists: true) } else { ch_qiime_classifier = Channel.empty() } +if (params.sidle_ref_tax_custom) { + if ("${params.sidle_ref_tax_custom}".contains(",")) { + sidle_ref_paths = "${params.sidle_ref_tax_custom}".split(",") + if (sidle_ref_paths.length != 3) { + error "--sidle_ref_tax_custom exately three filepaths separated by a comma (fasta, aligned fasta, taxonomy). Please review input." + } + ch_sidle_ref_taxonomy = Channel.fromPath( Arrays.asList(sidle_ref_paths), checkIfExists: true ) + } else { + error "--sidle_ref_tax_custom accepts exately three filepaths separated by a comma. Please review input." + } + val_sidle_ref_taxonomy = "user" + ch_sidle_ref_taxonomy_tree = params.sidle_ref_tree_custom ? Channel.fromPath("${params.sidle_ref_tree_custom}", checkIfExists: true) : Channel.empty() +} else if (params.sidle_ref_taxonomy) { + ch_sidle_ref_taxonomy = Channel.fromList( params.sidle_ref_databases[params.sidle_ref_taxonomy]["file"] ).map { file(it) } + ch_sidle_ref_taxonomy_tree = params.sidle_ref_tree_custom ? Channel.fromPath("${params.sidle_ref_tree_custom}", checkIfExists: true) : + params.sidle_ref_databases[params.sidle_ref_taxonomy]["tree_qza"] ? Channel.fromList( params.sidle_ref_databases[params.sidle_ref_taxonomy]["tree_qza"] ).map { file(it) } : Channel.empty() + val_sidle_ref_taxonomy = params.sidle_ref_taxonomy.replace('=','_').replace('.','_') +} else { + ch_sidle_ref_taxonomy = Channel.empty() + ch_sidle_ref_taxonomy_tree = Channel.empty() + val_sidle_ref_taxonomy = "none" +} + if (params.dada_ref_tax_custom) { //custom ref taxonomy input from params.dada_ref_tax_custom & params.dada_ref_tax_custom_sp ch_assigntax = Channel.fromPath("${params.dada_ref_tax_custom}", checkIfExists: true) @@ -154,7 +177,7 @@ if ( !(workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) } //only run QIIME2 downstream analysis when taxonomy is actually calculated and all required data is available -if ( !(workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) && !params.skip_taxonomy && !params.skip_qiime && !params.skip_qiime_downstream && (!params.skip_dada_taxonomy || params.sintax_ref_taxonomy || params.qiime_ref_taxonomy || params.qiime_ref_tax_custom || params.kraken2_ref_taxonomy || params.kraken2_ref_tax_custom) ) { +if ( !(workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) && !params.skip_taxonomy && !params.skip_qiime && !params.skip_qiime_downstream && (!params.skip_dada_taxonomy || params.sintax_ref_taxonomy || params.qiime_ref_taxonomy || params.qiime_ref_tax_custom || params.kraken2_ref_taxonomy || params.kraken2_ref_tax_custom || params.multiregion) ) { run_qiime2 = true } else { run_qiime2 = false @@ -178,6 +201,8 @@ include { DADA2_DENOISING } from '../modules/local/dada2_denoising include { DADA2_RMCHIMERA } from '../modules/local/dada2_rmchimera' include { DADA2_STATS } from '../modules/local/dada2_stats' include { DADA2_MERGE } from '../modules/local/dada2_merge' +include { DADA2_SPLITREGIONS } from '../modules/local/dada2_splitregions' +include { SIDLE_WF } from '../subworkflows/local/sidle_wf' include { BARRNAP } from '../modules/local/barrnap' include { BARRNAPSUMMARY } from '../modules/local/barrnapsummary' include { FILTER_SSU } from '../modules/local/filter_ssu' @@ -191,7 +216,8 @@ include { FORMAT_TAXONOMY } from '../modules/local/format_taxonomy include { ITSX_CUTASV } from '../modules/local/itsx_cutasv' include { MERGE_STATS as MERGE_STATS_STD} from '../modules/local/merge_stats' include { QIIME2_INSEQ } from '../modules/local/qiime2_inseq' -include { QIIME2_FILTERTAXA } from '../modules/local/qiime2_filtertaxa' +include { QIIME2_TABLEFILTERTAXA } from '../modules/local/qiime2_tablefiltertaxa' +include { QIIME2_SEQFILTERTABLE } from '../modules/local/qiime2_seqfiltertable' include { QIIME2_INASV } from '../modules/local/qiime2_inasv' include { QIIME2_INTREE } from '../modules/local/qiime2_intree' include { FORMAT_PPLACETAX } from '../modules/local/format_pplacetax' @@ -281,6 +307,37 @@ workflow AMPLISEQ { error("One of `--input`, `--input_fasta`, `--input_folder` must be provided!") } + // + // Add primer info to sequencing files + // + if ( params.multiregion ) { + // is multiple region analysis + ch_input_reads + .combine( Channel.fromSamplesheet("multiregion") ) + .map{ info, reads, multi -> + def meta = info + multi + return [ meta, reads ] } + .map{ info, reads -> + def meta = info + + [id: info.sample+"_"+info.fw_primer+"_"+info.rv_primer] + + [fw_primer_revcomp: WorkflowAmpliseq.makeComplement(info.fw_primer.reverse())] + + [rv_primer_revcomp: WorkflowAmpliseq.makeComplement(info.rv_primer.reverse())] + return [ meta, reads ] } + .set { ch_input_reads } + } else { + // is single region + ch_input_reads + .map{ info, reads -> + def meta = info + + [region: null, region_length: null] + + [fw_primer: params.FW_primer, rv_primer: params.RV_primer] + + [id: info.sample] + + [fw_primer_revcomp: params.FW_primer ? WorkflowAmpliseq.makeComplement(params.FW_primer.reverse()) : null] + + [rv_primer_revcomp: params.RV_primer ? WorkflowAmpliseq.makeComplement(params.RV_primer.reverse()) : null] + return [ meta, reads ] } + .set { ch_input_reads } + } + //Filter empty files ch_input_reads.dump(tag:'ch_input_reads') .branch { @@ -386,11 +443,52 @@ workflow AMPLISEQ { ch_stats = DADA2_MERGE.out.dada2stats } + // + // SUBWORKFLOW / MODULES : Taxonomic classification with DADA2, SINTAX and/or QIIME2 + // + if ( params.multiregion ) { + // separate sequences and abundances when several regions + DADA2_SPLITREGIONS ( + //DADA2_DENOISING per run & region -> per run + ch_reads + .map { + info, reads -> + def meta = info.subMap( info.keySet() - 'id' - 'sample' - 'run' ) // All of 'id', 'sample', 'run' must be removed to merge by region + def inf2 = info.subMap( 'id', 'sample' )// May not contain false,true,null; only 'id', 'sample' required + [ meta, inf2 ] } + .groupTuple(by: 0 ).dump(tag:'DADA2_SPLITREGIONS:meta'), + DADA2_MERGE.out.dada2asv ) + ch_versions = ch_versions.mix(DADA2_SPLITREGIONS.out.versions) + + // run q2-sidle + SIDLE_WF ( + DADA2_SPLITREGIONS.out.for_sidle, + ch_sidle_ref_taxonomy.collect(), + val_sidle_ref_taxonomy, + ch_sidle_ref_taxonomy_tree + ) + ch_versions = ch_versions.mix(SIDLE_WF.out.versions) + + // forward results to downstream analysis if multi region + ch_dada2_asv = SIDLE_WF.out.table_tsv + ch_dada2_fasta = Channel.empty() + // Any ASV post-clustering param is not allowed: + // - solved by '!params.multiregion' for vsearch_cluster, filter_ssu, min_len_asv, max_len_asv, filter_codons + // - solved in 'lib/WorkflowAmpliseq.groovy': cut_its + // Must have params: + // - solved by '!params.multiregion' for skip_report + // - solved in 'lib/WorkflowAmpliseq.groovy': skip_dada_taxonomy + } else { + // forward results to downstream analysis if single region + ch_dada2_fasta = DADA2_MERGE.out.fasta + ch_dada2_asv = DADA2_MERGE.out.asv + } + // // MODULE : ASV post-clustering with VSEARCH // - if (params.vsearch_cluster) { - ch_fasta_for_clustering = DADA2_MERGE.out.fasta + if (params.vsearch_cluster && !params.multiregion) { + ch_fasta_for_clustering = ch_dada2_fasta .map { fasta -> def meta = [:] @@ -398,13 +496,10 @@ workflow AMPLISEQ { [ meta, fasta ] } VSEARCH_CLUSTER ( ch_fasta_for_clustering ) ch_versions = ch_versions.mix(VSEARCH_CLUSTER.out.versions.ifEmpty(null)) - FILTER_CLUSTERS ( VSEARCH_CLUSTER.out.clusters, DADA2_MERGE.out.asv ) + FILTER_CLUSTERS ( VSEARCH_CLUSTER.out.clusters, ch_dada2_asv ) ch_versions = ch_versions.mix(FILTER_CLUSTERS.out.versions.ifEmpty(null)) ch_dada2_fasta = FILTER_CLUSTERS.out.fasta ch_dada2_asv = FILTER_CLUSTERS.out.asv - } else { - ch_dada2_fasta = DADA2_MERGE.out.fasta - ch_dada2_asv = DADA2_MERGE.out.asv } // @@ -420,7 +515,7 @@ workflow AMPLISEQ { // // Modules : Filter rRNA // - if (!params.skip_barrnap && params.filter_ssu) { + if ( !params.skip_barrnap && params.filter_ssu && !params.multiregion ) { BARRNAP ( ch_unfiltered_fasta ) BARRNAPSUMMARY ( BARRNAP.out.gff.collect() ) BARRNAPSUMMARY.out.warning.subscribe { @@ -435,7 +530,7 @@ workflow AMPLISEQ { ch_stats = MERGE_STATS_FILTERSSU.out.tsv ch_dada2_fasta = FILTER_SSU.out.fasta ch_dada2_asv = FILTER_SSU.out.asv - } else if (!params.skip_barrnap && !params.filter_ssu) { + } else if ( !params.skip_barrnap && !params.filter_ssu && !params.multiregion ) { BARRNAP ( ch_unfiltered_fasta ) BARRNAPSUMMARY ( BARRNAP.out.gff.collect() ) BARRNAPSUMMARY.out.warning.subscribe { if ( it.baseName.toString().startsWith("WARNING") ) log.warn "Barrnap could not identify any rRNA in the ASV sequences. We recommended to use the --skip_barrnap option for these sequences." } @@ -450,7 +545,7 @@ workflow AMPLISEQ { // // Modules : amplicon length filtering // - if (params.min_len_asv || params.max_len_asv) { + if ( (params.min_len_asv || params.max_len_asv) && !params.multiregion ) { FILTER_LEN_ASV ( ch_dada2_fasta, ch_dada2_asv.ifEmpty( [] ) ) ch_versions = ch_versions.mix(FILTER_LEN_ASV.out.versions.ifEmpty(null)) MERGE_STATS_FILTERLENASV ( ch_stats, FILTER_LEN_ASV.out.stats ) @@ -464,7 +559,7 @@ workflow AMPLISEQ { // // Modules : Filtering based on codons in an open reading frame // - if (params.filter_codons ) { + if ( params.filter_codons && !params.multiregion ) { FILTER_CODONS ( ch_dada2_fasta, ch_dada2_asv.ifEmpty( [] ) ) ch_versions = ch_versions.mix(FILTER_CODONS.out.versions.ifEmpty(null)) MERGE_STATS_CODONS( ch_stats, FILTER_CODONS.out.stats ) @@ -606,6 +701,8 @@ workflow AMPLISEQ { // Import phylogenetic tree into QIIME2 if ( params.pplace_tree ) { ch_tree = QIIME2_INTREE ( FASTA_NEWICK_EPANG_GAPPA.out.grafted_phylogeny ).qza + } else if (params.multiregion) { + ch_tree = SIDLE_WF.out.tree_qza } else { ch_tree = [] } // Import taxonomic classification into QIIME2, if available @@ -615,6 +712,10 @@ workflow AMPLISEQ { ch_tax = Channel.empty() tax_agglom_min = 1 tax_agglom_max = 2 + } else if ( params.multiregion ) { + log.info "Use multi-region SIDLE taxonomy classification" + val_used_taxonomy = "SIDLE" + ch_tax = SIDLE_WF.out.tax_qza } else if ( params.pplace_tree && params.pplace_taxonomy) { log.info "Use EPA-NG / GAPPA taxonomy classification" val_used_taxonomy = "phylogenetic placement" @@ -645,20 +746,20 @@ workflow AMPLISEQ { // Filtering ASVs by taxonomy & prevalence & counts if (params.exclude_taxa != "none" || params.min_frequency != 1 || params.min_samples != 1) { - QIIME2_FILTERTAXA ( + QIIME2_TABLEFILTERTAXA ( QIIME2_INASV.out.qza, - QIIME2_INSEQ.out.qza, ch_tax, params.min_frequency, params.min_samples, params.exclude_taxa ) - FILTER_STATS ( ch_dada2_asv, QIIME2_FILTERTAXA.out.tsv ) + QIIME2_SEQFILTERTABLE ( QIIME2_TABLEFILTERTAXA.out.qza, QIIME2_INSEQ.out.qza ) + FILTER_STATS ( ch_dada2_asv, QIIME2_TABLEFILTERTAXA.out.tsv ) ch_versions = ch_versions.mix( FILTER_STATS.out.versions.ifEmpty(null) ) MERGE_STATS_FILTERTAXA (ch_stats, FILTER_STATS.out.tsv) - ch_asv = QIIME2_FILTERTAXA.out.asv - ch_seq = QIIME2_FILTERTAXA.out.seq - ch_tsv = QIIME2_FILTERTAXA.out.tsv + ch_asv = QIIME2_TABLEFILTERTAXA.out.qza + ch_seq = QIIME2_SEQFILTERTABLE.out.qza + ch_tsv = QIIME2_TABLEFILTERTAXA.out.tsv } else { ch_asv = QIIME2_INASV.out.qza ch_seq = QIIME2_INSEQ.out.qza @@ -815,7 +916,7 @@ workflow AMPLISEQ { // // MODULE: Summary Report // - if (!params.skip_report) { + if (!params.skip_report && !params.multiregion) { SUMMARY_REPORT ( ch_report_template, ch_report_css, @@ -866,7 +967,7 @@ workflow AMPLISEQ { !params.skip_taxonomy && ( params.qiime_ref_taxonomy || params.qiime_ref_tax_custom || params.classifier ) && run_qiime2_taxonomy ? QIIME2_TAXONOMY.out.tsv.ifEmpty( [] ) : [], run_qiime2, run_qiime2 ? val_used_taxonomy : "", - run_qiime2 && ( params.exclude_taxa != "none" || params.min_frequency != 1 || params.min_samples != 1 ) ? ch_dada2_asv.countLines()+","+QIIME2_FILTERTAXA.out.tsv.countLines() : "", + run_qiime2 && ( params.exclude_taxa != "none" || params.min_frequency != 1 || params.min_samples != 1 ) ? ch_dada2_asv.countLines()+","+QIIME2_TABLEFILTERTAXA.out.tsv.countLines() : "", run_qiime2 && ( params.exclude_taxa != "none" || params.min_frequency != 1 || params.min_samples != 1 ) ? FILTER_STATS.out.tsv.ifEmpty( [] ) : [], run_qiime2 && !params.skip_barplot ? QIIME2_BARPLOT.out.folder.ifEmpty( [] ) : [], run_qiime2 && !params.skip_abundance_tables ? QIIME2_EXPORT.out.abs_tsv.ifEmpty( [] ) : [], @@ -892,6 +993,10 @@ workflow AMPLISEQ { file("${params.outdir}/input").mkdir() file("${params.input_fasta}").copyTo("${params.outdir}/input") } + if ( params.multiregion ) { + file("${params.outdir}/input").mkdir() + file("${params.multiregion}").copyTo("${params.outdir}/input") + } //Save metadata in results folder if ( params.metadata ) { file("${params.outdir}/input").mkdir()