Require a minimum sequence length of 50bp after ITSx

nf-core · Mar 25, 2024 · f15c9dd · f15c9dd
1 parent 152331d
commit f15c9dd
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 24 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#711](https://github.com/nf-core/ampliseq/pull/711) - From r207 and onwards Archaea sequences were omitted when parsing GTDB databases. (This did not affect `sbdi-gtdb` databases, only `gtdb`.)
 - [#715](https://github.com/nf-core/ampliseq/pull/715) - Fix filtering vsearch clusters for high number of clusters
 - [#717](https://github.com/nf-core/ampliseq/pull/717) - Fix edge case for sorting file names by using radix method
+- [#718](https://github.com/nf-core/ampliseq/pull/718) - Require a minimum sequence length of 50bp for taxonomic classifcation after using ITSx
 
 ### `Dependencies`
 

diff --git a/assets/report_template.Rmd b/assets/report_template.Rmd
@@ -827,7 +827,8 @@ cat("# Taxonomic Classification\n")
 cat(paste0("
 ## ITS regions
 
-The ",params$cut_its," region was extracted from each ASV sequence using [ITSx](https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.12073).
+The ",params$cut_its," region was extracted from each ASV sequence using [ITSx](https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.12073)
+with a minimal sequence length of 50bp.
 Taxonomic classification should have improved performance based on extracted ITS sequence. ITSx results can be found in folder [itsx](../itsx).
 
 Taxonomies per extracted region was then transferred back to the full ASV sequence. No filtering was done based on whether the region was found or not.

diff --git a/conf/modules.config b/conf/modules.config
@@ -442,6 +442,8 @@ process {
     }
 
     withName: FILTER_LEN_ASV {
+        ext.min_len_asv = { params.min_len_asv }
+        ext.max_len_asv = { params.max_len_asv }
         publishDir = [
             path: { "${params.outdir}/asv_length_filter" },
             mode: params.publish_dir_mode,
@@ -483,6 +485,15 @@ process {
         ]
     }
 
+    withName: FILTER_LEN_ITSX {
+        ext.min_len_asv = 50
+        publishDir = [
+            path: { "${params.outdir}/itsx" },
+            mode: params.publish_dir_mode,
+            pattern: "ASV_*"
+        ]
+    }
+
     withName: 'FORMAT_TAXONOMY|FORMAT_TAXRESULTS' {
         publishDir = [
             path: { "${params.outdir}/dada2" },

diff --git a/docs/output.md b/docs/output.md
@@ -27,8 +27,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
   - [VSEARCH cluster](#vsearch-cluster) - Centroid fasta file, filtered asv table, and stats
   - [Barrnap](#barrnap) - Predict ribosomal RNA sequences and optional filtering
   - [Length filter](#length-filter) - Optionally, ASV can be filtered by length thresholds
-  - [ITSx](#itsx) - Optionally, the ITS region can be extracted
   - [Codons](#codons) - Optionally the ASVs can be filtered by presence of stop codons.
+  - [ITSx](#itsx) - Optionally, the ITS region can be extracted
 - [Taxonomic classification](#taxonomic-classification) - Taxonomic classification of (filtered) ASVs
   - [DADA2](#dada2) - Taxonomic classification with DADA2
   - [assignSH](#assignsh) - Optionally, a UNITE species hypothesis (SH) can be added to the DADA2 taxonomy
@@ -216,23 +216,6 @@ The minimum ASV length threshold can be set by `--min_len_asv` and the maximum l
 
 </details>
 
-#### ITSx
-
-Optionally, the ITS region can be extracted from each ASV sequence using ITSx, and taxonomic classification is performed based on the ITS sequence.
-
-<details markdown="1">
-<summary>Output files</summary>
-
-- `itsx/`
-  - `ASV_ITS_seqs.full.fasta`: Fasta file with full ITS region from each ASV sequence.
-  - `ASV_ITS_seqs.ITS1.fasta` or `ASV_ITS_seqs.ITS2.fasta`: If using --cut_its "its1" or --cut_its "its2"; fasta file with ITS1 or ITS2 region from each ASV sequence.
-  - `ASV_ITS_seqs.full_and_partial.fasta`: If using --its_partial; fasta file with full and partial ITS regions from each ASV sequence.
-  - `ASV_ITS_seqs.ITS1.full_and_partial.fasta` or `ASV_ITS_seqs.ITS2.full_and_partial.fasta`: If using --cut_its "its1" or --cut_its "its2" and --its_partial; fasta file with complete and partial ITS1 or ITS2 regions from each ASV sequence.
-  - `ASV_ITS_seqs.summary.txt`: Summary information from ITSx.
-  - `ITSx.args.txt`: File with parameters passed to ITSx.
-
-</details>
-
 #### Codons
 
 Optionally, the ASVs can be filtered against the presence of stop codons in the specified open reading frame of the ASV. The filtering step can also filter out ASVs that are not multiple of 3 in length.
@@ -250,6 +233,26 @@ Codon filtering can be activated by `--filter_codons`. By default, the codons ar
 
 </details>
 
+#### ITSx
+
+Optionally, the ITS region can be extracted from each ASV sequence using ITSx, and taxonomic classification is performed based on the ITS sequence. Only sequences with at minimum 50bp in length are retained.
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `itsx/`
+  - `ASV_ITS_seqs.full.fasta`: Fasta file with full ITS region from each ASV sequence.
+  - `ASV_ITS_seqs.ITS1.fasta` or `ASV_ITS_seqs.ITS2.fasta`: If using --cut_its "its1" or --cut_its "its2"; fasta file with ITS1 or ITS2 region from each ASV sequence.
+  - `ASV_ITS_seqs.full_and_partial.fasta`: If using --its_partial; fasta file with full and partial ITS regions from each ASV sequence.
+  - `ASV_ITS_seqs.ITS1.full_and_partial.fasta` or `ASV_ITS_seqs.ITS2.full_and_partial.fasta`: If using --cut_its "its1" or --cut_its "its2" and --its_partial; fasta file with complete and partial ITS1 or ITS2 regions from each ASV sequence.
+  - `ASV_ITS_seqs.summary.txt`: Summary information from ITSx.
+  - `ITSx.args.txt`: File with parameters passed to ITSx.
+  - `ASV_seqs.len.fasta`: Fasta file with filtered ASV sequences.
+  - `ASV_len_orig.tsv`: ASV length distribution before filtering.
+  - `ASV_len_filt.tsv`: ASV length distribution after filtering.
+
+</details>
+
 ### Taxonomic classification
 
 Taxonomic classification of ASVs can be performed with a choice of DADA2, SINTAX, Kraken2 or QIIME2 using supplied databases or user supplied databases (see parameter documentation). By default, DADA2 is used for the classification. The taxonomic classification will be done based on filtered ASV sequences (see above).

diff --git a/modules/local/filter_len_asv.nf → modules/local/filter_len.nf b/modules/local/filter_len_asv.nf → modules/local/filter_len.nf
@@ -1,4 +1,4 @@
-process FILTER_LEN_ASV {
+process FILTER_LEN {
     tag "${fasta}"
     label 'process_low'
 
@@ -23,8 +23,8 @@ process FILTER_LEN_ASV {
     task.ext.when == null || task.ext.when
 
     script:
-    def min_len_asv = params.min_len_asv ?: '1'
-    def max_len_asv = params.max_len_asv ?: '1000000'
+    def min_len_asv = task.ext.min_len_asv ?: '1'
+    def max_len_asv = task.ext.max_len_asv ?: '1000000'
 
     def read_table  = table ? "table <- read.table(file = '$table', sep = '\t', comment.char = '', header=TRUE)" : "table <- data.frame(matrix(ncol = 1, nrow = 0))"
     def asv_table_filtered  = table ? "ASV_table.len.tsv" : "empty_ASV_table.len.tsv"

diff --git a/workflows/ampliseq.nf b/workflows/ampliseq.nf
@@ -191,7 +191,8 @@ include { SIDLE_WF                      } from '../subworkflows/local/sidle_wf'
 include { BARRNAP                       } from '../modules/local/barrnap'
 include { BARRNAPSUMMARY                } from '../modules/local/barrnapsummary'
 include { FILTER_SSU                    } from '../modules/local/filter_ssu'
-include { FILTER_LEN_ASV                } from '../modules/local/filter_len_asv'
+include { FILTER_LEN as FILTER_LEN_ASV  } from '../modules/local/filter_len'
+include { FILTER_LEN as FILTER_LEN_ITSX } from '../modules/local/filter_len'
 include { MERGE_STATS as MERGE_STATS_FILTERSSU    } from '../modules/local/merge_stats'
 include { MERGE_STATS as MERGE_STATS_FILTERLENASV } from '../modules/local/merge_stats'
 include { MERGE_STATS as MERGE_STATS_CODONS       } from '../modules/local/merge_stats'
@@ -567,7 +568,8 @@ workflow AMPLISEQ {
         }
         ITSX_CUTASV ( ch_full_fasta, outfile )
         ch_versions = ch_versions.mix(ITSX_CUTASV.out.versions.ifEmpty(null))
-        ch_fasta = ITSX_CUTASV.out.fasta
+        FILTER_LEN_ITSX ( ITSX_CUTASV.out.fasta, [] )
+        ch_fasta = FILTER_LEN_ITSX.out.fasta
     }
 
     //