From f15c9dd7d77fcd95748b6d9f4697ba6f8953559c Mon Sep 17 00:00:00 2001
From: d4straub <daniel.straub@uni-tuebingen.de>
Date: Mon, 25 Mar 2024 16:21:03 +0100
Subject: [PATCH] Require a minimum sequence length of 50bp after ITSx

---
 CHANGELOG.md                                  |  1 +
 assets/report_template.Rmd                    |  3 +-
 conf/modules.config                           | 11 ++++++
 docs/output.md                                | 39 ++++++++++---------
 .../{filter_len_asv.nf => filter_len.nf}      |  6 +--
 workflows/ampliseq.nf                         |  6 ++-
 6 files changed, 42 insertions(+), 24 deletions(-)
 rename modules/local/{filter_len_asv.nf => filter_len.nf} (96%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 900786ed..673f5823 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#711](https://github.com/nf-core/ampliseq/pull/711) - From r207 and onwards Archaea sequences were omitted when parsing GTDB databases. (This did not affect `sbdi-gtdb` databases, only `gtdb`.)
 - [#715](https://github.com/nf-core/ampliseq/pull/715) - Fix filtering vsearch clusters for high number of clusters
 - [#717](https://github.com/nf-core/ampliseq/pull/717) - Fix edge case for sorting file names by using radix method
+- [#718](https://github.com/nf-core/ampliseq/pull/718) - Require a minimum sequence length of 50bp for taxonomic classifcation after using ITSx
 
 ### `Dependencies`
 
diff --git a/assets/report_template.Rmd b/assets/report_template.Rmd
index 07235f73..7241f69a 100644
--- a/assets/report_template.Rmd
+++ b/assets/report_template.Rmd
@@ -827,7 +827,8 @@ cat("# Taxonomic Classification\n")
 cat(paste0("
 ## ITS regions
 
-The ",params$cut_its," region was extracted from each ASV sequence using [ITSx](https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.12073).
+The ",params$cut_its," region was extracted from each ASV sequence using [ITSx](https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.12073)
+with a minimal sequence length of 50bp.
 Taxonomic classification should have improved performance based on extracted ITS sequence. ITSx results can be found in folder [itsx](../itsx).
 
 Taxonomies per extracted region was then transferred back to the full ASV sequence. No filtering was done based on whether the region was found or not.
diff --git a/conf/modules.config b/conf/modules.config
index 389bb1a5..de12098b 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -442,6 +442,8 @@ process {
     }
 
     withName: FILTER_LEN_ASV {
+        ext.min_len_asv = { params.min_len_asv }
+        ext.max_len_asv = { params.max_len_asv }
         publishDir = [
             path: { "${params.outdir}/asv_length_filter" },
             mode: params.publish_dir_mode,
@@ -483,6 +485,15 @@ process {
         ]
     }
 
+    withName: FILTER_LEN_ITSX {
+        ext.min_len_asv = 50
+        publishDir = [
+            path: { "${params.outdir}/itsx" },
+            mode: params.publish_dir_mode,
+            pattern: "ASV_*"
+        ]
+    }
+
     withName: 'FORMAT_TAXONOMY|FORMAT_TAXRESULTS' {
         publishDir = [
             path: { "${params.outdir}/dada2" },
diff --git a/docs/output.md b/docs/output.md
index 5c0cd495..12e4e603 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -27,8 +27,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
   - [VSEARCH cluster](#vsearch-cluster) - Centroid fasta file, filtered asv table, and stats
   - [Barrnap](#barrnap) - Predict ribosomal RNA sequences and optional filtering
   - [Length filter](#length-filter) - Optionally, ASV can be filtered by length thresholds
-  - [ITSx](#itsx) - Optionally, the ITS region can be extracted
   - [Codons](#codons) - Optionally the ASVs can be filtered by presence of stop codons.
+  - [ITSx](#itsx) - Optionally, the ITS region can be extracted
 - [Taxonomic classification](#taxonomic-classification) - Taxonomic classification of (filtered) ASVs
   - [DADA2](#dada2) - Taxonomic classification with DADA2
   - [assignSH](#assignsh) - Optionally, a UNITE species hypothesis (SH) can be added to the DADA2 taxonomy
@@ -216,23 +216,6 @@ The minimum ASV length threshold can be set by `--min_len_asv` and the maximum l
 
 </details>
 
-#### ITSx
-
-Optionally, the ITS region can be extracted from each ASV sequence using ITSx, and taxonomic classification is performed based on the ITS sequence.
-
-<details markdown="1">
-<summary>Output files</summary>
-
-- `itsx/`
-  - `ASV_ITS_seqs.full.fasta`: Fasta file with full ITS region from each ASV sequence.
-  - `ASV_ITS_seqs.ITS1.fasta` or `ASV_ITS_seqs.ITS2.fasta`: If using --cut_its "its1" or --cut_its "its2"; fasta file with ITS1 or ITS2 region from each ASV sequence.
-  - `ASV_ITS_seqs.full_and_partial.fasta`: If using --its_partial; fasta file with full and partial ITS regions from each ASV sequence.
-  - `ASV_ITS_seqs.ITS1.full_and_partial.fasta` or `ASV_ITS_seqs.ITS2.full_and_partial.fasta`: If using --cut_its "its1" or --cut_its "its2" and --its_partial; fasta file with complete and partial ITS1 or ITS2 regions from each ASV sequence.
-  - `ASV_ITS_seqs.summary.txt`: Summary information from ITSx.
-  - `ITSx.args.txt`: File with parameters passed to ITSx.
-
-</details>
-
 #### Codons
 
 Optionally, the ASVs can be filtered against the presence of stop codons in the specified open reading frame of the ASV. The filtering step can also filter out ASVs that are not multiple of 3 in length.
@@ -250,6 +233,26 @@ Codon filtering can be activated by `--filter_codons`. By default, the codons ar
 
 </details>
 
+#### ITSx
+
+Optionally, the ITS region can be extracted from each ASV sequence using ITSx, and taxonomic classification is performed based on the ITS sequence. Only sequences with at minimum 50bp in length are retained.
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `itsx/`
+  - `ASV_ITS_seqs.full.fasta`: Fasta file with full ITS region from each ASV sequence.
+  - `ASV_ITS_seqs.ITS1.fasta` or `ASV_ITS_seqs.ITS2.fasta`: If using --cut_its "its1" or --cut_its "its2"; fasta file with ITS1 or ITS2 region from each ASV sequence.
+  - `ASV_ITS_seqs.full_and_partial.fasta`: If using --its_partial; fasta file with full and partial ITS regions from each ASV sequence.
+  - `ASV_ITS_seqs.ITS1.full_and_partial.fasta` or `ASV_ITS_seqs.ITS2.full_and_partial.fasta`: If using --cut_its "its1" or --cut_its "its2" and --its_partial; fasta file with complete and partial ITS1 or ITS2 regions from each ASV sequence.
+  - `ASV_ITS_seqs.summary.txt`: Summary information from ITSx.
+  - `ITSx.args.txt`: File with parameters passed to ITSx.
+  - `ASV_seqs.len.fasta`: Fasta file with filtered ASV sequences.
+  - `ASV_len_orig.tsv`: ASV length distribution before filtering.
+  - `ASV_len_filt.tsv`: ASV length distribution after filtering.
+
+</details>
+
 ### Taxonomic classification
 
 Taxonomic classification of ASVs can be performed with a choice of DADA2, SINTAX, Kraken2 or QIIME2 using supplied databases or user supplied databases (see parameter documentation). By default, DADA2 is used for the classification. The taxonomic classification will be done based on filtered ASV sequences (see above).
diff --git a/modules/local/filter_len_asv.nf b/modules/local/filter_len.nf
similarity index 96%
rename from modules/local/filter_len_asv.nf
rename to modules/local/filter_len.nf
index 3cf7d59d..4964793d 100644
--- a/modules/local/filter_len_asv.nf
+++ b/modules/local/filter_len.nf
@@ -1,4 +1,4 @@
-process FILTER_LEN_ASV {
+process FILTER_LEN {
     tag "${fasta}"
     label 'process_low'
 
@@ -23,8 +23,8 @@ process FILTER_LEN_ASV {
     task.ext.when == null || task.ext.when
 
     script:
-    def min_len_asv = params.min_len_asv ?: '1'
-    def max_len_asv = params.max_len_asv ?: '1000000'
+    def min_len_asv = task.ext.min_len_asv ?: '1'
+    def max_len_asv = task.ext.max_len_asv ?: '1000000'
 
     def read_table  = table ? "table <- read.table(file = '$table', sep = '\t', comment.char = '', header=TRUE)" : "table <- data.frame(matrix(ncol = 1, nrow = 0))"
     def asv_table_filtered  = table ? "ASV_table.len.tsv" : "empty_ASV_table.len.tsv"
diff --git a/workflows/ampliseq.nf b/workflows/ampliseq.nf
index 0aaa7926..380e6bf6 100644
--- a/workflows/ampliseq.nf
+++ b/workflows/ampliseq.nf
@@ -191,7 +191,8 @@ include { SIDLE_WF                      } from '../subworkflows/local/sidle_wf'
 include { BARRNAP                       } from '../modules/local/barrnap'
 include { BARRNAPSUMMARY                } from '../modules/local/barrnapsummary'
 include { FILTER_SSU                    } from '../modules/local/filter_ssu'
-include { FILTER_LEN_ASV                } from '../modules/local/filter_len_asv'
+include { FILTER_LEN as FILTER_LEN_ASV  } from '../modules/local/filter_len'
+include { FILTER_LEN as FILTER_LEN_ITSX } from '../modules/local/filter_len'
 include { MERGE_STATS as MERGE_STATS_FILTERSSU    } from '../modules/local/merge_stats'
 include { MERGE_STATS as MERGE_STATS_FILTERLENASV } from '../modules/local/merge_stats'
 include { MERGE_STATS as MERGE_STATS_CODONS       } from '../modules/local/merge_stats'
@@ -567,7 +568,8 @@ workflow AMPLISEQ {
         }
         ITSX_CUTASV ( ch_full_fasta, outfile )
         ch_versions = ch_versions.mix(ITSX_CUTASV.out.versions.ifEmpty(null))
-        ch_fasta = ITSX_CUTASV.out.fasta
+        FILTER_LEN_ITSX ( ITSX_CUTASV.out.fasta, [] )
+        ch_fasta = FILTER_LEN_ITSX.out.fasta
     }
 
     //