Merge pull request #690 from nf-core/dev

Release 2.8.0
nf-core · Jan 15, 2024 · f3c97e1 · f3c97e1
2 parents 113e90b + 2c67a45
commit f3c97e1
Show file tree

Hide file tree

Showing 74 changed files with 890 additions and 204 deletions.
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -27,6 +27,9 @@ If you're not used to this workflow with git, you can start with some [docs from
 
 ## Tests
 
+You can optionally test your changes by running the pipeline locally. Then it is recommended to use the `debug` profile to
+receive warnings about process selectors and other debug info. Example: `nextflow run . -profile debug,test,docker --outdir <OUTDIR>`.
+
 When you create a pull request with changes, [GitHub Actions](https://github.com/features/actions) will run automatic tests.
 Typically, pull-requests are only fully reviewed when these tests are passing, though of course we can help out before then.
 

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -19,6 +19,7 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/ampl
 - [ ] If necessary, also make a PR on the nf-core/ampliseq _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository.
 - [ ] Make sure your code lints (`nf-core lint`).
 - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir <OUTDIR>`).
+- [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir <OUTDIR>`).
 - [ ] Usage Documentation in `docs/usage.md` is updated.
 - [ ] Output Documentation in `docs/output.md` is updated.
 - [ ] `CHANGELOG.md` is updated.

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -50,6 +50,7 @@ jobs:
           - "test_failed"
           - "test_multi"
           - "test_reftaxcustom"
+          - "test_qiimecustom"
           - "test_doubleprimers"
           - "test_iontorrent"
           - "test_novaseq"
@@ -61,7 +62,7 @@ jobs:
 
     steps:
       - name: Check out pipeline code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Check out test data
         uses: actions/checkout@v3

diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       # Use the @nf-core-bot token to check out so we can push later
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           token: ${{ secrets.nf_core_bot_auth_token }}
 
@@ -24,7 +24,7 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }}
 
-      - uses: actions/setup-node@v3
+      - uses: actions/setup-node@v4
 
       - name: Install Prettier
         run: npm install -g prettier @prettier/plugin-php

diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -14,9 +14,9 @@ jobs:
   EditorConfig:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
-      - uses: actions/setup-node@v3
+      - uses: actions/setup-node@v4
 
       - name: Install editorconfig-checker
         run: npm install -g editorconfig-checker
@@ -27,9 +27,9 @@ jobs:
   Prettier:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
-      - uses: actions/setup-node@v3
+      - uses: actions/setup-node@v4
 
       - name: Install Prettier
         run: npm install -g prettier
@@ -40,7 +40,7 @@ jobs:
   PythonBlack:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Check code lints with Black
         uses: psf/black@stable
@@ -71,7 +71,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check out pipeline code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Install Nextflow
         uses: nf-core/setup-nextflow@v1

diff --git a/.github/workflows/release-announcments.yml → .github/workflows/release-announcements.yml b/.github/workflows/release-announcments.yml → .github/workflows/release-announcements.yml
diff --git a/.gitpod.yml b/.gitpod.yml
@@ -4,7 +4,9 @@ tasks:
     command: |
       pre-commit install --install-hooks
       nextflow self-update
-
+  - name: unset JAVA_TOOL_OPTIONS
+    command: |
+      unset JAVA_TOOL_OPTIONS
 vscode:
   extensions: # based on nf-core.nf-core-extensionpack
     - codezombiech.gitignore # Language support for .gitignore files

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,33 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## nf-core/ampliseq version 2.8.0 - 2024-01-16
+
+### `Added`
+
+- [#666](https://github.com/nf-core/ampliseq/pull/666) - Added Greengenes2 database, version 2022.10, support for QIIME2 taxonomic classification.
+- [#667](https://github.com/nf-core/ampliseq/pull/667),[#691](https://github.com/nf-core/ampliseq/pull/691) - Added `--qiime_ref_tax_custom` to permit custom reference database for QIIME2 taxonomic classification
+- [#674](https://github.com/nf-core/ampliseq/pull/674) - Add PhytoRef database for DADA2 taxonomy assignment using `--dada_ref_taxonomy phytoref`
+- [#675](https://github.com/nf-core/ampliseq/pull/675) - Add the Zehr lab nifH database for DADA2 taxonomy assignment using `--dada_ref_taxonomy zehr-nifh`
+- [#681](https://github.com/nf-core/ampliseq/pull/681) - For DADA2, with `--dada_addspecies_allowmultiple` multiple exact species matches are reported and with `--dada_taxonomy_rc` reverse-complement matches are also considered in taxonomic classification
+
+### `Changed`
+
+- [#677](https://github.com/nf-core/ampliseq/pull/677) - Added cut_its information to SDBI export
+
+### `Fixed`
+
+- [#672](https://github.com/nf-core/ampliseq/pull/672),[#688](https://github.com/nf-core/ampliseq/pull/688),[#691](https://github.com/nf-core/ampliseq/pull/691) - Updated documentation
+- [#676](https://github.com/nf-core/ampliseq/pull/676) - Phyloseq sometimes only produced one of multiple output files
+- [#679](https://github.com/nf-core/ampliseq/pull/679) - Prevent masking low complexity regions by VSEARCH with lower case letters
+- [#680](https://github.com/nf-core/ampliseq/pull/680),[#673](https://github.com/nf-core/ampliseq/pull/673) - Improved pipeline summary report & error messages
+- [#683](https://github.com/nf-core/ampliseq/pull/683) - Template update for nf-core/tools version 2.11
+- [#687](https://github.com/nf-core/ampliseq/pull/687) - Correct conda package for ASV SSU filtering
+
+### `Dependencies`
+
+### `Removed`
+
 ## nf-core/ampliseq version 2.7.1 - 2023-11-14
 
 ### `Added`

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -41,6 +41,10 @@
 
   > Quast C, Pruesse E, Yilmaz P, Gerken J, Schweer T, Yarza P, Peplies J, Glöckner FO. The SILVA ribosomal RNA gene database project: improved data processing and web-based tools. Nucleic Acids Res. 2013 Jan;41(Database issue):D590-6. doi: 10.1093/nar/gks1219. Epub 2012 Nov 28. PMID: 23193283; PMCID: PMC3531112.
 
+- [Greengenes2](https://doi.org/10.1038/s41587-023-01845-1)
+
+  > McDonald, D., Jiang, Y., Balaban, M. et al. Greengenes2 unifies microbial data in a single reference tree. Nat Biotechnol (2023). https://doi.org/10.1038/s41587-023-01845-1
+
 - [PR2 - Protist Reference Ribosomal Database](https://pubmed.ncbi.nlm.nih.gov/23193267/)
 
   > Guillou L, Bachar D, Audic S, Bass D, Berney C, Bittner L, Boutte C, Burgaud G, de Vargas C, Decelle J, Del Campo J, Dolan JR, Dunthorn M, Edvardsen B, Holzmann M, Kooistra WH, Lara E, Le Bescot N, Logares R, Mahé F, Massana R, Montresor M, Morard R, Not F, Pawlowski J, Probert I, Sauvadet AL, Siano R, Stoeck T, Vaulot D, Zimmermann P, Christen R. The Protist Ribosomal Reference database (PR2): a catalog of unicellular eukaryote small sub-unit rRNA sequences with curated taxonomy. Nucleic Acids Res. 2013 Jan;41(Database issue):D597-604. doi: 10.1093/nar/gks1160. Epub 2012 Nov 27. PMID: 23193267; PMCID: PMC3531120.
@@ -61,13 +65,21 @@
 
   > Kõljalg U, Larsson KH, Abarenkov K, Nilsson RH, Alexander IJ, Eberhardt U, Erland S, Høiland K, Kjøller R, Larsson E, Pennanen T, Sen R, Taylor AF, Tedersoo L, Vrålstad T, Ursing BM. UNITE: a database providing web-based methods for the molecular identification of ectomycorrhizal fungi. New Phytol. 2005 Jun;166(3):1063-8. doi: 10.1111/j.1469-8137.2005.01376.x. PMID: 15869663.
 
-  - [MIDORI2 - a collection of reference databases](https://doi.org/10.1002/edn3.303/)
+- [MIDORI2 - a collection of reference databases](https://doi.org/10.1002/edn3.303/)
+
+  > Leray, M., Knowlton, N., & Machida, R. J. (2022). MIDORI2: A collection of quality controlled, preformatted, and regularly updated reference databases for taxonomic assignment of eukaryotic mitochondrial sequences. Environmental DNA, 4, 894– 907. doi: https://doi.org/10.1002/edn3.303.
+
+- [COIDB - CO1 Taxonomy Database](https://doi.org/10.17044/scilifelab.20514192.v2)
+
+  > Sundh J, Manoharan L, Iwaszkiewicz-Eggebrecht E, Miraldo A, Andersson A, Ronquist F. COI reference sequences from BOLD DB. doi: https://doi.org/10.17044/scilifelab.20514192.v2.
+
+- [PhytoRef plastid 16S rRNA database for photosynthetic eukaryotes](https://pubmed.ncbi.nlm.nih.gov/25740460/)
 
-    > Leray, M., Knowlton, N., & Machida, R. J. (2022). MIDORI2: A collection of quality controlled, preformatted, and regularly updated reference databases for taxonomic assignment of eukaryotic mitochondrial sequences. Environmental DNA, 4, 894– 907. https://doi.org/10.1002/edn3.303.
+  > Decelle J, Romac S, Stern RF, Bendif el M, Zingone A, Audic S, Guiry MD, Guillou L, Tessier D, Le Gall F, Gourvil P, Dos Santos AL, Probert I, Vaulot D, de Vargas C, Christen R. PhytoREF: a reference database of the plastidial 16S rRNA gene of photosynthetic eukaryotes with curated taxonomy. Mol Ecol Resour. 2015 Nov;15(6):1435-45. doi: 10.1111/1755-0998.12401. Epub 2015 Apr 6. PMID: 25740460.
 
-  - [COIDB - CO1 Taxonomy Database](https://doi.org/10.17044/scilifelab.20514192.v2)
+- [Zehr lab nifH database](http://doi.org/10.5281/zenodo.7996213)
 
-    > Sundh J, Manoharan L, Iwaszkiewicz-Eggebrecht E, Miraldo A, Andersson A, Ronquist F. COI reference sequences from BOLD DB. doi: https://doi.org/10.17044/scilifelab.20514192.v2.
+  > M. A. Moynihan & C. Furbo Reeder 2023. nifHdada2 GitHub repository, v2.0.5. Zenodo. doi: http://doi.org/10.5281/zenodo.7996213
 
 ### Phylogenetic placement
 

diff --git a/README.md b/README.md
@@ -47,11 +47,8 @@ By default, the pipeline currently performs the following:
 
 ## Usage
 
-:::note
-If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how
-to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline)
-with `-profile test` before running the workflow on actual data.
-:::
+> [!NOTE]
+> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.
 
 First, you need to know whether the sequencing files at hand are expected to contain primer sequences (usually yes) and if yes, what primer sequences. In the example below, the paired end sequencing data was produced with 515f (GTGYCAGCMGCCGCGGTAA) and 806r (GGACTACNVGGGTWTCTAAT) primers of the V4 region of the 16S rRNA gene. Please note, that those sequences should not contain any sequencing adapter sequences, only the sequence that matches the biological amplicon.
 
@@ -68,19 +65,15 @@ nextflow run nf-core/ampliseq \
    --outdir <OUTDIR>
 ```
 
-:::note
-Adding metadata will considerably increase the output, see [metadata documentation](https://nf-co.re/ampliseq/usage#metadata).
-:::
+> [!NOTE]
+> Adding metadata will considerably increase the output, see [metadata documentation](https://nf-co.re/ampliseq/usage#metadata).
 
-:::note
-By default the taxonomic assignment will be performed with DADA2 on SILVA database, but there are various tools and databases readily available, see [taxonomic classification documentation](https://nf-co.re/ampliseq/usage#taxonomic-classification).
-:::
+> [!TIP]
+> By default the taxonomic assignment will be performed with DADA2 on SILVA database, but there are various tools and databases readily available, see [taxonomic classification documentation](https://nf-co.re/ampliseq/usage#taxonomic-classification).
 
-:::warning
-Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those
-provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;
-see [docs](https://nf-co.re/usage/configuration#custom-configuration-files).
-:::
+> [!WARNING]
+> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;
+> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files).
 
 For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/ampliseq/usage) and the [parameter documentation](https://nf-co.re/ampliseq/parameters).
 

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -1,7 +1,7 @@
 report_comment: >
-  This report has been generated by the <a href="https://github.com/nf-core/ampliseq/releases/tag/2.7.1" target="_blank">nf-core/ampliseq</a>
+  This report has been generated by the <a href="https://github.com/nf-core/ampliseq/releases/tag/2.8.0" target="_blank">nf-core/ampliseq</a>
   analysis pipeline. For information about how to interpret these results, please see the
-  <a href="https://nf-co.re/ampliseq/2.7.1/docs/output" target="_blank">documentation</a>.
+  <a href="https://nf-co.re/ampliseq/2.8.0/docs/output" target="_blank">documentation</a>.
 report_section_order:
   "nf-core-ampliseq-methods-description":
     order: -1000

diff --git a/assets/report_template.Rmd b/assets/report_template.Rmd
@@ -181,17 +181,13 @@ supporting denoising of any amplicon and supports a variety of taxonomic databas
 
 ```{r, results='asis'}
 if ( !isFALSE(params$metadata) ) {
-    cat(paste0("
-# Data input and Metadata
-
-Pipeline input was saved to the [input](../input) directory.
-    "))
+    cat("# Data input and Metadata\n\n")
 } else {
-    cat(paste0("
-# Data input
+    cat("# Data input\n\n")
+}
 
-Pipeline input was saved in folder [input](../input).
-    "))
+if ( !isFALSE(params$metadata) || !isFALSE(params$input_samplesheet) ) {
+    cat("Pipeline input was saved in folder [input](../input).\n\n")
 }
 
 if ( !isFALSE(params$input_samplesheet) ) {
@@ -262,8 +258,7 @@ the denoising tool or sequences might be lost due to being labelled as PCR chime
 # import tsv
 cutadapt_summary <- read.table(file = params$cutadapt_summary, header = TRUE, sep = "\t")
 
-cutadapt_passed_col <- as.numeric(substr(
-        cutadapt_summary$cutadapt_passing_filters_percent, 1, 4))
+cutadapt_passed_col <- as.numeric( gsub("%","",cutadapt_summary$cutadapt_passing_filters_percent) )
 
 cutadapt_max_discarded <- round( 100 - min(cutadapt_passed_col), 1 )
 cutadapt_avg_passed <- round(mean(cutadapt_passed_col),1)
@@ -980,9 +975,15 @@ cat("\n\nDADA2 taxonomy assignments can be found in folder [dada2](../dada2) in
 # Header
 cat("## QIIME2\n")
 
-cat("The taxonomic classification was performed by [QIIME2](https://www.nature.com/articles/s41587-019-0209-9)
-    using the database: `", params$qiime2_ref_tax_title, "`.
-    More details about the reference taxonomy database can be found in the ['Methods section'](#methods).\n\n", sep = "")
+# indicate reference taxonomy
+if ( !isFALSE(params$qiime2_ref_tax_title) ) {
+    cat("The taxonomic classification was performed by [QIIME2](https://www.nature.com/articles/s41587-019-0209-9)
+        using the database: `", params$qiime2_ref_tax_title, "`.
+        More details about the reference taxonomy database can be found in the ['Methods section'](#methods).\n\n", sep = "")
+} else {
+    cat("The taxonomic classification was performed by [QIIME2](https://www.nature.com/articles/s41587-019-0209-9) using a custom database ",
+            "provided by the user.\n\n", sep = "")
+}
 
 # Read file and prepare table
 asv_tax <- read.table(params$qiime2_taxonomy, header = TRUE, sep = "\t")

diff --git a/assets/slackreport.json b/assets/slackreport.json
@@ -3,7 +3,7 @@
         {
             "fallback": "Plain-text summary of the attachment.",
             "color": "<% if (success) { %>good<% } else { %>danger<%} %>",
-            "author_name": "nf-core/ampliseq v${version} - ${runName}",
+            "author_name": "nf-core/ampliseq ${version} - ${runName}",
             "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico",
             "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>",
             "fields": [

diff --git a/bin/sbdiexportreannotate.R b/bin/sbdiexportreannotate.R
@@ -18,7 +18,10 @@ dbversion       <- args[1]
 taxfile         <- args[2]
 taxmethod       <- args[3]
 wfversion       <- args[4]
-predfile        <- args[5]
+cut_its         <- args[5]
+predfile        <- args[6]
+
+cut_its = ifelse(cut_its == 'none', '', paste(' cut_its:', cut_its, sep=''))
 
 # Read taxonomy table
 taxonomy <- read.delim(taxfile, sep = '\t', stringsAsFactors = FALSE)
@@ -108,10 +111,10 @@ taxtable  <- taxonomy %>%
         date_identified = as.character(lubridate::today()),
         reference_db = dbversion,
         annotation_algorithm = case_when(
-            (taxmethod == 'sintax')                         ~ paste('Ampliseq',wfversion,'(https://nf-co.re/ampliseq) VSEARCH:sintax', sep=' '),
-            (!(is.na(otu) | otu == ''))                     ~ paste('Ampliseq',wfversion,'(https://nf-co.re/ampliseq) addsh', sep=' '),
-            (!(is.na(species_exact) | species_exact == '')) ~ paste('Ampliseq',wfversion,'(https://nf-co.re/ampliseq) DADA2:assignTaxonomy:addSpecies', sep=' '),
-            TRUE                                            ~ paste('Ampliseq',wfversion,'(https://nf-co.re/ampliseq) DADA2:assignTaxonomy', sep=' ')
+            (taxmethod == 'sintax')                         ~ paste('Ampliseq ',wfversion,' (https://nf-co.re/ampliseq) VSEARCH:sintax',cut_its, sep=' '),
+            (!(is.na(otu) | otu == ''))                     ~ paste('Ampliseq ',wfversion,' (https://nf-co.re/ampliseq) addsh',cut_its, sep=' '),
+            (!(is.na(species_exact) | species_exact == '')) ~ paste('Ampliseq ',wfversion,' (https://nf-co.re/ampliseq) DADA2:assignTaxonomy:addSpecies',cut_its, sep=' '),
+            TRUE                                            ~ paste('Ampliseq ',wfversion,' (https://nf-co.re/ampliseq) DADA2:assignTaxonomy',cut_its, sep='')
         ),
         identification_references = 'https://docs.biodiversitydata.se/analyse-data/molecular-tools/#taxonomy-annotation',
         taxon_remarks = ifelse(!(is.na(domain) | domain == ''), paste('Domain = \'',domain,'\'',sep=''),''),

diff --git a/bin/taxref_reformat_phytoref.sh b/bin/taxref_reformat_phytoref.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+# Write the assignTaxonomy() fasta file: assignTaxonomy.fna
+cat PhytoRef_with_taxonomy.fasta | sed '/>/s/>[^|]*|/>/' | sed '/>/s/|/;/g' > assignTaxonomy.fna
+
+# Write the addSpecies() fasta file: addSpecies.fna
+cat PhytoRef_with_taxonomy.fasta | sed '/^>/s/>\([^|]\+\)|.*|\([^|]\+\)/>\1 \2/' > addSpecies.fna
diff --git a/bin/taxref_reformat_qiime_greengenes2022.sh b/bin/taxref_reformat_qiime_greengenes2022.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+# Decompress files.
+gzip -c -d *.seqs.fna.gz > greengenes2.fna
+gzip -c -d *.taxonomy.md5.tsv.gz > greengenes2.tax
diff --git a/bin/taxref_reformat_zehr-nifh.sh b/bin/taxref_reformat_zehr-nifh.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+# Write the assignTaxonomy() fasta file: assignTaxonomy.fna
+cp *.fasta assignTaxonomy.fna
+
+# Write the addSpecies() fasta file: addSpecies.fna
+cut -d, -f 2,6,7 *.csv  | grep -v '^sequence,' | sed 's/\(.*\),[0-9]* \(.*\),\(.*\)/>\3 \2\n\1/' > addSpecies.fna
diff --git a/conf/base.config b/conf/base.config
@@ -63,4 +63,9 @@ process {
     withName:CUSTOM_DUMPSOFTWAREVERSIONS {
         cache = false
     }
+    withName:QIIME2_EXTRACT {
+        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
+        memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 24.h  * task.attempt, 'time'    ) }
+    }
 }