Merge pull request #615 from a4000/dev

Dev
nf-core · Aug 11, 2023 · ab5add3 · ab5add3
2 parents a70ff48 + 4baba81
commit ab5add3
Show file tree

Hide file tree

Showing 26 changed files with 283 additions and 18 deletions.
diff --git a/CITATIONS.md b/CITATIONS.md
@@ -111,6 +111,10 @@
 
   > Jari Oksanen, F. Guillaume Blanchet, Michael Friendly, Roeland Kindt, Pierre Legendre, Dan McGlinn, Peter R. Minchin, R. B. O’Hara, Gavin L. Simpson, Peter Solymos, M. Henry H. Stevens, Eduard Szoecs, and Helene Wagner. vegan: Community Ecology Package. 2018. R package version 2.5-3.
 
+- [Phyloseq](https://doi.org/10.1371/journal.pone.0061217)
+
+  > McMurdie PJ, Holmes S (2013). “phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data.” PLoS ONE, 8(4), e61217.
+
 ### Non-default tools
 
 - [ITSx](https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.12073)

diff --git a/README.md b/README.md
@@ -40,6 +40,7 @@ By default, the pipeline currently performs the following:
 - Taxonomical classification using DADA2, [SINTAX](https://doi.org/10.1101/074161) or [QIIME2](https://www.nature.com/articles/s41587-019-0209-9)
 - Excludes unwanted taxa, produces absolute and relative feature/taxa count tables and plots, plots alpha rarefaction curves, computes alpha and beta diversity indices and plots thereof ([QIIME2](https://www.nature.com/articles/s41587-019-0209-9))
 - Calls differentially abundant taxa ([ANCOM](https://www.ncbi.nlm.nih.gov/pubmed/26028277))
+- Creates phyloseq R objects ([Phyloseq](https://www.bioconductor.org/packages/release/bioc/html/phyloseq.html))
 - Overall pipeline run summaries ([MultiQC](https://multiqc.info/))
 
 ## Usage

diff --git a/bin/reformat_tax_for_phyloseq.py b/bin/reformat_tax_for_phyloseq.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+import sys
+
+tax_file = sys.argv[1]
+out_file = sys.argv[2]
+
+# Import tsv file
+tax_df = pd.read_csv(tax_file, sep="\t")
+
+# The second column should hold the taxonomy information
+tax_col = tax_df.columns[1]
+
+# Split the values in the tax column
+split_tax = tax_df[tax_col].str.split(";", expand=True)
+
+# Assign names to the new columns with an auto incrementing integer
+new_col_names = [f"{tax_col}_{i+1}" for i in range(split_tax.shape[1])]
+split_tax.columns = new_col_names
+
+# Strip whitespace from the tax names
+split_tax = split_tax.applymap(lambda x: x.strip() if isinstance(x, str) else x)
+
+# Drop the original tax column
+tax_df = tax_df.drop(columns=[tax_col])
+
+# Add the new tax columns to the df
+result = pd.concat([tax_df, split_tax], axis=1)
+
+# Create new tsv file
+result.to_csv(out_file, sep="\t", index=False)
diff --git a/conf/modules.config b/conf/modules.config
@@ -785,6 +785,14 @@ process {
         ]
     }
 
+    withName: 'PHYLOSEQ' {
+        publishDir = [
+            path: { "${params.outdir}/phyloseq" },
+            mode: params.publish_dir_mode,
+            pattern: "*.rds"
+        ]
+    }
+
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },

diff --git a/docs/output.md b/docs/output.md
@@ -41,6 +41,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
   - [Diversity analysis](#diversity-analysis) - High level overview with different diversity indices
   - [ANCOM](#ancom) - Differential abundance analysis
 - [PICRUSt2](#picrust2) - Predict the functional potential of a bacterial community
+- [Phyloseq](#phyloseq) - Phyloseq R objects
 - [Read count report](#read-count-report) - Report of read counts during various steps of the pipeline
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 
@@ -518,6 +519,18 @@ Most of the fields in the template will not be populated by the export process,
 
 </details>
 
+### Phyloseq
+
+This directory will hold phyloseq objects for each taxonomy table produced by this pipeline. The objects will contain an ASV abundance table and a taxonomy table. If the pipeline is provided with metadata, that metadata will also be included in the phyloseq object. A phylogenetic tree will also be included if the pipeline produces a tree.
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `phyloseq/`
+  - `<taxonomy>_phyloseq.rds`: Phyloseq R object.
+
+</details>
+
 ## Read count report
 
 This report includes information on how many reads per sample passed each pipeline step in which a loss can occur. Specifically, how many read pairs entered cutadapt, were reverse complemented, passed trimming; how many read pairs entered DADA2, were denoised, merged and non-chimeric; and how many counts were lost during excluding unwanted taxa and removing low abundance/prevalence sequences in QIIME2.

diff --git a/modules/local/phyloseq.nf b/modules/local/phyloseq.nf
@@ -0,0 +1,63 @@
+process PHYLOSEQ {
+    tag "$prefix"
+    label 'process_low'
+
+    conda "bioconda::bioconductor-phyloseq=1.44.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/bioconductor-phyloseq:1.44.0--r43hdfd78af_0' :
+        'quay.io/biocontainers/bioconductor-phyloseq:1.44.0--r43hdfd78af_0' }"
+
+    input:
+    tuple val(prefix), path(tax_tsv)
+    path otu_tsv
+    path sam_tsv
+    path tree
+
+    output:
+    tuple val(prefix), path("*phyloseq.rds"), emit: rds
+    path "versions.yml"                     , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def sam_tsv = "\"${sam_tsv}\""
+    def otu_tsv = "\"${otu_tsv}\""
+    def tax_tsv = "\"${tax_tsv}\""
+    def tree    = "\"${tree}\""
+    def prefix  = "\"${prefix}\""
+    """
+    #!/usr/bin/env Rscript
+
+    suppressPackageStartupMessages(library(phyloseq))
+
+    otu_df  <- read.table($otu_tsv, sep="\\t", header=TRUE, row.names=1)
+    tax_df  <- read.table($tax_tsv, sep="\\t", header=TRUE, row.names=1)
+    otu_mat <- as.matrix(otu_df)
+    tax_mat <- as.matrix(tax_df)
+
+    OTU     <- otu_table(otu_mat, taxa_are_rows=TRUE)
+    TAX     <- tax_table(tax_mat)
+    phy_obj <- phyloseq(OTU, TAX)
+
+    if (file.exists($sam_tsv)) {
+        sam_df  <- read.table($sam_tsv, sep="\\t", header=TRUE, row.names=1)
+        SAM     <- sample_data(sam_df)
+        phy_obj <- merge_phyloseq(phy_obj, SAM)
+    }
+
+    if (file.exists($tree)) {
+        TREE    <- read_tree($tree)
+        phy_obj <- merge_phyloseq(phy_obj, TREE)
+    }
+
+    saveRDS(phy_obj, file = paste0($prefix, "_phyloseq.rds"))
+
+    # Version information
+    writeLines(c("\\"${task.process}\\":",
+        paste0("    R: ", paste0(R.Version()[c("major","minor")], collapse = ".")),
+        paste0("    phyloseq: ", packageVersion("phyloseq"))),
+        "versions.yml"
+    )
+    """
+}
diff --git a/modules/local/phyloseq_inasv.nf b/modules/local/phyloseq_inasv.nf
@@ -0,0 +1,28 @@
+process PHYLOSEQ_INASV {
+    label 'process_low'
+
+    conda "conda-forge::sed=4.7"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
+        'nf-core/ubuntu:20.04' }"
+
+    input:
+    path(biom_file)
+
+    output:
+    path( "*.tsv" )          , emit: tsv
+    path "versions.yml"      , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    """
+    tail $biom_file -n +2 | sed '1s/#OTU ID/ASV_ID/' > reformat_$biom_file
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        bash: \$(bash --version | sed -n 1p | sed 's/GNU bash, version //g')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/phyloseq_intax.nf b/modules/local/phyloseq_intax.nf
@@ -0,0 +1,29 @@
+process PHYLOSEQ_INTAX {
+    label 'process_low'
+
+    conda "conda-forge::pandas=1.1.5"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/pandas:1.1.5':
+        'biocontainers/pandas:1.1.5' }"
+
+    input:
+    path(tax_tsv)
+
+    output:
+    path( "*.tsv" )          , emit: tsv
+    path "versions.yml"      , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    """
+    reformat_tax_for_phyloseq.py $tax_tsv reformat_$tax_tsv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version 2>&1 | sed 's/Python //g')
+        pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)")
+    END_VERSIONS
+    """
+}
diff --git a/subworkflows/local/phyloseq_workflow.nf b/subworkflows/local/phyloseq_workflow.nf
@@ -0,0 +1,44 @@
+/*
+ * Create phyloseq objects
+ */
+
+include { PHYLOSEQ                                } from '../../modules/local/phyloseq'
+include { PHYLOSEQ_INASV                          } from '../../modules/local/phyloseq_inasv'
+
+workflow PHYLOSEQ_WORKFLOW {
+    take:
+    ch_tax
+    ch_tsv
+    ch_meta
+    ch_tree
+    run_qiime2
+
+    main:
+    if ( params.metadata ) {
+        ch_phyloseq_inmeta = ch_meta.first() // The .first() is to make sure it's a value channel
+    } else {
+        ch_phyloseq_inmeta = []
+    }
+
+    if ( params.pplace_tree ) {
+        ch_phyloseq_intree = ch_tree.map { it = it[1] }.first()
+    } else {
+        ch_phyloseq_intree = []
+    }
+
+    if ( run_qiime2 ) {
+        if ( params.exclude_taxa != "none" || params.min_frequency != 1 || params.min_samples != 1 ) {
+            ch_phyloseq_inasv = PHYLOSEQ_INASV ( ch_tsv ).tsv
+        } else {
+            ch_phyloseq_inasv = ch_tsv
+        }
+    } else {
+        ch_phyloseq_inasv = ch_tsv
+    }
+
+    PHYLOSEQ ( ch_tax, ch_phyloseq_inasv, ch_phyloseq_inmeta, ch_phyloseq_intree )
+
+    emit:
+    rds     = PHYLOSEQ.out.rds
+    versions= PHYLOSEQ.out.versions
+}
diff --git a/tests/pipeline/iontorrent.nf.test b/tests/pipeline/iontorrent.nf.test
@@ -38,7 +38,8 @@ nextflow_pipeline {
                 { assert snapshot(path("$outputDir/input/Samplesheet_it_SE_ITS.tsv")).match("input") },
                 { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"),
                                 path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"),
-                                path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }
+                                path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") },
+                { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() }
             )
         }
     }

diff --git a/tests/pipeline/iontorrent.nf.test.snap b/tests/pipeline/iontorrent.nf.test.snap
diff --git a/tests/pipeline/multi.nf.test b/tests/pipeline/multi.nf.test
@@ -63,7 +63,8 @@ nextflow_pipeline {
                 { assert new File("$outputDir/qiime2/representative_sequences/filtered-sequences.qza").exists() },
                 { assert new File("$outputDir/qiime2/representative_sequences/rep-seq.fasta").exists() },
                 { assert snapshot(path("$outputDir/qiime2/representative_sequences/descriptive_stats.tsv"),
-                                path("$outputDir/qiime2/representative_sequences/seven_number_summary.tsv")).match("qiime2") }
+                                path("$outputDir/qiime2/representative_sequences/seven_number_summary.tsv")).match("qiime2") },
+                { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() }
             )
         }
     }

diff --git a/tests/pipeline/multi.nf.test.snap b/tests/pipeline/multi.nf.test.snap
diff --git a/tests/pipeline/pacbio_its.nf.test b/tests/pipeline/pacbio_its.nf.test
@@ -52,7 +52,8 @@ nextflow_pipeline {
                                 path("$outputDir/SBDI/emof.tsv"),
                                 path("$outputDir/SBDI/event.tsv")).match("SBDI") },
                 { assert new File("$outputDir/SBDI/annotation.tsv").exists() },
-                { assert new File("$outputDir/SBDI/asv-table.tsv").exists() }
+                { assert new File("$outputDir/SBDI/asv-table.tsv").exists() },
+                { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() }
             )
         }
     }

diff --git a/tests/pipeline/pacbio_its.nf.test.snap b/tests/pipeline/pacbio_its.nf.test.snap
diff --git a/tests/pipeline/pplace.nf.test b/tests/pipeline/pplace.nf.test
@@ -55,7 +55,8 @@ nextflow_pipeline {
                 { assert new File("$outputDir/pplace/test_pplace.taxonomy.per_query.tsv").exists() },
                 { assert new File("$outputDir/pplace/test_pplace.graft.test_pplace.epa_result.newick").exists() },
                 { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"),
-                                path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }
+                                path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") },
+                { assert new File("$outputDir/phyloseq/qiime2_phyloseq.rds").exists() }
             )
         }
     }

diff --git a/tests/pipeline/pplace.nf.test.snap b/tests/pipeline/pplace.nf.test.snap
diff --git a/tests/pipeline/reftaxcustom.nf.test b/tests/pipeline/reftaxcustom.nf.test
@@ -43,7 +43,8 @@ nextflow_pipeline {
                 { assert snapshot(path("$outputDir/input/Samplesheet.tsv")).match("input") },
                 { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"),
                                 path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"),
-                                path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }
+                                path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") },
+                { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() }
             )
         }
     }

diff --git a/tests/pipeline/reftaxcustom.nf.test.snap b/tests/pipeline/reftaxcustom.nf.test.snap
diff --git a/tests/pipeline/single.nf.test b/tests/pipeline/single.nf.test
@@ -44,7 +44,8 @@ nextflow_pipeline {
                 { assert snapshot(path("$outputDir/input/Samplesheet_single_end.tsv")).match("input") },
                 { assert snapshot(path("$outputDir/multiqc/multiqc_data/multiqc_fastqc.txt"),
                                 path("$outputDir/multiqc/multiqc_data/multiqc_general_stats.txt"),
-                                path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") }
+                                path("$outputDir/multiqc/multiqc_data/multiqc_cutadapt.txt")).match("multiqc") },
+                { assert new File("$outputDir/phyloseq/dada2_phyloseq.rds").exists() }
             )
         }
     }