Merge pull request #5 from UPHL-BioNGS/Tom_dev_231227

Tom dev 231227
UPHL-BioNGS · Jan 24, 2024 · 032d370 · 032d370
2 parents 972d002 + 9a03aad
commit 032d370
Show file tree

Hide file tree

Showing 10 changed files with 97 additions and 24 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -17,11 +17,11 @@
                 "python.linting.flake8Path": "/opt/conda/bin/flake8",
                 "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle",
                 "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle",
-                "python.linting.pylintPath": "/opt/conda/bin/pylint"
+                "python.linting.pylintPath": "/opt/conda/bin/pylint",
             },
 
             // Add the IDs of extensions you want installed when the container is created.
-            "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"]
-        }
-    }
+            "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"],
+        },
+    },
 }
diff --git a/README.md b/README.md
@@ -35,7 +35,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
 > **Currently prepares influenza samples (paired-end FASTQ files) for assembly. These steps also provide different quality reports for sample evaluation.**
 
 * Combine FASTQ file lanes, if they were provided with multiple lanes, into unified FASTQ files to ensure they are organized and named consistently (`Lane_Merge`).
-* Remove human read data with the ([`NCBI_SRA_Human_Scrubber`](https://github.com/ncbi/sra-human-scrubber) for uploading reads to to public repositories for DNA sequencing data.
+* Remove human read data with the [`NCBI_SRA_Human_Scrubber`](https://github.com/ncbi/sra-human-scrubber) for uploading reads to to public repositories for DNA sequencing data.
 * Filter unpaired reads from FASTQ files (`SeqKit_Pair`).
 * Trim reads and assess quality (`FaQCs`).
 * Remove adapter sequences and phix reference with (`BBMap_BBDuk`).
@@ -51,6 +51,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
 * Assembly of influenza gene segments with (`IRMA`) using the built-in FLU module. Also, influenza typing and H/N subtype classifications are made.
 * QC of consensus assembly (`IRMA_Consensus_QC`).
 * Generate IRMA consensus QC report (`IRMA_Consensus_QC_Reportsheet`)
+* Annotation of IRMA consensus sequences with (`VADR`)
 * Influenza A type and H/N subtype classification as well as influenza B type and lineage classification using (`Abricate_Flu`). The database used in this task is [InsaFlu](https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-018-0555-0).
 * Generate a summary report for influenza classification results (`IMRA_Abricate_Reportsheet`).
 * Gather corresponding Nextclade dataset using the Abricate_Flu classifcation results (`Nextclade_Variables`).

diff --git a/conf/modules.config b/conf/modules.config
@@ -156,6 +156,17 @@ process {
             pattern: "*"
         ]
     }
+    withName: 'VADR' {
+        ext.args   = '--minlen 60'
+        ext.args2  = '--split --cpu 8 -r --atgonly --xnocomp --nomisc --alt_fail extrant5,extrant3 --mkey flu'
+        ext.when         = {  }
+        publishDir       = [
+            enabled: true,
+            mode: "${params.publish_dir_mode}",
+            path: { "${params.outdir}/vadr"},
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
     withName: ABRICATE_FLU {
         ext.args = '--db insaflu --minid 70 --mincov 60'
         ext.when   = {  }

diff --git a/docs/output.md b/docs/output.md
@@ -31,7 +31,8 @@ results/
 ├── pipeline_info
 ├── qc_report
 ├── reports
-└── SUMMARY_REPORT
+├── SUMMARY_REPORT
+└── vadr
 ```
 
 ## Pipeline overview
@@ -92,6 +93,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 * Assembly of influenza gene segments with (`IRMA`) using the built-in FLU module. Also, influenza typing and H/N subtype classifications are made.
 * QC of consensus assembly (`IRMA_Consensus_QC`).
 * Generate IRMA consensus QC report (`IRMA_Consensus_QC_Reportsheet`)
+* Annotation of IRMA consensus sequences with (`VADR`)
 * Influenza A type and H/N subtype classification as well as influenza B type and lineage classification using (`Abricate_Flu`). The database used in this task is [InsaFlu](https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-018-0555-0).
 * Generate a summary report for influenza classification results (`IMRA_Abricate_Reportsheet`).
 * Gather corresponding Nextclade dataset using the Abricate_Flu classifcation results (`Nextclade_Variables`).

diff --git a/modules/local/vadr.nf b/modules/local/vadr.nf
@@ -0,0 +1,41 @@
+process VADR {
+    tag "$meta.id"
+    label 'process_medium'
+
+    container 'quay.io/staphb/vadr:1.6.3'
+
+    input:
+    tuple val(meta), path(assembly)
+
+    output:
+    tuple val(meta), path("${meta.id}/") , optional:true, emit: vadr
+    path "*.vadr.log"                    , emit: log
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def args2 = task.ext.args2 ?: ""
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def vadr_log = "${meta.id}.vadr.log"
+
+    """
+    fasta-trim-terminal-ambigs.pl \\
+    $args \\
+    $assembly > ${meta.id}.vadr_trimmed.fasta
+
+    v-annotate.pl \\
+    $args2 \\
+    ${meta.id}.vadr_trimmed.fasta \\
+    $meta.id
+
+    # Soft link for traceability
+    ln -s .command.log $vadr_log
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        vadr: \$(vadr --version 2>&1 | sed 's/^.*vadr //')
+    END_VERSIONS
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -59,6 +59,8 @@ params {
     irma_module                  = "FLU"
     genome_length                = 13500
     keep_ref_deletions           = true
+    skip_ncbi_sra_human_scrubber = false
+    skip_vadr                    = false
     skip_kraken2                 = false
     skip_nextclade               = false
     adapters_fasta               = 'https://raw.githubusercontent.com/BioInfoTools/BBMap/master/resources/adapters.fa'

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -315,6 +315,16 @@
                     "default": false,
                     "description": "Skip Kraken2 option."
                 },
+                "skip_ncbi_sra_human_scrubber": {
+                    "type": "boolean",
+                    "default": false,
+                    "description": "Skip NCBI SRA human scrubber option."
+                },
+                "skip_vadr": {
+                    "type": "boolean",
+                    "default": false,
+                    "description": "Skip the VADR module option."
+                },
                 "skip_nextclade": {
                     "type": "boolean",
                     "default": false,

diff --git a/subworkflows/local/assembly_typing_clade_variables.nf b/subworkflows/local/assembly_typing_clade_variables.nf
@@ -7,6 +7,7 @@
 include { IRMA                                 } from '../../modules/local/irma.nf'
 include { IRMA_CONSENSUS_QC                    } from '../../modules/local/irma_consensus_qc.nf'
 include { IRMA_CONSENSUS_QC_REPORTSHEET        } from '../../modules/local/irma_consensus_qc_reportsheet.nf'
+include { VADR                                 } from '../../modules/local/vadr.nf'
 include { ABRICATE_FLU                         } from '../../modules/local/abricate_flu.nf'
 include { IRMA_ABRICATE_REPORT                 } from '../../modules/local/irma_abricate_report'
 include { IRMA_ABRICATE_REPORTSHEET            } from '../../modules/local/irma_abricate_reportsheet.nf'
@@ -18,7 +19,7 @@ include { NEXTCLADE_VARIABLES                  } from '../../modules/local/nextc
 ============================================================================================================
 */
 
-def irma_module = 'FLU'
+def irma_module = ''
 if (params.irma_module) {
     irma_module = params.irma_module
 }
@@ -34,11 +35,10 @@ workflow ASSEMBLY_TYPING_CLADE_VARIABLES {
     clean_reads // file: /path/to/BBMAP_BBDUK/'*.clean*.fastq.gz'
 
     main:
-    ch_versions            = Channel.empty()
-    ch_assembly            = Channel.empty()
-    ch_HA                  = Channel.empty()
-    ch_NA                  = Channel.empty()
-    ch_dataset             = Channel.empty()
+    ch_versions                        = Channel.empty()
+    ch_assembly                        = Channel.empty()
+    ch_HA                              = Channel.empty()
+    ch_NA                              = Channel.empty()
 
     IRMA(clean_reads, irma_module)
     ch_assembly = IRMA.out.assembly
@@ -66,6 +66,10 @@ workflow ASSEMBLY_TYPING_CLADE_VARIABLES {
     IRMA_CONSENSUS_QC_REPORTSHEET(ch_irma_consensus_qc_results)
     irma_consensus_qc_tsv = IRMA_CONSENSUS_QC_REPORTSHEET.out.irma_consensus_qc_tsv
 
+    if ( !params.skip_vadr ) {
+        VADR(IRMA.out.assembly)
+    }
+
     ABRICATE_FLU(IRMA.out.assembly)
     ch_versions = ch_versions.mix(ABRICATE_FLU.out.versions)
 
@@ -104,12 +108,12 @@ workflow ASSEMBLY_TYPING_CLADE_VARIABLES {
                                     )
 
     emit:
-    HA                         = IRMA.out.HA
-    NA                         = IRMA.out.NA
-    typing_report_tsv          = IRMA_ABRICATE_REPORTSHEET.out.typing_report_tsv
-    irma_consensus_qc_tsv      = IRMA_CONSENSUS_QC_REPORTSHEET.out.irma_consensus_qc_tsv
-    assembly                   = ch_assembly
-    dataset                    = ch_dataset
-    versions                   = ch_versions
+    HA                              = IRMA.out.HA
+    NA                              = IRMA.out.NA
+    typing_report_tsv               = IRMA_ABRICATE_REPORTSHEET.out.typing_report_tsv
+    irma_consensus_qc_tsv           = IRMA_CONSENSUS_QC_REPORTSHEET.out.irma_consensus_qc_tsv
+    assembly                        = ch_assembly
+    dataset                         = ch_dataset
+    versions                        = ch_versions
 
 }
diff --git a/subworkflows/local/preprocessing_read_qc.nf b/subworkflows/local/preprocessing_read_qc.nf
@@ -32,8 +32,10 @@ workflow PREPROCESSING_READ_QC {
     ch_kraken2reportsheet      = Channel.empty()
     ch_kraken2_reportsheet_tsv = Channel.empty()
 
-    NCBI_SRA_HUMAN_SCRUBBER(reads)
-    ch_versions = ch_versions.mix(NCBI_SRA_HUMAN_SCRUBBER.out.versions)
+    if ( !params.skip_ncbi_sra_human_scrubber ) {
+        NCBI_SRA_HUMAN_SCRUBBER(reads)
+        ch_versions = ch_versions.mix(NCBI_SRA_HUMAN_SCRUBBER.out.versions)
+    }
 
     SEQKIT_PAIR(reads)
     ch_versions = ch_versions.mix(SEQKIT_PAIR.out.versions)
@@ -51,7 +53,7 @@ workflow PREPROCESSING_READ_QC {
     ch_versions = ch_versions.mix(QC_REPORT.out.versions)
 
     if ( !params.skip_kraken2 ) {
-        KRAKEN2_KRAKEN2(reads, db, false, true)
+        KRAKEN2_KRAKEN2(BBMAP_BBDUK.out.clean_reads, db, false, true)
         ch_versions = ch_versions.mix(KRAKEN2_KRAKEN2.out.versions)
 
         ch_kraken2report_summary_input = KRAKEN2_KRAKEN2.out.txt

diff --git a/workflows/walkercreek.nf b/workflows/walkercreek.nf
@@ -206,7 +206,7 @@ workflow WALKERCREEK {
     /*
         SUBWORKFLOW: ASSEMBLY_TYPING_CLADE_VARIABLES - assembly, flu typing/subtyping, and Nextclade variable determination based upon flu 'abricate_subtype'
     */
-    ASSEMBLY_TYPING_CLADE_VARIABLES(ch_all_reads)
+    ASSEMBLY_TYPING_CLADE_VARIABLES(PREPROCESSING_READ_QC.out.clean_reads)
     ch_assembly = ASSEMBLY_TYPING_CLADE_VARIABLES.out.assembly
     ch_HA = ASSEMBLY_TYPING_CLADE_VARIABLES.out.HA
     ch_NA = ASSEMBLY_TYPING_CLADE_VARIABLES.out.NA
@@ -229,7 +229,7 @@ workflow WALKERCREEK {
     //
     // MODULE: Run FastQC
     //
-    FASTQC (ch_all_reads)
+    FASTQC (PREPROCESSING_READ_QC.out.clean_reads)
     ch_versions = ch_versions.mix(FASTQC.out.versions.first())
 
     //