Skip to content

Commit

Permalink
Merge pull request #50 from TRON-Bioinformatics/49_add_subworkflow_sn…
Browse files Browse the repository at this point in the history
…peff_custom_genome

49 add subworkflow snpeff custom genome
  • Loading branch information
priesgo authored Sep 10, 2023
2 parents 6ae7320 + 463b84e commit 7a2b155
Show file tree
Hide file tree
Showing 9 changed files with 115 additions and 21 deletions.
23 changes: 16 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,13 +171,24 @@ No additional parameter needs to be provided to use the default SARS-CoV-2 refer

#### Using a custom reference genome

These references can be customised to use a different SARS-CoV-2 reference or to analyse a different virus.
Two files need to be provided:
These references can be customised to use a different SARS-CoV-2 reference or to analyse a different virus.
Two files need to be provided:

- Use a custom reference genome by providing the parameter `--reference your.fasta`.
- Gene annotation file in GFFv3 format `--gff your.gff`.
- Gene annotation file in GFFv3 format `--gff your.gff`.

As well as an organism name for the SnpEff annotation:

- Organism name to use in custom SnpEff database `--snpeff_organism`.

Additionally, the FASTA needs bwa-mem2 indexes, .fai index and a .dict index.
These indexes can be generated with the following two commands:
**NOTE**: beware that for Nextflow to find these files the reference needs to be passed as an absolute path.

When a custom reference genome is provided, the pipeline automatically generates a bwa-mem2 index
in the case of fastq input and a custom SnpEff database for functional annotations using the steps
described below. The steps are listed here for documentation purposes and should __not be performed manually__.

The reference genome FASTA needs to be indexed for different components of the pipeline.
bwa-mem2 indices, .fai index and a .dict index can be generated with the following three commands:
```
bwa-mem2 index reference.fasta
samtools faidx reference.fasta
Expand All @@ -194,8 +205,6 @@ When running CoVigator you will also need to provide three parameters:
- `--snpeff_data`: path to the SnpEff data folder
- `--snpeff_config`: path to the SnpEff config file

**NOTE**: beware that for Nextflow to find these indices the reference needs to be passed as an absolute path.

**Limitations**

- The SARS-CoV-2 specific annotations (ie: ConsHMM conservation and SARS-CoV-2 protein domains) will be skipped when
Expand Down
2 changes: 1 addition & 1 deletion bin/add_sample_to_vcf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
import argparse
import gzip

Expand Down
2 changes: 1 addition & 1 deletion bin/assembly_variant_caller.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
import os
from argparse import ArgumentParser
from dataclasses import dataclass
Expand Down
2 changes: 1 addition & 1 deletion bin/ivar2vcf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
import os
from argparse import ArgumentParser
from typing import List
Expand Down
2 changes: 1 addition & 1 deletion bin/phasing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
import argparse
import logging

Expand Down
1 change: 1 addition & 0 deletions bin/reference_genome.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python3
from pysam import FastaFile


Expand Down
38 changes: 30 additions & 8 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

nextflow.enable.dsl = 2


include { BWA_INDEX; SNPEFF_DATABASE } from './modules/00_prepare_annotation'
include { READ_TRIMMING_PAIRED_END; READ_TRIMMING_SINGLE_END } from './modules/01_fastp'
include { ALIGNMENT_PAIRED_END; ALIGNMENT_SINGLE_END } from './modules/02_bwa'
include { BAM_PREPROCESSING; COVERAGE_ANALYSIS; PRIMER_TRIMMING_IVAR } from './modules/03_bam_preprocessing'
Expand Down Expand Up @@ -34,10 +34,12 @@ params.skip_normalization = false
// references
params.reference = false
params.gff = false
params.snpeff_data = false
params.snpeff_config = false
// These will be now generated by the subworkflow if using a custom annotation
//params.snpeff_data = false
//params.snpeff_config = false
params.snpeff_organism = false
params.primers = false
params.different_virus = false

params.output = "."
params.min_mapping_quality = 20
Expand Down Expand Up @@ -88,20 +90,28 @@ if (params.reference == false) {
skip_sarscov2_annotations = params.skip_sarscov2_annotations
}
else {
log.info "Using custom reference genome: ${params.reference}"
log.info "Using custom reference genome: ${params.reference}. Preparing references to be used with covigator pipeline"
reference = params.reference // do not put into a file as we need the indices
gff = params.gff ? file(params.gff) : false
snpeff_data = params.snpeff_data
snpeff_config = params.snpeff_config
snpeff_organism = params.snpeff_organism
snpeff_data = false
snpeff_config = false
skip_sarscov2_annotations = true
}

primers = params.primers ? file(params.primers) : false

skip_snpeff = false
if (! snpeff_data || ! snpeff_config || ! snpeff_organism) {
log.info "Skipping SnpEff annotation as either --snpeff_data, --snpeff_config or --snpeff_organism was not provided"

// This check is different for custom reference and default reference
// For reference genome skip snpeff annotation only if the snpeff arguments are not correctly provided
if (!params.reference && (! snpeff_data || ! snpeff_config || ! snpeff_organism)) {
log.info "Skipping SnpEff annotation as either --snpeff_data, --snpeff_config or --snpeff_organism was not provided"
skip_snpeff = true
}
// For a custom genome skip the snpeff annotation if params gff or snpeff organism are not provided
else if (params.reference && (! gff || ! snpeff_organism)) {
log.info "Skipping SnpEff annotation as either --gff or --snpeff_organism was not provided to build database"
skip_snpeff = true
}

Expand Down Expand Up @@ -232,7 +242,19 @@ if (params.skip_bcftools && params.skip_gatk && params.skip_ivar && params.skip_


workflow {
if (params.reference) {
if (! skip_snpeff && gff) {
SNPEFF_DATABASE(reference, gff, snpeff_organism)
snpeff_data = SNPEFF_DATABASE.out.snpeff_data
snpeff_config = SNPEFF_DATABASE.out.snpeff_config
}

}
if (input_fastqs) {
if (params.reference) {
BWA_INDEX(reference)
reference = BWA_INDEX.out.reference
}
if (library == "paired") {
READ_TRIMMING_PAIRED_END(input_fastqs)
ALIGNMENT_PAIRED_END(READ_TRIMMING_PAIRED_END.out[0], reference)
Expand Down
62 changes: 62 additions & 0 deletions modules/00_prepare_annotation.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
params.memory = "3g"
params.cpus = 1
params.output = "."


process BWA_INDEX {
cpus params.cpus
memory params.memory
publishDir "${params.output}", mode: "copy"
tag "${name}"

conda (params.enable_conda ? "bioconda::bwa-mem2=2.2.1 bioconda::samtools=1.12 bioconda::gatk4=4.2.0.0" : null)

input:
val(reference)

output:
path("reference/sequences.fa"), emit: reference
path("reference/sequences.fa.fai"), emit: fai
path("reference/sequences.dict"), emit: gatk_dict

script:
memory = "${params.memory}".replaceAll(" ", "").toLowerCase()
"""
mkdir -p reference
cp ${reference} reference/sequences.fa
bwa-mem2 index reference/sequences.fa
samtools faidx reference/sequences.fa
gatk CreateSequenceDictionary --REFERENCE reference/sequences.fa
"""
}

process SNPEFF_DATABASE {
cpus params.cpus
memory params.memory
publishDir "${params.output}", mode: "copy"
tag "${name}"

conda (params.enable_conda ? "bioconda::snpeff=5.0 bioconda::samtools=1.12" : null)

input:
val(reference)
val(gff)
val(snpeff_organism)

output:
path("snpeff/snpEff.config"), emit: snpeff_config
path("snpeff/"), emit: snpeff_data

script:
memory = "${params.memory}".replaceAll(" ", "").toLowerCase()
"""
mkdir -p snpeff/${snpeff_organism}
echo ${snpeff_organism}.genome : ${snpeff_organism} > snpeff/snpEff.config
cp ${reference} snpeff/${snpeff_organism}/sequences.fa
cp ${gff} snpeff/${snpeff_organism}/genes.gff
cd snpeff
snpEff build -gff3 -v ${snpeff_organism} -dataDir .
"""

}

4 changes: 2 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ profiles {
dag.enabled = false
}
test_fasta {
params.fasta = "$baseDir/test_data/test_data.fasta"
params.fasta = "$baseDir/tests/test_data/test_data.fasta"
params.name = "test"
params.output = "covigator_fasta_test"
}
test_fastq {
params.fastq1 = "$baseDir/test_data/test_data_1.fastq.gz"
params.fastq1 = "$baseDir/tests/test_data/test_data_1.fastq.gz"
params.name = "test"
params.output = "covigator_fastq_test"
}
Expand Down

0 comments on commit 7a2b155

Please sign in to comment.