diff --git a/README.md b/README.md index e240f5a..1c230f5 100755 --- a/README.md +++ b/README.md @@ -171,13 +171,24 @@ No additional parameter needs to be provided to use the default SARS-CoV-2 refer #### Using a custom reference genome -These references can be customised to use a different SARS-CoV-2 reference or to analyse a different virus. -Two files need to be provided: +These references can be customised to use a different SARS-CoV-2 reference or to analyse a different virus. +Two files need to be provided: + - Use a custom reference genome by providing the parameter `--reference your.fasta`. -- Gene annotation file in GFFv3 format `--gff your.gff`. +- Gene annotation file in GFFv3 format `--gff your.gff`. + +As well as an organism name for the SnpEff annotation: + +- Organism name to use in custom SnpEff database `--snpeff_organism`. -Additionally, the FASTA needs bwa-mem2 indexes, .fai index and a .dict index. -These indexes can be generated with the following two commands: +**NOTE**: beware that for Nextflow to find these files the reference needs to be passed as an absolute path. + +When a custom reference genome is provided, the pipeline automatically generates a bwa-mem2 index +in the case of fastq input and a custom SnpEff database for functional annotations using the steps +described below. The steps are listed here for documentation purposes and should __not be performed manually__. + +The reference genome FASTA needs to be indexed for different components of the pipeline. +bwa-mem2 indices, .fai index and a .dict index can be generated with the following three commands: ``` bwa-mem2 index reference.fasta samtools faidx reference.fasta @@ -194,8 +205,6 @@ When running CoVigator you will also need to provide three parameters: - `--snpeff_data`: path to the SnpEff data folder - `--snpeff_config`: path to the SnpEff config file -**NOTE**: beware that for Nextflow to find these indices the reference needs to be passed as an absolute path. - **Limitations** - The SARS-CoV-2 specific annotations (ie: ConsHMM conservation and SARS-CoV-2 protein domains) will be skipped when diff --git a/bin/add_sample_to_vcf.py b/bin/add_sample_to_vcf.py index b291c90..b923958 100755 --- a/bin/add_sample_to_vcf.py +++ b/bin/add_sample_to_vcf.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import argparse import gzip diff --git a/bin/assembly_variant_caller.py b/bin/assembly_variant_caller.py index b223f74..f20ae13 100755 --- a/bin/assembly_variant_caller.py +++ b/bin/assembly_variant_caller.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import os from argparse import ArgumentParser from dataclasses import dataclass diff --git a/bin/ivar2vcf.py b/bin/ivar2vcf.py index 07f6615..e837584 100755 --- a/bin/ivar2vcf.py +++ b/bin/ivar2vcf.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import os from argparse import ArgumentParser from typing import List diff --git a/bin/phasing.py b/bin/phasing.py index a2c2bdb..13603ed 100755 --- a/bin/phasing.py +++ b/bin/phasing.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import argparse import logging diff --git a/bin/reference_genome.py b/bin/reference_genome.py index 1596b81..bfc8206 100755 --- a/bin/reference_genome.py +++ b/bin/reference_genome.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 from pysam import FastaFile diff --git a/main.nf b/main.nf index 93beb95..830d7c6 100755 --- a/main.nf +++ b/main.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl = 2 - +include { BWA_INDEX; SNPEFF_DATABASE } from './modules/00_prepare_annotation' include { READ_TRIMMING_PAIRED_END; READ_TRIMMING_SINGLE_END } from './modules/01_fastp' include { ALIGNMENT_PAIRED_END; ALIGNMENT_SINGLE_END } from './modules/02_bwa' include { BAM_PREPROCESSING; COVERAGE_ANALYSIS; PRIMER_TRIMMING_IVAR } from './modules/03_bam_preprocessing' @@ -34,10 +34,12 @@ params.skip_normalization = false // references params.reference = false params.gff = false -params.snpeff_data = false -params.snpeff_config = false +// These will be now generated by the subworkflow if using a custom annotation +//params.snpeff_data = false +//params.snpeff_config = false params.snpeff_organism = false params.primers = false +params.different_virus = false params.output = "." params.min_mapping_quality = 20 @@ -88,20 +90,28 @@ if (params.reference == false) { skip_sarscov2_annotations = params.skip_sarscov2_annotations } else { - log.info "Using custom reference genome: ${params.reference}" + log.info "Using custom reference genome: ${params.reference}. Preparing references to be used with covigator pipeline" reference = params.reference // do not put into a file as we need the indices gff = params.gff ? file(params.gff) : false - snpeff_data = params.snpeff_data - snpeff_config = params.snpeff_config snpeff_organism = params.snpeff_organism + snpeff_data = false + snpeff_config = false skip_sarscov2_annotations = true } primers = params.primers ? file(params.primers) : false skip_snpeff = false -if (! snpeff_data || ! snpeff_config || ! snpeff_organism) { - log.info "Skipping SnpEff annotation as either --snpeff_data, --snpeff_config or --snpeff_organism was not provided" + +// This check is different for custom reference and default reference +// For reference genome skip snpeff annotation only if the snpeff arguments are not correctly provided +if (!params.reference && (! snpeff_data || ! snpeff_config || ! snpeff_organism)) { + log.info "Skipping SnpEff annotation as either --snpeff_data, --snpeff_config or --snpeff_organism was not provided" + skip_snpeff = true +} +// For a custom genome skip the snpeff annotation if params gff or snpeff organism are not provided +else if (params.reference && (! gff || ! snpeff_organism)) { + log.info "Skipping SnpEff annotation as either --gff or --snpeff_organism was not provided to build database" skip_snpeff = true } @@ -232,7 +242,19 @@ if (params.skip_bcftools && params.skip_gatk && params.skip_ivar && params.skip_ workflow { + if (params.reference) { + if (! skip_snpeff && gff) { + SNPEFF_DATABASE(reference, gff, snpeff_organism) + snpeff_data = SNPEFF_DATABASE.out.snpeff_data + snpeff_config = SNPEFF_DATABASE.out.snpeff_config + } + + } if (input_fastqs) { + if (params.reference) { + BWA_INDEX(reference) + reference = BWA_INDEX.out.reference + } if (library == "paired") { READ_TRIMMING_PAIRED_END(input_fastqs) ALIGNMENT_PAIRED_END(READ_TRIMMING_PAIRED_END.out[0], reference) diff --git a/modules/00_prepare_annotation.nf b/modules/00_prepare_annotation.nf new file mode 100755 index 0000000..f8f0ce8 --- /dev/null +++ b/modules/00_prepare_annotation.nf @@ -0,0 +1,62 @@ +params.memory = "3g" +params.cpus = 1 +params.output = "." + + +process BWA_INDEX { + cpus params.cpus + memory params.memory + publishDir "${params.output}", mode: "copy" + tag "${name}" + + conda (params.enable_conda ? "bioconda::bwa-mem2=2.2.1 bioconda::samtools=1.12 bioconda::gatk4=4.2.0.0" : null) + + input: + val(reference) + + output: + path("reference/sequences.fa"), emit: reference + path("reference/sequences.fa.fai"), emit: fai + path("reference/sequences.dict"), emit: gatk_dict + + script: + memory = "${params.memory}".replaceAll(" ", "").toLowerCase() + """ + mkdir -p reference + cp ${reference} reference/sequences.fa + bwa-mem2 index reference/sequences.fa + samtools faidx reference/sequences.fa + gatk CreateSequenceDictionary --REFERENCE reference/sequences.fa + """ +} + +process SNPEFF_DATABASE { + cpus params.cpus + memory params.memory + publishDir "${params.output}", mode: "copy" + tag "${name}" + + conda (params.enable_conda ? "bioconda::snpeff=5.0 bioconda::samtools=1.12" : null) + + input: + val(reference) + val(gff) + val(snpeff_organism) + + output: + path("snpeff/snpEff.config"), emit: snpeff_config + path("snpeff/"), emit: snpeff_data + + script: + memory = "${params.memory}".replaceAll(" ", "").toLowerCase() + """ + mkdir -p snpeff/${snpeff_organism} + echo ${snpeff_organism}.genome : ${snpeff_organism} > snpeff/snpEff.config + cp ${reference} snpeff/${snpeff_organism}/sequences.fa + cp ${gff} snpeff/${snpeff_organism}/genes.gff + cd snpeff + snpEff build -gff3 -v ${snpeff_organism} -dataDir . + """ + +} + diff --git a/nextflow.config b/nextflow.config index cba16a1..6ed6ce9 100755 --- a/nextflow.config +++ b/nextflow.config @@ -40,12 +40,12 @@ profiles { dag.enabled = false } test_fasta { - params.fasta = "$baseDir/test_data/test_data.fasta" + params.fasta = "$baseDir/tests/test_data/test_data.fasta" params.name = "test" params.output = "covigator_fasta_test" } test_fastq { - params.fastq1 = "$baseDir/test_data/test_data_1.fastq.gz" + params.fastq1 = "$baseDir/tests/test_data/test_data_1.fastq.gz" params.name = "test" params.output = "covigator_fastq_test" }