diff --git a/Makefile b/Makefile index a10ec03..4589a34 100755 --- a/Makefile +++ b/Makefile @@ -22,4 +22,7 @@ test: bash tests/scripts/test_11.sh bash tests/scripts/test_12.sh bash tests/scripts/test_13.sh + bash tests/scripts/test_14.sh + bash tests/scripts/test_15.sh + bash tests/scripts/test_16.sh bash tests/scripts/test_python_unit_tests.sh diff --git a/main.nf b/main.nf index 830d7c6..c3cbf6a 100755 --- a/main.nf +++ b/main.nf @@ -30,6 +30,7 @@ params.skip_bcftools = false params.skip_gatk = false params.skip_pangolin = false params.skip_normalization = false +params.lineage_mode = false // references params.reference = false @@ -39,7 +40,7 @@ params.gff = false //params.snpeff_config = false params.snpeff_organism = false params.primers = false -params.different_virus = false +params.reference_generate = false params.output = "." params.min_mapping_quality = 20 @@ -206,6 +207,11 @@ else if (params.input_vcfs_list != false || params.vcf != false) { .set { input_vcfs } } } +else if(params.reference_generate) { + genome = params.reference + annotation = gff + annotation_name = snpeff_organism +} else { log.error "missing some input data" exit 1 @@ -240,119 +246,154 @@ if (params.skip_bcftools && params.skip_gatk && params.skip_ivar && params.skip_ exit 1 } +// Lineage mode should only be used when running with assembly or VCF input +// For FASTQ input we can run pangolin as regular rule +if (params.lineage_mode && input_fastqs) { + log.error "lineage mode is only supported with fasta or VCF input" + exit 1 +} workflow { - if (params.reference) { - if (! skip_snpeff && gff) { - SNPEFF_DATABASE(reference, gff, snpeff_organism) - snpeff_data = SNPEFF_DATABASE.out.snpeff_data - snpeff_config = SNPEFF_DATABASE.out.snpeff_config - } - - } - if (input_fastqs) { - if (params.reference) { - BWA_INDEX(reference) - reference = BWA_INDEX.out.reference + if (params.lineage_mode) { + log.info "Running lineage mode. In this mode only Pangolin is executed on either fasta or VCF" + if (input_fastas) { + PANGOLIN_LINEAGE(input_fastas) } - if (library == "paired") { - READ_TRIMMING_PAIRED_END(input_fastqs) - ALIGNMENT_PAIRED_END(READ_TRIMMING_PAIRED_END.out[0], reference) - bam_files = ALIGNMENT_PAIRED_END.out + else if (input_vcfs) { + if (! params.skip_normalization) { + VARIANT_NORMALIZATION(input_vcfs, reference) + normalized_vcfs = VARIANT_NORMALIZATION.out + } + else { + normalized_vcfs = input_vcfs + } + VCF2FASTA(normalized_vcfs, reference) + PANGOLIN_LINEAGE(VCF2FASTA.out) } - else { - READ_TRIMMING_SINGLE_END(input_fastqs) - ALIGNMENT_SINGLE_END(READ_TRIMMING_SINGLE_END.out[0], reference) - bam_files = ALIGNMENT_SINGLE_END.out + } + else if (params.reference_generate) { + log.info "Running genome/annotation generate mode. This step generates only bwa and snpEff databases..." + // Custom reference mode. This mode is added to prepare the reference just once when processing many samples + if (genome) { + BWA_INDEX(genome) + if (!skip_snpeff) { + SNPEFF_DATABASE(genome, gff, snpeff_organism) + } } - BAM_PREPROCESSING(bam_files, reference) - preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bams + } + else { + // Run genome and annotation steps for single sample + if (params.reference) { + if (! skip_snpeff && gff) { + SNPEFF_DATABASE(reference, gff, snpeff_organism) + snpeff_data = SNPEFF_DATABASE.out.snpeff_data + snpeff_config = SNPEFF_DATABASE.out.snpeff_config + } - if (primers) { - PRIMER_TRIMMING_IVAR(preprocessed_bams, primers) - preprocessed_bams = PRIMER_TRIMMING_IVAR.out.trimmed_bam } - COVERAGE_ANALYSIS(preprocessed_bams) - - // variant calling - vcfs_to_normalize = null - if (!params.skip_bcftools) { - VARIANT_CALLING_BCFTOOLS(preprocessed_bams, reference) - vcfs_to_normalize = vcfs_to_normalize == null? - VARIANT_CALLING_BCFTOOLS.out : vcfs_to_normalize.concat(VARIANT_CALLING_BCFTOOLS.out) + if (input_fastqs) { + if (params.reference) { + BWA_INDEX(reference) + reference = BWA_INDEX.out.reference + } + if (library == "paired") { + READ_TRIMMING_PAIRED_END(input_fastqs) + ALIGNMENT_PAIRED_END(READ_TRIMMING_PAIRED_END.out[0], reference) + bam_files = ALIGNMENT_PAIRED_END.out + } + else { + READ_TRIMMING_SINGLE_END(input_fastqs) + ALIGNMENT_SINGLE_END(READ_TRIMMING_SINGLE_END.out[0], reference) + bam_files = ALIGNMENT_SINGLE_END.out + } + BAM_PREPROCESSING(bam_files, reference) + preprocessed_bams = BAM_PREPROCESSING.out.preprocessed_bams + + if (primers) { + PRIMER_TRIMMING_IVAR(preprocessed_bams, primers) + preprocessed_bams = PRIMER_TRIMMING_IVAR.out.trimmed_bam + } + COVERAGE_ANALYSIS(preprocessed_bams) + + // variant calling + vcfs_to_normalize = null + if (!params.skip_bcftools) { + VARIANT_CALLING_BCFTOOLS(preprocessed_bams, reference) + vcfs_to_normalize = vcfs_to_normalize == null? + VARIANT_CALLING_BCFTOOLS.out : vcfs_to_normalize.concat(VARIANT_CALLING_BCFTOOLS.out) + } + if (!params.skip_lofreq) { + VARIANT_CALLING_LOFREQ(preprocessed_bams, reference) + vcfs_to_normalize = vcfs_to_normalize == null? + VARIANT_CALLING_LOFREQ.out : vcfs_to_normalize.concat(VARIANT_CALLING_LOFREQ.out) + } + if (!params.skip_gatk) { + VARIANT_CALLING_GATK(preprocessed_bams, reference) + vcfs_to_normalize = vcfs_to_normalize == null? + VARIANT_CALLING_GATK.out : vcfs_to_normalize.concat(VARIANT_CALLING_GATK.out) + } + if (!params.skip_ivar && gff) { + VARIANT_CALLING_IVAR(preprocessed_bams, reference, gff) + IVAR2VCF(VARIANT_CALLING_IVAR.out, reference) + vcfs_to_normalize = vcfs_to_normalize == null? + IVAR2VCF.out : vcfs_to_normalize.concat(IVAR2VCF.out) + } } - if (!params.skip_lofreq) { - VARIANT_CALLING_LOFREQ(preprocessed_bams, reference) - vcfs_to_normalize = vcfs_to_normalize == null? - VARIANT_CALLING_LOFREQ.out : vcfs_to_normalize.concat(VARIANT_CALLING_LOFREQ.out) + else if (input_fastas) { + if (!params.skip_pangolin) { + // pangolin from fasta + PANGOLIN_LINEAGE(input_fastas) + } + // assembly variant calling + VARIANT_CALLING_ASSEMBLY(input_fastas, reference) + vcfs_to_normalize = VARIANT_CALLING_ASSEMBLY.out } - if (!params.skip_gatk) { - VARIANT_CALLING_GATK(preprocessed_bams, reference) - vcfs_to_normalize = vcfs_to_normalize == null? - VARIANT_CALLING_GATK.out : vcfs_to_normalize.concat(VARIANT_CALLING_GATK.out) + else if (input_vcfs) { + vcfs_to_normalize = input_vcfs } - if (!params.skip_ivar && gff) { - VARIANT_CALLING_IVAR(preprocessed_bams, reference, gff) - IVAR2VCF(VARIANT_CALLING_IVAR.out, reference) - vcfs_to_normalize = vcfs_to_normalize == null? - IVAR2VCF.out : vcfs_to_normalize.concat(IVAR2VCF.out) + + if (! params.skip_normalization) { + VARIANT_NORMALIZATION(vcfs_to_normalize, reference) + normalized_vcfs = VARIANT_NORMALIZATION.out } - } - else if (input_fastas) { - if (!params.skip_pangolin) { - // pangolin from fasta - PANGOLIN_LINEAGE(input_fastas) + else { + normalized_vcfs = vcfs_to_normalize } - // assembly variant calling - VARIANT_CALLING_ASSEMBLY(input_fastas, reference) - vcfs_to_normalize = VARIANT_CALLING_ASSEMBLY.out - } - else if (input_vcfs) { - vcfs_to_normalize = input_vcfs - } - - if (! params.skip_normalization) { - VARIANT_NORMALIZATION(vcfs_to_normalize, reference) - normalized_vcfs = VARIANT_NORMALIZATION.out - } - else { - normalized_vcfs = vcfs_to_normalize - } - - if (input_fastqs || input_vcfs) { - // pangolin from VCF on the normalized VCFs - if (!params.skip_pangolin) { - VCF2FASTA(normalized_vcfs, reference) - PANGOLIN_LINEAGE(VCF2FASTA.out) + if (input_fastqs || input_vcfs) { + // pangolin from VCF on the normalized VCFs + if (!params.skip_pangolin) { + VCF2FASTA(normalized_vcfs, reference) + PANGOLIN_LINEAGE(VCF2FASTA.out) + } } - } - if (! skip_sarscov2_annotations) { - // only optionally add SARS-CoV-2 specific annotations - VARIANT_SARSCOV2_ANNOTATION(normalized_vcfs) - normalized_vcfs = VARIANT_SARSCOV2_ANNOTATION.out.annotated_vcfs - } + if (! skip_sarscov2_annotations) { + // only optionally add SARS-CoV-2 specific annotations + VARIANT_SARSCOV2_ANNOTATION(normalized_vcfs) + normalized_vcfs = VARIANT_SARSCOV2_ANNOTATION.out.annotated_vcfs + } - if (preprocessed_bams) { - // we can only add technical annotations when we have the reads - VAFATOR(normalized_vcfs.combine(preprocessed_bams, by: 0)) - VARIANT_VAF_ANNOTATION(VAFATOR.out.annotated_vcf) - normalized_vcfs = VARIANT_VAF_ANNOTATION.out.vaf_annotated - } + if (preprocessed_bams) { + // we can only add technical annotations when we have the reads + VAFATOR(normalized_vcfs.combine(preprocessed_bams, by: 0)) + VARIANT_VAF_ANNOTATION(VAFATOR.out.annotated_vcf) + normalized_vcfs = VARIANT_VAF_ANNOTATION.out.vaf_annotated + } - // NOTE: phasing has to happen before SnpEff annotation for MNVs to be annotated correctly - if (gff) { - PHASING(normalized_vcfs, reference, gff) - normalized_vcfs = PHASING.out - } + // NOTE: phasing has to happen before SnpEff annotation for MNVs to be annotated correctly + if (gff) { + PHASING(normalized_vcfs, reference, gff) + normalized_vcfs = PHASING.out + } - if (! skip_snpeff) { - // only when configured we run SnpEff - VARIANT_ANNOTATION(normalized_vcfs, snpeff_data, snpeff_config, snpeff_organism) - normalized_vcfs = VARIANT_ANNOTATION.out.annotated_vcfs - } - else { - BGZIP(normalized_vcfs) + if (! skip_snpeff) { + // only when configured we run SnpEff + VARIANT_ANNOTATION(normalized_vcfs, snpeff_data, snpeff_config, snpeff_organism) + normalized_vcfs = VARIANT_ANNOTATION.out.annotated_vcfs + } + else { + BGZIP(normalized_vcfs) + } } } diff --git a/modules/00_prepare_annotation.nf b/modules/00_prepare_annotation.nf index f8f0ce8..02a3c9f 100755 --- a/modules/00_prepare_annotation.nf +++ b/modules/00_prepare_annotation.nf @@ -18,6 +18,11 @@ process BWA_INDEX { path("reference/sequences.fa"), emit: reference path("reference/sequences.fa.fai"), emit: fai path("reference/sequences.dict"), emit: gatk_dict + path("reference/sequences.fa.0123") + path("reference/sequences.fa.amb") + path("reference/sequences.fa.ann") + path("reference/sequences.fa.bwt.2bit.64") + path("reference/sequences.fa.pac") script: memory = "${params.memory}".replaceAll(" ", "").toLowerCase() diff --git a/modules/07_lineage_annotation.nf b/modules/07_lineage_annotation.nf index 62e072c..662693c 100644 --- a/modules/07_lineage_annotation.nf +++ b/modules/07_lineage_annotation.nf @@ -19,7 +19,8 @@ process PANGOLIN_LINEAGE { when: // only runs pangolin on LoFreq and the assembly results - caller == "lofreq" || caller == "assembly" + // JoHa: added input to run lineage mode for vcf input + caller == "lofreq" || caller == "assembly" || caller == "input" shell: """ diff --git a/nextflow.config b/nextflow.config index 6ed6ce9..1e59245 100755 --- a/nextflow.config +++ b/nextflow.config @@ -29,7 +29,10 @@ params.pfam_names_header = "$baseDir/reference/pfam_names.header.txt" params.pfam_descriptions_header = "$baseDir/reference/pfam_descriptions.header.txt" profiles { - conda { params.enable_conda = true } + conda { + params.enable_conda = true + conda.enabled = true + } debug { process.beforeScript = 'echo $HOSTNAME' } test { params.cpus = 1 diff --git a/tests/scripts/test_14.sh b/tests/scripts/test_14.sh new file mode 100755 index 0000000..c2cb317 --- /dev/null +++ b/tests/scripts/test_14.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +################################################################################## +# Genome generate +################################################################################## +echo "Running CoVigator pipeline test 14" +source bin/assert.sh +output=tests/output/test14 +nextflow main.nf -profile conda --name test_data \ + --output $output \ + --reference_generate \ + --reference $(pwd)/reference/Sars_cov_2.ASM985889v3.dna.toplevel.fa \ + --gff $(pwd)/reference/Sars_cov_2.ASM985889v3.101.gff3 \ + --snpeff_organism Sars_cov_2 + +# Test reference genome related output +test -s $output/reference/sequences.fa.fai || { echo "Missing fasta index file!"; exit 1; } +test -s $output/reference/sequences.dict || { echo "Missing GATK dict file!"; exit 1; } + +# Test bwa index files are present +test -s $output/reference/sequences.fa.0123 || { echo "Missing bwa 0123 index file!"; exit 1; } +test -s $output/reference/sequences.fa.amb || { echo "Missing bwa amb index file!"; exit 1; } +test -s $output/reference/sequences.fa.ann || { echo "Missing bwa ann index file!"; exit 1; } +test -s $output/reference/sequences.fa.bwt.2bit.64 || { echo "Missing bwa bwt.2bit.64 index file!"; exit 1; } +test -s $output/reference/sequences.fa.pac || { echo "Missing bwa pac index file!"; exit 1; } + + +# Test snpEff output +test -s $output/snpeff/snpEff.config || { echo "Missing snpEff config file!"; exit 1; } +test -s $output/snpeff/Sars_cov_2/snpEffectPredictor.bin || { echo "Missing snpEff predictor bin file!"; exit 1; } +test -s $output/snpeff/Sars_cov_2/sequences.fa || { echo "Missing snpEff reference genome!"; exit 1; } +test -s $output/snpeff/Sars_cov_2/genes.gff || { echo "Missing snpEff reference annotation!"; exit 1; } diff --git a/tests/scripts/test_15.sh b/tests/scripts/test_15.sh new file mode 100755 index 0000000..5bfa53e --- /dev/null +++ b/tests/scripts/test_15.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +################################################################################## +# Lineage only mode +################################################################################## + +echo "Running CoVigator pipeline test 15" +source bin/assert.sh +output=tests/output/test15 +nextflow main.nf -profile conda --name test_data \ + --output $output \ + --vcf $(pwd)/tests/test_data/test_data.assembly.lineage_mode.vcf.gz \ + --lineage_mode \ + --skip_normalization + +test -s $output/test_data.input.fasta || { echo "Missing VCF2FASTA fasta file (lineage mode with vcf input)!"; exit 1; } +test -s $output/test_data.input.pangolin.csv || { echo "Missing pangolin output file (lineage mode with vcf input)!"; exit 1; } +assert_eq $(wc -l $output/test_data.input.pangolin.csv | cut -d ' ' -f1) 2 "Wrong number of pangolin results" + + + + diff --git a/tests/scripts/test_16.sh b/tests/scripts/test_16.sh new file mode 100755 index 0000000..b91c04d --- /dev/null +++ b/tests/scripts/test_16.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +################################################################################## +# Lineage only mode +################################################################################## + +echo "Running CoVigator pipeline test 16" +source bin/assert.sh +output=tests/output/test15 +nextflow main.nf -profile conda --name test_data \ + --output $output \ + --fasta $(pwd)/tests/test_data/test_data.fasta \ + --lineage_mode + +test -s $output/test_data.assembly.pangolin.csv || { echo "Missing pangolin output file (lineage mode with vcf input)!"; exit 1; } +assert_eq $(wc -l $output/test_data.assembly.pangolin.csv | cut -d ' ' -f1) 2 "Wrong number of pangolin results" diff --git a/tests/test_data/test_data.assembly.lineage_mode.vcf.gz b/tests/test_data/test_data.assembly.lineage_mode.vcf.gz new file mode 100644 index 0000000..d0cfaae Binary files /dev/null and b/tests/test_data/test_data.assembly.lineage_mode.vcf.gz differ