Merge pull request #8 from TRON-Bioinformatics/optional-gnomad

Optional gnomad
TRON-Bioinformatics · May 30, 2022 · eabdca2 · eabdca2
2 parents 27c98de + b0c61a2
commit eabdca2
Show file tree

Hide file tree

Showing 11 changed files with 55 additions and 48 deletions.
diff --git a/Makefile b/Makefile
@@ -21,3 +21,4 @@ test:
 	bash tests/test_05.sh
 	bash tests/test_06.sh
 	bash tests/test_08.sh
+	bash tests/test_09.sh
diff --git a/README.md b/README.md
@@ -16,9 +16,9 @@ This workflow implements the Mutect2 (Benjamin, 2019) best practices somatic var
 
 It has the following steps:
 * **Mutect2** - the somatic variant caller.
-* **Pile-up summaries** - summarizes counts of reads that support reference, alternate and other alleles for given sites.
 * **Learn read orientation model** - learn the prior probability of read orientation artifacts.
-* **Calculate contamination** - Given pileup data from GetPileupSummaries, calculates the fraction of reads coming from cross-sample contamination.
+* **Pile-up summaries** - summarizes counts of reads that support reference, alternate and other alleles for given sites (optional).
+* **Calculate contamination** - Given pileup data from GetPileupSummaries, calculates the fraction of reads coming from cross-sample contamination (optional).
 * **Filter calls** - filters mutations from the raw Mutect2 variant calls
 * **Funcotator annotation** - add functional annotations (optional)
 
@@ -51,14 +51,15 @@ Input:
     name1	tumor_bam1	normal_bam1
     name2	tumor_bam2	normal_bam2
     * reference: path to the FASTA genome reference (indexes expected *.fai, *.dict)
-    * gnomad: path to the gnomad VCF or other germline resource
     
 Optional input:
+    * gnomad: path to the gnomad VCF or other germline resource (recommended). If not provided the contamination will 
+    not be estimated and the filter of common germline variants will be disabled
     * intervals: path to a BED file containing the regions to analyse
-    * output: the folder where to publish output
+    * output: the folder where to publish output (default: output)
     * enable_bam_output: outputs a new BAM file with the Mutect2 reassembly of reads (default: false)
     * disable_common_germline_filter: disable the use of GnomAD to filter out common variants in the population
-    from the somatic calls. The GnomAD resource is still required though as this common SNPs are used elsewhere to
+    from the somatic calls. The GnomAD can still be provided though as this common SNPs are used elsewhere to
     calculate the contamination (default: false)
     * funcotator: To use Funcotator, supply the path to a database to be used. (can be downloaded from GATK FTP server)
     * reference_version_funcotator: version of the reference genome (default: "hg19")
@@ -70,7 +71,7 @@ Optional input:
     * memory_contamination: the ammount of memory used by contamination (default: 16g)
     * memory_filter: the ammount of memory used by filter (default: 16g)
     * memory_funcotator: the ammount of memory used by filter (default: 16g)
-    * args_filter: optional arguments to the FilterMutectCalls function of GATK (e.g.: "--min-allele-fraction 0.05 --min-reads-per-strand 1 --unique-alt-read-count 4") (see FilterMutectCalls documentation)
+    * args_filter: optional arguments to the FilterMutectCalls function of GATK (e.g.: "--contamination-estimate 0.4 --min-allele-fraction 0.05 --min-reads-per-strand 1 --unique-alt-read-count 4") (see FilterMutectCalls documentation)
     * args_funcotator: optional arguments to Funcotator (e.g. "--remove-filtered-variants true")  (see Funcotator documentation)
     * args_mutect2: optional arguments to Mutect2 (e.g. "--sites-only-vcf-output")  (see Mutect2 documentation)
 

diff --git a/main.nf b/main.nf
@@ -12,21 +12,8 @@ include { FUNCOTATOR } from './modules/06_annotate'
 params.help= false
 params.input_files = false
 params.reference = false
-params.intervals = false
 params.gnomad = false
 params.output = 'output'
-params.pon = false
-params.memory_mutect2 = "16g"
-params.cpus_mutect2 = 2
-params.memory_read_orientation = "16g"
-params.cpus_read_orientation = 2
-params.memory_pileup = "32g"
-params.cpus_pileup = 2
-params.memory_contamination = "16g"
-params.cpus_contamination = 2
-params.memory_filter = "16g"
-params.cpus_filter = 2
-params.disable_common_germline_filter = false
 params.funcotator = false
 
 def helpMessage() {
@@ -41,10 +28,6 @@ if (!params.reference) {
     log.error "--reference is required"
     exit 1
 }
-if (!params.gnomad) {
-    log.error "--gnomad is required"
-    exit 1
-}
 
 // checks required inputs
 if (params.input_files) {
@@ -58,17 +41,25 @@ if (params.input_files) {
 }
 
 workflow {
+
     MUTECT2(input_files)
-    PILEUP_SUMMARIES(input_files)
     LEARN_READ_ORIENTATION_MODEL(MUTECT2.out.f1r2_stats)
-    CALCULATE_CONTAMINATION(PILEUP_SUMMARIES.out.pileupsummaries)
-    FILTER_CALLS(
-        CALCULATE_CONTAMINATION.out.contaminationTables.join(
-            LEARN_READ_ORIENTATION_MODEL.out.read_orientation_model).join(MUTECT2.out.unfiltered_vcfs))
+
+    if (params.gnomad) {
+        PILEUP_SUMMARIES(input_files)
+        CALCULATE_CONTAMINATION(PILEUP_SUMMARIES.out.pileupsummaries)
+        FILTER_CALLS(
+            CALCULATE_CONTAMINATION.out.contaminationTables.join(
+                LEARN_READ_ORIENTATION_MODEL.out.read_orientation_model).join(MUTECT2.out.unfiltered_vcfs))
+    }
+    else {
+        FILTER_CALLS(
+            input_files.map{ row-> tuple(row[0], file("dummy"), file("dummy2")) }.join(
+                LEARN_READ_ORIENTATION_MODEL.out.read_orientation_model).join(MUTECT2.out.unfiltered_vcfs))
+    }
 
     FILTER_CALLS.out.final_vcfs.map {it.join("\t")}.collectFile(name: "${params.output}/mutect2_output_files.txt", newLine: true)
     if(params.funcotator){
         FUNCOTATOR(FILTER_CALLS.out.anno_input)
     }
-
 }
diff --git a/modules/01_mutect2.nf b/modules/01_mutect2.nf
@@ -21,13 +21,13 @@ process MUTECT2 {
     tuple val(name), val(tumor_bam), val(normal_bam)
 
     output:
-    tuple val("${name}"), file("${name}.mutect2.unfiltered.vcf"), file("${name}.mutect2.unfiltered.vcf.stats"), emit: unfiltered_vcfs
-    tuple val("${name}"), file("${name}.f1r2.tar.gz"), emit: f1r2_stats
-    tuple file("${name}.mutect2.assembled_haplotypes.bam"), file("${name}.mutect2.assembled_haplotypes.bai"), optional: true
+    tuple val("${name}"), path("${name}.mutect2.unfiltered.vcf"), path("${name}.mutect2.unfiltered.vcf.stats"), emit: unfiltered_vcfs
+    tuple val("${name}"), path("${name}.f1r2.tar.gz"), emit: f1r2_stats
+    tuple path("${name}.mutect2.assembled_haplotypes.bam"), path("${name}.mutect2.assembled_haplotypes.bai"), optional: true
 
     script:
     normal_panel_option = params.pon ? "--panel-of-normals ${params.pon}" : ""
-    germline_filter = params.disable_common_germline_filter ? "" : "--germline-resource ${params.gnomad}"
+    germline_filter = params.disable_common_germline_filter || ! params.gnomad ? "" : "--germline-resource ${params.gnomad}"
     normal_inputs = normal_bam.split(",").collect({v -> "--input $v"}).join(" ")
     tumor_inputs = tumor_bam.split(",").collect({v -> "--input $v"}).join(" ")
     normalRGSMs = normal_bam.split(",").collect({v -> "\$(samtools view -H $v | grep -oP '(?<=SM:)[^ |\\t]*' | head -1)"})

diff --git a/modules/02_learn_read_orientation.nf b/modules/02_learn_read_orientation.nf
@@ -11,10 +11,10 @@ process LEARN_READ_ORIENTATION_MODEL {
   conda (params.enable_conda ? "bioconda::gatk4=4.2.6.1" : null)
 
   input:
-  tuple val(name), file(f1r2_stats)
+  tuple val(name), path(f1r2_stats)
 
   output:
-  tuple val(name), file("${name}.read-orientation-model.tar.gz"), emit: read_orientation_model
+  tuple val(name), path("${name}.read-orientation-model.tar.gz"), emit: read_orientation_model
 
   """
   gatk --java-options '-Xmx${params.memory_read_orientation}' LearnReadOrientationModel \

diff --git a/modules/03_pileup_summary.nf b/modules/03_pileup_summary.nf
@@ -15,7 +15,7 @@ process PILEUP_SUMMARIES {
     tuple val(name), val(tumor_bam), val(normal_bam)
 
     output:
-    tuple val("${name}"), file("${name}.pileupsummaries.table"), emit: pileupsummaries
+    tuple val("${name}"), path("${name}.pileupsummaries.table"), emit: pileupsummaries
 
     script:
     tumor_inputs = tumor_bam.split(",").collect({v -> "--input $v"}).join(" ")

diff --git a/modules/04_calculate_contamination.nf b/modules/04_calculate_contamination.nf
@@ -11,10 +11,10 @@ process CALCULATE_CONTAMINATION {
     conda (params.enable_conda ? "bioconda::gatk4=4.2.6.1" : null)
 
     input:
-    tuple val(name), file(table)
+    tuple val(name), path(table)
 
     output:
-    tuple val(name), file("${name}.segments.table"), file("${name}.calculatecontamination.table"), emit: contaminationTables
+    tuple val(name), path("${name}.segments.table"), path("${name}.calculatecontamination.table"), emit: contaminationTables
 
     """
     gatk --java-options '-Xmx${params.memory_contamination}' CalculateContamination \

diff --git a/modules/05_filter_calls.nf b/modules/05_filter_calls.nf
@@ -13,19 +13,22 @@ process FILTER_CALLS {
     conda (params.enable_conda ? "bioconda::gatk4=4.2.6.1" : null)
 
     input:
-    tuple val(name), file(segments_table), file(contamination_table), file(model), file(unfiltered_vcf), file(vcf_stats)
+    tuple val(name), path(segments_table), path(contamination_table), path(model), path(unfiltered_vcf), path(vcf_stats)
 
     output:
     tuple val(name), val("${params.output}/${name}/${name}.mutect2.vcf"), emit: final_vcfs
-    tuple val(name), file("${name}.mutect2.vcf"), emit: anno_input
-    file "${name}.mutect2.vcf"
+    tuple val(name), path("${name}.mutect2.vcf"), emit: anno_input
+    path "${name}.mutect2.vcf"
 
+    script:
+    segments_table_param = segments_table.exists() ? "--tumor-segmentation ${segments_table}" : ""
+    contamination_table_param = contamination_table.exists() ? "--contamination-table ${contamination_table}" : ""
     """
     gatk --java-options '-Xmx${params.memory_filter}' FilterMutectCalls \
     -V ${unfiltered_vcf} \
     --reference ${params.reference} \
-    --tumor-segmentation ${segments_table} \
-    --contamination-table ${contamination_table} \
+    ${segments_table_param} \
+    ${contamination_table_param} \
     --ob-priors ${model} \
     --output ${name}.mutect2.vcf ${params.args_filter}
     """

diff --git a/modules/06_annotate.nf b/modules/06_annotate.nf
@@ -16,11 +16,11 @@ process FUNCOTATOR {
     conda (params.enable_conda ? "bioconda::gatk4=4.2.6.1" : null)
 
     input:
-    tuple val(name), file(vcf)
+    tuple val(name), path(vcf)
 
     output:
     tuple val(name), val("${params.output}/${name}/${name}.mutect2.funcotated.vcf"), emit: vcf_anno
-    file "${name}.mutect2.funcotated.maf"
+    path "${name}.mutect2.funcotated.maf"
 
     """
     gatk --java-options '-Xmx${params.memory_funcotator}' Funcotator \

diff --git a/nextflow.config b/nextflow.config
@@ -37,7 +37,7 @@ env {
 // Capture exit codes from upstream processes when piping
 process.shell = ['/bin/bash', '-euo', 'pipefail']
 
-VERSION = '1.6.0'
+VERSION = '1.7.0'
 DOI = 'https://zenodo.org/badge/latestdoi/355860788'
 
 manifest {
@@ -66,9 +66,10 @@ Input:
     name1	tumor_bam1	normal_bam1
     name2	tumor_bam2	normal_bam2
     * reference: path to the FASTA genome reference (indexes expected *.fai, *.dict)
-    * gnomad: path to the gnomad VCF or other germline resource
 
 Optional input:
+    * gnomad: path to the gnomad VCF or other germline resource (recommended). If not provided the contamination will
+    not be estimated and the filter of common germline variants will be disabled
     * intervals: path to a BED file containing the regions to analyse
     * output: the folder where to publish output
     * enable_bam_output: outputs a new BAM file with the Mutect2 reassembly of reads (default: false)
@@ -85,7 +86,7 @@ Optional input:
     * memory_contamination: the ammount of memory used by contamination (default: 16g)
     * memory_filter: the ammount of memory used by filter (default: 16g)
     * memory_funcotator: the ammount of memory used by filter (default: 16g)
-    * args_filter: optional arguments to the FilterMutectCalls function of GATK (e.g.: "--min-allele-fraction 0.05 --min-reads-per-strand 1 --unique-alt-read-count 4") (see FilterMutectCalls documentation)
+    * args_filter: optional arguments to the FilterMutectCalls function of GATK (e.g.: "--contamination-estimate 0.4 --min-allele-fraction 0.05 --min-reads-per-strand 1 --unique-alt-read-count 4") (see FilterMutectCalls documentation)
     * args_funcotator: optional arguments to Funcotator (e.g. "--remove-filtered-variants true")  (see Funcotator documentation)
     * args_mutect2: optional arguments to Mutect2 (e.g. "--sites-only-vcf-output")  (see Mutect2 documentation)
 

diff --git a/tests/test_09.sh b/tests/test_09.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+
+source bin/assert.sh
+output=output/test9
+
+echo -e "sample_name\t"`pwd`"/test_data/SRR8244887.preprocessed.downsampled.bam\t"`pwd`"/test_data/SRR8244836.preprocessed.downsampled.bam" > test_data/test_input.txt
+nextflow main.nf -profile test,conda --output $output --input_files test_data/test_input.txt --gnomad false
+
+test -s $output/sample_name/sample_name.mutect2.vcf || { echo "Missing output VCF file!"; exit 1; }