diff --git a/pipeline/call-gSV.nf b/pipeline/call-gSV.nf index 4f833a74..f1668f51 100644 --- a/pipeline/call-gSV.nf +++ b/pipeline/call-gSV.nf @@ -51,7 +51,7 @@ Starting workflow... .stripIndent() include { run_validate } from './modules/validation' -include { call_gSV_Delly; call_gCNV_Delly } from './modules/delly' +include { call_gSV_Delly; call_gCNV_Delly; regenotype_gSV_Delly; regenotype_gCNV_Delly } from './modules/delly' include { call_gSV_Manta } from './modules/manta' include { convert_BCF2VCF_BCFtools as convert_gSV_BCF2VCF_BCFtools; convert_BCF2VCF_BCFtools as convert_gCNV_BCF2VCF_BCFtools } from './modules/bcftools' include { run_vcfstats_RTGTools as run_gSV_vcfstats_RTGTools; run_vcfstats_RTGTools as run_gCNV_vcfstats_RTGTools } from './modules/rtgtools' @@ -65,7 +65,8 @@ input_bam_ch = Channel row.patient, row.sample, row.input_bam, - "${row.input_bam}.bai" + "${row.input_bam}.bai", + row.mode ) } @@ -91,34 +92,46 @@ else { reference_fasta_index = "${params.reference_fasta}.fai" } -// Create channel for validation validation_channel = Channel .fromPath(params.input_csv, checkIfExists:true) .splitCsv(header:true) - .map{ row -> [ - row.input_bam, - params.reference_fasta + .map{ row -> [ + 'file-input', + row.input_bam ] } - .flatten() + +Channel + .of(['file-fasta', params.reference_fasta]) + .mix(validation_channel) + .set { validation_channel } workflow { - run_validate(validation_channel) - if (params.run_manta) { - call_gSV_Manta(input_bam_ch, params.reference_fasta, reference_fasta_index) - run_sha512sum_Manta(call_gSV_Manta.out.vcf_small_indel_sv_file.mix(call_gSV_Manta.out.vcf_diploid_sv_file, call_gSV_Manta.out.vcf_candidate_sv_file)) - } - if (params.run_delly) { - call_gSV_Delly(input_bam_ch, params.reference_fasta, reference_fasta_index, params.exclusion_file) - call_gCNV_Delly(input_bam_ch, call_gSV_Delly.out.bcf_sv_file, params.reference_fasta, reference_fasta_index, params.mappability_map) - convert_gSV_BCF2VCF_BCFtools(call_gSV_Delly.out.bcf_sv_file, call_gSV_Delly.out.bam_sample_name, 'SV') - convert_gCNV_BCF2VCF_BCFtools(call_gCNV_Delly.out.bcf_cnv_file, call_gCNV_Delly.out.bam_sample_name, 'CNV') - if (params.run_qc) { - run_gSV_vcfstats_RTGTools(convert_gSV_BCF2VCF_BCFtools.out.vcf_file, call_gSV_Delly.out.bam_sample_name, 'SV') - run_gSV_vcf_validator_VCFtools(convert_gSV_BCF2VCF_BCFtools.out.vcf_file, call_gSV_Delly.out.bam_sample_name, 'SV') - run_gCNV_vcfstats_RTGTools(convert_gCNV_BCF2VCF_BCFtools.out.vcf_file, call_gCNV_Delly.out.bam_sample_name, 'CNV') - run_gCNV_vcf_validator_VCFtools(convert_gCNV_BCF2VCF_BCFtools.out.vcf_file, call_gCNV_Delly.out.bam_sample_name, 'CNV') + if (params.run_discovery) { + run_validate(validation_channel) + if (params.run_manta) { + call_gSV_Manta(input_bam_ch, params.reference_fasta, reference_fasta_index) + run_sha512sum_Manta(call_gSV_Manta.out.vcf_small_indel_sv_file.mix(call_gSV_Manta.out.vcf_diploid_sv_file, call_gSV_Manta.out.vcf_candidate_sv_file)) + } + if (params.run_delly) { + call_gSV_Delly(input_bam_ch, params.reference_fasta, reference_fasta_index, params.exclusion_file) + call_gCNV_Delly(input_bam_ch, call_gSV_Delly.out.bcf_sv_file, params.reference_fasta, reference_fasta_index, params.mappability_map) + convert_gSV_BCF2VCF_BCFtools(call_gSV_Delly.out.bcf_sv_file, call_gSV_Delly.out.bam_sample_name, 'SV') + convert_gCNV_BCF2VCF_BCFtools(call_gCNV_Delly.out.bcf_cnv_file, call_gCNV_Delly.out.bam_sample_name, 'CNV') + if (params.run_qc) { + run_gSV_vcfstats_RTGTools(convert_gSV_BCF2VCF_BCFtools.out.vcf_file, call_gSV_Delly.out.bam_sample_name, 'SV') + run_gSV_vcf_validator_VCFtools(convert_gSV_BCF2VCF_BCFtools.out.vcf_file, call_gSV_Delly.out.bam_sample_name, 'SV') + run_gCNV_vcfstats_RTGTools(convert_gCNV_BCF2VCF_BCFtools.out.vcf_file, call_gCNV_Delly.out.bam_sample_name, 'CNV') + run_gCNV_vcf_validator_VCFtools(convert_gCNV_BCF2VCF_BCFtools.out.vcf_file, call_gCNV_Delly.out.bam_sample_name, 'CNV') + } + run_sha512sum_Delly(call_gSV_Delly.out.bcf_sv_file.mix(convert_gSV_BCF2VCF_BCFtools.out.vcf_file, call_gCNV_Delly.out.bcf_cnv_file, convert_gCNV_BCF2VCF_BCFtools.out.vcf_file)) } - run_sha512sum_Delly(call_gSV_Delly.out.bcf_sv_file.mix(convert_gSV_BCF2VCF_BCFtools.out.vcf_file, call_gCNV_Delly.out.bcf_cnv_file, convert_gCNV_BCF2VCF_BCFtools.out.vcf_file)) } + // When 'run_regenotyping' is set to true, the mode specified in the input_csv will be used to determine which + // regenotyping process to run. For example, if the mode contains 'SV', regenotype_gSV_Delly will run, etc. + if (params.run_regenotyping) { + run_validate(validation_channel) + regenotype_gSV_Delly(input_bam_ch, params.reference_fasta, reference_fasta_index, params.exclusion_file, params.merged_sites) + regenotype_gCNV_Delly(input_bam_ch, params.reference_fasta, reference_fasta_index, params.exclusion_file, params.merged_sites) + } } diff --git a/pipeline/config/nextflow.config b/pipeline/config/nextflow.config index f9e1cc4c..8f91ebf6 100644 --- a/pipeline/config/nextflow.config +++ b/pipeline/config/nextflow.config @@ -16,6 +16,12 @@ params { blcds_registered_dataset = false sge_scheduler = false + // The mode of the call-gSV pipeline to run. Discovery will identify SVs using Delly/Manta. Regenotyping will regenotype SVs/CNVs using Delly. Please note that setting + // `run_regenotyping` to true will supersede `run_delly` below. + run_discovery = true + run_regenotyping = false + merged_sites = '/path/to/sites.bcf' + input_csv = 'path/to/input.csv' reference_fasta = '/path/to/genome.fa' // If an index is not explicitly specified for the reference FASTA, the pipeline will assume one exists in the same directory as the reference FASTA diff --git a/pipeline/inputs/call-gSV-inputs.csv b/pipeline/inputs/call-gSV-inputs.csv index f7df4b57..c4b828e0 100644 --- a/pipeline/inputs/call-gSV-inputs.csv +++ b/pipeline/inputs/call-gSV-inputs.csv @@ -1,2 +1,2 @@ -patient,sample,input_bam -patient-name,sample-name,/path/to/sample.bam \ No newline at end of file +patient,sample,input_bam,mode +patient-name,sample-name,/path/to/sample.bam,mode \ No newline at end of file diff --git a/pipeline/modules/delly.nf b/pipeline/modules/delly.nf index 8a4627bb..cbb24827 100644 --- a/pipeline/modules/delly.nf +++ b/pipeline/modules/delly.nf @@ -25,17 +25,17 @@ process call_gSV_Delly { saveAs: { "call_gSV_Delly/${bam_sample_name}.log${file(it).getName()}" } input: - tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai) - path(reference_fasta) - path(reference_fasta_fai) - path(exclusion_file) + tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai), val(mode) + path(reference_fasta) + path(reference_fasta_fai) + path(exclusion_file) output: - path "DELLY-${params.delly_version}_SV_${params.dataset_id}_${bam_sample_name}.bcf", emit: bcf_sv_file - path "DELLY-${params.delly_version}_SV_${params.dataset_id}_${bam_sample_name}.bcf.csi" - path ".command.*" - val bam_sample_name, emit: bam_sample_name + path "DELLY-${params.delly_version}_SV_${params.dataset_id}_${bam_sample_name}.bcf", emit: bcf_sv_file + path "DELLY-${params.delly_version}_SV_${params.dataset_id}_${bam_sample_name}.bcf.csi" + path ".command.*" + val bam_sample_name, emit: bam_sample_name """ set -euo pipefail @@ -49,6 +49,49 @@ process call_gSV_Delly { """ } +process regenotype_gSV_Delly { + container docker_image_delly + + publishDir params.output_dir, + enabled: params.save_intermediate_files, + pattern: "*.bcf*", + mode: "copy", + saveAs: { "Delly-${params.delly_version}/${file(it).getName()}" } + + publishDir params.output_log_dir, + pattern: ".command.*", + mode: "copy", + saveAs: { "regenotype_gSV_Delly/${bam_sample_name}.log${file(it).getName()}" } + + input: + tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai), val(mode) + path(reference_fasta) + path(reference_fasta_fai) + path(exclusion_file_or_mappability_map) + path(sites) + + when: + mode == 'SV' + + output: + path "DELLY-${params.delly_version}_RGSV_${params.dataset_id}_${bam_sample_name}.bcf", emit: regenotyped_sv_bcf + path "DELLY-${params.delly_version}_RGSV_${params.dataset_id}_${bam_sample_name}.bcf.csi", emit: regenotyped_sv_bcf_csi + path ".command.*" + + script: + """ + set -euo pipefail + delly \ + call \ + --vcffile $sites \ + --exclude $exclusion_file_or_mappability_map \ + --genome $reference_fasta \ + --outfile "DELLY-${params.delly_version}_RGSV_${params.dataset_id}_${bam_sample_name}.bcf" \ + --map-qual ${params.map_qual} \ + "$input_bam" + """ +} + process call_gCNV_Delly { container docker_image_delly @@ -64,17 +107,17 @@ process call_gCNV_Delly { saveAs: { "call_gCNV_Delly/${bam_sample_name}.log${file(it).getName()}" } input: - tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai) - path(delly_sv_file) - path(reference_fasta) - path(reference_fasta_fai) - path(mappability_file) + tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai), val(mode) + path(delly_sv_file) + path(reference_fasta) + path(reference_fasta_fai) + path(mappability_file) output: - path "DELLY-${params.delly_version}_CNV_${params.dataset_id}_${bam_sample_name}.bcf", emit: bcf_cnv_file - path "DELLY-${params.delly_version}_CNV_${params.dataset_id}_${bam_sample_name}.bcf.csi" - path ".command.*" - val bam_sample_name, emit: bam_sample_name + path "DELLY-${params.delly_version}_CNV_${params.dataset_id}_${bam_sample_name}.bcf", emit: bcf_cnv_file + path "DELLY-${params.delly_version}_CNV_${params.dataset_id}_${bam_sample_name}.bcf.csi" + path ".command.*" + val bam_sample_name, emit: bam_sample_name """ set -euo pipefail @@ -87,3 +130,46 @@ process call_gCNV_Delly { $input_bam """ } + +process regenotype_gCNV_Delly { + container docker_image_delly + + publishDir params.output_dir, + enabled: params.save_intermediate_files, + pattern: "*.bcf*", + mode: "copy", + saveAs: { "Delly-${params.delly_version}/${file(it).getName()}" } + + publishDir params.output_log_dir, + pattern: ".command.*", + mode: "copy", + saveAs: { "regenotype_gCNV_Delly/${bam_sample_name}.log${file(it).getName()}" } + + input: + tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai), val(mode) + path(reference_fasta) + path(reference_fasta_fai) + path(exclusion_file_or_mappability_map) + path(sites) + + output: + path "DELLY-${params.delly_version}_RGCNV_${params.dataset_id}_${bam_sample_name}.bcf", emit: regenotyped_cnv_bcf + path "DELLY-${params.delly_version}_RGCNV_${params.dataset_id}_${bam_sample_name}.bcf.csi", emit: regenotyped_cnv_bcf_csi + path ".command.*" + + when: + mode == 'CNV' + + script: + """ + set -euo pipefail + delly \ + cnv \ + --segmentation \ + --vcffile $sites \ + --exclude $exclusion_file_or_mappability_map \ + --genome $reference_fasta \ + --outfile "DELLY-${params.delly_version}_RGCNV_${params.dataset_id}_${bam_sample_name}.bcf" \ + "$input_bam" + """ +} \ No newline at end of file diff --git a/pipeline/modules/manta.nf b/pipeline/modules/manta.nf index 3ac8f333..b654d230 100644 --- a/pipeline/modules/manta.nf +++ b/pipeline/modules/manta.nf @@ -24,7 +24,7 @@ process call_gSV_Manta { saveAs: { "call_gSV_Manta/${bam_sample_name}.log${file(it).getName()}" } input: - tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai) + tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai), val(mode) path(reference_fasta) path(reference_fasta_fai) diff --git a/pipeline/modules/validation.nf b/pipeline/modules/validation.nf index 3319399c..b25dfe11 100644 --- a/pipeline/modules/validation.nf +++ b/pipeline/modules/validation.nf @@ -19,13 +19,14 @@ process run_validate { saveAs: { "run_validate/${file_to_validate}.log${file(it).getName()}" } input: - path(file_to_validate) + tuple val(mode), path(file_to_validate) output: + path("${file_to_validate}.temp") path ".command.*" """ set -euo pipefail - python -m validate -t file-input ${file_to_validate} + python -m validate -t ${mode} ${file_to_validate} > "${file_to_validate}.temp" """ }