Skip to content

Commit

Permalink
Merge pull request #30 from uclahs-cds/ts-regenotype
Browse files Browse the repository at this point in the history
call-gSV Regenotyping
  • Loading branch information
timothyjsanders authored Jun 30, 2021
2 parents 8358223 + 269cd0b commit 82e2cee
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 45 deletions.
59 changes: 36 additions & 23 deletions pipeline/call-gSV.nf
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ Starting workflow...
.stripIndent()

include { run_validate } from './modules/validation'
include { call_gSV_Delly; call_gCNV_Delly } from './modules/delly'
include { call_gSV_Delly; call_gCNV_Delly; regenotype_gSV_Delly; regenotype_gCNV_Delly } from './modules/delly'
include { call_gSV_Manta } from './modules/manta'
include { convert_BCF2VCF_BCFtools as convert_gSV_BCF2VCF_BCFtools; convert_BCF2VCF_BCFtools as convert_gCNV_BCF2VCF_BCFtools } from './modules/bcftools'
include { run_vcfstats_RTGTools as run_gSV_vcfstats_RTGTools; run_vcfstats_RTGTools as run_gCNV_vcfstats_RTGTools } from './modules/rtgtools'
Expand All @@ -65,7 +65,8 @@ input_bam_ch = Channel
row.patient,
row.sample,
row.input_bam,
"${row.input_bam}.bai"
"${row.input_bam}.bai",
row.mode
)
}

Expand All @@ -91,34 +92,46 @@ else {
reference_fasta_index = "${params.reference_fasta}.fai"
}

// Create channel for validation
validation_channel = Channel
.fromPath(params.input_csv, checkIfExists:true)
.splitCsv(header:true)
.map{ row -> [
row.input_bam,
params.reference_fasta
.map{ row -> [
'file-input',
row.input_bam
]
}
.flatten()

Channel
.of(['file-fasta', params.reference_fasta])
.mix(validation_channel)
.set { validation_channel }

workflow {
run_validate(validation_channel)
if (params.run_manta) {
call_gSV_Manta(input_bam_ch, params.reference_fasta, reference_fasta_index)
run_sha512sum_Manta(call_gSV_Manta.out.vcf_small_indel_sv_file.mix(call_gSV_Manta.out.vcf_diploid_sv_file, call_gSV_Manta.out.vcf_candidate_sv_file))
}
if (params.run_delly) {
call_gSV_Delly(input_bam_ch, params.reference_fasta, reference_fasta_index, params.exclusion_file)
call_gCNV_Delly(input_bam_ch, call_gSV_Delly.out.bcf_sv_file, params.reference_fasta, reference_fasta_index, params.mappability_map)
convert_gSV_BCF2VCF_BCFtools(call_gSV_Delly.out.bcf_sv_file, call_gSV_Delly.out.bam_sample_name, 'SV')
convert_gCNV_BCF2VCF_BCFtools(call_gCNV_Delly.out.bcf_cnv_file, call_gCNV_Delly.out.bam_sample_name, 'CNV')
if (params.run_qc) {
run_gSV_vcfstats_RTGTools(convert_gSV_BCF2VCF_BCFtools.out.vcf_file, call_gSV_Delly.out.bam_sample_name, 'SV')
run_gSV_vcf_validator_VCFtools(convert_gSV_BCF2VCF_BCFtools.out.vcf_file, call_gSV_Delly.out.bam_sample_name, 'SV')
run_gCNV_vcfstats_RTGTools(convert_gCNV_BCF2VCF_BCFtools.out.vcf_file, call_gCNV_Delly.out.bam_sample_name, 'CNV')
run_gCNV_vcf_validator_VCFtools(convert_gCNV_BCF2VCF_BCFtools.out.vcf_file, call_gCNV_Delly.out.bam_sample_name, 'CNV')
if (params.run_discovery) {
run_validate(validation_channel)
if (params.run_manta) {
call_gSV_Manta(input_bam_ch, params.reference_fasta, reference_fasta_index)
run_sha512sum_Manta(call_gSV_Manta.out.vcf_small_indel_sv_file.mix(call_gSV_Manta.out.vcf_diploid_sv_file, call_gSV_Manta.out.vcf_candidate_sv_file))
}
if (params.run_delly) {
call_gSV_Delly(input_bam_ch, params.reference_fasta, reference_fasta_index, params.exclusion_file)
call_gCNV_Delly(input_bam_ch, call_gSV_Delly.out.bcf_sv_file, params.reference_fasta, reference_fasta_index, params.mappability_map)
convert_gSV_BCF2VCF_BCFtools(call_gSV_Delly.out.bcf_sv_file, call_gSV_Delly.out.bam_sample_name, 'SV')
convert_gCNV_BCF2VCF_BCFtools(call_gCNV_Delly.out.bcf_cnv_file, call_gCNV_Delly.out.bam_sample_name, 'CNV')
if (params.run_qc) {
run_gSV_vcfstats_RTGTools(convert_gSV_BCF2VCF_BCFtools.out.vcf_file, call_gSV_Delly.out.bam_sample_name, 'SV')
run_gSV_vcf_validator_VCFtools(convert_gSV_BCF2VCF_BCFtools.out.vcf_file, call_gSV_Delly.out.bam_sample_name, 'SV')
run_gCNV_vcfstats_RTGTools(convert_gCNV_BCF2VCF_BCFtools.out.vcf_file, call_gCNV_Delly.out.bam_sample_name, 'CNV')
run_gCNV_vcf_validator_VCFtools(convert_gCNV_BCF2VCF_BCFtools.out.vcf_file, call_gCNV_Delly.out.bam_sample_name, 'CNV')
}
run_sha512sum_Delly(call_gSV_Delly.out.bcf_sv_file.mix(convert_gSV_BCF2VCF_BCFtools.out.vcf_file, call_gCNV_Delly.out.bcf_cnv_file, convert_gCNV_BCF2VCF_BCFtools.out.vcf_file))
}
run_sha512sum_Delly(call_gSV_Delly.out.bcf_sv_file.mix(convert_gSV_BCF2VCF_BCFtools.out.vcf_file, call_gCNV_Delly.out.bcf_cnv_file, convert_gCNV_BCF2VCF_BCFtools.out.vcf_file))
}
// When 'run_regenotyping' is set to true, the mode specified in the input_csv will be used to determine which
// regenotyping process to run. For example, if the mode contains 'SV', regenotype_gSV_Delly will run, etc.
if (params.run_regenotyping) {
run_validate(validation_channel)
regenotype_gSV_Delly(input_bam_ch, params.reference_fasta, reference_fasta_index, params.exclusion_file, params.merged_sites)
regenotype_gCNV_Delly(input_bam_ch, params.reference_fasta, reference_fasta_index, params.exclusion_file, params.merged_sites)
}
}
6 changes: 6 additions & 0 deletions pipeline/config/nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ params {
blcds_registered_dataset = false
sge_scheduler = false

// The mode of the call-gSV pipeline to run. Discovery will identify SVs using Delly/Manta. Regenotyping will regenotype SVs/CNVs using Delly. Please note that setting
// `run_regenotyping` to true will supersede `run_delly` below.
run_discovery = true
run_regenotyping = false
merged_sites = '/path/to/sites.bcf'

input_csv = 'path/to/input.csv'
reference_fasta = '/path/to/genome.fa'
// If an index is not explicitly specified for the reference FASTA, the pipeline will assume one exists in the same directory as the reference FASTA
Expand Down
4 changes: 2 additions & 2 deletions pipeline/inputs/call-gSV-inputs.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
patient,sample,input_bam
patient-name,sample-name,/path/to/sample.bam
patient,sample,input_bam,mode
patient-name,sample-name,/path/to/sample.bam,mode
120 changes: 103 additions & 17 deletions pipeline/modules/delly.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,17 @@ process call_gSV_Delly {
saveAs: { "call_gSV_Delly/${bam_sample_name}.log${file(it).getName()}" }

input:
tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai)
path(reference_fasta)
path(reference_fasta_fai)
path(exclusion_file)
tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai), val(mode)
path(reference_fasta)
path(reference_fasta_fai)
path(exclusion_file)


output:
path "DELLY-${params.delly_version}_SV_${params.dataset_id}_${bam_sample_name}.bcf", emit: bcf_sv_file
path "DELLY-${params.delly_version}_SV_${params.dataset_id}_${bam_sample_name}.bcf.csi"
path ".command.*"
val bam_sample_name, emit: bam_sample_name
path "DELLY-${params.delly_version}_SV_${params.dataset_id}_${bam_sample_name}.bcf", emit: bcf_sv_file
path "DELLY-${params.delly_version}_SV_${params.dataset_id}_${bam_sample_name}.bcf.csi"
path ".command.*"
val bam_sample_name, emit: bam_sample_name

"""
set -euo pipefail
Expand All @@ -49,6 +49,49 @@ process call_gSV_Delly {
"""
}

process regenotype_gSV_Delly {
container docker_image_delly

publishDir params.output_dir,
enabled: params.save_intermediate_files,
pattern: "*.bcf*",
mode: "copy",
saveAs: { "Delly-${params.delly_version}/${file(it).getName()}" }

publishDir params.output_log_dir,
pattern: ".command.*",
mode: "copy",
saveAs: { "regenotype_gSV_Delly/${bam_sample_name}.log${file(it).getName()}" }

input:
tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai), val(mode)
path(reference_fasta)
path(reference_fasta_fai)
path(exclusion_file_or_mappability_map)
path(sites)

when:
mode == 'SV'

output:
path "DELLY-${params.delly_version}_RGSV_${params.dataset_id}_${bam_sample_name}.bcf", emit: regenotyped_sv_bcf
path "DELLY-${params.delly_version}_RGSV_${params.dataset_id}_${bam_sample_name}.bcf.csi", emit: regenotyped_sv_bcf_csi
path ".command.*"

script:
"""
set -euo pipefail
delly \
call \
--vcffile $sites \
--exclude $exclusion_file_or_mappability_map \
--genome $reference_fasta \
--outfile "DELLY-${params.delly_version}_RGSV_${params.dataset_id}_${bam_sample_name}.bcf" \
--map-qual ${params.map_qual} \
"$input_bam"
"""
}

process call_gCNV_Delly {
container docker_image_delly

Expand All @@ -64,17 +107,17 @@ process call_gCNV_Delly {
saveAs: { "call_gCNV_Delly/${bam_sample_name}.log${file(it).getName()}" }

input:
tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai)
path(delly_sv_file)
path(reference_fasta)
path(reference_fasta_fai)
path(mappability_file)
tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai), val(mode)
path(delly_sv_file)
path(reference_fasta)
path(reference_fasta_fai)
path(mappability_file)

output:
path "DELLY-${params.delly_version}_CNV_${params.dataset_id}_${bam_sample_name}.bcf", emit: bcf_cnv_file
path "DELLY-${params.delly_version}_CNV_${params.dataset_id}_${bam_sample_name}.bcf.csi"
path ".command.*"
val bam_sample_name, emit: bam_sample_name
path "DELLY-${params.delly_version}_CNV_${params.dataset_id}_${bam_sample_name}.bcf", emit: bcf_cnv_file
path "DELLY-${params.delly_version}_CNV_${params.dataset_id}_${bam_sample_name}.bcf.csi"
path ".command.*"
val bam_sample_name, emit: bam_sample_name

"""
set -euo pipefail
Expand All @@ -87,3 +130,46 @@ process call_gCNV_Delly {
$input_bam
"""
}

process regenotype_gCNV_Delly {
container docker_image_delly

publishDir params.output_dir,
enabled: params.save_intermediate_files,
pattern: "*.bcf*",
mode: "copy",
saveAs: { "Delly-${params.delly_version}/${file(it).getName()}" }

publishDir params.output_log_dir,
pattern: ".command.*",
mode: "copy",
saveAs: { "regenotype_gCNV_Delly/${bam_sample_name}.log${file(it).getName()}" }

input:
tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai), val(mode)
path(reference_fasta)
path(reference_fasta_fai)
path(exclusion_file_or_mappability_map)
path(sites)

output:
path "DELLY-${params.delly_version}_RGCNV_${params.dataset_id}_${bam_sample_name}.bcf", emit: regenotyped_cnv_bcf
path "DELLY-${params.delly_version}_RGCNV_${params.dataset_id}_${bam_sample_name}.bcf.csi", emit: regenotyped_cnv_bcf_csi
path ".command.*"

when:
mode == 'CNV'

script:
"""
set -euo pipefail
delly \
cnv \
--segmentation \
--vcffile $sites \
--exclude $exclusion_file_or_mappability_map \
--genome $reference_fasta \
--outfile "DELLY-${params.delly_version}_RGCNV_${params.dataset_id}_${bam_sample_name}.bcf" \
"$input_bam"
"""
}
2 changes: 1 addition & 1 deletion pipeline/modules/manta.nf
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ process call_gSV_Manta {
saveAs: { "call_gSV_Manta/${bam_sample_name}.log${file(it).getName()}" }

input:
tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai)
tuple val(patient), val(bam_sample_name), path(input_bam), path(input_bam_bai), val(mode)
path(reference_fasta)
path(reference_fasta_fai)

Expand Down
5 changes: 3 additions & 2 deletions pipeline/modules/validation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@ process run_validate {
saveAs: { "run_validate/${file_to_validate}.log${file(it).getName()}" }

input:
path(file_to_validate)
tuple val(mode), path(file_to_validate)

output:
path("${file_to_validate}.temp")
path ".command.*"

"""
set -euo pipefail
python -m validate -t file-input ${file_to_validate}
python -m validate -t ${mode} ${file_to_validate} > "${file_to_validate}.temp"
"""
}

0 comments on commit 82e2cee

Please sign in to comment.