diff --git a/.travis.yml b/.travis.yml index 8843cb8a..1e1b815c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ before_install: - chmod 777 nextflow # to change the test-data for travis, please download using the following command, extract, make changes, tarball again with gzip, and upload to google drive. # you will have to change the link below as well. Click to share the link, making it so anyone with the link can access, then extract the id in the link and put it here after "id=" - - wget -O test-data.tar.gz --no-check-certificate 'https://docs.google.com/uc?export=download&confirm=no_antivirus&id=1xcDnXk468SLpzr01Lw3CcJgeirYZPQXO' + - wget -O test-data.tar.gz --no-check-certificate 'https://docs.google.com/uc?export=download&confirm=no_antivirus&id=1lmuRJ1YO0DwBgtzG1kuKzTvGSijgEssc' - tar -xzvf test-data.tar.gz script: diff --git a/conf/containers.config b/conf/containers.config index 2200c20c..f477f253 100755 --- a/conf/containers.config +++ b/conf/containers.config @@ -21,6 +21,9 @@ withName:"CreateScatteredIntervals.*" { container = "broadinstitute/gatk:4.1.0.0" } + withName:"CreateBaitsetFiles" { + container = "broadinstitute/gatk:4.1.9.0" + } //------------------- Somatic pipeline diff --git a/conf/juno.config b/conf/juno.config index 20754096..994ee7bd 100644 --- a/conf/juno.config +++ b/conf/juno.config @@ -28,7 +28,7 @@ params { mem_per_core = true reference_base = "/juno/work/taylorlab/cmopipeline" // targets_base = "/juno/work/ccs/resources/tempo/${params.genome}" - targets_base = "${reference_base}/mskcc-igenomes/${params.genome.toLowerCase()}/tempo_targets" + targets_base = "${reference_base}/mskcc-igenomes/${params.genome.toLowerCase()}/tempo_targets_dsl2" genome_base = params.genome == 'GRCh37' ? "${reference_base}/mskcc-igenomes/igenomes/Homo_sapiens/GATK/GRCh37" : params.genome == 'GRCh38' ? "${reference_base}/mskcc-igenomes/igenomes/Homo_sapiens/GATK/GRCh38" : "${reference_base}/mskcc-igenomes/igenomes/smallGRCh37" minWallTime = 3.h medWallTime = 6.h diff --git a/conf/references.config b/conf/references.config index 265244d2..a546997f 100644 --- a/conf/references.config +++ b/conf/references.config @@ -13,12 +13,8 @@ params { targets { // If your files do not match this structure/naming, please create a folder and copy over the files or create symlinks. // If editing we recommend only changing the basename. - baitsInterval = "${params.targets_base}/\${targets_id}/baits.interval_list" - targetsInterval = "${params.targets_base}/\${targets_id}/targets.interval_list" - targetsBed = "${params.targets_base}/\${targets_id}/targets.bed" - targetsBedGz = "${params.targets_base}/\${targets_id}/targets.bed.gz" - targetsBedGzTbi = "${params.targets_base}/\${targets_id}/targets.bed.gz.tbi" - codingBed = "${params.targets_base}/\${targets_id}/coding.bed" + targetsBed = "${params.targets_base}/${params.assayType}/\${targets_id}/targets.bed" + baitsBed = "${params.targets_base}/${params.assayType}/\${targets_id}/baits.bed" } genomes { 'smallGRCh37' { @@ -58,6 +54,7 @@ params { hlaDat = "${params.reference_base}/hla/hla.dat" neoantigenCDNA = "${params.reference_base}/neoantigen/Homo_sapiens.GRCh37.75.cdna.all.fa.gz" neoantigenCDS = "${params.reference_base}/neoantigen/Homo_sapiens.GRCh37.75.cds.all.fa.gz" + codingRegions = "${params.reference_base}/ensGene.all_CODING_exons.reference.bed" } 'GRCh37' { acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_20130502_SNP_maf0.3.loci" @@ -95,6 +92,7 @@ params { hlaDat = "${params.reference_base}/mskcc-igenomes/grch37/hla/hla.dat" neoantigenCDNA = "${params.reference_base}/mskcc-igenomes/grch37/neoantigen/Homo_sapiens.GRCh37.75.cdna.all.fa.gz" neoantigenCDS = "${params.reference_base}/mskcc-igenomes/grch37/neoantigen/Homo_sapiens.GRCh37.75.cds.all.fa.gz" + codingRegions = "${params.reference_base}/mskcc-igenomes/grch37/coding_regions/ensGene.all_CODING_exons.reference.bed" } 'GRCh38' { acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci" diff --git a/dsl2.nf b/dsl2.nf index f8ae8781..ef96922e 100644 --- a/dsl2.nf +++ b/dsl2.nf @@ -40,6 +40,7 @@ include { snv_wf } from './modules/subworkflow/snv_wf' include { sampleQC_wf } from './modules/subworkflow/sampleQC_wf' addParams(referenceMap: referenceMap, targetsMap: targetsMap, multiqcWesConfig: multiqcWesConfig, multiqcWgsConfig: multiqcWgsConfig, multiqcTempoLogo: multiqcTempoLogo) include { samplePairingQC_wf } from './modules/subworkflow/samplePairingQC_wf' addParams(referenceMap: referenceMap, targetsMap: targetsMap) include { somaticMultiQC_wf } from './modules/subworkflow/somaticMultiQC_wf' addParams(multiqcWesConfig: multiqcWesConfig, multiqcWgsConfig: multiqcWgsConfig, multiqcTempoLogo: multiqcTempoLogo) +include { targets_wf } from './modules/subworkflow/targets_wf' addParams(referenceMap: referenceMap, targetsMap: targetsMap) include { scatter_wf } from './modules/subworkflow/scatter_wf' addParams(referenceMap: referenceMap, targetsMap: targetsMap) include { germlineSNV_wf } from './modules/subworkflow/germlineSNV_wf' addParams(referenceMap: referenceMap, targetsMap: targetsMap) include { germlineSV_wf } from './modules/subworkflow/germlineSV_wf' addParams(referenceMap: referenceMap, targetsMap: targetsMap) @@ -59,6 +60,7 @@ WFs = (!params.mapping && !params.bamMapping && aggregateParamIsFile) ? ['snv',' workflow { //Set flags for when each pipeline is required to run. doWF_align = (params.mapping) ? true : false + doWF_targets = WFs.size() > 0 doWF_manta = ['snv', 'sv', 'mutsig'].any(it -> it in WFs) ? true : false doWF_scatter = ['snv', 'sv', 'mutsig', 'germsnv'].any(it -> it in WFs) ? true : false doWF_germSNV = 'germsnv' in WFs ? true : false @@ -164,9 +166,13 @@ workflow { manta_wf(bamFiles) } + if(doWF_targets){ + targets_wf() + } + if(doWF_scatter) { - scatter_wf() + scatter_wf(targets_wf.out.baitsetPlus5) } if(doWF_germSV) @@ -181,7 +187,7 @@ workflow { if(doWF_germSNV) { - germlineSNV_wf(bams, bamsTumor, scatter_wf.out.mergedIList, facets_wf.out.facetsForMafAnno) + germlineSNV_wf(bams, bamsTumor, scatter_wf.out.mergedIList, facets_wf.out.facetsForMafAnno, targets_wf.out.baitsetPlus5) } if(doWF_SV) @@ -196,12 +202,12 @@ workflow { if(doWF_SNV) { - snv_wf(bamFiles, scatter_wf.out.mergedIList, manta_wf.out.mantaToStrelka, loh_wf.out.hlaOutput, facets_wf.out.facetsForMafAnno) + snv_wf(bamFiles, scatter_wf.out.mergedIList, manta_wf.out.mantaToStrelka, loh_wf.out.hlaOutput, facets_wf.out.facetsForMafAnno, targets_wf.out.baitsetPlus5) } if(doWF_QC) { - sampleQC_wf(inputBam, fastPJson) + sampleQC_wf(inputBam, fastPJson, targets_wf.out.baitsetInterval, targets_wf.out.baitsetPlus5_unzipped, targets_wf.out.baitsetPlus5) } if(doWF_msiSensor) @@ -222,9 +228,12 @@ workflow { .combine(mutSig_wf.out.mutSig4MetaDataParser, by: [0,1,2]) .combine(loh_wf.out.hlaOutput, by: [1,2]) .unique() - .map{ idNormal, target, idTumor, purityOut, mafFile, qcOutput, msifile, mutSig, placeHolder, polysolverFile -> - [idNormal, target, idTumor, purityOut, mafFile, qcOutput, msifile, mutSig, placeHolder, polysolverFile, targetsMap."$target".codingBed] - }.set{ mergedChannelMetaDataParser } + .combine(targets_wf.out.codingBaitsetBed) + .filter{ idNormal, target, idTumor, purityOut, mafFile, qcOutput, msifile, mutSig, placeHolder, polysolverFile, target2, codingBed -> + target == target2 + }.map{ idNormal, target, idTumor, purityOut, mafFile, qcOutput, msifile, mutSig, placeHolder, polysolverFile, target2, codingBed -> + [ idNormal, target, idTumor, purityOut, mafFile, qcOutput, msifile, mutSig, placeHolder, polysolverFile, codingBed ] + }.set{ mergedChannelMetaDataParser } mdParse_wf(mergedChannelMetaDataParser) } diff --git a/modules/function/define_maps.nf b/modules/function/define_maps.nf index 95522d5b..74f68179 100644 --- a/modules/function/define_maps.nf +++ b/modules/function/define_maps.nf @@ -56,17 +56,16 @@ def defineReferenceMap() { result_array << ['neoantigenCDNA' : checkParamReturnFile("neoantigenCDNA")] result_array << ['neoantigenCDS' : checkParamReturnFile("neoantigenCDS")] // coding region BED files for calculating TMB + result_array << ['codingRegions' : checkParamReturnFile("codingRegions")] return result_array } def loadTargetReferences(){ def result_array = [:] - new File(params.targets_base).eachDir{ i -> + new File("${params.targets_base}/${params.assayType}" ).eachDir{ i -> def target_id = i.getBaseName() - if (params.assayType == "genome" && target_id != "wgs" ){ return } - if (params.assayType != "genome" && target_id == "wgs" ){ return } result_array["${target_id}"] = [:] - for ( j in params.targets.keySet()) { // baitsInterval, targetsInterval, targetsBedGz, targetsBedGzTbi, codingBed + for ( j in params.targets.keySet()) { // baitsBed, targetsBed result_array."${target_id}" << [ ("$j".toString()) : evalTargetPath(j,target_id)] } } diff --git a/modules/process/Targets/CreateBaitsetFiles.nf b/modules/process/Targets/CreateBaitsetFiles.nf new file mode 100644 index 00000000..e5cdd9d9 --- /dev/null +++ b/modules/process/Targets/CreateBaitsetFiles.nf @@ -0,0 +1,55 @@ +process CreateBaitsetFiles { + tag "${targetId}" + + input: + tuple val(targetId), path("raw_targets.bed"), path("raw_baits.bed") + path(genomeFile) + path(genomeIndex) + path(genomeDict) + path(codingRegions) + + output: + tuple val(targetId), path(targetInterval), path(baitInterval), emit: baitsetInterval + tuple val(targetId), path(codingBaitsetBed), emit: codingBaitsetBed + tuple val(targetId), path("${targetPlus5}.gz"), path("${targetPlus5}.gz.tbi"), emit:baitsetPlus5 + tuple val(targetId), path(targetPlus5), emit:baitsetPlus5_unzipped + + script: + targetInterval = "${targetId}.targets.ilist" + baitInterval = "${targetId}.baits.ilist" + codingBaitsetBed = "${targetId}.coding.bed" + targetBed = "${targetId}.targets.bed" + baitBed = "${targetId}.baits.bed" + targetPlus5 = "${targetId}.plus5bp.bed" + """ + bedtools sort -i raw_targets.bed | bedtools merge -i - > ${targetBed} + bedtools sort -i raw_baits.bed | bedtools merge -i - > ${baitBed} + + bedtools intersect \\ + -a ${codingRegions} \\ + -b ${targetBed} > \\ + intersect.bed + sort -k1,1 -k 2,2n -k 3,3n intersect.bed > intersect.sorted.bed + bedtools merge -i intersect.sorted.bed > ${codingBaitsetBed} + + cut -f 1,2 ${genomeIndex} > this.genome + bedtools slop \\ + -i ${targetBed} \\ + -g ./this.genome \\ + -b 5 > \\ + ${targetPlus5} + bgzip -c ${targetPlus5} > ${targetPlus5}.gz + tabix -p bed ${targetPlus5}.gz + + gatk BedToIntervalList \\ + -I ${targetBed} \\ + -O ${targetInterval} \\ + -SD ${genomeDict} + + gatk BedToIntervalList \\ + -I ${baitBed} \\ + -O ${baitInterval} \\ + -SD ${genomeDict} + + """ +} diff --git a/modules/subworkflow/germlineSNV_wf.nf b/modules/subworkflow/germlineSNV_wf.nf index d2386217..1e76a280 100644 --- a/modules/subworkflow/germlineSNV_wf.nf +++ b/modules/subworkflow/germlineSNV_wf.nf @@ -12,6 +12,7 @@ workflow germlineSNV_wf bamsTumor mergedIList facetsForMafAnno + baitsetPlus5 main: referenceMap = params.referenceMap @@ -46,9 +47,13 @@ workflow germlineSNV_wf GermlineCombineHaplotypecallerVcf(haplotypecaller4Combine, Channel.value([referenceMap.genomeFile, referenceMap.genomeIndex, referenceMap.genomeDict])) - bams.map{ idNormal, target, bamNormal, baiNormal -> - [idNormal, target, bamNormal, baiNormal, targetsMap."$target".targetsBedGz, targetsMap."$target".targetsBedGzTbi] - }.set{ bamsForStrelkaGermline } + bams + .combine(baitsetPlus5) + .filter{ idNormal, target, bamNormal, baiNormal, target2, bedGz, bedGzTbi -> + target == target2 + }.map{ idNormal, target, bamNormal, baiNormal, target2, bedGz, bedGzTbi -> + [idNormal, target, bamNormal, baiNormal, bedGz, bedGzTbi] + }.set{bamsForStrelkaGermline} GermlineRunStrelka2(bamsForStrelkaGermline, Channel.value([referenceMap.genomeFile, referenceMap.genomeIndex, referenceMap.genomeDict])) diff --git a/modules/subworkflow/sampleQC_wf.nf b/modules/subworkflow/sampleQC_wf.nf index 15329acc..9a41a14b 100644 --- a/modules/subworkflow/sampleQC_wf.nf +++ b/modules/subworkflow/sampleQC_wf.nf @@ -9,14 +9,20 @@ workflow sampleQC_wf take: inputChannel fastPJson + intervals + targetsBed + targetsBedGz main: referenceMap = params.referenceMap targetsMap = params.targetsMap - inputChannel.map{ idSample, target, bam, bai -> - [idSample, target, bam, bai, targetsMap."$target".targetsInterval, targetsMap."$target".baitsInterval] - }.set{ bamsBQSR4HsMetrics } + inputChannel.combine(intervals) + .filter{ idSample, target, bam, bai, target2, targetsInterval, baitsInterval -> + target == target2 + }.map{ idSample, target, bam, bai, target2, targetsInterval, baitsInterval -> + [idSample, target, bam, bai, targetsInterval, baitsInterval] + }.set{ bamsBQSR4HsMetrics } QcCollectHsMetrics(bamsBQSR4HsMetrics, Channel.value([referenceMap.genomeFile, referenceMap.genomeIndex, referenceMap.genomeDict])) @@ -30,16 +36,19 @@ workflow sampleQC_wf } inputChannel - .map{ idSample, target, bam, bai -> [ idSample, target, bam, bai, file(targetsMap."$target".targetsBed) ]} + .combine(targetsBed) + .filter{ idSample, target, bam, bai, targets2, bedfile -> target == targets2 } + .map{ idSample, target, bam, bai, targets2, bedfile -> [idSample, target, bam, bai, bedfile] } .set{ bamsBQSR4Qualimap } QcQualimap(bamsBQSR4Qualimap) Channel.from(true, false).set{ ignore_read_groups } inputChannel - .map{ idSample, target, bam, bai -> - [ idSample, target, bam, bai, targetsMap."$target".targetsBedGz, targetsMap."$target".targetsBedGzTbi ] - }.set{ bamsBQSR4Alfred } + .combine(targetsBedGz) + .filter{ idSample, target, bam, bai, targets2, bedGz, bedGzTbi -> target == targets2 } + .map{ idSample, target, bam, bai, targets2, bedGz, bedGzTbi -> [idSample, target, bam, bai, bedGz, bedGzTbi] } + .set{ bamsBQSR4Alfred } QcAlfred(ignore_read_groups, bamsBQSR4Alfred, diff --git a/modules/subworkflow/scatter_wf.nf b/modules/subworkflow/scatter_wf.nf index 990f0be0..b3993637 100644 --- a/modules/subworkflow/scatter_wf.nf +++ b/modules/subworkflow/scatter_wf.nf @@ -2,15 +2,13 @@ include { CreateScatteredIntervals } from '../process/Scatter/CreateScatteredI workflow scatter_wf { + take: + targets4Intervals + main: referenceMap = params.referenceMap targetsMap = params.targetsMap - targets4Intervals = Channel.from(targetsMap.keySet()) - .map{ targetId -> - [ targetId, targetsMap."${targetId}".targetsBedGz, targetsMap."${targetId}".targetsBedGzTbi ] - } - CreateScatteredIntervals(Channel.value([referenceMap.genomeFile, referenceMap.genomeIndex, referenceMap.genomeDict]), diff --git a/modules/subworkflow/snv_wf.nf b/modules/subworkflow/snv_wf.nf index 73abd62b..9b9acd63 100644 --- a/modules/subworkflow/snv_wf.nf +++ b/modules/subworkflow/snv_wf.nf @@ -14,6 +14,7 @@ workflow snv_wf mantaToStrelka hlaOutput facetsForMafAnno + baitsetPlus5 main: referenceMap = params.referenceMap @@ -52,8 +53,11 @@ workflow snv_wf Channel.value([referenceMap.genomeFile, referenceMap.genomeIndex, referenceMap.genomeDict])) bamFiles.combine(mantaToStrelka, by: [0, 1, 2]) - .map{ idTumor, idNormal, target, bamTumor, baiTumor, bamNormal, baiNormal, mantaCSI, mantaCSIi -> - [idTumor, idNormal, target, bamTumor, baiTumor, bamNormal, baiNormal, mantaCSI, mantaCSIi, targetsMap."$target".targetsBedGz, targetsMap."$target".targetsBedGzTbi] + .combine(baitsetPlus5) + .filter{ idTumor, idNormal, target, bamTumor, baiTumor, bamNormal, baiNormal, mantaCSI, mantaCSIi, target2, bedGz, bedGzTbi -> + target2 == target + }.map{ idTumor, idNormal, target, bamTumor, baiTumor, bamNormal, baiNormal, mantaCSI, mantaCSIi, target2, bedGz, bedGzTbi -> + [idTumor, idNormal, target, bamTumor, baiTumor, bamNormal, baiNormal, mantaCSI, mantaCSIi, bedGz, bedGzTbi ] }.set{ input4Strelka } SomaticRunStrelka2(input4Strelka, diff --git a/modules/subworkflow/targets_wf.nf b/modules/subworkflow/targets_wf.nf new file mode 100644 index 00000000..ab186d5a --- /dev/null +++ b/modules/subworkflow/targets_wf.nf @@ -0,0 +1,25 @@ +include { CreateBaitsetFiles } from '../process/Targets/CreateBaitsetFiles' + +workflow targets_wf +{ + main: + referenceMap = params.referenceMap + targetsMap = params.targetsMap + + CreateBaitsetFiles( + Channel.from(targetsMap.keySet()) + .map{ targetId -> + [targetId, targetsMap."${targetId}".targetsBed, targetsMap."${targetId}".baitsBed] + }, + referenceMap.genomeFile, + referenceMap.genomeIndex, + referenceMap.genomeDict, + referenceMap.codingRegions + ) + + emit: + baitsetInterval = CreateBaitsetFiles.out.baitsetInterval + codingBaitsetBed = CreateBaitsetFiles.out.codingBaitsetBed + baitsetPlus5 = CreateBaitsetFiles.out.baitsetPlus5 + baitsetPlus5_unzipped = CreateBaitsetFiles.out.baitsetPlus5_unzipped +}