From 86220ec5b0399fcf99f9f02002cd5a981b7d4563 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 1 Dec 2023 18:33:20 +1100 Subject: [PATCH 01/86] Basic isolated alignment subworkflow outline --- modules/local/bwa/mem2/main.nf | 39 +++++++++++++++++ modules/local/markdups/main.nf | 38 ++++++++++++++++ modules/local/star/main.nf | 40 +++++++++++++++++ subworkflows/local/read_alignment.nf | 39 +++++++++++++++++ subworkflows/local/read_processing.nf | 34 +++++++++++++++ workflows/wgts.nf | 62 +++++++++++++++++++++++++++ 6 files changed, 252 insertions(+) create mode 100644 modules/local/bwa/mem2/main.nf create mode 100644 modules/local/markdups/main.nf create mode 100644 modules/local/star/main.nf create mode 100644 subworkflows/local/read_alignment.nf create mode 100644 subworkflows/local/read_processing.nf diff --git a/modules/local/bwa/mem2/main.nf b/modules/local/bwa/mem2/main.nf new file mode 100644 index 00000000..3b55a8a1 --- /dev/null +++ b/modules/local/bwa/mem2/main.nf @@ -0,0 +1,39 @@ +process BWA_MEM2 { + tag "${meta.id}" + label 'process_high' + + // TODO(SW): create container + //container 'foo' + + input: + // TODO(SW): decide input structure + tuple val(meta), path(fastqs) + + output: + // TODO(SW): set outputs + tuple val(meta), path('bar'), emit: bam + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + // TODO(SW): implement process + """ + touch bar + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwamem2: foo + END_VERSIONS + """ + + stub: + // TODO(SW): implement stub + """ + touch bar + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ +} diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf new file mode 100644 index 00000000..ea02ed90 --- /dev/null +++ b/modules/local/markdups/main.nf @@ -0,0 +1,38 @@ +process MARKDUPS { + tag "${meta.id}" + label 'process_low' + + // TODO(SW): create container + //container 'foo' + + input: + tuple val(meta), path(bam) + + output: + // TODO(SW): set outputs + tuple val(meta), path('bar'), emit: bam + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + // TODO(SW): implement process + """ + echo bar + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + markdups: foo + END_VERSIONS + """ + + stub: + // TODO(SW): implement stub + """ + touch bar + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ +} diff --git a/modules/local/star/main.nf b/modules/local/star/main.nf new file mode 100644 index 00000000..780139b3 --- /dev/null +++ b/modules/local/star/main.nf @@ -0,0 +1,40 @@ +process STAR { + tag "${meta.id}" + label 'process_low' + + // TODO(SW): create container + //container 'foo' + + input: + // TODO(SW): decide input structure + tuple val(meta), path(fastqs) + + output: + // TODO(SW): set outputs + tuple val(meta), path('bar'), emit: bam + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + // TODO(SW): implement process + """ + echo bar + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: foo + END_VERSIONS + """ + + stub: + // TODO(SW): implement stub + """ + touch bar + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ +} + diff --git a/subworkflows/local/read_alignment.nf b/subworkflows/local/read_alignment.nf new file mode 100644 index 00000000..a13025fa --- /dev/null +++ b/subworkflows/local/read_alignment.nf @@ -0,0 +1,39 @@ +include { BWA_MEM2 } from '../../modules/local/bwa/mem2/main' +include { STAR } from '../../modules/local/star/main' + +workflow READ_ALIGNMENT { + take: + // Sample data + ch_inputs // channel: [mandatory] [ meta ] + + main: + // Channel for version.yml files + // channel: [ versions.yml ] + ch_versions = Channel.empty() + + // STAR + // TODO(SW): implement inputs + ch_star_inputs = Channel.of([[id: 'foo'], []]) + STAR( + ch_star_inputs, + // TODO(SW): include reference files + ) + // TODO(SW): implement outputs + ch_star_outputs = Channel.empty() + + // BWA MEM2 + // TODO(SW): implement inputs + ch_bwa_inputs = Channel.of([[id: 'foo'], []]) + BWA_MEM2( + ch_bwa_inputs, + // TODO(SW): include reference files + ) + // TODO(SW): implement outputs + ch_bwa_outputs = Channel.empty() + + emit: + dna = ch_bwa_outputs // channel: [ meta, bam_dna ] + rna = ch_star_outputs // channel: [ meta, bam_rna ] + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/read_processing.nf b/subworkflows/local/read_processing.nf new file mode 100644 index 00000000..af0bb10c --- /dev/null +++ b/subworkflows/local/read_processing.nf @@ -0,0 +1,34 @@ +include { MARKDUPS } from '../../modules/local/markdups/main' + +workflow READ_PROCESSING { + take: + // Sample data + ch_inputs // channel: [mandatory] [ meta ] + ch_dna_bams // channel: [mandatory] [ meta, bam_dna ] + ch_rna_bams // channel: [mandatory] [ meta, bam_rna ] + + main: + // Channel for version.yml files + // channel: [ versions.yml ] + ch_versions = Channel.empty() + + // NOTE(SW): channel operations will be required to configure MarkDups for individual samples + + // MarkDups + // TODO(SW): implement inputs + ch_markdups_inputs = Channel.of([[id: 'foo'], []]) + MARKDUPS( + ch_markdups_inputs, + // TODO(SW): configuration + // TODO(SW): reference files + ) + // TODO(SW): implement outputs + ch_markduplicates_dna_out = Channel.empty() + ch_markduplicates_rna_out = Channel.empty() + + emit: + dna = ch_markduplicates_dna_out // channel: [ meta, bam_dna ] + rna = ch_markduplicates_rna_out // channel: [ meta, bam_rna ] + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 5ffa4995..5e3476d9 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -83,6 +83,8 @@ include { ORANGE_REPORTING } from '../subworkflows/local/orange_reporting' include { PAVE_ANNOTATION } from '../subworkflows/local/pave_annotation' include { PREPARE_REFERENCE } from '../subworkflows/local/prepare_reference' include { PURPLE_CALLING } from '../subworkflows/local/purple_calling' +include { READ_ALIGNMENT } from '../subworkflows/local/read_alignment' +include { READ_PROCESSING } from '../subworkflows/local/read_processing' include { SAGE_APPEND } from '../subworkflows/local/sage_append' include { SAGE_CALLING } from '../subworkflows/local/sage_calling' include { SIGS_FITTING } from '../subworkflows/local/sigs_fitting' @@ -127,6 +129,65 @@ workflow WGTS { // Set GRIDSS config gridss_config = params.containsKey('gridss_config') ? file(params.gridss_config) : hmf_data.gridss_config + // + // SUBWORKFLOW: Align reads + // + // channel: [ meta, bam_dna ] + ch_dna_alignment_out = Channel.empty() + // channel: [ meta, bam_rna ] + ch_rna_alignment_out = Channel.empty() + // TODO(SW): set up correctly + if (true | run_config.stages.alignment) { + + READ_ALIGNMENT( + ch_inputs, + // alignment reference files + ) + + ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) + + ch_dna_alignment_out = ch_dna_alignment_out.mix(READ_ALIGNMENT.out.dna) + ch_rna_alignment_out = ch_rna_alignment_out.mix(READ_ALIGNMENT.out.rna) + + } else { + + ch_dna_alignment_out = ch_inputs.map { meta -> [meta, []] } + ch_rna_alignment_out = ch_inputs.map { meta -> [meta, []] } + + } + + // + // SUBWORKFLOW: Process read alignments + // + // channel: [ meta, bam_dna ] + ch_dna_processed_out = Channel.empty() + // channel: [ meta, bam_rna ] + ch_rna_processed_out = Channel.empty() + // TODO(SW): set up correctly + if (true | run_config.stages.markdups) { + + READ_PROCESSING( + ch_inputs, + ch_dna_alignment_out, + ch_rna_alignment_out, + ) + + ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) + + ch_dna_processed_out = ch_dna_processed_out.mix(READ_PROCESSING.out.dna) + ch_rna_processed_out = ch_rna_processed_out.mix(READ_PROCESSING.out.rna) + + } else { + + ch_dna_processed_out = ch_inputs.map { meta -> [meta, []] } + ch_rna_processed_out = ch_inputs.map { meta -> [meta, []] } + + } + + // TODO(SW): adjust downstream selection of input BAM + + /* + // // MODULE: Run Isofox to analyse RNA data // @@ -714,6 +775,7 @@ workflow WGTS { CUSTOM_DUMPSOFTWAREVERSIONS( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) + */ } /* From 4e92135a6694542d0c98dd0f4f13afc69c26be91 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Sun, 11 Feb 2024 12:49:47 +1100 Subject: [PATCH 02/86] Initial implementation of alignment workflow. --- lib/Constants.groovy | 14 ++ lib/Utils.groovy | 89 +++++++- modules/local/bwa/Dockerfile | 9 + modules/local/bwa/mem/main.nf | 54 +++++ modules/local/fastp/Dockerfile | 8 + modules/local/fastp/main.nf | 38 ++++ modules/local/markdups/Dockerfile | 12 ++ modules/local/markdups/main.nf | 91 ++++++-- modules/local/sambamba/Dockerfile | 8 + modules/local/sambamba/index/main.nf | 23 ++ subworkflows/local/alignment.nf | 305 +++++++++++++++++++++++++++ temp/genomes_GRCh37_hmf.config | 17 ++ workflows/targeted.nf | 2 + workflows/wgts.nf | 103 +++++---- 14 files changed, 707 insertions(+), 66 deletions(-) create mode 100644 modules/local/bwa/Dockerfile create mode 100644 modules/local/bwa/mem/main.nf create mode 100644 modules/local/fastp/Dockerfile create mode 100644 modules/local/fastp/main.nf create mode 100644 modules/local/markdups/Dockerfile create mode 100644 modules/local/sambamba/Dockerfile create mode 100644 modules/local/sambamba/index/main.nf create mode 100644 subworkflows/local/alignment.nf create mode 100644 temp/genomes_GRCh37_hmf.config diff --git a/lib/Constants.groovy b/lib/Constants.groovy index 8f2eb1fd..f3fc958b 100644 --- a/lib/Constants.groovy +++ b/lib/Constants.groovy @@ -34,21 +34,26 @@ class Constants { } static enum Process { + // TODO[MC]: Add process here. + BWAMEM, AMBER, BAMTOOLS, CHORD, COBALT, CUPPA, + FASTP, FLAGSTAT, GRIDSS, GRIPSS, ISOFOX, LILAC, LINX, + MARKDUPS, ORANGE, PAVE, PURPLE, SAGE, + SAMBAMBA_INDEX, SIGS, VIRUSINTERPRETER, } @@ -57,6 +62,9 @@ class Constants { // Generic BAM, BAI, + BAM_MARKDUPS, + BAI_MARKDUPS, + FASTQ, // Process AMBER_DIR, BAMTOOLS, @@ -97,9 +105,15 @@ class Constants { DNA_RNA, } + static List DNA_SAMPLE_KEYS = [ + [Constants.SampleType.TUMOR, Constants.SequenceType.DNA], + [Constants.SampleType.NORMAL, Constants.SequenceType.DNA], + ] + static Map PLACEHOLDER_META = [meta_placeholder: null] static List PLACEHOLDER_OPTIONAL_CHANNEL = [] + // TODO(MC): How is this used? static Map INPUT = [ ISOFOX_DIR: [ diff --git a/lib/Utils.groovy b/lib/Utils.groovy index cf2b1e0a..a3adb2cb 100644 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -88,6 +88,9 @@ class Utils { if (key === Constants.FileType.BAM) { index_enum = Constants.FileType.BAI index_str = 'bai' + } else if (key === Constants.FileType.BAM_MARKDUPS) { + index_enum = Constants.FileType.BAI_MARKDUPS + index_str = 'bai' } else if (key === Constants.FileType.GRIDSS_VCF) { index_enum = Constants.FileType.GRIDSS_VCF_TBI index_str = 'tbi' @@ -177,7 +180,7 @@ class Utils { inputs.each { meta -> - // Require BAMs for each defined sample type + // Require BAMs or BAM_MARKDUPs or FASTQs for each defined sample type // NOTE(SW): repeating key pairs above to avoid having to duplicate error messages sample_keys.each { key -> @@ -187,9 +190,12 @@ class Utils { def (sample_type, sequence_type) = key - if (!meta[key].containsKey(Constants.FileType.BAM)) { - log.error "no BAM provided for ${meta.group_id} ${sample_type}/${sequence_type}\n\n" + - "NB: BAMs are always required as they are the basis to determine input sample type." + if (!meta[key].containsKey(Constants.FileType.BAM) && + !meta[key].containsKey(Constants.FileType.BAM_MARKDUPS) && + !meta[key].containsKey(Constants.FileType.FASTQ)) { + + log.error "no BAMs nor BAM_MARKDUPs nor FASTQs provided for ${meta.group_id} ${sample_type}/${sequence_type}\n\n" + + "NB: BAMs or BAM_MARKDUPs or FASTQs are always required as they are the basis to determine input sample type." System.exit(1) } @@ -367,6 +373,49 @@ class Utils { return getNormalDnaBam(meta) !== null } + static public getNormalDnaMarkdupsBam(meta) { + def meta_sample = meta.getOrDefault([Constants.SampleType.NORMAL, Constants.SequenceType.DNA], [:]) + return meta_sample.getOrDefault(Constants.FileType.BAM_MARKDUPS, null) + } + + static public hasNormalDnaMarkdupsBam(meta) { + return getNormalDnaMarkdupsBam(meta) !== null + } + + static public getTumorDnaMarkdupsBam(meta) { + def meta_sample = meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.DNA], [:]) + return meta_sample.getOrDefault(Constants.FileType.BAM_MARKDUPS, null) + } + + static public hasTumorDnaMarkdupsBam(meta) { + return getTumorDnaMarkdupsBam(meta) !== null + } + + static public hasDnaMarkdupsBam(meta) { + return hasNormalDnaMarkdupsBam(meta) || hasTumorDnaMarkdupsBam(meta) + } + + static public getNormalDnaFastq(meta) { + def meta_sample = meta.getOrDefault([Constants.SampleType.NORMAL, Constants.SequenceType.DNA], [:]) + return meta_sample.getOrDefault(Constants.FileType.FASTQ, null) + } + + static public hasNormalDnaFastq(meta) { + return getNormalDnaFastq(meta) !== null + } + + static public getTumorDnaFastq(meta) { + def meta_sample = meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.DNA], [:]) + return meta_sample.getOrDefault(Constants.FileType.FASTQ, null) + } + + static public hasTumorDnaFastq(meta) { + return getTumorDnaFastq(meta) !== null + } + + static public hasDnaFastq(meta) { + return hasNormalDnaFastq(meta) || hasTumorDnaFastq(meta) + } static public getRunMode(run_mode, log) { def run_mode_enum = Utils.getEnumFromString(run_mode, Constants.RunMode) @@ -404,4 +453,36 @@ class Utils { } } + // Alignment utils. + static public splitGroupIntoSamples(meta_group) { + def sample_entries = [:] + def common_entries = [:] + meta_group.each { key, value -> + + if ((value instanceof java.util.Map) && value.containsKey('sample_id')) { + sample_entries[key] = value + } else { + common_entries[key] = value + } + } + + def meta_samples = [] + sample_entries.each { key, value -> + + def meta_sample = common_entries.getClass().newInstance(common_entries) + meta_sample[key] = value + meta_samples.add(meta_sample) + } + + return meta_samples + } + + static public readGroupFromFastqPath(fastq_path) { + def base_name = fastq_path.split('/')[-1] + def pattern = /^(.+)_\d+\.fastq$/ + def matcher = base_name =~ pattern + assert matcher.find() + return matcher[0][1] + } + } diff --git a/modules/local/bwa/Dockerfile b/modules/local/bwa/Dockerfile new file mode 100644 index 00000000..2172ebc4 --- /dev/null +++ b/modules/local/bwa/Dockerfile @@ -0,0 +1,9 @@ +FROM docker.io/continuumio/miniconda3:23.10.0-1 + +RUN \ + conda install -y -n base conda-libmamba-solver && \ + conda config --set solver libmamba && \ + conda install -y -c bioconda -c conda-forge -c conda \ + 'bwa==0.7.17' \ + 'sambamba==1.0' && \ + conda clean -yaf diff --git a/modules/local/bwa/mem/main.nf b/modules/local/bwa/mem/main.nf new file mode 100644 index 00000000..428d1ce9 --- /dev/null +++ b/modules/local/bwa/mem/main.nf @@ -0,0 +1,54 @@ +// TODO(MC): BWA MEM2: Need docker and resource files. + +process BWA_MEM { + tag "${meta.id}" + + // TODO(MC): What process label? + // label 'process_medium' + + container 'docker.io/scwatts/bwa:0.7.17-sambamba' + + input: + tuple val(meta), path(reads_fwd), path(reads_rev) + path genome_fasta + path genome_bwa_index + + output: + tuple val(meta), path('*bam'), emit: bam + + // TODO(MC): How does this work? + when: + task.ext.when == null || task.ext.when + + // # TODO(MC): read group + // # -R ${meta.read_group} + + script: + """ + ln -s \$(find -L ${genome_bwa_index} -type f) ./ + + bwa mem \\ + -Y \\ + -t ${task.cpus} \\ + ${genome_fasta} \\ + ${reads_fwd} \\ + ${reads_rev} | \\ + \\ + sambamba view \\ + --sam-input \\ + --format bam \\ + --compression-level 0 \\ + --nthreads ${task.cpus} \\ + /dev/stdin | \\ + \\ + sambamba sort \\ + --nthreads ${task.cpus} \\ + --out ${meta.split}.${meta.sample_id}.${meta.read_group}.bam \\ + /dev/stdin + """ + + stub: + """ + touch ${meta.split}.${meta.sample_id}.${meta.read_group}.bam + """ +} diff --git a/modules/local/fastp/Dockerfile b/modules/local/fastp/Dockerfile new file mode 100644 index 00000000..5d7cdbf9 --- /dev/null +++ b/modules/local/fastp/Dockerfile @@ -0,0 +1,8 @@ +FROM docker.io/continuumio/miniconda3:23.10.0-1 + +RUN \ + conda install -y -n base conda-libmamba-solver && \ + conda config --set solver libmamba && \ + conda install -y -c bioconda -c conda-forge -c conda \ + 'fastp==0.23.4' && \ + conda clean -yaf diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf new file mode 100644 index 00000000..8deec567 --- /dev/null +++ b/modules/local/fastp/main.nf @@ -0,0 +1,38 @@ +process FASTP { + tag "${meta.id}" + + // TODO(MC): Resources? + + container 'docker.io/scwatts/fastp:0.23.4' + + input: + tuple val(meta), path(reads_fwd), path(reads_rev) + + output: + tuple val(meta), path('*_R1.fastp.fastq'), path('*_R2.fastp.fastq'), emit: fastq + + script: + // TODO(MC): UMI flags + // --umi \\ + // --umi_loc per_read \\ + // --umi_len 7 \\ + // --umi_skip 1 \\ + + """ + # * do not apply trimming/clipping, already done in BCL convert + + fastp \\ + --in1 ${reads_fwd} \\ + --in2 ${reads_rev} \\ + --disable_adapter_trimming \\ + --split_by_lines 40000000 \\ + --out1 ${meta.sample_id}_${meta.read_group}_R1.fastp.fastq \\ + --out2 ${meta.sample_id}_${meta.read_group}_R2.fastp.fastq + """ + + stub: + """ + touch 00{1..4}.${meta.sample_id}_${meta.read_group}_R1.fastp.fastq + touch 00{1..4}.${meta.sample_id}_${meta.read_group}_R2.fastp.fastq + """ +} diff --git a/modules/local/markdups/Dockerfile b/modules/local/markdups/Dockerfile new file mode 100644 index 00000000..d7232f05 --- /dev/null +++ b/modules/local/markdups/Dockerfile @@ -0,0 +1,12 @@ +FROM docker.io/continuumio/miniconda3:23.10.0-1 + +RUN \ + conda install -y -n base conda-libmamba-solver && \ + conda config --set solver libmamba && \ + conda install -y -c bioconda -c conda-forge -c conda \ + 'sambamba==1.0' 'samtools==1.17' 'openjdk >=8' && \ + conda clean -yaf + +RUN \ + mkdir -p /opt/markdups/ && \ + wget -O /opt/markdups/markdups.jar 'https://github.com/hartwigmedical/hmftools/releases/download/mark-dups-v1.1/mark-dups_v1.1.rc1.jar' diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index ea02ed90..7a25e728 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -1,38 +1,89 @@ process MARKDUPS { - tag "${meta.id}" - label 'process_low' + tag "${meta_bam.id}" - // TODO(SW): create container - //container 'foo' + // TODO(MC): Resources required? + // label 'process_low' + + container 'docker.io/scwatts/markdups:1.1.rc1' input: - tuple val(meta), path(bam) + tuple val(meta_bam), path(bams), path(bais) + path genome_fasta + path genome_fai + path genome_dict + path unmap_regions output: - // TODO(SW): set outputs - tuple val(meta), path('bar'), emit: bam - path 'versions.yml' , emit: versions + tuple val(meta_bam), path('*bam'), path('*bai'), emit: bam + path '*.tsv' + // TODO(MC): Make sure this is in each. when: task.ext.when == null || task.ext.when - script: - def args = task.ext.args ?: '' + // TODO(MC): Versions in each. + // path 'versions.yml' , emit: versions - // TODO(SW): implement process - """ - echo bar + // script: + // def args = task.ext.args ?: '' + + // // TODO(SW): implement process + // """ + // echo bar + + // cat <<-END_VERSIONS > versions.yml + // "${task.process}": + // markdups: foo + // END_VERSIONS + // """ - cat <<-END_VERSIONS > versions.yml - "${task.process}": - markdups: foo - END_VERSIONS + // stub: + // // TODO(SW): implement stub + // """ + // touch bar + // echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + // """ + + // # TODO(MC): Umi flags + // # -multi_bam \\ + // # -umi_enabled \\ + // # -umi_duplex \\ + // # -umi_duplex_delim _ \\ + // # -umi_base_diff_stats \\ + + script: + """ + java \\ + -Xmx${Math.round(task.memory.bytes * 0.95)} \\ + -jar /opt/markdups/markdups.jar \\ + \\ + -samtools \$(which samtools) \\ + -sambamba \$(which sambamba) \\ + \\ + -sample ${meta_bam.sample_id} \\ + -input_bam ${bams.join(',')} \\ + \\ + -form_consensus \\ + \\ + -unmap_regions ${unmap_regions} \\ + -ref_genome ${genome_fasta} \\ + -ref_genome_version 37 \\ + \\ + -write_stats \\ + -threads 16 \\ + \\ + -output_bam ${meta_bam.sample_id}.mark_dups.bam """ stub: - // TODO(SW): implement stub """ - touch bar - echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + touch ${meta_bam.sample_id}.mark_dups.bam + touch ${meta_bam.sample_id}.mark_dups.bam.bai + touch ${meta_bam.sample_id}.duplicate_freq.tsv """ + + // # TODO(MC): + // # touch ${meta_bam.sample_id}.umi_coord_freq.tsv + // # touch ${meta_bam.sample_id}.umi_edit_distance.tsv + // # touch ${meta_bam.sample_id}.umi_nucleotide_freq.tsv } diff --git a/modules/local/sambamba/Dockerfile b/modules/local/sambamba/Dockerfile new file mode 100644 index 00000000..8c2c38c5 --- /dev/null +++ b/modules/local/sambamba/Dockerfile @@ -0,0 +1,8 @@ +FROM docker.io/continuumio/miniconda3:23.10.0-1 + +RUN \ + conda install -y -n base conda-libmamba-solver && \ + conda config --set solver libmamba && \ + conda install -y -c bioconda -c conda-forge -c conda \ + 'sambamba==1.0' && \ + conda clean -yaf diff --git a/modules/local/sambamba/index/main.nf b/modules/local/sambamba/index/main.nf new file mode 100644 index 00000000..f6cf00ea --- /dev/null +++ b/modules/local/sambamba/index/main.nf @@ -0,0 +1,23 @@ +process SAMBAMBA_INDEX { + tag "${meta.id}" + + container 'docker.io/scwatts/sambamba:1.0' + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path(bam), path('*bai'), emit: bam + + script: + """ + sambamba index \\ + --nthreads ${task.cpus} \\ + ${meta.split}.${meta.sample_id}.${meta.read_group}.bam + """ + + stub: + """ + touch ${meta.split}.${meta.sample_id}.${meta.read_group}.bam.bai + """ +} diff --git a/subworkflows/local/alignment.nf b/subworkflows/local/alignment.nf new file mode 100644 index 00000000..4b34ac29 --- /dev/null +++ b/subworkflows/local/alignment.nf @@ -0,0 +1,305 @@ +include { BWA_MEM } from '../../modules/local/bwa/mem/main' +include { MARKDUPS } from '../../modules/local/markdups/main' +include { FASTP } from '../../modules/local/fastp/main' +include { SAMBAMBA_INDEX } from '../../modules/local/sambamba/index/main' + +workflow ALIGNMENT { + take: + ch_inputs // channel: [ meta ] + genome_fasta + genome_fai + genome_dict + genome_bwa_index + unmap_regions + max_fastq_lines + + main: + // channel: [ meta ] (One sample per record). + ch_meta_samples = ch_inputs.flatMap { meta -> Utils.splitGroupIntoSamples(meta) } + + // Sort inputs + // channel: [ meta ] (One sample per record). + ch_meta_samples_sorted = ch_meta_samples + .branch { meta -> + runnable_fastq: Utils.hasDnaFastq(meta) + runnable_markdups: Utils.hasDnaMarkdupsBam(meta) + skip: true + } + + // TODO(MC): Simplify this branch. + if (max_fastq_lines > 0) { + // Split fastq files using fastp. + + // Create fastp process input channel. + // channel: [ meta_fastq, reads_fwd_fastq, reads_rev_fastq ] + ch_fastp_inputs = ch_meta_samples_sorted.runnable_fastq + .flatMap { meta -> + + def sample_key = Constants.DNA_SAMPLE_KEYS.find { key -> meta.containsKey(key) } + if (sample_key === null) { + log.error "No DNA sample found" + System.exit(1) + } + + def sample_id = meta[sample_key]['sample_id'] + def fastq_files = meta[sample_key][Constants.FileType.FASTQ].tokenize(';') + + // TODO(MC): Validate fastq_files. + + def meta_fastq_common = [:] + meta.each { key, value -> + + + if (key === sample_key) { + return + } + + meta_fastq_common[key] = meta[key] + } + meta_fastq_common['sample_key'] = sample_key + meta_fastq_common['sample_id'] = sample_id + + def fastq_pairs = [] + for (i = 0; i < fastq_files.size(); i += 2) { + def reads_fwd = fastq_files[i] + def reads_rev = fastq_files[i + 1] + + def meta_fastq = meta_fastq_common.getClass().newInstance(meta_fastq_common) + meta_fastq['read_group'] = Utils.readGroupFromFastqPath(reads_fwd) + + fastq_pairs.add([meta_fastq, reads_fwd, reads_rev]) + } + + fastq_pairs + } + + FASTP(ch_fastp_inputs) + + // TODO(MC): See WISP implementation. + // Create inputs for bwa mem. + // channel: [ meta_fastq, reads_fwd_fastq, reads_rev_fastq ] + ch_bwa_mem_inputs = FASTP.out.fastq.flatMap { fastq -> + + def meta = fastq[0] + def fwd_reads = fastq[1] + def rev_reads = fastq[2] + + // Pair up the reads. + def read_pairs = [:] + fwd_reads.each { fastq_path -> + + def base_name = fastq_path.getFileName().toString() + def pattern = /^(\d+)\.(.+)_R[12]\.fastp\.fastq$/ + def matcher = base_name =~ pattern + assert matcher.find() + def split = matcher[0][1] + def key = "${split}.${matcher[0][2]}" + assert !read_pairs.containsKey(key) + read_pairs[key] = [split, fastq_path] + } + + rev_reads.each { fastq_path -> + + def base_name = fastq_path.getFileName().toString() + def pattern = /^(.+)_R[12]\.fastp\.fastq$/ + def matcher = base_name =~ pattern + assert matcher.find() + def key = matcher[0][1] + assert read_pairs.containsKey(key) + read_pairs[key].add(fastq_path) + } + + def fastqs = [] + read_pairs.values().each { split_fastq_pair -> + + meta_fastq = meta.getClass().newInstance(meta) + meta_fastq['split'] = split_fastq_pair[0] + + fastqs.add([meta_fastq, split_fastq_pair[1], split_fastq_pair[2]]) + } + + fastqs + } + } else { + + // Skip splitting fastq files using fastp. + + // Create inputs for bwa mem. + // channel: [ meta_fastq, reads_fwd_fastq, reads_rev_fastq ] + ch_bwa_mem_inputs = ch_meta_samples_sorted.runnable_fastq + .flatMap { meta -> + + def sample_key = Constants.DNA_SAMPLE_KEYS.find { key -> meta.containsKey(key) } + if (sample_key === null) { + log.error "No DNA sample found" + System.exit(1) + } + + def sample_id = meta[sample_key]['sample_id'] + def fastq_files = meta[sample_key][Constants.FileType.FASTQ].tokenize(';') + + // TODO(MC): Validate fastq_files. + + def meta_fastq_common = [:] + meta.each { key, value -> + + + if (key === sample_key) { + return + } + + meta_fastq_common[key] = meta[key] + } + meta_fastq_common['sample_key'] = sample_key + meta_fastq_common['sample_id'] = sample_id + + def fastq_pairs = [] + for (i = 0; i < fastq_files.size(); i += 2) { + def reads_fwd = fastq_files[i] + def reads_rev = fastq_files[i + 1] + + def meta_fastq = meta_fastq_common.getClass().newInstance(meta_fastq_common) + meta_fastq['read_group'] = Utils.readGroupFromFastqPath(reads_fwd) + meta_fastq['split'] = '000' + + fastq_pairs.add([meta_fastq, reads_fwd, reads_rev]) + } + + fastq_pairs + } + } + + // channel: [ meta_fastq, bam ] + BWA_MEM( + ch_bwa_mem_inputs, + genome_fasta, + genome_bwa_index, + ) + + // channel: [ meta_fastq, bam, bai ] + SAMBAMBA_INDEX( + BWA_MEM.out.bam, + ) + + // Prepare input to markdups process. + // First we prepare a channel of inputs that have gone through alignment. + // channel: [ meta_bam, bams, bais ] + ch_fastq_markdups_inputs = SAMBAMBA_INDEX.out.bam + .map { bam -> // Strip read groups and splits. + + def meta = bam[0] + def meta_bam = [:] + meta.keySet().each { key -> + + if (key == 'read_group' || key == 'split') { + return + } + + meta_bam[key] = meta[key] + } + + [meta_bam, [meta_bam, bam[1], bam[2]]] + } + .groupTuple() + .map { key_lane_bams -> + def lane_bams = key_lane_bams[1] + def meta_bam = lane_bams[0][0] + def bams = [] + def bais = [] + lane_bams.each { lane_bam -> + + bams.add(lane_bam[1]) + bais.add(lane_bam[2]) + } + + [meta_bam, bams, bais] + } + + // Next we prepare channel for markdups input that started of as aligned bams. + // channel: [ meta, bams, bais ] (One sample per meta record). + ch_input_markdups_inputs = ch_meta_samples_sorted.runnable_markdups.map { meta -> + + def sample_key = Constants.DNA_SAMPLE_KEYS.find { key -> meta.containsKey(key) } + if (sample_key === null) { + log.error "No DNA sample found" + System.exit(1) + } + + def sample_id = meta[sample_key]['sample_id'] + def bam = meta[sample_key][Constants.FileType.BAM_MARKDUPS] + def bai = meta[sample_key][Constants.FileType.BAI_MARKDUPS] + + def meta_bam = meta.getClass().newInstance(meta); + meta_bam['sample_key'] = sample_key + meta_bam['sample_id'] = sample_id + + [meta_bam, [bam], [bai]] + } + + // Merging the two markdups input channels. + // channel: [ meta_bam, bams, bais ] + ch_markdups_inputs = Channel.empty() + .mix( + ch_fastq_markdups_inputs, + ch_input_markdups_inputs, + ) + + // channel: [ meta_bam, bam, bai ] + MARKDUPS( + ch_markdups_inputs, + genome_fasta, + genome_fai, + genome_dict, + unmap_regions, + ) + + // Fill the sample information back in. + // channel: [ meta ] (One sample per meta record). + ch_bam_samples = MARKDUPS.out.bam.map { bam -> + + def meta_bam = bam[0] + + // TODO(MC): Safer to copy and delete unneeded fields. + def meta = [ + group_id: meta_bam.group_id, + subject_id: meta_bam.subject_id, + ] + + sample = [sample_id: meta_bam.sample_id] + sample[Constants.FileType.BAM] = bam[1] + sample[Constants.FileType.BAI] = bam[2] + meta[meta.sample_key] = sample + + meta + } + + // Merge back in skipped meta entries. + // channel: [ meta ] (One sample per meta record). + ch_all_samples = Channel.empty() + .mix( + ch_bam_samples, + ch_meta_samples_sorted.skip, + ) + + // TODO(MC): Get rid of blocking. + // Undo split of meta records. + // channel: [ meta_bam ] + ch_outputs = ch_all_samples + .map { sample -> [sample.group_id, sample]} + .groupTuple() + .map { key_samples -> + + def samples = key_samples[1] + def merged_sample = [:] + samples.each { sample -> + + sample.each { key, value -> merged_sample[key] = value } + } + + merged_sample + } + + emit: + meta_bam = ch_outputs + // TODO[MC]: Channel version outputs. +} diff --git a/temp/genomes_GRCh37_hmf.config b/temp/genomes_GRCh37_hmf.config new file mode 100644 index 00000000..480613a1 --- /dev/null +++ b/temp/genomes_GRCh37_hmf.config @@ -0,0 +1,17 @@ +params { + genomes { + 'GRCh37_hmf' { + fasta = "/Users/matthewcooper/projects/oncoanalyser/genomes/GRCh37_hmf/Homo_sapiens.GRCh37.GATK.illumina.fasta" + fai = "/Users/matthewcooper/projects/oncoanalyser/genomes/GRCh37_hmf/samtools_index/1.16/Homo_sapiens.GRCh37.GATK.illumina.fasta.fai" + dict = "/Users/matthewcooper/projects/oncoanalyser/genomes/GRCh37_hmf/samtools_index/1.16/Homo_sapiens.GRCh37.GATK.illumina.fasta.dict" + bwa_index = "/Users/matthewcooper/projects/oncoanalyser/genomes/GRCh37_hmf/bwa_index/0.7.17-r1188" + bwa_index_image = "/Users/matthewcooper/projects/oncoanalyser/genomes/GRCh37_hmf/bwa_index_image/0.7.17-r1188/Homo_sapiens.GRCh37.GATK.illumina.fasta.img" + gridss_index = "/Users/matthewcooper/projects/oncoanalyser/genomes/GRCh37_hmf/gridss_index/2.13.2/Homo_sapiens.GRCh37.GATK.illumina.fasta.gridsscache" + } + } + + ref_data_hmf_data_path = "/Users/matthewcooper/projects/oncoanalyser/hmf_reference_data/hmftools/5.34_37--0" + ref_data_virusbreakenddb_path = "/Users/matthewcooper/projects/oncoanalyser/virusbreakend/virusbreakenddb_20210401" + refdata_unmap_regions = "/Users/matthewcooper/projects/oncoanalyser/hmf_reference_data/markdups/unmap_regions_37.tsv" + max_fastq_records = 10000000 +} diff --git a/workflows/targeted.nf b/workflows/targeted.nf index 2951bbba..3e9c0b1b 100644 --- a/workflows/targeted.nf +++ b/workflows/targeted.nf @@ -2,6 +2,8 @@ import Constants import Processes import Utils +// TODO[MC]: Alignment. + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 5e3476d9..24ea60c0 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -15,6 +15,7 @@ inputs = Utils.parseInput(params.input, workflow.stubRun, log) run_config = WorkflowMain.getRunConfig(params, inputs, log) // Validate inputs +// TODO(MC): Reexamine validation in light of fastq/bam markdups. Utils.validateInput(inputs, run_config, log) // Check input path parameters to see if they exist @@ -67,6 +68,7 @@ linx_gene_id_file = params.linx_gene_id_file ? file(params.linx_gene_id_file) : // // SUBWORKFLOWS // +include { ALIGNMENT } from '../subworkflows/local/alignment' include { AMBER_PROFILING } from '../subworkflows/local/amber_profiling' include { BAMTOOLS_METRICS } from '../subworkflows/local/bamtools_metrics' include { CHORD_PREDICTION } from '../subworkflows/local/chord_prediction' @@ -117,7 +119,8 @@ workflow WGTS { // Create input channel from parsed CSV // channel: [ meta ] - ch_inputs = Channel.fromList(inputs) + // TODO[MC]: Rename this back to original, and swap name out for current ch_inputs. + ch_inputs0 = Channel.fromList(inputs) // Set up reference data, assign more human readable variables PREPARE_REFERENCE( @@ -126,63 +129,79 @@ workflow WGTS { ref_data = PREPARE_REFERENCE.out hmf_data = PREPARE_REFERENCE.out.hmf_data + // TODO[MC]: Skipping alignment, only running up to alignment? + ALIGNMENT( + ch_inputs0, + ref_data.genome_fasta, + ref_data.genome_fai, + ref_data.genome_dict, + ref_data.genome_bwa_index, + file(params.refdata_unmap_regions), + 4 * params.max_fastq_records, + ) + + ch_inputs = ALIGNMENT.out.meta_bam + // Set GRIDSS config gridss_config = params.containsKey('gridss_config') ? file(params.gridss_config) : hmf_data.gridss_config - // - // SUBWORKFLOW: Align reads - // - // channel: [ meta, bam_dna ] - ch_dna_alignment_out = Channel.empty() - // channel: [ meta, bam_rna ] - ch_rna_alignment_out = Channel.empty() - // TODO(SW): set up correctly - if (true | run_config.stages.alignment) { + // // + // // SUBWORKFLOW: Align reads + // // + // // channel: [ meta, bam_dna ] + // ch_dna_alignment_out = Channel.empty() + // // channel: [ meta, bam_rna ] + // ch_rna_alignment_out = Channel.empty() + // // TODO(MC): set up correctly + // // if (true | run_config.stages.alignment) { - READ_ALIGNMENT( - ch_inputs, - // alignment reference files - ) + // // READ_ALIGNMENT( + // // ch_inputs, + // // // alignment reference files + // // ) - ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) + // // ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) - ch_dna_alignment_out = ch_dna_alignment_out.mix(READ_ALIGNMENT.out.dna) - ch_rna_alignment_out = ch_rna_alignment_out.mix(READ_ALIGNMENT.out.rna) + // // ch_dna_alignment_out = ch_dna_alignment_out.mix(READ_ALIGNMENT.out.dna) + // // ch_rna_alignment_out = ch_rna_alignment_out.mix(READ_ALIGNMENT.out.rna) - } else { + // // } else { - ch_dna_alignment_out = ch_inputs.map { meta -> [meta, []] } - ch_rna_alignment_out = ch_inputs.map { meta -> [meta, []] } + // // ch_dna_alignment_out = ch_inputs.map { meta -> [meta, []] } + // // ch_rna_alignment_out = ch_inputs.map { meta -> [meta, []] } - } + // // } - // - // SUBWORKFLOW: Process read alignments - // - // channel: [ meta, bam_dna ] - ch_dna_processed_out = Channel.empty() - // channel: [ meta, bam_rna ] - ch_rna_processed_out = Channel.empty() - // TODO(SW): set up correctly - if (true | run_config.stages.markdups) { + // ch_dna_alignment_out = ch_inputs.map { meta -> [meta, []] } + // ch_rna_alignment_out = ch_inputs.map { meta -> [meta, []] } - READ_PROCESSING( - ch_inputs, - ch_dna_alignment_out, - ch_rna_alignment_out, - ) + // // + // // SUBWORKFLOW: Process read alignments + // // + // // channel: [ meta, bam_dna ] + // ch_dna_processed_out = Channel.empty() + // // channel: [ meta, bam_rna ] + // ch_rna_processed_out = Channel.empty() + // // TODO(SW): set up correctly + // if (true | run_config.stages.markdups) { - ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) + // READ_PROCESSING( + // ch_inputs, + // ch_dna_alignment_out, + // ch_rna_alignment_out, + // ) - ch_dna_processed_out = ch_dna_processed_out.mix(READ_PROCESSING.out.dna) - ch_rna_processed_out = ch_rna_processed_out.mix(READ_PROCESSING.out.rna) + // ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) - } else { + // ch_dna_processed_out = ch_dna_processed_out.mix(READ_PROCESSING.out.dna) + // ch_rna_processed_out = ch_rna_processed_out.mix(READ_PROCESSING.out.rna) - ch_dna_processed_out = ch_inputs.map { meta -> [meta, []] } - ch_rna_processed_out = ch_inputs.map { meta -> [meta, []] } + // } else { - } + // ch_dna_processed_out = ch_inputs.map { meta -> [meta, []] } + // ch_rna_processed_out = ch_inputs.map { meta -> [meta, []] } + + // } // TODO(SW): adjust downstream selection of input BAM From f69caf9e53a5099da7b46a339aee1e61f95ba197 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Mon, 12 Feb 2024 10:39:03 +1100 Subject: [PATCH 03/86] Simplified condition on whether fastp is run in alignment subworkflow. --- subworkflows/local/alignment.nf | 180 ++++++++++++-------------------- 1 file changed, 69 insertions(+), 111 deletions(-) diff --git a/subworkflows/local/alignment.nf b/subworkflows/local/alignment.nf index 4b34ac29..dce56990 100644 --- a/subworkflows/local/alignment.nf +++ b/subworkflows/local/alignment.nf @@ -26,66 +26,71 @@ workflow ALIGNMENT { skip: true } - // TODO(MC): Simplify this branch. - if (max_fastq_lines > 0) { - // Split fastq files using fastp. - - // Create fastp process input channel. - // channel: [ meta_fastq, reads_fwd_fastq, reads_rev_fastq ] - ch_fastp_inputs = ch_meta_samples_sorted.runnable_fastq - .flatMap { meta -> - - def sample_key = Constants.DNA_SAMPLE_KEYS.find { key -> meta.containsKey(key) } - if (sample_key === null) { - log.error "No DNA sample found" - System.exit(1) - } - - def sample_id = meta[sample_key]['sample_id'] - def fastq_files = meta[sample_key][Constants.FileType.FASTQ].tokenize(';') + // channel: [ meta_fastq, reads_fwd_fastq, reads_rev_fastq ] + ch_fastq_pairs = ch_meta_samples_sorted.runnable_fastq + .flatMap { meta -> + + def sample_key = Constants.DNA_SAMPLE_KEYS.find { key -> meta.containsKey(key) } + if (sample_key === null) { + log.error "No DNA sample found" + System.exit(1) + } - // TODO(MC): Validate fastq_files. + def sample_id = meta[sample_key]['sample_id'] + def fastq_files = meta[sample_key][Constants.FileType.FASTQ].tokenize(';') - def meta_fastq_common = [:] - meta.each { key, value -> + // TODO(MC): Validate fastq_files. + def meta_fastq_common = [:] + meta.each { key, value -> - if (key === sample_key) { - return - } - meta_fastq_common[key] = meta[key] + if (key === sample_key) { + return } - meta_fastq_common['sample_key'] = sample_key - meta_fastq_common['sample_id'] = sample_id - def fastq_pairs = [] - for (i = 0; i < fastq_files.size(); i += 2) { - def reads_fwd = fastq_files[i] - def reads_rev = fastq_files[i + 1] + meta_fastq_common[key] = meta[key] + } + meta_fastq_common['sample_key'] = sample_key + meta_fastq_common['sample_id'] = sample_id - def meta_fastq = meta_fastq_common.getClass().newInstance(meta_fastq_common) - meta_fastq['read_group'] = Utils.readGroupFromFastqPath(reads_fwd) + def fastq_pairs = [] + for (i = 0; i < fastq_files.size(); i += 2) { + def reads_fwd = fastq_files[i] + def reads_rev = fastq_files[i + 1] - fastq_pairs.add([meta_fastq, reads_fwd, reads_rev]) - } + def meta_fastq = meta_fastq_common.getClass().newInstance(meta_fastq_common) + meta_fastq['read_group'] = Utils.readGroupFromFastqPath(reads_fwd) - fastq_pairs + fastq_pairs.add([meta_fastq, reads_fwd, reads_rev]) } - FASTP(ch_fastp_inputs) - - // TODO(MC): See WISP implementation. - // Create inputs for bwa mem. - // channel: [ meta_fastq, reads_fwd_fastq, reads_rev_fastq ] - ch_bwa_mem_inputs = FASTP.out.fastq.flatMap { fastq -> + fastq_pairs + } - def meta = fastq[0] - def fwd_reads = fastq[1] - def rev_reads = fastq[2] + // Split fastq files using fastp. + // channel: [ meta_fastq, reads_fwd_fastqs, reads_rev_fastqs ] + ch_split_fastq_pairs = Channel.empty() + if (max_fastq_lines > 0) { + FASTP(ch_fastq_pairs) + ch_split_fastq_pairs = FASTP.out.fastq + } else { + ch_split_fastq_pairs = ch_fastq_pairs.map { fastq_pair -> [fastq_pair[0], [fastq_pair[1]], [fastq_pair[2]]] } + } - // Pair up the reads. - def read_pairs = [:] + // TODO(MC): See WISP implementation. + // Create inputs for bwa mem. + // channel: [ meta_fastq, reads_fwd_fastq, reads_rev_fastq ] + ch_bwa_mem_inputs = ch_split_fastq_pairs.flatMap { fastq -> + def meta = fastq[0] + def fwd_reads = fastq[1] + def rev_reads = fastq[2] + + // Pair up the reads. + def read_pairs = [:] + if (fwd_reads.size() == 1) { + read_pairs[""] = ["000", fwd_reads[0], rev_reads[0]] + } else { fwd_reads.each { fastq_path -> def base_name = fastq_path.getFileName().toString() @@ -108,77 +113,30 @@ workflow ALIGNMENT { assert read_pairs.containsKey(key) read_pairs[key].add(fastq_path) } - - def fastqs = [] - read_pairs.values().each { split_fastq_pair -> - - meta_fastq = meta.getClass().newInstance(meta) - meta_fastq['split'] = split_fastq_pair[0] - - fastqs.add([meta_fastq, split_fastq_pair[1], split_fastq_pair[2]]) - } - - fastqs } - } else { - - // Skip splitting fastq files using fastp. - - // Create inputs for bwa mem. - // channel: [ meta_fastq, reads_fwd_fastq, reads_rev_fastq ] - ch_bwa_mem_inputs = ch_meta_samples_sorted.runnable_fastq - .flatMap { meta -> - - def sample_key = Constants.DNA_SAMPLE_KEYS.find { key -> meta.containsKey(key) } - if (sample_key === null) { - log.error "No DNA sample found" - System.exit(1) - } - - def sample_id = meta[sample_key]['sample_id'] - def fastq_files = meta[sample_key][Constants.FileType.FASTQ].tokenize(';') - - // TODO(MC): Validate fastq_files. - - def meta_fastq_common = [:] - meta.each { key, value -> + def fastqs = [] + read_pairs.values().each { split_fastq_pair -> - if (key === sample_key) { - return - } + meta_fastq = meta.getClass().newInstance(meta) + meta_fastq['split'] = split_fastq_pair[0] - meta_fastq_common[key] = meta[key] - } - meta_fastq_common['sample_key'] = sample_key - meta_fastq_common['sample_id'] = sample_id - - def fastq_pairs = [] - for (i = 0; i < fastq_files.size(); i += 2) { - def reads_fwd = fastq_files[i] - def reads_rev = fastq_files[i + 1] - - def meta_fastq = meta_fastq_common.getClass().newInstance(meta_fastq_common) - meta_fastq['read_group'] = Utils.readGroupFromFastqPath(reads_fwd) - meta_fastq['split'] = '000' - - fastq_pairs.add([meta_fastq, reads_fwd, reads_rev]) - } + fastqs.add([meta_fastq, split_fastq_pair[1], split_fastq_pair[2]]) + } - fastq_pairs - } + fastqs } // channel: [ meta_fastq, bam ] BWA_MEM( - ch_bwa_mem_inputs, - genome_fasta, - genome_bwa_index, + ch_bwa_mem_inputs, + genome_fasta, + genome_bwa_index, ) // channel: [ meta_fastq, bam, bai ] SAMBAMBA_INDEX( - BWA_MEM.out.bam, + BWA_MEM.out.bam, ) // Prepare input to markdups process. @@ -208,8 +166,8 @@ workflow ALIGNMENT { def bais = [] lane_bams.each { lane_bam -> - bams.add(lane_bam[1]) - bais.add(lane_bam[2]) + bams.add(lane_bam[1]) + bais.add(lane_bam[2]) } [meta_bam, bams, bais] @@ -221,8 +179,8 @@ workflow ALIGNMENT { def sample_key = Constants.DNA_SAMPLE_KEYS.find { key -> meta.containsKey(key) } if (sample_key === null) { - log.error "No DNA sample found" - System.exit(1) + log.error "No DNA sample found" + System.exit(1) } def sample_id = meta[sample_key]['sample_id'] @@ -261,8 +219,8 @@ workflow ALIGNMENT { // TODO(MC): Safer to copy and delete unneeded fields. def meta = [ - group_id: meta_bam.group_id, - subject_id: meta_bam.subject_id, + group_id: meta_bam.group_id, + subject_id: meta_bam.subject_id, ] sample = [sample_id: meta_bam.sample_id] @@ -299,7 +257,7 @@ workflow ALIGNMENT { merged_sample } - emit: + emit: meta_bam = ch_outputs // TODO[MC]: Channel version outputs. } From 1ef8abd29822693250b89a48d8703a9be845224b Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Mon, 12 Feb 2024 11:16:29 +1100 Subject: [PATCH 04/86] Get rid of blocking when merging individual sample records back into a single group record. --- lib/Utils.groovy | 4 ++++ subworkflows/local/alignment.nf | 22 ++++++++++++++++------ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/lib/Utils.groovy b/lib/Utils.groovy index a3adb2cb..2ca80db2 100644 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -477,6 +477,10 @@ class Utils { return meta_samples } + static public groupSampleCounts(meta_group) { + return splitGroupIntoSamples(meta_group).size() + } + static public readGroupFromFastqPath(fastq_path) { def base_name = fastq_path.split('/')[-1] def pattern = /^(.+)_\d+\.fastq$/ diff --git a/subworkflows/local/alignment.nf b/subworkflows/local/alignment.nf index dce56990..750a06d7 100644 --- a/subworkflows/local/alignment.nf +++ b/subworkflows/local/alignment.nf @@ -14,6 +14,9 @@ workflow ALIGNMENT { max_fastq_lines main: + // channel: [ group_id, sample_count ] + ch_sample_counts = ch_inputs.map { meta -> [meta.group_id, Utils.groupSampleCounts(meta)] } + // channel: [ meta ] (One sample per record). ch_meta_samples = ch_inputs.flatMap { meta -> Utils.splitGroupIntoSamples(meta) } @@ -78,7 +81,6 @@ workflow ALIGNMENT { ch_split_fastq_pairs = ch_fastq_pairs.map { fastq_pair -> [fastq_pair[0], [fastq_pair[1]], [fastq_pair[2]]] } } - // TODO(MC): See WISP implementation. // Create inputs for bwa mem. // channel: [ meta_fastq, reads_fwd_fastq, reads_rev_fastq ] ch_bwa_mem_inputs = ch_split_fastq_pairs.flatMap { fastq -> @@ -226,7 +228,7 @@ workflow ALIGNMENT { sample = [sample_id: meta_bam.sample_id] sample[Constants.FileType.BAM] = bam[1] sample[Constants.FileType.BAI] = bam[2] - meta[meta.sample_key] = sample + meta[meta_bam.sample_key] = sample meta } @@ -239,11 +241,19 @@ workflow ALIGNMENT { ch_meta_samples_sorted.skip, ) - // TODO(MC): Get rid of blocking. - // Undo split of meta records. + // Merge individual sample records back into group records without blocking for the whole channel to be processed. // channel: [ meta_bam ] - ch_outputs = ch_all_samples - .map { sample -> [sample.group_id, sample]} + ch_outputs = ch_sample_counts + .cross( + ch_all_samples.map { meta -> [meta.group_id, meta] } + ) + .map { count_tuple, meta_tuple -> + def group_id = count_tuple[0] + def count = count_tuple[1] + def meta = meta_tuple[1] + + tuple(groupKey(group_id, count), meta) + } .groupTuple() .map { key_samples -> From b2c2ad7690841cba6209dc07ab6ae7b7dba255f6 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Mon, 12 Feb 2024 11:29:22 +1100 Subject: [PATCH 05/86] Simple improvement to the alignment subworkflow. --- subworkflows/local/alignment.nf | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/subworkflows/local/alignment.nf b/subworkflows/local/alignment.nf index 750a06d7..27e64942 100644 --- a/subworkflows/local/alignment.nf +++ b/subworkflows/local/alignment.nf @@ -42,8 +42,6 @@ workflow ALIGNMENT { def sample_id = meta[sample_key]['sample_id'] def fastq_files = meta[sample_key][Constants.FileType.FASTQ].tokenize(';') - // TODO(MC): Validate fastq_files. - def meta_fastq_common = [:] meta.each { key, value -> @@ -219,13 +217,11 @@ workflow ALIGNMENT { def meta_bam = bam[0] - // TODO(MC): Safer to copy and delete unneeded fields. - def meta = [ - group_id: meta_bam.group_id, - subject_id: meta_bam.subject_id, - ] + def meta = meta_bam.getClass().newInstance(meta_bam) + meta.remove('sample_key') + meta.remove('sample_id') - sample = [sample_id: meta_bam.sample_id] + def sample = [sample_id: meta_bam.sample_id] sample[Constants.FileType.BAM] = bam[1] sample[Constants.FileType.BAI] = bam[2] meta[meta_bam.sample_key] = sample From f3778619d144e4deea4df5fcb04d355cdac58b9c Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Tue, 13 Feb 2024 12:37:11 +1100 Subject: [PATCH 06/86] Merge alignment and markdups logic into Stephen's stubs. --- lib/Utils.groovy | 5 + modules/local/fastp/main.nf | 3 +- subworkflows/local/alignment.nf | 269 ---------------------- subworkflows/local/read_alignment.nf | 312 +++++++++++++++++++++++--- subworkflows/local/read_processing.nf | 148 ++++++++++-- workflows/wgts.nf | 109 +++++---- 6 files changed, 466 insertions(+), 380 deletions(-) delete mode 100644 subworkflows/local/alignment.nf diff --git a/lib/Utils.groovy b/lib/Utils.groovy index 2ca80db2..58c412ca 100644 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -453,6 +453,11 @@ class Utils { } } + public static shallow_copy(obj) { + + return obj.getClass().newInstance(obj) + } + // Alignment utils. static public splitGroupIntoSamples(meta_group) { def sample_entries = [:] diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf index 8deec567..35d3714b 100644 --- a/modules/local/fastp/main.nf +++ b/modules/local/fastp/main.nf @@ -7,6 +7,7 @@ process FASTP { input: tuple val(meta), path(reads_fwd), path(reads_rev) + val(max_fastq_records) output: tuple val(meta), path('*_R1.fastp.fastq'), path('*_R2.fastp.fastq'), emit: fastq @@ -25,7 +26,7 @@ process FASTP { --in1 ${reads_fwd} \\ --in2 ${reads_rev} \\ --disable_adapter_trimming \\ - --split_by_lines 40000000 \\ + --split_by_lines ${4 * max_fastq_records} \\ --out1 ${meta.sample_id}_${meta.read_group}_R1.fastp.fastq \\ --out2 ${meta.sample_id}_${meta.read_group}_R2.fastp.fastq """ diff --git a/subworkflows/local/alignment.nf b/subworkflows/local/alignment.nf deleted file mode 100644 index 27e64942..00000000 --- a/subworkflows/local/alignment.nf +++ /dev/null @@ -1,269 +0,0 @@ -include { BWA_MEM } from '../../modules/local/bwa/mem/main' -include { MARKDUPS } from '../../modules/local/markdups/main' -include { FASTP } from '../../modules/local/fastp/main' -include { SAMBAMBA_INDEX } from '../../modules/local/sambamba/index/main' - -workflow ALIGNMENT { - take: - ch_inputs // channel: [ meta ] - genome_fasta - genome_fai - genome_dict - genome_bwa_index - unmap_regions - max_fastq_lines - - main: - // channel: [ group_id, sample_count ] - ch_sample_counts = ch_inputs.map { meta -> [meta.group_id, Utils.groupSampleCounts(meta)] } - - // channel: [ meta ] (One sample per record). - ch_meta_samples = ch_inputs.flatMap { meta -> Utils.splitGroupIntoSamples(meta) } - - // Sort inputs - // channel: [ meta ] (One sample per record). - ch_meta_samples_sorted = ch_meta_samples - .branch { meta -> - runnable_fastq: Utils.hasDnaFastq(meta) - runnable_markdups: Utils.hasDnaMarkdupsBam(meta) - skip: true - } - - // channel: [ meta_fastq, reads_fwd_fastq, reads_rev_fastq ] - ch_fastq_pairs = ch_meta_samples_sorted.runnable_fastq - .flatMap { meta -> - - def sample_key = Constants.DNA_SAMPLE_KEYS.find { key -> meta.containsKey(key) } - if (sample_key === null) { - log.error "No DNA sample found" - System.exit(1) - } - - def sample_id = meta[sample_key]['sample_id'] - def fastq_files = meta[sample_key][Constants.FileType.FASTQ].tokenize(';') - - def meta_fastq_common = [:] - meta.each { key, value -> - - - if (key === sample_key) { - return - } - - meta_fastq_common[key] = meta[key] - } - meta_fastq_common['sample_key'] = sample_key - meta_fastq_common['sample_id'] = sample_id - - def fastq_pairs = [] - for (i = 0; i < fastq_files.size(); i += 2) { - def reads_fwd = fastq_files[i] - def reads_rev = fastq_files[i + 1] - - def meta_fastq = meta_fastq_common.getClass().newInstance(meta_fastq_common) - meta_fastq['read_group'] = Utils.readGroupFromFastqPath(reads_fwd) - - fastq_pairs.add([meta_fastq, reads_fwd, reads_rev]) - } - - fastq_pairs - } - - // Split fastq files using fastp. - // channel: [ meta_fastq, reads_fwd_fastqs, reads_rev_fastqs ] - ch_split_fastq_pairs = Channel.empty() - if (max_fastq_lines > 0) { - FASTP(ch_fastq_pairs) - ch_split_fastq_pairs = FASTP.out.fastq - } else { - ch_split_fastq_pairs = ch_fastq_pairs.map { fastq_pair -> [fastq_pair[0], [fastq_pair[1]], [fastq_pair[2]]] } - } - - // Create inputs for bwa mem. - // channel: [ meta_fastq, reads_fwd_fastq, reads_rev_fastq ] - ch_bwa_mem_inputs = ch_split_fastq_pairs.flatMap { fastq -> - def meta = fastq[0] - def fwd_reads = fastq[1] - def rev_reads = fastq[2] - - // Pair up the reads. - def read_pairs = [:] - if (fwd_reads.size() == 1) { - read_pairs[""] = ["000", fwd_reads[0], rev_reads[0]] - } else { - fwd_reads.each { fastq_path -> - - def base_name = fastq_path.getFileName().toString() - def pattern = /^(\d+)\.(.+)_R[12]\.fastp\.fastq$/ - def matcher = base_name =~ pattern - assert matcher.find() - def split = matcher[0][1] - def key = "${split}.${matcher[0][2]}" - assert !read_pairs.containsKey(key) - read_pairs[key] = [split, fastq_path] - } - - rev_reads.each { fastq_path -> - - def base_name = fastq_path.getFileName().toString() - def pattern = /^(.+)_R[12]\.fastp\.fastq$/ - def matcher = base_name =~ pattern - assert matcher.find() - def key = matcher[0][1] - assert read_pairs.containsKey(key) - read_pairs[key].add(fastq_path) - } - } - - def fastqs = [] - read_pairs.values().each { split_fastq_pair -> - - meta_fastq = meta.getClass().newInstance(meta) - meta_fastq['split'] = split_fastq_pair[0] - - fastqs.add([meta_fastq, split_fastq_pair[1], split_fastq_pair[2]]) - } - - fastqs - } - - // channel: [ meta_fastq, bam ] - BWA_MEM( - ch_bwa_mem_inputs, - genome_fasta, - genome_bwa_index, - ) - - // channel: [ meta_fastq, bam, bai ] - SAMBAMBA_INDEX( - BWA_MEM.out.bam, - ) - - // Prepare input to markdups process. - // First we prepare a channel of inputs that have gone through alignment. - // channel: [ meta_bam, bams, bais ] - ch_fastq_markdups_inputs = SAMBAMBA_INDEX.out.bam - .map { bam -> // Strip read groups and splits. - - def meta = bam[0] - def meta_bam = [:] - meta.keySet().each { key -> - - if (key == 'read_group' || key == 'split') { - return - } - - meta_bam[key] = meta[key] - } - - [meta_bam, [meta_bam, bam[1], bam[2]]] - } - .groupTuple() - .map { key_lane_bams -> - def lane_bams = key_lane_bams[1] - def meta_bam = lane_bams[0][0] - def bams = [] - def bais = [] - lane_bams.each { lane_bam -> - - bams.add(lane_bam[1]) - bais.add(lane_bam[2]) - } - - [meta_bam, bams, bais] - } - - // Next we prepare channel for markdups input that started of as aligned bams. - // channel: [ meta, bams, bais ] (One sample per meta record). - ch_input_markdups_inputs = ch_meta_samples_sorted.runnable_markdups.map { meta -> - - def sample_key = Constants.DNA_SAMPLE_KEYS.find { key -> meta.containsKey(key) } - if (sample_key === null) { - log.error "No DNA sample found" - System.exit(1) - } - - def sample_id = meta[sample_key]['sample_id'] - def bam = meta[sample_key][Constants.FileType.BAM_MARKDUPS] - def bai = meta[sample_key][Constants.FileType.BAI_MARKDUPS] - - def meta_bam = meta.getClass().newInstance(meta); - meta_bam['sample_key'] = sample_key - meta_bam['sample_id'] = sample_id - - [meta_bam, [bam], [bai]] - } - - // Merging the two markdups input channels. - // channel: [ meta_bam, bams, bais ] - ch_markdups_inputs = Channel.empty() - .mix( - ch_fastq_markdups_inputs, - ch_input_markdups_inputs, - ) - - // channel: [ meta_bam, bam, bai ] - MARKDUPS( - ch_markdups_inputs, - genome_fasta, - genome_fai, - genome_dict, - unmap_regions, - ) - - // Fill the sample information back in. - // channel: [ meta ] (One sample per meta record). - ch_bam_samples = MARKDUPS.out.bam.map { bam -> - - def meta_bam = bam[0] - - def meta = meta_bam.getClass().newInstance(meta_bam) - meta.remove('sample_key') - meta.remove('sample_id') - - def sample = [sample_id: meta_bam.sample_id] - sample[Constants.FileType.BAM] = bam[1] - sample[Constants.FileType.BAI] = bam[2] - meta[meta_bam.sample_key] = sample - - meta - } - - // Merge back in skipped meta entries. - // channel: [ meta ] (One sample per meta record). - ch_all_samples = Channel.empty() - .mix( - ch_bam_samples, - ch_meta_samples_sorted.skip, - ) - - // Merge individual sample records back into group records without blocking for the whole channel to be processed. - // channel: [ meta_bam ] - ch_outputs = ch_sample_counts - .cross( - ch_all_samples.map { meta -> [meta.group_id, meta] } - ) - .map { count_tuple, meta_tuple -> - def group_id = count_tuple[0] - def count = count_tuple[1] - def meta = meta_tuple[1] - - tuple(groupKey(group_id, count), meta) - } - .groupTuple() - .map { key_samples -> - - def samples = key_samples[1] - def merged_sample = [:] - samples.each { sample -> - - sample.each { key, value -> merged_sample[key] = value } - } - - merged_sample - } - - emit: - meta_bam = ch_outputs - // TODO[MC]: Channel version outputs. -} diff --git a/subworkflows/local/read_alignment.nf b/subworkflows/local/read_alignment.nf index a13025fa..acc92a4d 100644 --- a/subworkflows/local/read_alignment.nf +++ b/subworkflows/local/read_alignment.nf @@ -1,39 +1,297 @@ -include { BWA_MEM2 } from '../../modules/local/bwa/mem2/main' +include { BWA_MEM } from '../../modules/local/bwa/mem/main' +include { FASTP } from '../../modules/local/fastp/main' +include { SAMBAMBA_INDEX } from '../../modules/local/sambamba/index/main' include { STAR } from '../../modules/local/star/main' workflow READ_ALIGNMENT { take: - // Sample data - ch_inputs // channel: [mandatory] [ meta ] + // Sample data + ch_inputs // channel: [mandatory] [ meta ] + genome_fasta + genome_bwa_index + max_fastq_records main: - // Channel for version.yml files - // channel: [ versions.yml ] - ch_versions = Channel.empty() - - // STAR - // TODO(SW): implement inputs - ch_star_inputs = Channel.of([[id: 'foo'], []]) - STAR( - ch_star_inputs, - // TODO(SW): include reference files + // TODO(MC): Versions. + // // Channel for version.yml files + // // channel: [ versions.yml ] + // ch_versions = Channel.empty() + + // channel: [ group_id, sample_count ] + ch_sample_counts = ch_inputs.map { meta -> [meta.group_id, Utils.groupSampleCounts(meta)] } + + // channel: [ meta ] (One sample per record). + ch_meta_samples = ch_inputs.flatMap { meta -> Utils.splitGroupIntoSamples(meta) } + + // Sort inputs + // channel: [ meta ] (One sample per record). + ch_meta_samples_sorted = ch_meta_samples + .branch { meta -> + runnable_fastq: Utils.hasDnaFastq(meta) + skip: true + } + + // STAR + // TODO(SW): implement inputs + // ch_star_inputs = Channel.of([[id: 'foo'], []]) + // STAR( + // ch_star_inputs, + // // TODO(SW): include reference files + // ) + // TODO(SW): implement outputs + ch_star_outputs = Channel.empty() + + // BWA MEM + // channel: [ sample_key, fastq_pair_count ] + ch_sample_fastq_pair_count = ch_meta_samples_sorted.runnable_fastq.map { meta_sample -> + + def sample_key = Utils.shallow_copy(meta_sample) + def fastq_pair_count = 0 + meta_sample.each { key, value -> + + if ((value instanceof java.util.Map) && value.containsKey('sample_id')) { + sample_key['sample_key'] = key + sample_key['sample_id'] = value.sample_id + sample_key.remove(key) + + fastq_pair_count = value[Constants.FileType.FASTQ].tokenize(';').size() / 2 + } + } + + [sample_key, fastq_pair_count] + } + + // channel: [ meta_fastq, reads_fwd_fastq, reads_rev_fastq ] + ch_fastq_pairs = ch_meta_samples_sorted.runnable_fastq + .flatMap { meta -> + + def sample_key = Constants.DNA_SAMPLE_KEYS.find { key -> meta.containsKey(key) } + if (sample_key === null) { + log.error "No DNA sample found" + System.exit(1) + } + + def sample_id = meta[sample_key]['sample_id'] + def fastq_files = meta[sample_key][Constants.FileType.FASTQ].tokenize(';') + + def meta_fastq_common = [:] + meta.each { key, value -> + + + if (key === sample_key) { + return + } + + meta_fastq_common[key] = meta[key] + } + meta_fastq_common['sample_key'] = sample_key + meta_fastq_common['sample_id'] = sample_id + + def fastq_pairs = [] + for (i = 0; i < fastq_files.size(); i += 2) { + def reads_fwd = fastq_files[i] + def reads_rev = fastq_files[i + 1] + + def meta_fastq = Utils.shallow_copy(meta_fastq_common) + meta_fastq['read_group'] = Utils.readGroupFromFastqPath(reads_fwd) + + fastq_pairs.add([meta_fastq, reads_fwd, reads_rev]) + } + + fastq_pairs + } + + // Split fastq files using fastp. + // channel: [ meta_fastq, reads_fwd_fastqs, reads_rev_fastqs ] + ch_split_fastq_pairs = Channel.empty() + if (max_fastq_records > 0) { + FASTP( + ch_fastq_pairs, + max_fastq_records, ) - // TODO(SW): implement outputs - ch_star_outputs = Channel.empty() - - // BWA MEM2 - // TODO(SW): implement inputs - ch_bwa_inputs = Channel.of([[id: 'foo'], []]) - BWA_MEM2( - ch_bwa_inputs, - // TODO(SW): include reference files + + ch_split_fastq_pairs = FASTP.out.fastq + } else { + ch_split_fastq_pairs = ch_fastq_pairs.map { fastq_pair -> [fastq_pair[0], [fastq_pair[1]], [fastq_pair[2]]] } + } + + // channel: [ sample_key, fastq_pair_split_count ] + ch_sample_fastq_pair_split_count = ch_sample_fastq_pair_count + .cross( + ch_split_fastq_pairs.map { split_fastq_pairs -> + + def meta_sample = split_fastq_pairs[0] + def sample_key = Utils.shallow_copy(meta_sample) + sample_key.remove('read_group') + sample_key.remove(meta_sample.sample_key) + + [sample_key, split_fastq_pairs[1].size()] + } ) - // TODO(SW): implement outputs - ch_bwa_outputs = Channel.empty() + .map { count_tuple, split_count_tuple -> + def sample_key = count_tuple[0] + def count = count_tuple[1].intValue() + def split_count = split_count_tuple[1] + + tuple(groupKey(sample_key, count), sample_key, split_count) + } + .groupTuple() + .map { group_key, sample_keys, split_counts -> + + [sample_keys[0], split_counts.sum()] + } + + // Create inputs for bwa mem. + // channel: [ meta_fastq, reads_fwd_fastq, reads_rev_fastq ] + ch_bwa_mem_inputs = ch_split_fastq_pairs.flatMap { fastq -> + def meta = fastq[0] + def fwd_reads = fastq[1] + def rev_reads = fastq[2] + + // Pair up the reads. + def read_pairs = [:] + if (fwd_reads.size() == 1) { + read_pairs[""] = ["000", fwd_reads[0], rev_reads[0]] + } else { + fwd_reads.each { fastq_path -> + + def base_name = fastq_path.getFileName().toString() + def pattern = /^(\d+)\.(.+)_R[12]\.fastp\.fastq$/ + def matcher = base_name =~ pattern + assert matcher.find() + def split = matcher[0][1] + def key = "${split}.${matcher[0][2]}" + assert !read_pairs.containsKey(key) + read_pairs[key] = [split, fastq_path] + } + + rev_reads.each { fastq_path -> + + def base_name = fastq_path.getFileName().toString() + def pattern = /^(.+)_R[12]\.fastp\.fastq$/ + def matcher = base_name =~ pattern + assert matcher.find() + def key = matcher[0][1] + assert read_pairs.containsKey(key) + read_pairs[key].add(fastq_path) + } + } + + def fastqs = [] + read_pairs.values().each { split_fastq_pair -> + + meta_fastq = Utils.shallow_copy(meta) + meta_fastq['split'] = split_fastq_pair[0] + + fastqs.add([meta_fastq, split_fastq_pair[1], split_fastq_pair[2]]) + } + + fastqs + } + + // channel: [ meta_fastq, bam ] + BWA_MEM( + ch_bwa_mem_inputs, + genome_fasta, + genome_bwa_index, + ) + + // channel: [ meta_fastq, bam, bai ] + SAMBAMBA_INDEX( + BWA_MEM.out.bam, + ) + + // Merge all bam records for a single sample into a singlke record. + // channel: [ meta ] (One sample per meta record). + ch_merged_bam_samples = ch_sample_fastq_pair_split_count + .cross( + SAMBAMBA_INDEX.out.bam + .map { bam -> + + def meta_bam = bam[0] + def sample_key = Utils.shallow_copy(meta_bam) + sample_key.remove(meta_bam.sample_key) + sample_key.remove('read_group') + sample_key.remove('split') + + [sample_key, bam] + } + ) + .map { count_tuple, bam_tuple -> + + def sample_key = count_tuple[0] + def count = count_tuple[1] + def bam = bam_tuple[1] + + tuple(groupKey(sample_key, count), bam) + } + .groupTuple() + .map { group_key, bams -> + + def first_meta_bam = bams[0][0] + def sample_key = first_meta_bam.sample_key + + def bam_files = [] + def bai_files = [] + + def meta_bam = Utils.shallow_copy(first_meta_bam) + meta_bam.remove(sample_key) + meta_bam.remove('sample_key') + meta_bam.remove('sample_id') + meta_bam.remove('read_group') + meta_bam.remove('split') + + meta_bam[sample_key] = [sample_id: first_meta_bam.sample_id] + meta_bam[sample_key][Constants.FileType.BAM_MARKDUPS] = bam_files + meta_bam[sample_key][Constants.FileType.BAI_MARKDUPS] = bai_files + + bams.each { bam -> + bam_files.add(bam[1]) + bai_files.add(bam[2]) + } + + meta_bam + } + + // Merge back in skipped meta entries. + // channel: [ meta ] (One sample per meta record). + ch_all_samples = Channel.empty() + .mix( + ch_merged_bam_samples, + ch_meta_samples_sorted.skip, + ) + + // Merge individual sample records back into group records without blocking for the whole channel to be processed. + // channel: [ meta_bam ] + ch_bwa_outputs = ch_sample_counts + .cross( + ch_all_samples.map { meta -> [meta.group_id, meta] } + ) + .map { count_tuple, meta_tuple -> + def group_id = count_tuple[0] + def count = count_tuple[1] + def meta = meta_tuple[1] + + tuple(groupKey(group_id, count), meta) + } + .groupTuple() + .map { group_key, meta_samples -> + + def meta_group = [:] + meta_samples.each { meta_sample -> + + meta_sample.each { key, value -> meta_group[key] = value } + } + + meta_group + } emit: - dna = ch_bwa_outputs // channel: [ meta, bam_dna ] - rna = ch_star_outputs // channel: [ meta, bam_rna ] + dna = ch_bwa_outputs // channel: [ meta ] + + // TODO(MC): RNA alignment. + rna = ch_star_outputs // channel: [ meta, bam_rna ] - versions = ch_versions // channel: [ versions.yml ] + // TODO(MC): Versions. + // versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/read_processing.nf b/subworkflows/local/read_processing.nf index af0bb10c..d8de6b54 100644 --- a/subworkflows/local/read_processing.nf +++ b/subworkflows/local/read_processing.nf @@ -1,34 +1,134 @@ -include { MARKDUPS } from '../../modules/local/markdups/main' +include { MARKDUPS } from '../../modules/local/markdups/main' workflow READ_PROCESSING { take: - // Sample data - ch_inputs // channel: [mandatory] [ meta ] - ch_dna_bams // channel: [mandatory] [ meta, bam_dna ] - ch_rna_bams // channel: [mandatory] [ meta, bam_rna ] + // Sample data + ch_inputs // channel: [mandatory] [ meta ] + ch_dna_bams // channel: [mandatory] [ meta, bam_dna ] + ch_rna_bams // channel: [mandatory] [ meta, bam_rna ] + genome_fasta + genome_fai + genome_dict + unmap_regions main: - // Channel for version.yml files - // channel: [ versions.yml ] - ch_versions = Channel.empty() - - // NOTE(SW): channel operations will be required to configure MarkDups for individual samples - - // MarkDups - // TODO(SW): implement inputs - ch_markdups_inputs = Channel.of([[id: 'foo'], []]) - MARKDUPS( - ch_markdups_inputs, - // TODO(SW): configuration - // TODO(SW): reference files + // TODO(MC): Versions. + // Channel for version.yml files + // channel: [ versions.yml ] + // ch_versions = Channel.empty() + + // channel: [ group_id, sample_count ] + ch_sample_counts = ch_inputs.map { meta -> [meta.group_id, Utils.groupSampleCounts(meta)] } + + // channel: [ meta ] (One sample per record). + ch_meta_samples = ch_dna_bams.flatMap { meta -> Utils.splitGroupIntoSamples(meta) } + + // Sort inputs + // channel: [ meta ] (One sample per record). + ch_meta_samples_sorted = ch_meta_samples + .branch { meta -> + runnable: Utils.hasDnaMarkdupsBam(meta) + skip: true + } + + // MarkDups + // Prepare input to markdups process. + // channel: [ meta_bam, bams, bais ] + ch_markdups_inputs = ch_meta_samples_sorted.runnable + .map { meta_sample -> + + def meta_bam = Utils.shallow_copy(meta_sample) + def bams = [] + def bais = [] + meta_sample.each { key, value -> + + if ((value instanceof java.util.Map) && value.containsKey('sample_id')) { + meta_bam['sample_id'] = value.sample_id + meta_bam['sample_key'] = key + bams = value[Constants.FileType.BAM_MARKDUPS] + bais = value[Constants.FileType.BAI_MARKDUPS] + } + } + + if (!(bams instanceof Collection)) { + bams = [bams] + } + + if (!(bais instanceof Collection)) { + bais = [bais] + } + + [meta_bam, bams, bais] + } + + // channel: [ meta_bam, bam, bai ] + MARKDUPS( + ch_markdups_inputs, + genome_fasta, + genome_fai, + genome_dict, + unmap_regions, + ) + + // Update sample information. + // channel: [ meta ] (One sample per meta record). + ch_bam_samples = MARKDUPS.out.bam.map { bam -> + + def meta_bam = bam[0] + + def meta = Utils.shallow_copy(meta_bam) + meta.remove('sample_id') + meta.remove('sample_key') + + def sample = [sample_id: meta_bam.sample_id] + sample[Constants.FileType.BAM] = bam[1] + sample[Constants.FileType.BAI] = bam[2] + meta[meta_bam.sample_key] = sample + + meta + } + + // Merge back in skipped meta entries. + // channel: [ meta ] (One sample per meta record). + ch_all_samples = Channel.empty() + .mix( + ch_bam_samples, + ch_meta_samples_sorted.skip, + ) + + // Merge individual sample records back into group records without blocking for the whole channel to be processed. + // channel: [ meta_bam ] + ch_markduplicates_dna_out = ch_sample_counts + .cross( + ch_all_samples.map { meta -> [meta.group_id, meta] } ) - // TODO(SW): implement outputs - ch_markduplicates_dna_out = Channel.empty() - ch_markduplicates_rna_out = Channel.empty() + .map { count_tuple, meta_tuple -> + + def group_id = count_tuple[0] + def count = count_tuple[1] + def meta = meta_tuple[1] + + tuple(groupKey(group_id, count), meta) + } + .groupTuple() + .map { group_key, meta_samples -> + + def meta_group = [:] + meta_samples.each { meta_sample -> + + meta_sample.each { key, value -> meta_group[key] = value } + } + + meta_group + } + + // TODO(SW): implement outputs + ch_markduplicates_rna_out = Channel.empty() emit: - dna = ch_markduplicates_dna_out // channel: [ meta, bam_dna ] - rna = ch_markduplicates_rna_out // channel: [ meta, bam_rna ] + dna = ch_markduplicates_dna_out // channel: [ meta ] + rna = ch_markduplicates_rna_out // channel: [ meta, bam_rna ] - versions = ch_versions // channel: [ versions.yml ] + // TODO(MC): Versions. + // versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 24ea60c0..1d55a889 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -68,7 +68,6 @@ linx_gene_id_file = params.linx_gene_id_file ? file(params.linx_gene_id_file) : // // SUBWORKFLOWS // -include { ALIGNMENT } from '../subworkflows/local/alignment' include { AMBER_PROFILING } from '../subworkflows/local/amber_profiling' include { BAMTOOLS_METRICS } from '../subworkflows/local/bamtools_metrics' include { CHORD_PREDICTION } from '../subworkflows/local/chord_prediction' @@ -119,8 +118,7 @@ workflow WGTS { // Create input channel from parsed CSV // channel: [ meta ] - // TODO[MC]: Rename this back to original, and swap name out for current ch_inputs. - ch_inputs0 = Channel.fromList(inputs) + ch_inputs = Channel.fromList(inputs) // Set up reference data, assign more human readable variables PREPARE_REFERENCE( @@ -129,79 +127,72 @@ workflow WGTS { ref_data = PREPARE_REFERENCE.out hmf_data = PREPARE_REFERENCE.out.hmf_data - // TODO[MC]: Skipping alignment, only running up to alignment? - ALIGNMENT( - ch_inputs0, - ref_data.genome_fasta, - ref_data.genome_fai, - ref_data.genome_dict, - ref_data.genome_bwa_index, - file(params.refdata_unmap_regions), - 4 * params.max_fastq_records, - ) - - ch_inputs = ALIGNMENT.out.meta_bam - // Set GRIDSS config gridss_config = params.containsKey('gridss_config') ? file(params.gridss_config) : hmf_data.gridss_config - // // - // // SUBWORKFLOW: Align reads - // // - // // channel: [ meta, bam_dna ] - // ch_dna_alignment_out = Channel.empty() - // // channel: [ meta, bam_rna ] - // ch_rna_alignment_out = Channel.empty() - // // TODO(MC): set up correctly - // // if (true | run_config.stages.alignment) { + // + // SUBWORKFLOW: Align reads + // + // channel: [ meta ] + ch_dna_alignment_out = Channel.empty() + // channel: [ meta, bam_rna ] + ch_rna_alignment_out = Channel.empty() + // TODO(SW): set up correctly + if (true || run_config.stages.alignment) { - // // READ_ALIGNMENT( - // // ch_inputs, - // // // alignment reference files - // // ) + READ_ALIGNMENT( + ch_inputs, + ref_data.genome_fasta, + ref_data.genome_bwa_index, + params.max_fastq_records, + ) - // // ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) + // TODO(MC): Versions. + // ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) - // // ch_dna_alignment_out = ch_dna_alignment_out.mix(READ_ALIGNMENT.out.dna) - // // ch_rna_alignment_out = ch_rna_alignment_out.mix(READ_ALIGNMENT.out.rna) + ch_dna_alignment_out = ch_dna_alignment_out.mix(READ_ALIGNMENT.out.dna) + ch_rna_alignment_out = ch_rna_alignment_out.mix(READ_ALIGNMENT.out.rna) - // // } else { - // // ch_dna_alignment_out = ch_inputs.map { meta -> [meta, []] } - // // ch_rna_alignment_out = ch_inputs.map { meta -> [meta, []] } + } else { - // // } + ch_dna_alignment_out = ch_inputs + ch_rna_alignment_out = ch_inputs.map { meta -> [meta, []] } - // ch_dna_alignment_out = ch_inputs.map { meta -> [meta, []] } - // ch_rna_alignment_out = ch_inputs.map { meta -> [meta, []] } + } - // // - // // SUBWORKFLOW: Process read alignments - // // - // // channel: [ meta, bam_dna ] - // ch_dna_processed_out = Channel.empty() - // // channel: [ meta, bam_rna ] - // ch_rna_processed_out = Channel.empty() - // // TODO(SW): set up correctly - // if (true | run_config.stages.markdups) { + // + // SUBWORKFLOW: Process read alignments + // + // channel: [ meta ] + ch_dna_processed_out = Channel.empty() + // channel: [ meta, bam_rna ] + ch_rna_processed_out = Channel.empty() + // TODO(SW): set up correctly + if (true || run_config.stages.markdups) { - // READ_PROCESSING( - // ch_inputs, - // ch_dna_alignment_out, - // ch_rna_alignment_out, - // ) + READ_PROCESSING( + ch_inputs, + ch_dna_alignment_out, + ch_rna_alignment_out, + ref_data.genome_fasta, + ref_data.genome_fai, + ref_data.genome_dict, + file(params.refdata_unmap_regions), + ) - // ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) + // TODO(MC): Versions. + // ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) - // ch_dna_processed_out = ch_dna_processed_out.mix(READ_PROCESSING.out.dna) - // ch_rna_processed_out = ch_rna_processed_out.mix(READ_PROCESSING.out.rna) + ch_dna_processed_out = ch_dna_processed_out.mix(READ_PROCESSING.out.dna) + ch_rna_processed_out = ch_rna_processed_out.mix(READ_PROCESSING.out.rna) - // } else { + } else { - // ch_dna_processed_out = ch_inputs.map { meta -> [meta, []] } - // ch_rna_processed_out = ch_inputs.map { meta -> [meta, []] } + ch_dna_processed_out = ch_inputs.map + ch_rna_processed_out = ch_inputs.map { meta -> [meta, []] } - // } + } // TODO(SW): adjust downstream selection of input BAM From 01510042bdbb5eda230e67eca3912249ac4f20dd Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Tue, 13 Feb 2024 14:01:54 +1100 Subject: [PATCH 07/86] Updgrading from bwa mem to bwa mem2. --- modules/local/bwa/mem/main.nf | 2 - modules/local/bwa/mem2/Dockerfile | 9 ++++ modules/local/bwa/mem2/main.nf | 66 +++++++++++++++++++++------- subworkflows/local/read_alignment.nf | 8 ++-- 4 files changed, 62 insertions(+), 23 deletions(-) create mode 100644 modules/local/bwa/mem2/Dockerfile diff --git a/modules/local/bwa/mem/main.nf b/modules/local/bwa/mem/main.nf index 428d1ce9..e3b54dd4 100644 --- a/modules/local/bwa/mem/main.nf +++ b/modules/local/bwa/mem/main.nf @@ -1,5 +1,3 @@ -// TODO(MC): BWA MEM2: Need docker and resource files. - process BWA_MEM { tag "${meta.id}" diff --git a/modules/local/bwa/mem2/Dockerfile b/modules/local/bwa/mem2/Dockerfile new file mode 100644 index 00000000..0f2641da --- /dev/null +++ b/modules/local/bwa/mem2/Dockerfile @@ -0,0 +1,9 @@ +FROM docker.io/continuumio/miniconda3:23.10.0-1 + +RUN \ + conda install -y -n base conda-libmamba-solver && \ + conda config --set solver libmamba && \ + conda install -y -c bioconda -c conda-forge -c conda \ + 'bwa-mem2==2.2.1' \ + 'sambamba==1.0' && \ + conda clean -yaf diff --git a/modules/local/bwa/mem2/main.nf b/modules/local/bwa/mem2/main.nf index 3b55a8a1..03d5d327 100644 --- a/modules/local/bwa/mem2/main.nf +++ b/modules/local/bwa/mem2/main.nf @@ -2,38 +2,70 @@ process BWA_MEM2 { tag "${meta.id}" label 'process_high' - // TODO(SW): create container - //container 'foo' + // TODO(MC): Upload container. + container 'bwa-mem2:2.2.1-sambamba' input: - // TODO(SW): decide input structure - tuple val(meta), path(fastqs) + tuple val(meta), path(reads_fwd), path(reads_rev) + path genome_fasta + // TODO(MC): Copied into local genome_bwa_index for ref genome 37: + // + Homo_sapiens.GRCh37.GATK.illumina.fasta.bwt.2bit.64 + // + Homo_sapiens.GRCh37.GATK.illumina.fasta.0123 + path genome_bwa_index output: - // TODO(SW): set outputs - tuple val(meta), path('bar'), emit: bam - path 'versions.yml' , emit: versions + tuple val(meta), path('*.bam'), emit: bam + // TODO(MC): Versions. + // path 'versions.yml' , emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' + // # TODO(MC): read group + // # -R ${meta.read_group} - // TODO(SW): implement process """ - touch bar + ln -s \$(find -L ${genome_bwa_index} -type f) ./ - cat <<-END_VERSIONS > versions.yml - "${task.process}": - bwamem2: foo - END_VERSIONS + bwa-mem2 mem \\ + -Y \\ + -t ${task.cpus} \\ + ${genome_fasta} \\ + ${reads_fwd} \\ + ${reads_rev} | \\ + \\ + sambamba view \\ + --sam-input \\ + --format bam \\ + --compression-level 0 \\ + --nthreads ${task.cpus} \\ + /dev/stdin | \\ + \\ + sambamba sort \\ + --nthreads ${task.cpus} \\ + --out ${meta.split}.${meta.sample_id}.${meta.read_group}.bam \\ + /dev/stdin """ + // TODO(SW): Versions. + // """ + // touch bar + + // cat <<-END_VERSIONS > versions.yml + // "${task.process}": + // bwamem2: foo + // END_VERSIONS + // """ + stub: - // TODO(SW): implement stub """ - touch bar - echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + touch ${meta.split}.${meta.sample_id}.${meta.read_group}.bam """ + + // TODO(MV): Versions. + // """ + // touch bar + // echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + // """ } diff --git a/subworkflows/local/read_alignment.nf b/subworkflows/local/read_alignment.nf index acc92a4d..d8d654a8 100644 --- a/subworkflows/local/read_alignment.nf +++ b/subworkflows/local/read_alignment.nf @@ -1,4 +1,4 @@ -include { BWA_MEM } from '../../modules/local/bwa/mem/main' +include { BWA_MEM2 } from '../../modules/local/bwa/mem2/main' include { FASTP } from '../../modules/local/fastp/main' include { SAMBAMBA_INDEX } from '../../modules/local/sambamba/index/main' include { STAR } from '../../modules/local/star/main' @@ -41,7 +41,7 @@ workflow READ_ALIGNMENT { // TODO(SW): implement outputs ch_star_outputs = Channel.empty() - // BWA MEM + // BWA MEM2 // channel: [ sample_key, fastq_pair_count ] ch_sample_fastq_pair_count = ch_meta_samples_sorted.runnable_fastq.map { meta_sample -> @@ -190,7 +190,7 @@ workflow READ_ALIGNMENT { } // channel: [ meta_fastq, bam ] - BWA_MEM( + BWA_MEM2( ch_bwa_mem_inputs, genome_fasta, genome_bwa_index, @@ -198,7 +198,7 @@ workflow READ_ALIGNMENT { // channel: [ meta_fastq, bam, bai ] SAMBAMBA_INDEX( - BWA_MEM.out.bam, + BWA_MEM2.out.bam, ) // Merge all bam records for a single sample into a singlke record. From 8456695db7c08b2726c8f517492b7e3f6a6f80fd Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Tue, 13 Feb 2024 14:28:39 +1100 Subject: [PATCH 08/86] Fixing read group flag for bwa mem2. --- modules/local/bwa/Dockerfile | 2 +- modules/local/bwa/mem/main.nf | 52 ------------------------------- modules/local/bwa/mem2/Dockerfile | 9 ------ modules/local/bwa/mem2/main.nf | 4 +-- 4 files changed, 3 insertions(+), 64 deletions(-) delete mode 100644 modules/local/bwa/mem/main.nf delete mode 100644 modules/local/bwa/mem2/Dockerfile diff --git a/modules/local/bwa/Dockerfile b/modules/local/bwa/Dockerfile index 2172ebc4..0f2641da 100644 --- a/modules/local/bwa/Dockerfile +++ b/modules/local/bwa/Dockerfile @@ -4,6 +4,6 @@ RUN \ conda install -y -n base conda-libmamba-solver && \ conda config --set solver libmamba && \ conda install -y -c bioconda -c conda-forge -c conda \ - 'bwa==0.7.17' \ + 'bwa-mem2==2.2.1' \ 'sambamba==1.0' && \ conda clean -yaf diff --git a/modules/local/bwa/mem/main.nf b/modules/local/bwa/mem/main.nf deleted file mode 100644 index e3b54dd4..00000000 --- a/modules/local/bwa/mem/main.nf +++ /dev/null @@ -1,52 +0,0 @@ -process BWA_MEM { - tag "${meta.id}" - - // TODO(MC): What process label? - // label 'process_medium' - - container 'docker.io/scwatts/bwa:0.7.17-sambamba' - - input: - tuple val(meta), path(reads_fwd), path(reads_rev) - path genome_fasta - path genome_bwa_index - - output: - tuple val(meta), path('*bam'), emit: bam - - // TODO(MC): How does this work? - when: - task.ext.when == null || task.ext.when - - // # TODO(MC): read group - // # -R ${meta.read_group} - - script: - """ - ln -s \$(find -L ${genome_bwa_index} -type f) ./ - - bwa mem \\ - -Y \\ - -t ${task.cpus} \\ - ${genome_fasta} \\ - ${reads_fwd} \\ - ${reads_rev} | \\ - \\ - sambamba view \\ - --sam-input \\ - --format bam \\ - --compression-level 0 \\ - --nthreads ${task.cpus} \\ - /dev/stdin | \\ - \\ - sambamba sort \\ - --nthreads ${task.cpus} \\ - --out ${meta.split}.${meta.sample_id}.${meta.read_group}.bam \\ - /dev/stdin - """ - - stub: - """ - touch ${meta.split}.${meta.sample_id}.${meta.read_group}.bam - """ -} diff --git a/modules/local/bwa/mem2/Dockerfile b/modules/local/bwa/mem2/Dockerfile deleted file mode 100644 index 0f2641da..00000000 --- a/modules/local/bwa/mem2/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM docker.io/continuumio/miniconda3:23.10.0-1 - -RUN \ - conda install -y -n base conda-libmamba-solver && \ - conda config --set solver libmamba && \ - conda install -y -c bioconda -c conda-forge -c conda \ - 'bwa-mem2==2.2.1' \ - 'sambamba==1.0' && \ - conda clean -yaf diff --git a/modules/local/bwa/mem2/main.nf b/modules/local/bwa/mem2/main.nf index 03d5d327..08c8e503 100644 --- a/modules/local/bwa/mem2/main.nf +++ b/modules/local/bwa/mem2/main.nf @@ -22,14 +22,14 @@ process BWA_MEM2 { task.ext.when == null || task.ext.when script: - // # TODO(MC): read group - // # -R ${meta.read_group} + def read_group_tag = "@RG\t${meta.read_group}" """ ln -s \$(find -L ${genome_bwa_index} -type f) ./ bwa-mem2 mem \\ -Y \\ + -R '${read_group_tag}' \\ -t ${task.cpus} \\ ${genome_fasta} \\ ${reads_fwd} \\ From c3f90500953b2ec32f040d14b0a9dcd7b0263112 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Tue, 13 Feb 2024 14:32:54 +1100 Subject: [PATCH 09/86] Reassigning TODO. --- subworkflows/local/read_alignment.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/read_alignment.nf b/subworkflows/local/read_alignment.nf index d8d654a8..c258a06b 100644 --- a/subworkflows/local/read_alignment.nf +++ b/subworkflows/local/read_alignment.nf @@ -289,7 +289,7 @@ workflow READ_ALIGNMENT { emit: dna = ch_bwa_outputs // channel: [ meta ] - // TODO(MC): RNA alignment. + // TODO(SW): RNA alignment. rna = ch_star_outputs // channel: [ meta, bam_rna ] // TODO(MC): Versions. From fe49cf0f5648f3a57b7905fe575c47ca03c9ecb0 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Tue, 13 Feb 2024 17:07:48 +1100 Subject: [PATCH 10/86] Emiting versions. --- lib/Constants.groovy | 4 +-- modules/local/bwa/mem2/main.nf | 27 ++++++----------- modules/local/fastp/main.nf | 13 +++++++-- modules/local/markdups/main.nf | 42 +++++++++------------------ modules/local/sambamba/index/main.nf | 11 +++++++ subworkflows/local/read_alignment.nf | 18 +++++++----- subworkflows/local/read_processing.nf | 9 +++--- workflows/wgts.nf | 6 ++-- 8 files changed, 61 insertions(+), 69 deletions(-) diff --git a/lib/Constants.groovy b/lib/Constants.groovy index f3fc958b..91f1059c 100644 --- a/lib/Constants.groovy +++ b/lib/Constants.groovy @@ -34,8 +34,7 @@ class Constants { } static enum Process { - // TODO[MC]: Add process here. - BWAMEM, + BWAMEM2, AMBER, BAMTOOLS, CHORD, @@ -113,7 +112,6 @@ class Constants { static Map PLACEHOLDER_META = [meta_placeholder: null] static List PLACEHOLDER_OPTIONAL_CHANNEL = [] - // TODO(MC): How is this used? static Map INPUT = [ ISOFOX_DIR: [ diff --git a/modules/local/bwa/mem2/main.nf b/modules/local/bwa/mem2/main.nf index 08c8e503..bc1df838 100644 --- a/modules/local/bwa/mem2/main.nf +++ b/modules/local/bwa/mem2/main.nf @@ -15,8 +15,7 @@ process BWA_MEM2 { output: tuple val(meta), path('*.bam'), emit: bam - // TODO(MC): Versions. - // path 'versions.yml' , emit: versions + path 'versions.yml' , emit: versions when: task.ext.when == null || task.ext.when @@ -46,26 +45,18 @@ process BWA_MEM2 { --nthreads ${task.cpus} \\ --out ${meta.split}.${meta.sample_id}.${meta.read_group}.bam \\ /dev/stdin - """ - - // TODO(SW): Versions. - // """ - // touch bar - // cat <<-END_VERSIONS > versions.yml - // "${task.process}": - // bwamem2: foo - // END_VERSIONS - // """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwamem2: 2.2.1 + sambamba: 1.0 + END_VERSIONS + """ stub: """ touch ${meta.split}.${meta.sample_id}.${meta.read_group}.bam - """ - // TODO(MV): Versions. - // """ - // touch bar - // echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml - // """ + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ } diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf index 35d3714b..a14b43ec 100644 --- a/modules/local/fastp/main.nf +++ b/modules/local/fastp/main.nf @@ -1,8 +1,6 @@ process FASTP { tag "${meta.id}" - // TODO(MC): Resources? - container 'docker.io/scwatts/fastp:0.23.4' input: @@ -11,6 +9,10 @@ process FASTP { output: tuple val(meta), path('*_R1.fastp.fastq'), path('*_R2.fastp.fastq'), emit: fastq + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when script: // TODO(MC): UMI flags @@ -29,11 +31,18 @@ process FASTP { --split_by_lines ${4 * max_fastq_records} \\ --out1 ${meta.sample_id}_${meta.read_group}_R1.fastp.fastq \\ --out2 ${meta.sample_id}_${meta.read_group}_R2.fastp.fastq + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: 0.23.4 + END_VERSIONS """ stub: """ touch 00{1..4}.${meta.sample_id}_${meta.read_group}_R1.fastp.fastq touch 00{1..4}.${meta.sample_id}_${meta.read_group}_R2.fastp.fastq + + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml """ } diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index 7a25e728..c3f97e28 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -1,9 +1,6 @@ process MARKDUPS { tag "${meta_bam.id}" - // TODO(MC): Resources required? - // label 'process_low' - container 'docker.io/scwatts/markdups:1.1.rc1' input: @@ -15,35 +12,13 @@ process MARKDUPS { output: tuple val(meta_bam), path('*bam'), path('*bai'), emit: bam + path 'versions.yml' , emit: versions path '*.tsv' - // TODO(MC): Make sure this is in each. when: task.ext.when == null || task.ext.when - // TODO(MC): Versions in each. - // path 'versions.yml' , emit: versions - - // script: - // def args = task.ext.args ?: '' - - // // TODO(SW): implement process - // """ - // echo bar - - // cat <<-END_VERSIONS > versions.yml - // "${task.process}": - // markdups: foo - // END_VERSIONS - // """ - - // stub: - // // TODO(SW): implement stub - // """ - // touch bar - // echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml - // """ - + script: // # TODO(MC): Umi flags // # -multi_bam \\ // # -umi_enabled \\ @@ -51,7 +26,6 @@ process MARKDUPS { // # -umi_duplex_delim _ \\ // # -umi_base_diff_stats \\ - script: """ java \\ -Xmx${Math.round(task.memory.bytes * 0.95)} \\ @@ -73,6 +47,14 @@ process MARKDUPS { -threads 16 \\ \\ -output_bam ${meta_bam.sample_id}.mark_dups.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sambamba: 1.0 + samtools: 1.17 + openjdk: >=8 + mark-dups: 1.1 + END_VERSIONS """ stub: @@ -80,9 +62,11 @@ process MARKDUPS { touch ${meta_bam.sample_id}.mark_dups.bam touch ${meta_bam.sample_id}.mark_dups.bam.bai touch ${meta_bam.sample_id}.duplicate_freq.tsv + + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml """ - // # TODO(MC): + // # TODO(MC): UMIs. // # touch ${meta_bam.sample_id}.umi_coord_freq.tsv // # touch ${meta_bam.sample_id}.umi_edit_distance.tsv // # touch ${meta_bam.sample_id}.umi_nucleotide_freq.tsv diff --git a/modules/local/sambamba/index/main.nf b/modules/local/sambamba/index/main.nf index f6cf00ea..2c42e475 100644 --- a/modules/local/sambamba/index/main.nf +++ b/modules/local/sambamba/index/main.nf @@ -8,16 +8,27 @@ process SAMBAMBA_INDEX { output: tuple val(meta), path(bam), path('*bai'), emit: bam + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when script: """ sambamba index \\ --nthreads ${task.cpus} \\ ${meta.split}.${meta.sample_id}.${meta.read_group}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sambamba: 1.0 + END_VERSIONS """ stub: """ touch ${meta.split}.${meta.sample_id}.${meta.read_group}.bam.bai + + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml """ } diff --git a/subworkflows/local/read_alignment.nf b/subworkflows/local/read_alignment.nf index c258a06b..47259fd1 100644 --- a/subworkflows/local/read_alignment.nf +++ b/subworkflows/local/read_alignment.nf @@ -12,10 +12,9 @@ workflow READ_ALIGNMENT { max_fastq_records main: - // TODO(MC): Versions. - // // Channel for version.yml files - // // channel: [ versions.yml ] - // ch_versions = Channel.empty() + // Channel for version.yml files + // channel: [ versions.yml ] + ch_versions = Channel.empty() // channel: [ group_id, sample_count ] ch_sample_counts = ch_inputs.map { meta -> [meta.group_id, Utils.groupSampleCounts(meta)] } @@ -110,6 +109,8 @@ workflow READ_ALIGNMENT { max_fastq_records, ) + ch_versions = ch_versions.mix(FASTP.out.versions) + ch_split_fastq_pairs = FASTP.out.fastq } else { ch_split_fastq_pairs = ch_fastq_pairs.map { fastq_pair -> [fastq_pair[0], [fastq_pair[1]], [fastq_pair[2]]] } @@ -196,11 +197,15 @@ workflow READ_ALIGNMENT { genome_bwa_index, ) + ch_versions = ch_versions.mix(BWA_MEM2.out.versions) + // channel: [ meta_fastq, bam, bai ] SAMBAMBA_INDEX( BWA_MEM2.out.bam, ) + ch_versions = ch_versions.mix(SAMBAMBA_INDEX.out.versions) + // Merge all bam records for a single sample into a singlke record. // channel: [ meta ] (One sample per meta record). ch_merged_bam_samples = ch_sample_fastq_pair_split_count @@ -288,10 +293,7 @@ workflow READ_ALIGNMENT { emit: dna = ch_bwa_outputs // channel: [ meta ] - // TODO(SW): RNA alignment. rna = ch_star_outputs // channel: [ meta, bam_rna ] - - // TODO(MC): Versions. - // versions = ch_versions // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/read_processing.nf b/subworkflows/local/read_processing.nf index d8de6b54..53815d86 100644 --- a/subworkflows/local/read_processing.nf +++ b/subworkflows/local/read_processing.nf @@ -12,10 +12,9 @@ workflow READ_PROCESSING { unmap_regions main: - // TODO(MC): Versions. // Channel for version.yml files // channel: [ versions.yml ] - // ch_versions = Channel.empty() + ch_versions = Channel.empty() // channel: [ group_id, sample_count ] ch_sample_counts = ch_inputs.map { meta -> [meta.group_id, Utils.groupSampleCounts(meta)] } @@ -70,6 +69,8 @@ workflow READ_PROCESSING { unmap_regions, ) + ch_versions = ch_versions.mix(MARKDUPS.out.versions) + // Update sample information. // channel: [ meta ] (One sample per meta record). ch_bam_samples = MARKDUPS.out.bam.map { bam -> @@ -128,7 +129,5 @@ workflow READ_PROCESSING { emit: dna = ch_markduplicates_dna_out // channel: [ meta ] rna = ch_markduplicates_rna_out // channel: [ meta, bam_rna ] - - // TODO(MC): Versions. - // versions = ch_versions // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 1d55a889..bc177865 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -147,8 +147,7 @@ workflow WGTS { params.max_fastq_records, ) - // TODO(MC): Versions. - // ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) + ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) ch_dna_alignment_out = ch_dna_alignment_out.mix(READ_ALIGNMENT.out.dna) ch_rna_alignment_out = ch_rna_alignment_out.mix(READ_ALIGNMENT.out.rna) @@ -181,8 +180,7 @@ workflow WGTS { file(params.refdata_unmap_regions), ) - // TODO(MC): Versions. - // ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) + ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) ch_dna_processed_out = ch_dna_processed_out.mix(READ_PROCESSING.out.dna) ch_rna_processed_out = ch_rna_processed_out.mix(READ_PROCESSING.out.rna) From 1d13b56331f47aa7a11d6f3efae6bc5d8506252b Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Tue, 13 Feb 2024 17:14:54 +1100 Subject: [PATCH 11/86] Updating TODOs. --- modules/local/bwa/mem2/main.nf | 5 +++-- workflows/wgts.nf | 9 ++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/modules/local/bwa/mem2/main.nf b/modules/local/bwa/mem2/main.nf index bc1df838..aee328ea 100644 --- a/modules/local/bwa/mem2/main.nf +++ b/modules/local/bwa/mem2/main.nf @@ -2,15 +2,16 @@ process BWA_MEM2 { tag "${meta.id}" label 'process_high' - // TODO(MC): Upload container. + // TODO(SW): Upload container. container 'bwa-mem2:2.2.1-sambamba' input: tuple val(meta), path(reads_fwd), path(reads_rev) path genome_fasta - // TODO(MC): Copied into local genome_bwa_index for ref genome 37: + // TODO(SW): The following resourse files are needed from gs://hmf-public/HMFtools-Resources/ref_genome/37: // + Homo_sapiens.GRCh37.GATK.illumina.fasta.bwt.2bit.64 // + Homo_sapiens.GRCh37.GATK.illumina.fasta.0123 + // Similarly for ref genome 38. path genome_bwa_index output: diff --git a/workflows/wgts.nf b/workflows/wgts.nf index bc177865..4a22c9b5 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -15,7 +15,6 @@ inputs = Utils.parseInput(params.input, workflow.stubRun, log) run_config = WorkflowMain.getRunConfig(params, inputs, log) // Validate inputs -// TODO(MC): Reexamine validation in light of fastq/bam markdups. Utils.validateInput(inputs, run_config, log) // Check input path parameters to see if they exist @@ -111,6 +110,14 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft // Get absolute file paths samplesheet = Utils.getFileObject(params.input) +// TODO(MC): New params, and resource files, documentation and proper placement. +// TODO(MC): Processed appearing as NULL. +// TODO(MC): WARN: Found unexpected parameters: +// * --max_fastq_records: 10000000 +// * --refdata_unmap_regions: /Users/matthewcooper/projects/oncoanalyser/hmf_reference_data/markdups/unmap_regions_37.tsv +// - Ignore this warning: params.schema_ignore_params = "max_fastq_records,refdata_unmap_regions" +// TODO(MC): get error logs for amber, cobalt, and gripss +// TODO(MC): Drop commit 'WIP: Reverting bioconda containers'. workflow WGTS { // Create channel for versions // channel: [ versions.yml ] From 8ce78ff3727f13fa625a07b981d7c3ffecc41486 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Wed, 14 Feb 2024 10:20:38 +1100 Subject: [PATCH 12/86] Fixing tags for new processes. --- modules/local/bwa/mem2/main.nf | 2 +- modules/local/fastp/main.nf | 2 +- modules/local/markdups/main.nf | 2 +- modules/local/sambamba/index/main.nf | 2 +- workflows/wgts.nf | 1 - 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/modules/local/bwa/mem2/main.nf b/modules/local/bwa/mem2/main.nf index aee328ea..b77606f8 100644 --- a/modules/local/bwa/mem2/main.nf +++ b/modules/local/bwa/mem2/main.nf @@ -1,5 +1,5 @@ process BWA_MEM2 { - tag "${meta.id}" + tag "${meta.subject_id}__${meta.sample_id}" label 'process_high' // TODO(SW): Upload container. diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf index a14b43ec..0205452c 100644 --- a/modules/local/fastp/main.nf +++ b/modules/local/fastp/main.nf @@ -1,5 +1,5 @@ process FASTP { - tag "${meta.id}" + tag "${meta.subject_id}__${meta.sample_id}" container 'docker.io/scwatts/fastp:0.23.4' diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index c3f97e28..aaa6e301 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -1,5 +1,5 @@ process MARKDUPS { - tag "${meta_bam.id}" + tag "${meta_bam.subject_id}__${meta_bam.sample_id}" container 'docker.io/scwatts/markdups:1.1.rc1' diff --git a/modules/local/sambamba/index/main.nf b/modules/local/sambamba/index/main.nf index 2c42e475..cb205904 100644 --- a/modules/local/sambamba/index/main.nf +++ b/modules/local/sambamba/index/main.nf @@ -1,5 +1,5 @@ process SAMBAMBA_INDEX { - tag "${meta.id}" + tag "${meta.subject_id}__${meta.sample_id}" container 'docker.io/scwatts/sambamba:1.0' diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 4a22c9b5..784ed89c 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -111,7 +111,6 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft samplesheet = Utils.getFileObject(params.input) // TODO(MC): New params, and resource files, documentation and proper placement. -// TODO(MC): Processed appearing as NULL. // TODO(MC): WARN: Found unexpected parameters: // * --max_fastq_records: 10000000 // * --refdata_unmap_regions: /Users/matthewcooper/projects/oncoanalyser/hmf_reference_data/markdups/unmap_regions_37.tsv From ee059adf13e5b49b84f7d160b91f14e2812f0faa Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Wed, 14 Feb 2024 10:29:09 +1100 Subject: [PATCH 13/86] Setting up targeted and wgts workflows for testing. + Put read_alignment and read_processing subworkflows into targeted workflow. + Uncomment the whole wgts worflow and integrate this with the read_alignment and read_processing subworkflows. --- workflows/targeted.nf | 150 +++++++++++++++++++++++++++++------------- workflows/wgts.nf | 112 +++++++++++++++---------------- 2 files changed, 159 insertions(+), 103 deletions(-) diff --git a/workflows/targeted.nf b/workflows/targeted.nf index 3e9c0b1b..4fffbf0c 100644 --- a/workflows/targeted.nf +++ b/workflows/targeted.nf @@ -2,8 +2,6 @@ import Constants import Processes import Utils -// TODO[MC]: Alignment. - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -125,6 +123,68 @@ workflow TARGETED { // Set GRIDSS config gridss_config = params.containsKey('gridss_config') ? file(params.gridss_config) : hmf_data.gridss_config + // + // SUBWORKFLOW: Align reads + // + // channel: [ meta ] + ch_dna_alignment_out = Channel.empty() + // channel: [ meta, bam_rna ] + ch_rna_alignment_out = Channel.empty() + // TODO(SW): set up correctly + if (true || run_config.stages.alignment) { + + READ_ALIGNMENT( + ch_inputs, + ref_data.genome_fasta, + ref_data.genome_bwa_index, + params.max_fastq_records, + ) + + ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) + + ch_dna_alignment_out = ch_dna_alignment_out.mix(READ_ALIGNMENT.out.dna) + ch_rna_alignment_out = ch_rna_alignment_out.mix(READ_ALIGNMENT.out.rna) + + + } else { + + ch_dna_alignment_out = ch_inputs + ch_rna_alignment_out = ch_inputs.map { meta -> [meta, []] } + + } + + // + // SUBWORKFLOW: Process read alignments + // + // channel: [ meta ] + ch_dna_processed_out = Channel.empty() + // channel: [ meta, bam_rna ] + ch_rna_processed_out = Channel.empty() + // TODO(SW): set up correctly + if (true || run_config.stages.markdups) { + + READ_PROCESSING( + ch_inputs, + ch_dna_alignment_out, + ch_rna_alignment_out, + ref_data.genome_fasta, + ref_data.genome_fai, + ref_data.genome_dict, + file(params.refdata_unmap_regions), + ) + + ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) + + ch_dna_processed_out = ch_dna_processed_out.mix(READ_PROCESSING.out.dna) + ch_rna_processed_out = ch_rna_processed_out.mix(READ_PROCESSING.out.rna) + + } else { + + ch_dna_processed_out = ch_inputs.map + ch_rna_processed_out = ch_inputs.map { meta -> [meta, []] } + + } + // // MODULE: Run Isofox to analyse RNA data // @@ -139,7 +199,7 @@ workflow TARGETED { isofox_tpm_norm = params.isofox_tpm_norm ? file(params.isofox_tpm_norm) : panel_data.isofox_tpm_norm ISOFOX_QUANTIFICATION( - ch_inputs, + ch_dna_processed_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -158,7 +218,7 @@ workflow TARGETED { } else { - ch_isofox_out = ch_inputs.map { meta -> [meta, []] } + ch_isofox_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -170,7 +230,7 @@ workflow TARGETED { if (run_config.stages.amber) { AMBER_PROFILING( - ch_inputs, + ch_dna_processed_out, ref_data.genome_version, hmf_data.heterozygous_sites, panel_data.target_region_bed, @@ -181,7 +241,7 @@ workflow TARGETED { } else { - ch_amber_out = ch_inputs.map { meta -> [meta, []] } + ch_amber_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -193,7 +253,7 @@ workflow TARGETED { if (run_config.stages.cobalt) { COBALT_PROFILING( - ch_inputs, + ch_dna_processed_out, hmf_data.gc_profile, hmf_data.diploid_bed, panel_data.target_region_normalisation, @@ -205,7 +265,7 @@ workflow TARGETED { } else { - ch_cobalt_out = ch_inputs.map { meta -> [meta, []] } + ch_cobalt_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -217,7 +277,7 @@ workflow TARGETED { if (run_config.stages.gridss) { GRIDSS_SVPREP_CALLING( - ch_inputs, + ch_dna_processed_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -237,7 +297,7 @@ workflow TARGETED { } else { - ch_gridss_out = ch_inputs.map { meta -> [meta, []] } + ch_gridss_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -251,7 +311,7 @@ workflow TARGETED { if (run_config.stages.gripss) { GRIPSS_FILTERING( - ch_inputs, + ch_dna_processed_out, ch_gridss_out, ref_data.genome_fasta, ref_data.genome_version, @@ -271,9 +331,9 @@ workflow TARGETED { } else { - ch_gripss_somatic_out = ch_inputs.map { meta -> [meta, [], []] } - ch_gripss_germline_out = ch_inputs.map { meta -> [meta, [], []] } - ch_gripss_somatic_unfiltered_out = ch_inputs.map { meta -> [meta, [], []] } + ch_gripss_somatic_out = ch_dna_processed_out.map { meta -> [meta, [], []] } + ch_gripss_germline_out = ch_dna_processed_out.map { meta -> [meta, [], []] } + ch_gripss_somatic_unfiltered_out = ch_dna_processed_out.map { meta -> [meta, [], []] } } @@ -289,7 +349,7 @@ workflow TARGETED { if (run_config.stages.sage) { SAGE_CALLING( - ch_inputs, + ch_dna_processed_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -313,10 +373,10 @@ workflow TARGETED { } else { - ch_sage_germline_vcf_out = ch_inputs.map { meta -> [meta, [], []] } - ch_sage_somatic_vcf_out = ch_inputs.map { meta -> [meta, [], []] } - ch_sage_germline_dir_out = ch_inputs.map { meta -> [meta, []] } - ch_sage_somatic_dir_out = ch_inputs.map { meta -> [meta, []] } + ch_sage_germline_vcf_out = ch_dna_processed_out.map { meta -> [meta, [], []] } + ch_sage_somatic_vcf_out = ch_dna_processed_out.map { meta -> [meta, [], []] } + ch_sage_germline_dir_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_sage_somatic_dir_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -329,7 +389,7 @@ workflow TARGETED { if (run_config.stages.pave) { PAVE_ANNOTATION( - ch_inputs, + ch_dna_processed_out, ch_sage_germline_vcf_out, ch_sage_somatic_vcf_out, ref_data.genome_fasta, @@ -353,8 +413,8 @@ workflow TARGETED { } else { - ch_pave_germline_out = ch_inputs.map { meta -> [meta, []] } - ch_pave_somatic_out = ch_inputs.map { meta -> [meta, []] } + ch_pave_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_pave_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -366,7 +426,7 @@ workflow TARGETED { if (run_config.stages.purple) { PURPLE_CALLING( - ch_inputs, + ch_dna_processed_out, ch_amber_out, ch_cobalt_out, ch_pave_somatic_out, @@ -395,7 +455,7 @@ workflow TARGETED { } else { - ch_purple_out = ch_inputs.map { meta -> [meta, []] } + ch_purple_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -410,7 +470,7 @@ workflow TARGETED { // NOTE(SW): currently used only for ORANGE but will also be used for Neo once implemented SAGE_APPEND( - ch_inputs, + ch_dna_processed_out, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, @@ -425,8 +485,8 @@ workflow TARGETED { } else { - ch_sage_somatic_append_out = ch_inputs.map { meta -> [meta, []] } - ch_sage_germline_append_out = ch_inputs.map { meta -> [meta, []] } + ch_sage_somatic_append_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_sage_germline_append_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -439,7 +499,7 @@ workflow TARGETED { if (run_config.stages.linx) { LINX_ANNOTATION( - ch_inputs, + ch_dna_processed_out, ch_purple_out, ref_data.genome_version, hmf_data.ensembl_data_resources, @@ -455,8 +515,8 @@ workflow TARGETED { } else { - ch_linx_somatic_out = ch_inputs.map { meta -> [meta, []] } - ch_linx_germline_out = ch_inputs.map { meta -> [meta, []] } + ch_linx_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_linx_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -468,7 +528,7 @@ workflow TARGETED { if (run_config.stages.linx) { LINX_PLOTTING( - ch_inputs, + ch_dna_processed_out, ch_linx_somatic_out, ref_data.genome_version, hmf_data.ensembl_data_resources, @@ -480,7 +540,7 @@ workflow TARGETED { } else { - ch_linx_somatic_visualiser_dir_out = ch_inputs.map { meta -> [meta, []] } + ch_linx_somatic_visualiser_dir_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -493,7 +553,7 @@ workflow TARGETED { if (run_config.stages.orange && run_config.stages.flagstat) { FLAGSTAT_METRICS( - ch_inputs, + ch_dna_processed_out, ) ch_versions = ch_versions.mix(FLAGSTAT_METRICS.out.versions) @@ -503,8 +563,8 @@ workflow TARGETED { } else { - ch_flagstat_somatic_out = ch_inputs.map { meta -> [meta, []] } - ch_flagstat_germline_out = ch_inputs.map { meta -> [meta, []] } + ch_flagstat_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_flagstat_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -517,7 +577,7 @@ workflow TARGETED { if (run_config.stages.bamtools) { BAMTOOLS_METRICS( - ch_inputs, + ch_dna_processed_out, ref_data.genome_fasta, ref_data.genome_version, ) @@ -529,8 +589,8 @@ workflow TARGETED { } else { - ch_bamtools_somatic_out = ch_inputs.map { meta -> [meta, []] } - ch_bamtools_germline_out = ch_inputs.map { meta -> [meta, []] } + ch_bamtools_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_bamtools_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -545,7 +605,7 @@ workflow TARGETED { ref_data_hla_slice_bed = params.containsKey('ref_data_hla_slice_bed') ? params.ref_data_hla_slice_bed : [] LILAC_CALLING( - ch_inputs, + ch_dna_processed_out, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, @@ -560,7 +620,7 @@ workflow TARGETED { } else { - ch_lilac_out = ch_inputs.map { meta -> [meta, []] } + ch_lilac_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -570,13 +630,13 @@ workflow TARGETED { if (run_config.stages.orange) { // Create placeholder channels for empty remaining channels - ch_chord_out = ch_inputs.map { meta -> [meta, []] } - ch_cuppa_out = ch_inputs.map { meta -> [meta, []] } - ch_sigs_out = ch_inputs.map { meta -> [meta, []] } - ch_virusinterpreter_out = ch_inputs.map { meta -> [meta, []] } + ch_chord_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_cuppa_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_sigs_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_virusinterpreter_out = ch_dna_processed_out.map { meta -> [meta, []] } ORANGE_REPORTING( - ch_inputs, + ch_dna_processed_out, ch_bamtools_somatic_out, ch_bamtools_germline_out, ch_flagstat_somatic_out, diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 784ed89c..ada38f53 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -198,10 +198,6 @@ workflow WGTS { } - // TODO(SW): adjust downstream selection of input BAM - - /* - // // MODULE: Run Isofox to analyse RNA data // @@ -213,7 +209,7 @@ workflow WGTS { isofox_gc_ratios = params.isofox_gc_ratios ? file(params.isofox_gc_ratios) : hmf_data.isofox_gc_ratios ISOFOX_QUANTIFICATION( - ch_inputs, + ch_dna_processed_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -232,7 +228,7 @@ workflow WGTS { } else { - ch_isofox_out = ch_inputs.map { meta -> [meta, []] } + ch_isofox_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -244,7 +240,7 @@ workflow WGTS { if (run_config.stages.amber) { AMBER_PROFILING( - ch_inputs, + ch_dna_processed_out, ref_data.genome_version, hmf_data.heterozygous_sites, [], // target_region_bed @@ -256,7 +252,7 @@ workflow WGTS { } else { - ch_amber_out = ch_inputs.map { meta -> [meta, []] } + ch_amber_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -268,7 +264,7 @@ workflow WGTS { if (run_config.stages.cobalt) { COBALT_PROFILING( - ch_inputs, + ch_dna_processed_out, hmf_data.gc_profile, hmf_data.diploid_bed, [], // panel_target_region_normalisation @@ -280,7 +276,7 @@ workflow WGTS { } else { - ch_cobalt_out = ch_inputs.map { meta -> [meta, []] } + ch_cobalt_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -292,7 +288,7 @@ workflow WGTS { if (run_config.stages.gridss) { GRIDSS_SVPREP_CALLING( - ch_inputs, + ch_dna_processed_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -312,7 +308,7 @@ workflow WGTS { } else { - ch_gridss_out = ch_inputs.map { meta -> [meta, []] } + ch_gridss_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -326,7 +322,7 @@ workflow WGTS { if (run_config.stages.gripss) { GRIPSS_FILTERING( - ch_inputs, + ch_dna_processed_out, ch_gridss_out, ref_data.genome_fasta, ref_data.genome_version, @@ -346,9 +342,9 @@ workflow WGTS { } else { - ch_gripss_somatic_out = ch_inputs.map { meta -> [meta, [], []] } - ch_gripss_germline_out = ch_inputs.map { meta -> [meta, [], []] } - ch_gripss_somatic_unfiltered_out = ch_inputs.map { meta -> [meta, [], []] } + ch_gripss_somatic_out = ch_dna_processed_out.map { meta -> [meta, [], []] } + ch_gripss_germline_out = ch_dna_processed_out.map { meta -> [meta, [], []] } + ch_gripss_somatic_unfiltered_out = ch_dna_processed_out.map { meta -> [meta, [], []] } } @@ -364,7 +360,7 @@ workflow WGTS { if (run_config.stages.sage) { SAGE_CALLING( - ch_inputs, + ch_dna_processed_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -388,10 +384,10 @@ workflow WGTS { } else { - ch_sage_germline_vcf_out = ch_inputs.map { meta -> [meta, [], []] } - ch_sage_somatic_vcf_out = ch_inputs.map { meta -> [meta, [], []] } - ch_sage_germline_dir_out = ch_inputs.map { meta -> [meta, []] } - ch_sage_somatic_dir_out = ch_inputs.map { meta -> [meta, []] } + ch_sage_germline_vcf_out = ch_dna_processed_out.map { meta -> [meta, [], []] } + ch_sage_somatic_vcf_out = ch_dna_processed_out.map { meta -> [meta, [], []] } + ch_sage_germline_dir_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_sage_somatic_dir_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -404,7 +400,7 @@ workflow WGTS { if (run_config.stages.pave) { PAVE_ANNOTATION( - ch_inputs, + ch_dna_processed_out, ch_sage_germline_vcf_out, ch_sage_somatic_vcf_out, ref_data.genome_fasta, @@ -428,8 +424,8 @@ workflow WGTS { } else { - ch_pave_germline_out = ch_inputs.map { meta -> [meta, []] } - ch_pave_somatic_out = ch_inputs.map { meta -> [meta, []] } + ch_pave_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_pave_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -441,7 +437,7 @@ workflow WGTS { if (run_config.stages.purple) { PURPLE_CALLING( - ch_inputs, + ch_dna_processed_out, ch_amber_out, ch_cobalt_out, ch_pave_somatic_out, @@ -470,7 +466,7 @@ workflow WGTS { } else { - ch_purple_out = ch_inputs.map { meta -> [meta, []] } + ch_purple_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -485,7 +481,7 @@ workflow WGTS { // NOTE(SW): currently used only for ORANGE but will also be used for Neo once implemented SAGE_APPEND( - ch_inputs, + ch_dna_processed_out, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, @@ -499,8 +495,8 @@ workflow WGTS { } else { - ch_sage_somatic_append_out = ch_inputs.map { meta -> [meta, []] } - ch_sage_germline_append_out = ch_inputs.map { meta -> [meta, []] } + ch_sage_somatic_append_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_sage_germline_append_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -513,7 +509,7 @@ workflow WGTS { if (run_config.stages.linx) { LINX_ANNOTATION( - ch_inputs, + ch_dna_processed_out, ch_purple_out, ref_data.genome_version, hmf_data.ensembl_data_resources, @@ -529,8 +525,8 @@ workflow WGTS { } else { - ch_linx_somatic_out = ch_inputs.map { meta -> [meta, []] } - ch_linx_germline_out = ch_inputs.map { meta -> [meta, []] } + ch_linx_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_linx_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -542,7 +538,7 @@ workflow WGTS { if (run_config.stages.linx) { LINX_PLOTTING( - ch_inputs, + ch_dna_processed_out, ch_linx_somatic_out, ref_data.genome_version, hmf_data.ensembl_data_resources, @@ -554,7 +550,7 @@ workflow WGTS { } else { - ch_linx_somatic_visualiser_dir_out = ch_inputs.map { meta -> [meta, []] } + ch_linx_somatic_visualiser_dir_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -567,7 +563,7 @@ workflow WGTS { if (run_config.stages.orange && run_config.stages.flagstat) { FLAGSTAT_METRICS( - ch_inputs, + ch_dna_processed_out, ) ch_versions = ch_versions.mix(FLAGSTAT_METRICS.out.versions) @@ -577,8 +573,8 @@ workflow WGTS { } else { - ch_flagstat_somatic_out = ch_inputs.map { meta -> [meta, []] } - ch_flagstat_germline_out = ch_inputs.map { meta -> [meta, []] } + ch_flagstat_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_flagstat_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -591,7 +587,7 @@ workflow WGTS { if (run_config.stages.bamtools) { BAMTOOLS_METRICS( - ch_inputs, + ch_dna_processed_out, ref_data.genome_fasta, ref_data.genome_version, ) @@ -603,8 +599,8 @@ workflow WGTS { } else { - ch_bamtools_somatic_out = ch_inputs.map { meta -> [meta, []] } - ch_bamtools_germline_out = ch_inputs.map { meta -> [meta, []] } + ch_bamtools_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_bamtools_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -616,7 +612,7 @@ workflow WGTS { if (run_config.stages.sigs) { SIGS_FITTING( - ch_inputs, + ch_dna_processed_out, ch_purple_out, hmf_data.sigs_signatures, ) @@ -627,7 +623,7 @@ workflow WGTS { } else { - ch_sigs_out = ch_inputs.map { meta -> [meta, []] } + ch_sigs_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -639,7 +635,7 @@ workflow WGTS { if (run_config.stages.chord) { CHORD_PREDICTION( - ch_inputs, + ch_dna_processed_out, ch_purple_out, ref_data.genome_version, ) @@ -650,7 +646,7 @@ workflow WGTS { } else { - ch_chord_out = ch_inputs.map { meta -> [meta, []] } + ch_chord_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -665,7 +661,7 @@ workflow WGTS { ref_data_hla_slice_bed = params.containsKey('ref_data_hla_slice_bed') ? params.ref_data_hla_slice_bed : [] LILAC_CALLING( - ch_inputs, + ch_dna_processed_out, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, @@ -680,7 +676,7 @@ workflow WGTS { } else { - ch_lilac_out = ch_inputs.map { meta -> [meta, []] } + ch_lilac_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -692,7 +688,7 @@ workflow WGTS { if (run_config.stages.virusinterpreter) { VIRUSBREAKEND_CALLING( - ch_inputs, + ch_dna_processed_out, ch_purple_out, ch_bamtools_somatic_out, ref_data.genome_fasta, @@ -713,7 +709,7 @@ workflow WGTS { } else { - ch_virusinterpreter_out = ch_inputs.map { meta -> [meta, []] } + ch_virusinterpreter_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -725,7 +721,7 @@ workflow WGTS { if (run_config.stages.cuppa) { CUPPA_PREDICTION( - ch_inputs, + ch_dna_processed_out, ch_isofox_out, ch_purple_out, ch_linx_somatic_out, @@ -740,7 +736,7 @@ workflow WGTS { } else { - ch_cuppa_out = ch_inputs.map { meta -> [meta, []] } + ch_cuppa_out = ch_dna_processed_out.map { meta -> [meta, []] } } @@ -750,7 +746,7 @@ workflow WGTS { if (run_config.stages.orange) { ORANGE_REPORTING( - ch_inputs, + ch_dna_processed_out, ch_bamtools_somatic_out, ch_bamtools_germline_out, ch_flagstat_somatic_out, @@ -783,13 +779,13 @@ workflow WGTS { ch_versions = ch_versions.mix(ORANGE_REPORTING.out.versions) } - // - // MODULE: Pipeline reporting - // - CUSTOM_DUMPSOFTWAREVERSIONS( - ch_versions.unique().collectFile(name: 'collated_versions.yml') - ) - */ + // TODO(MC): This is failing. + // // + // // MODULE: Pipeline reporting + // // + // CUSTOM_DUMPSOFTWAREVERSIONS( + // ch_versions.unique().collectFile(name: 'collated_versions.yml') + // ) } /* From e3f8d4576521f956b86068fadaf6647b6536bcba Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Thu, 15 Feb 2024 09:39:18 +1100 Subject: [PATCH 14/86] Minor fixes and style improvements. --- modules/local/markdups/main.nf | 2 +- subworkflows/local/read_alignment.nf | 6 +++--- workflows/targeted.nf | 3 ++- workflows/wgts.nf | 13 ++++++------- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index aaa6e301..9900288a 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -52,7 +52,7 @@ process MARKDUPS { "${task.process}": sambamba: 1.0 samtools: 1.17 - openjdk: >=8 + openjdk: 8 mark-dups: 1.1 END_VERSIONS """ diff --git a/subworkflows/local/read_alignment.nf b/subworkflows/local/read_alignment.nf index 47259fd1..4865ef10 100644 --- a/subworkflows/local/read_alignment.nf +++ b/subworkflows/local/read_alignment.nf @@ -53,7 +53,7 @@ workflow READ_ALIGNMENT { sample_key['sample_id'] = value.sample_id sample_key.remove(key) - fastq_pair_count = value[Constants.FileType.FASTQ].tokenize(';').size() / 2 + fastq_pair_count = value[Constants.FileType.FASTQ].toString().tokenize(';').size() / 2 } } @@ -71,12 +71,11 @@ workflow READ_ALIGNMENT { } def sample_id = meta[sample_key]['sample_id'] - def fastq_files = meta[sample_key][Constants.FileType.FASTQ].tokenize(';') + def fastq_files = meta[sample_key][Constants.FileType.FASTQ].toString().tokenize(';') def meta_fastq_common = [:] meta.each { key, value -> - if (key === sample_key) { return } @@ -251,6 +250,7 @@ workflow READ_ALIGNMENT { meta_bam[sample_key][Constants.FileType.BAI_MARKDUPS] = bai_files bams.each { bam -> + bam_files.add(bam[1]) bai_files.add(bam[2]) } diff --git a/workflows/targeted.nf b/workflows/targeted.nf index 4fffbf0c..a40240af 100644 --- a/workflows/targeted.nf +++ b/workflows/targeted.nf @@ -102,6 +102,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft // Get absolute file paths samplesheet = Utils.getFileObject(params.input) +// TODO(MC): -panel targeted workflow TARGETED { // Create channel for versions @@ -123,7 +124,7 @@ workflow TARGETED { // Set GRIDSS config gridss_config = params.containsKey('gridss_config') ? file(params.gridss_config) : hmf_data.gridss_config - // + // // SUBWORKFLOW: Align reads // // channel: [ meta ] diff --git a/workflows/wgts.nf b/workflows/wgts.nf index ada38f53..d76774c1 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -779,13 +779,12 @@ workflow WGTS { ch_versions = ch_versions.mix(ORANGE_REPORTING.out.versions) } - // TODO(MC): This is failing. - // // - // // MODULE: Pipeline reporting - // // - // CUSTOM_DUMPSOFTWAREVERSIONS( - // ch_versions.unique().collectFile(name: 'collated_versions.yml') - // ) + // + // MODULE: Pipeline reporting + // + CUSTOM_DUMPSOFTWAREVERSIONS( + ch_versions.unique().collectFile(name: 'collated_versions.yml') + ) } /* From 58ed71cee585837c053da9980b5cef4ab1d21661 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Thu, 15 Feb 2024 15:37:07 +1100 Subject: [PATCH 15/86] Adding a TODO. --- modules/local/markdups/main.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index 9900288a..97da2c6d 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -26,6 +26,8 @@ process MARKDUPS { // # -umi_duplex_delim _ \\ // # -umi_base_diff_stats \\ + // TODO(MC): Ref genome version. + """ java \\ -Xmx${Math.round(task.memory.bytes * 0.95)} \\ From de77de8b678ea3078dcd4c5cb7fd1cca897ea410 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Mon, 19 Feb 2024 05:11:34 +1100 Subject: [PATCH 16/86] Add has_umis switch to markdups. --- modules/local/markdups/main.nf | 20 +++++++++----------- subworkflows/local/read_processing.nf | 1 + 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index 97da2c6d..56da8954 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -9,6 +9,7 @@ process MARKDUPS { path genome_fai path genome_dict path unmap_regions + val has_umis output: tuple val(meta_bam), path('*bam'), path('*bai'), emit: bam @@ -19,12 +20,7 @@ process MARKDUPS { task.ext.when == null || task.ext.when script: - // # TODO(MC): Umi flags - // # -multi_bam \\ - // # -umi_enabled \\ - // # -umi_duplex \\ - // # -umi_duplex_delim _ \\ - // # -umi_base_diff_stats \\ + def umi_flags = has_umis ? '-umi_enabled -umi_duplex -umi_duplex_delim _ -umi_base_diff_stats' : '' // TODO(MC): Ref genome version. @@ -40,6 +36,8 @@ process MARKDUPS { -input_bam ${bams.join(',')} \\ \\ -form_consensus \\ + -multi_bam \\ + ${umi_flags} \\ \\ -unmap_regions ${unmap_regions} \\ -ref_genome ${genome_fasta} \\ @@ -60,16 +58,16 @@ process MARKDUPS { """ stub: + def umi_output_files = has_umis ? 'touch ${meta_bam.sample_id}.umi_coord_freq.tsv;' + + ' touch ${meta_bam.sample_id}.umi_edit_distance.tsv;' + + ' touch ${meta_bam.sample_id}.umi_nucleotide_freq.tsv' : '' + """ touch ${meta_bam.sample_id}.mark_dups.bam touch ${meta_bam.sample_id}.mark_dups.bam.bai touch ${meta_bam.sample_id}.duplicate_freq.tsv + ${umi_output_files} echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml """ - - // # TODO(MC): UMIs. - // # touch ${meta_bam.sample_id}.umi_coord_freq.tsv - // # touch ${meta_bam.sample_id}.umi_edit_distance.tsv - // # touch ${meta_bam.sample_id}.umi_nucleotide_freq.tsv } diff --git a/subworkflows/local/read_processing.nf b/subworkflows/local/read_processing.nf index 53815d86..060d0e45 100644 --- a/subworkflows/local/read_processing.nf +++ b/subworkflows/local/read_processing.nf @@ -67,6 +67,7 @@ workflow READ_PROCESSING { genome_fai, genome_dict, unmap_regions, + false, ) ch_versions = ch_versions.mix(MARKDUPS.out.versions) From 01ad68d713e2da1801bc0d1c650a0e4e718c012e Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Mon, 19 Feb 2024 08:00:40 +1100 Subject: [PATCH 17/86] Force symlink overwrite so process does not fail on resume. --- modules/local/bwa/mem2/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/bwa/mem2/main.nf b/modules/local/bwa/mem2/main.nf index b77606f8..f56da635 100644 --- a/modules/local/bwa/mem2/main.nf +++ b/modules/local/bwa/mem2/main.nf @@ -25,7 +25,7 @@ process BWA_MEM2 { def read_group_tag = "@RG\t${meta.read_group}" """ - ln -s \$(find -L ${genome_bwa_index} -type f) ./ + ln -fs \$(find -L ${genome_bwa_index} -type f) ./ bwa-mem2 mem \\ -Y \\ From df6e65a378fd1fc3721a5ad99ff0a071a3bee290 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Mon, 19 Feb 2024 08:18:44 +1100 Subject: [PATCH 18/86] Change name of output bam from markdups. --- modules/local/markdups/main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index 56da8954..1c2ceafd 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -46,7 +46,7 @@ process MARKDUPS { -write_stats \\ -threads 16 \\ \\ - -output_bam ${meta_bam.sample_id}.mark_dups.bam + -output_bam ${meta_bam.sample_id}.markdups.bam cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -63,8 +63,8 @@ process MARKDUPS { ' touch ${meta_bam.sample_id}.umi_nucleotide_freq.tsv' : '' """ - touch ${meta_bam.sample_id}.mark_dups.bam - touch ${meta_bam.sample_id}.mark_dups.bam.bai + touch ${meta_bam.sample_id}.markdups.bam + touch ${meta_bam.sample_id}.markdups.bam.bai touch ${meta_bam.sample_id}.duplicate_freq.tsv ${umi_output_files} From a4a95b1526db56549d70659b34573c176ecd68c9 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Mon, 19 Feb 2024 11:24:33 +1100 Subject: [PATCH 19/86] Fix read group arg to bwa mem2. --- modules/local/bwa/mem2/main.nf | 3 ++- modules/local/fastp/main.nf | 7 +------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/modules/local/bwa/mem2/main.nf b/modules/local/bwa/mem2/main.nf index f56da635..28da8562 100644 --- a/modules/local/bwa/mem2/main.nf +++ b/modules/local/bwa/mem2/main.nf @@ -22,7 +22,8 @@ process BWA_MEM2 { task.ext.when == null || task.ext.when script: - def read_group_tag = "@RG\t${meta.read_group}" + // TODO(MC): Double check this with Charles. + def read_group_tag = "@RG\\tID:${meta.read_group}\\tSM:${meta.sample_id}" """ ln -fs \$(find -L ${genome_bwa_index} -type f) ./ diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf index 0205452c..fdbbb2d6 100644 --- a/modules/local/fastp/main.nf +++ b/modules/local/fastp/main.nf @@ -15,14 +15,9 @@ process FASTP { task.ext.when == null || task.ext.when script: - // TODO(MC): UMI flags - // --umi \\ - // --umi_loc per_read \\ - // --umi_len 7 \\ - // --umi_skip 1 \\ - """ # * do not apply trimming/clipping, already done in BCL convert + # * do not process umis, already done for us fastp \\ --in1 ${reads_fwd} \\ From 61aa1b18da08dbaf8e3334b03daa395c68aebd5b Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Mon, 19 Feb 2024 11:43:12 +1100 Subject: [PATCH 20/86] Add TODO. --- workflows/wgts.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/wgts.nf b/workflows/wgts.nf index d76774c1..fa3ae8a4 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -117,6 +117,8 @@ samplesheet = Utils.getFileObject(params.input) // - Ignore this warning: params.schema_ignore_params = "max_fastq_records,refdata_unmap_regions" // TODO(MC): get error logs for amber, cobalt, and gripss // TODO(MC): Drop commit 'WIP: Reverting bioconda containers'. +// TODO(MC): Drop commit 'WIP: Turn FASTP back on by default'. +// TODO(MC): Check on bwa vs bwa-mem2 differences. workflow WGTS { // Create channel for versions // channel: [ versions.yml ] From a1dde8e9052a2e56ee1dc19aaf903a01ee5e5c86 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Mon, 19 Feb 2024 16:33:19 +1100 Subject: [PATCH 21/86] Fix markdups umi flags for TSO500 panel samples. --- modules/local/markdups/main.nf | 2 +- workflows/wgts.nf | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index 1c2ceafd..a2727f81 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -20,7 +20,7 @@ process MARKDUPS { task.ext.when == null || task.ext.when script: - def umi_flags = has_umis ? '-umi_enabled -umi_duplex -umi_duplex_delim _ -umi_base_diff_stats' : '' + def umi_flags = has_umis ? '-umi_enabled -umi_duplex -umi_duplex_delim +' : '' // TODO(MC): Ref genome version. diff --git a/workflows/wgts.nf b/workflows/wgts.nf index fa3ae8a4..5d2ffef3 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -119,6 +119,7 @@ samplesheet = Utils.getFileObject(params.input) // TODO(MC): Drop commit 'WIP: Reverting bioconda containers'. // TODO(MC): Drop commit 'WIP: Turn FASTP back on by default'. // TODO(MC): Check on bwa vs bwa-mem2 differences. +// TODO(MC): Run all TSO500 test. workflow WGTS { // Create channel for versions // channel: [ versions.yml ] From 8f06a9fd8a4b95bbcab39bc68a647877ed86601d Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Mon, 19 Feb 2024 17:29:01 +1100 Subject: [PATCH 22/86] Add TODO. --- workflows/wgts.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 5d2ffef3..1d5ea3c7 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -120,6 +120,7 @@ samplesheet = Utils.getFileObject(params.input) // TODO(MC): Drop commit 'WIP: Turn FASTP back on by default'. // TODO(MC): Check on bwa vs bwa-mem2 differences. // TODO(MC): Run all TSO500 test. +// TODO(MC): Go from .fastq.gz. workflow WGTS { // Create channel for versions // channel: [ versions.yml ] From a2330b2dd73b59f7c59943e2baddc16a7ea4a651 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Mon, 19 Feb 2024 19:27:09 +1100 Subject: [PATCH 23/86] Fix read group extraction from fastq filenames and a bug in the markdups stub. --- lib/Utils.groovy | 11 +++++++---- modules/local/markdups/main.nf | 6 +++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/lib/Utils.groovy b/lib/Utils.groovy index 58c412ca..e40711e6 100644 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -488,10 +488,13 @@ class Utils { static public readGroupFromFastqPath(fastq_path) { def base_name = fastq_path.split('/')[-1] - def pattern = /^(.+)_\d+\.fastq$/ - def matcher = base_name =~ pattern - assert matcher.find() - return matcher[0][1] + def components = base_name.split('_') + def read_group_components = [] + for (def i = 0; i < components.size() - 2; ++i) { + read_group_components.add(components[i]) + } + + return read_group_components.join('_') } } diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index a2727f81..e060d18c 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -58,9 +58,9 @@ process MARKDUPS { """ stub: - def umi_output_files = has_umis ? 'touch ${meta_bam.sample_id}.umi_coord_freq.tsv;' + - ' touch ${meta_bam.sample_id}.umi_edit_distance.tsv;' + - ' touch ${meta_bam.sample_id}.umi_nucleotide_freq.tsv' : '' + def umi_output_files = has_umis ? "touch ${meta_bam.sample_id}.umi_coord_freq.tsv;" + + " touch ${meta_bam.sample_id}.umi_edit_distance.tsv;" + + " touch ${meta_bam.sample_id}.umi_nucleotide_freq.tsv" : '' """ touch ${meta_bam.sample_id}.markdups.bam From 36c96fcc244c4247fad03ed1b87d2d18b36ff7f6 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Tue, 20 Feb 2024 08:56:54 +1100 Subject: [PATCH 24/86] Running with umis for targeted and without for wgts. --- subworkflows/local/read_processing.nf | 3 ++- workflows/targeted.nf | 1 + workflows/wgts.nf | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/read_processing.nf b/subworkflows/local/read_processing.nf index 060d0e45..11017a28 100644 --- a/subworkflows/local/read_processing.nf +++ b/subworkflows/local/read_processing.nf @@ -10,6 +10,7 @@ workflow READ_PROCESSING { genome_fai genome_dict unmap_regions + has_umis main: // Channel for version.yml files @@ -67,7 +68,7 @@ workflow READ_PROCESSING { genome_fai, genome_dict, unmap_regions, - false, + has_umis, ) ch_versions = ch_versions.mix(MARKDUPS.out.versions) diff --git a/workflows/targeted.nf b/workflows/targeted.nf index a40240af..2234f23a 100644 --- a/workflows/targeted.nf +++ b/workflows/targeted.nf @@ -172,6 +172,7 @@ workflow TARGETED { ref_data.genome_fai, ref_data.genome_dict, file(params.refdata_unmap_regions), + true, ) ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 1d5ea3c7..5e984da4 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -188,6 +188,7 @@ workflow WGTS { ref_data.genome_fai, ref_data.genome_dict, file(params.refdata_unmap_regions), + false, ) ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) From c8cec7532028dc7fa936a988de9f86143f8d83d4 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Tue, 20 Feb 2024 09:13:52 +1100 Subject: [PATCH 25/86] Fix includes in targeted.nf. --- workflows/targeted.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/targeted.nf b/workflows/targeted.nf index 2234f23a..84d1a770 100644 --- a/workflows/targeted.nf +++ b/workflows/targeted.nf @@ -79,6 +79,8 @@ include { ORANGE_REPORTING } from '../subworkflows/local/orange_reporting' include { PAVE_ANNOTATION } from '../subworkflows/local/pave_annotation' include { PREPARE_REFERENCE } from '../subworkflows/local/prepare_reference' include { PURPLE_CALLING } from '../subworkflows/local/purple_calling' +include { READ_ALIGNMENT } from '../subworkflows/local/read_alignment' +include { READ_PROCESSING } from '../subworkflows/local/read_processing' include { SAGE_APPEND } from '../subworkflows/local/sage_append' include { SAGE_CALLING } from '../subworkflows/local/sage_calling' From b45b43a0dce703fc50fe949027c7b10e0bf4dbac Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Tue, 20 Feb 2024 09:25:05 +1100 Subject: [PATCH 26/86] Only run markdups with UMIs when tso500 panel is selected. --- workflows/targeted.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workflows/targeted.nf b/workflows/targeted.nf index 84d1a770..0eb935ed 100644 --- a/workflows/targeted.nf +++ b/workflows/targeted.nf @@ -104,7 +104,6 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft // Get absolute file paths samplesheet = Utils.getFileObject(params.input) -// TODO(MC): -panel targeted workflow TARGETED { // Create channel for versions @@ -166,6 +165,8 @@ workflow TARGETED { // TODO(SW): set up correctly if (true || run_config.stages.markdups) { + has_umis = params.panel.equalsIgnoreCase('tso500') + READ_PROCESSING( ch_inputs, ch_dna_alignment_out, @@ -174,7 +175,7 @@ workflow TARGETED { ref_data.genome_fai, ref_data.genome_dict, file(params.refdata_unmap_regions), - true, + has_umis, ) ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) From 247922958c2604941ae4fadcbfe51937b4ff46ea Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Tue, 20 Feb 2024 09:40:04 +1100 Subject: [PATCH 27/86] Create switch between bwa mem and bwa mem2 for debugging purposes. --- modules/local/bwa/mem/Dockerfile | 9 ++++ modules/local/bwa/mem/main.nf | 60 +++++++++++++++++++++++++ modules/local/bwa/{ => mem2}/Dockerfile | 0 subworkflows/local/read_alignment.nf | 32 +++++++++---- temp/genomes_GRCh37_hmf.config | 3 +- workflows/targeted.nf | 1 + workflows/wgts.nf | 1 + 7 files changed, 97 insertions(+), 9 deletions(-) create mode 100644 modules/local/bwa/mem/Dockerfile create mode 100644 modules/local/bwa/mem/main.nf rename modules/local/bwa/{ => mem2}/Dockerfile (100%) diff --git a/modules/local/bwa/mem/Dockerfile b/modules/local/bwa/mem/Dockerfile new file mode 100644 index 00000000..2172ebc4 --- /dev/null +++ b/modules/local/bwa/mem/Dockerfile @@ -0,0 +1,9 @@ +FROM docker.io/continuumio/miniconda3:23.10.0-1 + +RUN \ + conda install -y -n base conda-libmamba-solver && \ + conda config --set solver libmamba && \ + conda install -y -c bioconda -c conda-forge -c conda \ + 'bwa==0.7.17' \ + 'sambamba==1.0' && \ + conda clean -yaf diff --git a/modules/local/bwa/mem/main.nf b/modules/local/bwa/mem/main.nf new file mode 100644 index 00000000..554251ca --- /dev/null +++ b/modules/local/bwa/mem/main.nf @@ -0,0 +1,60 @@ +process BWA_MEM { + tag "${meta.subject_id}__${meta.sample_id}" + label 'process_high' + + container 'docker.io/scwatts/bwa:0.7.17-sambamba' + + input: + tuple val(meta), path(reads_fwd), path(reads_rev) + path genome_fasta + path genome_bwa_index + + output: + tuple val(meta), path('*.bam'), emit: bam + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // TODO(MC): Double check this with Charles. + def read_group_tag = "@RG\\tID:${meta.read_group}\\tSM:${meta.sample_id}" + + // TODO(MC): Fix versions. + """ + ln -fs \$(find -L ${genome_bwa_index} -type f) ./ + + bwa mem \\ + -Y \\ + -R '${read_group_tag}' \\ + -t ${task.cpus} \\ + ${genome_fasta} \\ + ${reads_fwd} \\ + ${reads_rev} | \\ + \\ + sambamba view \\ + --sam-input \\ + --format bam \\ + --compression-level 0 \\ + --nthreads ${task.cpus} \\ + /dev/stdin | \\ + \\ + sambamba sort \\ + --nthreads ${task.cpus} \\ + --out ${meta.split}.${meta.sample_id}.${meta.read_group}.bam \\ + /dev/stdin + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwamem2: 2.2.1 + sambamba: 1.0 + END_VERSIONS + """ + + stub: + """ + touch ${meta.split}.${meta.sample_id}.${meta.read_group}.bam + + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ +} diff --git a/modules/local/bwa/Dockerfile b/modules/local/bwa/mem2/Dockerfile similarity index 100% rename from modules/local/bwa/Dockerfile rename to modules/local/bwa/mem2/Dockerfile diff --git a/subworkflows/local/read_alignment.nf b/subworkflows/local/read_alignment.nf index 4865ef10..3e2281bb 100644 --- a/subworkflows/local/read_alignment.nf +++ b/subworkflows/local/read_alignment.nf @@ -1,3 +1,4 @@ +include { BWA_MEM } from '../../modules/local/bwa/mem/main' include { BWA_MEM2 } from '../../modules/local/bwa/mem2/main' include { FASTP } from '../../modules/local/fastp/main' include { SAMBAMBA_INDEX } from '../../modules/local/sambamba/index/main' @@ -10,6 +11,7 @@ workflow READ_ALIGNMENT { genome_fasta genome_bwa_index max_fastq_records + use_mem2 main: // Channel for version.yml files @@ -40,7 +42,7 @@ workflow READ_ALIGNMENT { // TODO(SW): implement outputs ch_star_outputs = Channel.empty() - // BWA MEM2 + // BWA MEM/MEM2 // channel: [ sample_key, fastq_pair_count ] ch_sample_fastq_pair_count = ch_meta_samples_sorted.runnable_fastq.map { meta_sample -> @@ -190,17 +192,31 @@ workflow READ_ALIGNMENT { } // channel: [ meta_fastq, bam ] - BWA_MEM2( - ch_bwa_mem_inputs, - genome_fasta, - genome_bwa_index, - ) + ch_alignment_output = Channel.empty() + if (use_mem2) { + BWA_MEM2( + ch_bwa_mem_inputs, + genome_fasta, + genome_bwa_index, + ) + + ch_alignment_output = BWA_MEM2.out.bam + ch_versions = ch_versions.mix(BWA_MEM2.out.versions) + } + else { + BWA_MEM( + ch_bwa_mem_inputs, + genome_fasta, + genome_bwa_index, + ) - ch_versions = ch_versions.mix(BWA_MEM2.out.versions) + ch_alignment_output = BWA_MEM.out.bam + ch_versions = ch_versions.mix(BWA_MEM.out.versions) + } // channel: [ meta_fastq, bam, bai ] SAMBAMBA_INDEX( - BWA_MEM2.out.bam, + ch_alignment_output, ) ch_versions = ch_versions.mix(SAMBAMBA_INDEX.out.versions) diff --git a/temp/genomes_GRCh37_hmf.config b/temp/genomes_GRCh37_hmf.config index 480613a1..4249b1f0 100644 --- a/temp/genomes_GRCh37_hmf.config +++ b/temp/genomes_GRCh37_hmf.config @@ -13,5 +13,6 @@ params { ref_data_hmf_data_path = "/Users/matthewcooper/projects/oncoanalyser/hmf_reference_data/hmftools/5.34_37--0" ref_data_virusbreakenddb_path = "/Users/matthewcooper/projects/oncoanalyser/virusbreakend/virusbreakenddb_20210401" refdata_unmap_regions = "/Users/matthewcooper/projects/oncoanalyser/hmf_reference_data/markdups/unmap_regions_37.tsv" - max_fastq_records = 10000000 + max_fastq_records = 0 + use_mem2 = true } diff --git a/workflows/targeted.nf b/workflows/targeted.nf index 0eb935ed..de85c31c 100644 --- a/workflows/targeted.nf +++ b/workflows/targeted.nf @@ -140,6 +140,7 @@ workflow TARGETED { ref_data.genome_fasta, ref_data.genome_bwa_index, params.max_fastq_records, + params.use_mem2, ) ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 5e984da4..00da6520 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -155,6 +155,7 @@ workflow WGTS { ref_data.genome_fasta, ref_data.genome_bwa_index, params.max_fastq_records, + params.use_mem2, ) ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) From 3b832650f08e0553661a7e81f2a13bc8a469cdf6 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Tue, 20 Feb 2024 16:19:16 +1100 Subject: [PATCH 28/86] Move new params into nextflow.config. --- nextflow.config | 4 ++++ nextflow_schema.json | 10 ++++++++++ temp/genomes_GRCh37_hmf.config | 2 -- workflows/targeted.nf | 2 +- workflows/wgts.nf | 4 ---- 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/nextflow.config b/nextflow.config index 908ebb92..9bcf70e6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,6 +15,10 @@ params { // Workflow mode mode = null + // Alignment options + max_fastq_records = 0 + use_mem2 = true + // Isofox user input files isofox_counts = null isofox_gc_ratios = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 835ff2e3..4c852122 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -82,6 +82,16 @@ "default": false, "fa_icon": "fas fa-diagram-project" }, + "max_fastq_records": { + "type": "integer", + "description": "When positive, will use fastp to split fastq files so that each resultant fastq file has no more than max_fastq_records records. When nonpositive, fastp is not used and the provided fastq files are passed as-is to the aligner.", + "fa_icon": "fas fa-cog" + }, + "use_mem2": { + "type": "boolean", + "description": "When set uses bwa-mem2 for alignment, otherwise uses bwa mem for alignment.", + "fa_icon": "fas fa-cog" + }, "gridss_config": { "type": "string", "description": "Path to GRIDSS configuration file.", diff --git a/temp/genomes_GRCh37_hmf.config b/temp/genomes_GRCh37_hmf.config index 4249b1f0..2eef3d1d 100644 --- a/temp/genomes_GRCh37_hmf.config +++ b/temp/genomes_GRCh37_hmf.config @@ -13,6 +13,4 @@ params { ref_data_hmf_data_path = "/Users/matthewcooper/projects/oncoanalyser/hmf_reference_data/hmftools/5.34_37--0" ref_data_virusbreakenddb_path = "/Users/matthewcooper/projects/oncoanalyser/virusbreakend/virusbreakenddb_20210401" refdata_unmap_regions = "/Users/matthewcooper/projects/oncoanalyser/hmf_reference_data/markdups/unmap_regions_37.tsv" - max_fastq_records = 0 - use_mem2 = true } diff --git a/workflows/targeted.nf b/workflows/targeted.nf index de85c31c..ba97bfdc 100644 --- a/workflows/targeted.nf +++ b/workflows/targeted.nf @@ -166,7 +166,7 @@ workflow TARGETED { // TODO(SW): set up correctly if (true || run_config.stages.markdups) { - has_umis = params.panel.equalsIgnoreCase('tso500') + has_umis = run_config.panel.equalsIgnoreCase('tso500') READ_PROCESSING( ch_inputs, diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 00da6520..69736dc6 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -111,10 +111,6 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft samplesheet = Utils.getFileObject(params.input) // TODO(MC): New params, and resource files, documentation and proper placement. -// TODO(MC): WARN: Found unexpected parameters: -// * --max_fastq_records: 10000000 -// * --refdata_unmap_regions: /Users/matthewcooper/projects/oncoanalyser/hmf_reference_data/markdups/unmap_regions_37.tsv -// - Ignore this warning: params.schema_ignore_params = "max_fastq_records,refdata_unmap_regions" // TODO(MC): get error logs for amber, cobalt, and gripss // TODO(MC): Drop commit 'WIP: Reverting bioconda containers'. // TODO(MC): Drop commit 'WIP: Turn FASTP back on by default'. From 00cfe507210fae117daa57b53eec04ddaaca00b9 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Tue, 20 Feb 2024 15:45:03 +1100 Subject: [PATCH 29/86] Add label to markdups process. --- modules/local/markdups/main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index e060d18c..dfe8c3c8 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -1,5 +1,6 @@ process MARKDUPS { tag "${meta_bam.subject_id}__${meta_bam.sample_id}" + label 'process_medium' container 'docker.io/scwatts/markdups:1.1.rc1' From 00bf6eed36a6a11e0f027ff947e3b4c07bee9648 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Tue, 20 Feb 2024 15:34:34 +1100 Subject: [PATCH 30/86] Add TODO. --- workflows/wgts.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 69736dc6..76cda24b 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -117,6 +117,7 @@ samplesheet = Utils.getFileObject(params.input) // TODO(MC): Check on bwa vs bwa-mem2 differences. // TODO(MC): Run all TSO500 test. // TODO(MC): Go from .fastq.gz. +// TODO(MC): Unmap region file location. workflow WGTS { // Create channel for versions // channel: [ versions.yml ] From ba241d1daae37018d4270cac28693f1445448a7f Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Wed, 21 Feb 2024 14:22:39 +1100 Subject: [PATCH 31/86] Disable all filtering and poly-g trimming in fastp. --- modules/local/fastp/main.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf index fdbbb2d6..09bd6e0d 100644 --- a/modules/local/fastp/main.nf +++ b/modules/local/fastp/main.nf @@ -17,12 +17,16 @@ process FASTP { script: """ # * do not apply trimming/clipping, already done in BCL convert + # * turn off all filtering # * do not process umis, already done for us fastp \\ --in1 ${reads_fwd} \\ --in2 ${reads_rev} \\ + --disable_quality_filtering \\ + --disable_length_filtering \\ --disable_adapter_trimming \\ + --disable_trim_poly_g \\ --split_by_lines ${4 * max_fastq_records} \\ --out1 ${meta.sample_id}_${meta.read_group}_R1.fastp.fastq \\ --out2 ${meta.sample_id}_${meta.read_group}_R2.fastp.fastq From 3c1815439af283b9cf4533708765516cbca62d1f Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Wed, 21 Feb 2024 14:04:59 +1100 Subject: [PATCH 32/86] Use fastp by default. --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 9bcf70e6..a8c892a6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,7 +16,7 @@ params { mode = null // Alignment options - max_fastq_records = 0 + max_fastq_records = 10000000 use_mem2 = true // Isofox user input files From 6b4f80512bfd2d01cafcbdad55e5d445d0c40a09 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Wed, 21 Feb 2024 16:05:18 +1100 Subject: [PATCH 33/86] Remove bwa mem process. --- modules/local/bwa/{mem2 => }/Dockerfile | 0 modules/local/bwa/mem/Dockerfile | 9 ---- modules/local/bwa/mem/main.nf | 60 ------------------------- nextflow.config | 1 - nextflow_schema.json | 5 --- subworkflows/local/read_alignment.nf | 32 ++++--------- workflows/targeted.nf | 1 - workflows/wgts.nf | 1 - 8 files changed, 8 insertions(+), 101 deletions(-) rename modules/local/bwa/{mem2 => }/Dockerfile (100%) delete mode 100644 modules/local/bwa/mem/Dockerfile delete mode 100644 modules/local/bwa/mem/main.nf diff --git a/modules/local/bwa/mem2/Dockerfile b/modules/local/bwa/Dockerfile similarity index 100% rename from modules/local/bwa/mem2/Dockerfile rename to modules/local/bwa/Dockerfile diff --git a/modules/local/bwa/mem/Dockerfile b/modules/local/bwa/mem/Dockerfile deleted file mode 100644 index 2172ebc4..00000000 --- a/modules/local/bwa/mem/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM docker.io/continuumio/miniconda3:23.10.0-1 - -RUN \ - conda install -y -n base conda-libmamba-solver && \ - conda config --set solver libmamba && \ - conda install -y -c bioconda -c conda-forge -c conda \ - 'bwa==0.7.17' \ - 'sambamba==1.0' && \ - conda clean -yaf diff --git a/modules/local/bwa/mem/main.nf b/modules/local/bwa/mem/main.nf deleted file mode 100644 index 554251ca..00000000 --- a/modules/local/bwa/mem/main.nf +++ /dev/null @@ -1,60 +0,0 @@ -process BWA_MEM { - tag "${meta.subject_id}__${meta.sample_id}" - label 'process_high' - - container 'docker.io/scwatts/bwa:0.7.17-sambamba' - - input: - tuple val(meta), path(reads_fwd), path(reads_rev) - path genome_fasta - path genome_bwa_index - - output: - tuple val(meta), path('*.bam'), emit: bam - path 'versions.yml' , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - // TODO(MC): Double check this with Charles. - def read_group_tag = "@RG\\tID:${meta.read_group}\\tSM:${meta.sample_id}" - - // TODO(MC): Fix versions. - """ - ln -fs \$(find -L ${genome_bwa_index} -type f) ./ - - bwa mem \\ - -Y \\ - -R '${read_group_tag}' \\ - -t ${task.cpus} \\ - ${genome_fasta} \\ - ${reads_fwd} \\ - ${reads_rev} | \\ - \\ - sambamba view \\ - --sam-input \\ - --format bam \\ - --compression-level 0 \\ - --nthreads ${task.cpus} \\ - /dev/stdin | \\ - \\ - sambamba sort \\ - --nthreads ${task.cpus} \\ - --out ${meta.split}.${meta.sample_id}.${meta.read_group}.bam \\ - /dev/stdin - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - bwamem2: 2.2.1 - sambamba: 1.0 - END_VERSIONS - """ - - stub: - """ - touch ${meta.split}.${meta.sample_id}.${meta.read_group}.bam - - echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml - """ -} diff --git a/nextflow.config b/nextflow.config index a8c892a6..8525b232 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,7 +17,6 @@ params { // Alignment options max_fastq_records = 10000000 - use_mem2 = true // Isofox user input files isofox_counts = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 4c852122..32e470a8 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -87,11 +87,6 @@ "description": "When positive, will use fastp to split fastq files so that each resultant fastq file has no more than max_fastq_records records. When nonpositive, fastp is not used and the provided fastq files are passed as-is to the aligner.", "fa_icon": "fas fa-cog" }, - "use_mem2": { - "type": "boolean", - "description": "When set uses bwa-mem2 for alignment, otherwise uses bwa mem for alignment.", - "fa_icon": "fas fa-cog" - }, "gridss_config": { "type": "string", "description": "Path to GRIDSS configuration file.", diff --git a/subworkflows/local/read_alignment.nf b/subworkflows/local/read_alignment.nf index 3e2281bb..4865ef10 100644 --- a/subworkflows/local/read_alignment.nf +++ b/subworkflows/local/read_alignment.nf @@ -1,4 +1,3 @@ -include { BWA_MEM } from '../../modules/local/bwa/mem/main' include { BWA_MEM2 } from '../../modules/local/bwa/mem2/main' include { FASTP } from '../../modules/local/fastp/main' include { SAMBAMBA_INDEX } from '../../modules/local/sambamba/index/main' @@ -11,7 +10,6 @@ workflow READ_ALIGNMENT { genome_fasta genome_bwa_index max_fastq_records - use_mem2 main: // Channel for version.yml files @@ -42,7 +40,7 @@ workflow READ_ALIGNMENT { // TODO(SW): implement outputs ch_star_outputs = Channel.empty() - // BWA MEM/MEM2 + // BWA MEM2 // channel: [ sample_key, fastq_pair_count ] ch_sample_fastq_pair_count = ch_meta_samples_sorted.runnable_fastq.map { meta_sample -> @@ -192,31 +190,17 @@ workflow READ_ALIGNMENT { } // channel: [ meta_fastq, bam ] - ch_alignment_output = Channel.empty() - if (use_mem2) { - BWA_MEM2( - ch_bwa_mem_inputs, - genome_fasta, - genome_bwa_index, - ) - - ch_alignment_output = BWA_MEM2.out.bam - ch_versions = ch_versions.mix(BWA_MEM2.out.versions) - } - else { - BWA_MEM( - ch_bwa_mem_inputs, - genome_fasta, - genome_bwa_index, - ) + BWA_MEM2( + ch_bwa_mem_inputs, + genome_fasta, + genome_bwa_index, + ) - ch_alignment_output = BWA_MEM.out.bam - ch_versions = ch_versions.mix(BWA_MEM.out.versions) - } + ch_versions = ch_versions.mix(BWA_MEM2.out.versions) // channel: [ meta_fastq, bam, bai ] SAMBAMBA_INDEX( - ch_alignment_output, + BWA_MEM2.out.bam, ) ch_versions = ch_versions.mix(SAMBAMBA_INDEX.out.versions) diff --git a/workflows/targeted.nf b/workflows/targeted.nf index ba97bfdc..e6c60379 100644 --- a/workflows/targeted.nf +++ b/workflows/targeted.nf @@ -140,7 +140,6 @@ workflow TARGETED { ref_data.genome_fasta, ref_data.genome_bwa_index, params.max_fastq_records, - params.use_mem2, ) ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 76cda24b..0aea0f8d 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -152,7 +152,6 @@ workflow WGTS { ref_data.genome_fasta, ref_data.genome_bwa_index, params.max_fastq_records, - params.use_mem2, ) ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) From c132e2b29473c8d3a3344b406044fe25a8d12e5f Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Wed, 21 Feb 2024 16:42:49 +1100 Subject: [PATCH 34/86] Remove obsolete TODOs. --- workflows/wgts.nf | 5 ----- 1 file changed, 5 deletions(-) diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 0aea0f8d..6588c0f8 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -110,12 +110,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft // Get absolute file paths samplesheet = Utils.getFileObject(params.input) -// TODO(MC): New params, and resource files, documentation and proper placement. -// TODO(MC): get error logs for amber, cobalt, and gripss // TODO(MC): Drop commit 'WIP: Reverting bioconda containers'. -// TODO(MC): Drop commit 'WIP: Turn FASTP back on by default'. -// TODO(MC): Check on bwa vs bwa-mem2 differences. -// TODO(MC): Run all TSO500 test. // TODO(MC): Go from .fastq.gz. // TODO(MC): Unmap region file location. workflow WGTS { From ec992f1d35bf8561521e6333fcaf5667eec42cd6 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Thu, 22 Feb 2024 10:07:24 +1100 Subject: [PATCH 35/86] Pass ref genome version to markdups. --- modules/local/markdups/main.nf | 5 ++--- subworkflows/local/read_processing.nf | 2 ++ workflows/targeted.nf | 1 + workflows/wgts.nf | 1 + 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index dfe8c3c8..255da3db 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -6,6 +6,7 @@ process MARKDUPS { input: tuple val(meta_bam), path(bams), path(bais) + val genome_ver path genome_fasta path genome_fai path genome_dict @@ -23,8 +24,6 @@ process MARKDUPS { script: def umi_flags = has_umis ? '-umi_enabled -umi_duplex -umi_duplex_delim +' : '' - // TODO(MC): Ref genome version. - """ java \\ -Xmx${Math.round(task.memory.bytes * 0.95)} \\ @@ -42,7 +41,7 @@ process MARKDUPS { \\ -unmap_regions ${unmap_regions} \\ -ref_genome ${genome_fasta} \\ - -ref_genome_version 37 \\ + -ref_genome_version ${genome_ver} \\ \\ -write_stats \\ -threads 16 \\ diff --git a/subworkflows/local/read_processing.nf b/subworkflows/local/read_processing.nf index 11017a28..969f89d5 100644 --- a/subworkflows/local/read_processing.nf +++ b/subworkflows/local/read_processing.nf @@ -6,6 +6,7 @@ workflow READ_PROCESSING { ch_inputs // channel: [mandatory] [ meta ] ch_dna_bams // channel: [mandatory] [ meta, bam_dna ] ch_rna_bams // channel: [mandatory] [ meta, bam_rna ] + genome_ver genome_fasta genome_fai genome_dict @@ -64,6 +65,7 @@ workflow READ_PROCESSING { // channel: [ meta_bam, bam, bai ] MARKDUPS( ch_markdups_inputs, + genome_ver, genome_fasta, genome_fai, genome_dict, diff --git a/workflows/targeted.nf b/workflows/targeted.nf index e6c60379..13170ba1 100644 --- a/workflows/targeted.nf +++ b/workflows/targeted.nf @@ -171,6 +171,7 @@ workflow TARGETED { ch_inputs, ch_dna_alignment_out, ch_rna_alignment_out, + ref_data.genome_version, ref_data.genome_fasta, ref_data.genome_fai, ref_data.genome_dict, diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 6588c0f8..f5f600ff 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -176,6 +176,7 @@ workflow WGTS { ch_inputs, ch_dna_alignment_out, ch_rna_alignment_out, + ref_data.genome_version, ref_data.genome_fasta, ref_data.genome_fai, ref_data.genome_dict, From f7d3e5cf23700f4c9f76cf10828d0ea17b50f959 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Thu, 22 Feb 2024 10:36:27 +1100 Subject: [PATCH 36/86] Add TODO. --- workflows/wgts.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/wgts.nf b/workflows/wgts.nf index f5f600ff..41199858 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -113,6 +113,7 @@ samplesheet = Utils.getFileObject(params.input) // TODO(MC): Drop commit 'WIP: Reverting bioconda containers'. // TODO(MC): Go from .fastq.gz. // TODO(MC): Unmap region file location. +// TODO(MC): Fix warnings. workflow WGTS { // Create channel for versions // channel: [ versions.yml ] From a01df622f1a18b3f81697f2d215c9f7820909f12 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Thu, 22 Feb 2024 11:30:26 +1100 Subject: [PATCH 37/86] Get versions from CLIs. --- modules/local/bwa/mem2/main.nf | 5 +++-- modules/local/fastp/main.nf | 2 +- modules/local/markdups/main.nf | 9 +++++---- modules/local/sambamba/index/main.nf | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/modules/local/bwa/mem2/main.nf b/modules/local/bwa/mem2/main.nf index 28da8562..4072a54b 100644 --- a/modules/local/bwa/mem2/main.nf +++ b/modules/local/bwa/mem2/main.nf @@ -25,6 +25,7 @@ process BWA_MEM2 { // TODO(MC): Double check this with Charles. def read_group_tag = "@RG\\tID:${meta.read_group}\\tSM:${meta.sample_id}" + // NOTE(MC): Hardcoding bwa-mem2 version since the CLI does not have a --version flag. """ ln -fs \$(find -L ${genome_bwa_index} -type f) ./ @@ -50,8 +51,8 @@ process BWA_MEM2 { cat <<-END_VERSIONS > versions.yml "${task.process}": - bwamem2: 2.2.1 - sambamba: 1.0 + bwa-mem2: 2.2.1 + sambamba: \$(sambamba --version 2>&1 | egrep '^sambamba' | head -n 1 | awk '{ print \$NF }') END_VERSIONS """ diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf index 09bd6e0d..95ec9e39 100644 --- a/modules/local/fastp/main.nf +++ b/modules/local/fastp/main.nf @@ -33,7 +33,7 @@ process FASTP { cat <<-END_VERSIONS > versions.yml "${task.process}": - fastp: 0.23.4 + fastp: \$(fastp --version 2>&1 | sed 's/^.* //') END_VERSIONS """ diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index 255da3db..e7737d65 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -24,6 +24,7 @@ process MARKDUPS { script: def umi_flags = has_umis ? '-umi_enabled -umi_duplex -umi_duplex_delim +' : '' + // TODO(MC): Update mark-dups version command, and update to bioconda container. """ java \\ -Xmx${Math.round(task.memory.bytes * 0.95)} \\ @@ -50,10 +51,10 @@ process MARKDUPS { cat <<-END_VERSIONS > versions.yml "${task.process}": - sambamba: 1.0 - samtools: 1.17 - openjdk: 8 - mark-dups: 1.1 + mark-dups: \$(java -jar /opt/markdups/markdups.jar -version | awk '{ print \$NF }') + openjdk: \$(java --version | egrep '^OpenJDK Runtime Environment ' | sed 's/^.*build //' | sed 's/.\$//') + sambamba: \$(sambamba --version 2>&1 | egrep '^sambamba' | head -n 1 | awk '{ print \$NF }') + samtools: \$(samtools --version 2>&1 | egrep '^samtools\\s' | head -n 1 | sed 's/^.* //') END_VERSIONS """ diff --git a/modules/local/sambamba/index/main.nf b/modules/local/sambamba/index/main.nf index cb205904..8d54aa07 100644 --- a/modules/local/sambamba/index/main.nf +++ b/modules/local/sambamba/index/main.nf @@ -21,7 +21,7 @@ process SAMBAMBA_INDEX { cat <<-END_VERSIONS > versions.yml "${task.process}": - sambamba: 1.0 + sambamba: \$(sambamba --version 2>&1 | egrep '^sambamba' | head -n 1 | awk '{ print \$NF }') END_VERSIONS """ From 2c298c4db8580923835bcc24b90c34535c43598c Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Thu, 22 Feb 2024 16:52:39 +1100 Subject: [PATCH 38/86] Delete Dockerfiles and update containers to biocontainer equivalents. --- modules/local/bwa/Dockerfile | 9 ------ modules/local/bwa/mem2/main.nf | 5 +-- modules/local/fastp/Dockerfile | 8 ----- modules/local/fastp/main.nf | 4 ++- modules/local/markdups/Dockerfile | 12 ------- modules/local/markdups/main.nf | 48 ++++++++++++++-------------- modules/local/sambamba/Dockerfile | 8 ----- modules/local/sambamba/index/main.nf | 4 ++- 8 files changed, 33 insertions(+), 65 deletions(-) delete mode 100644 modules/local/bwa/Dockerfile delete mode 100644 modules/local/fastp/Dockerfile delete mode 100644 modules/local/markdups/Dockerfile delete mode 100644 modules/local/sambamba/Dockerfile diff --git a/modules/local/bwa/Dockerfile b/modules/local/bwa/Dockerfile deleted file mode 100644 index 0f2641da..00000000 --- a/modules/local/bwa/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM docker.io/continuumio/miniconda3:23.10.0-1 - -RUN \ - conda install -y -n base conda-libmamba-solver && \ - conda config --set solver libmamba && \ - conda install -y -c bioconda -c conda-forge -c conda \ - 'bwa-mem2==2.2.1' \ - 'sambamba==1.0' && \ - conda clean -yaf diff --git a/modules/local/bwa/mem2/main.nf b/modules/local/bwa/mem2/main.nf index 4072a54b..f43b520b 100644 --- a/modules/local/bwa/mem2/main.nf +++ b/modules/local/bwa/mem2/main.nf @@ -2,8 +2,9 @@ process BWA_MEM2 { tag "${meta.subject_id}__${meta.sample_id}" label 'process_high' - // TODO(SW): Upload container. - container 'bwa-mem2:2.2.1-sambamba' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bwa-mem2:2.2.1--hd03093a_5' : + 'quay.io/biocontainers/bwa-mem2:2.2.1--hd03093a_5' }" input: tuple val(meta), path(reads_fwd), path(reads_rev) diff --git a/modules/local/fastp/Dockerfile b/modules/local/fastp/Dockerfile deleted file mode 100644 index 5d7cdbf9..00000000 --- a/modules/local/fastp/Dockerfile +++ /dev/null @@ -1,8 +0,0 @@ -FROM docker.io/continuumio/miniconda3:23.10.0-1 - -RUN \ - conda install -y -n base conda-libmamba-solver && \ - conda config --set solver libmamba && \ - conda install -y -c bioconda -c conda-forge -c conda \ - 'fastp==0.23.4' && \ - conda clean -yaf diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf index 95ec9e39..86f72f92 100644 --- a/modules/local/fastp/main.nf +++ b/modules/local/fastp/main.nf @@ -1,7 +1,9 @@ process FASTP { tag "${meta.subject_id}__${meta.sample_id}" - container 'docker.io/scwatts/fastp:0.23.4' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--hadf994f_2' : + 'quay.io/biocontainers/fastp:0.23.4--hadf994f_2' }" input: tuple val(meta), path(reads_fwd), path(reads_rev) diff --git a/modules/local/markdups/Dockerfile b/modules/local/markdups/Dockerfile deleted file mode 100644 index d7232f05..00000000 --- a/modules/local/markdups/Dockerfile +++ /dev/null @@ -1,12 +0,0 @@ -FROM docker.io/continuumio/miniconda3:23.10.0-1 - -RUN \ - conda install -y -n base conda-libmamba-solver && \ - conda config --set solver libmamba && \ - conda install -y -c bioconda -c conda-forge -c conda \ - 'sambamba==1.0' 'samtools==1.17' 'openjdk >=8' && \ - conda clean -yaf - -RUN \ - mkdir -p /opt/markdups/ && \ - wget -O /opt/markdups/markdups.jar 'https://github.com/hartwigmedical/hmftools/releases/download/mark-dups-v1.1/mark-dups_v1.1.rc1.jar' diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index e7737d65..2860ab05 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -2,7 +2,9 @@ process MARKDUPS { tag "${meta_bam.subject_id}__${meta_bam.sample_id}" label 'process_medium' - container 'docker.io/scwatts/markdups:1.1.rc1' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/hmftools-mark-dups:1.1--hdfd78af_0' : + 'quay.io/biocontainers/hmftools-mark-dups:1.1--hdfd78af_0' }" input: tuple val(meta_bam), path(bams), path(bais) @@ -24,34 +26,32 @@ process MARKDUPS { script: def umi_flags = has_umis ? '-umi_enabled -umi_duplex -umi_duplex_delim +' : '' - // TODO(MC): Update mark-dups version command, and update to bioconda container. """ - java \\ + markdups \\ -Xmx${Math.round(task.memory.bytes * 0.95)} \\ - -jar /opt/markdups/markdups.jar \\ - \\ - -samtools \$(which samtools) \\ - -sambamba \$(which sambamba) \\ - \\ - -sample ${meta_bam.sample_id} \\ - -input_bam ${bams.join(',')} \\ - \\ - -form_consensus \\ - -multi_bam \\ - ${umi_flags} \\ - \\ - -unmap_regions ${unmap_regions} \\ - -ref_genome ${genome_fasta} \\ - -ref_genome_version ${genome_ver} \\ - \\ - -write_stats \\ - -threads 16 \\ - \\ - -output_bam ${meta_bam.sample_id}.markdups.bam + \\ + -samtools \$(which samtools) \\ + -sambamba \$(which sambamba) \\ + \\ + -sample ${meta_bam.sample_id} \\ + -input_bam ${bams.join(',')} \\ + \\ + -form_consensus \\ + -multi_bam \\ + ${umi_flags} \\ + \\ + -unmap_regions ${unmap_regions} \\ + -ref_genome ${genome_fasta} \\ + -ref_genome_version ${genome_ver} \\ + \\ + -write_stats \\ + -threads 16 \\ + \\ + -output_bam ${meta_bam.sample_id}.markdups.bam cat <<-END_VERSIONS > versions.yml "${task.process}": - mark-dups: \$(java -jar /opt/markdups/markdups.jar -version | awk '{ print \$NF }') + markdups: \$(markdups -version | awk '{ print \$NF }') openjdk: \$(java --version | egrep '^OpenJDK Runtime Environment ' | sed 's/^.*build //' | sed 's/.\$//') sambamba: \$(sambamba --version 2>&1 | egrep '^sambamba' | head -n 1 | awk '{ print \$NF }') samtools: \$(samtools --version 2>&1 | egrep '^samtools\\s' | head -n 1 | sed 's/^.* //') diff --git a/modules/local/sambamba/Dockerfile b/modules/local/sambamba/Dockerfile deleted file mode 100644 index 8c2c38c5..00000000 --- a/modules/local/sambamba/Dockerfile +++ /dev/null @@ -1,8 +0,0 @@ -FROM docker.io/continuumio/miniconda3:23.10.0-1 - -RUN \ - conda install -y -n base conda-libmamba-solver && \ - conda config --set solver libmamba && \ - conda install -y -c bioconda -c conda-forge -c conda \ - 'sambamba==1.0' && \ - conda clean -yaf diff --git a/modules/local/sambamba/index/main.nf b/modules/local/sambamba/index/main.nf index 8d54aa07..f39b0ea7 100644 --- a/modules/local/sambamba/index/main.nf +++ b/modules/local/sambamba/index/main.nf @@ -1,7 +1,9 @@ process SAMBAMBA_INDEX { tag "${meta.subject_id}__${meta.sample_id}" - container 'docker.io/scwatts/sambamba:1.0' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sambamba:1.0--h98b6b92_0' : + 'quay.io/biocontainers/sambamba:1.0--h98b6b92_0' }" input: tuple val(meta), path(bam) From 33244d90067323477ab111e13c5c18310f4e9713 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Thu, 22 Feb 2024 16:14:12 +1100 Subject: [PATCH 39/86] Removing temp config. --- temp/genomes_GRCh37_hmf.config | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 temp/genomes_GRCh37_hmf.config diff --git a/temp/genomes_GRCh37_hmf.config b/temp/genomes_GRCh37_hmf.config deleted file mode 100644 index 2eef3d1d..00000000 --- a/temp/genomes_GRCh37_hmf.config +++ /dev/null @@ -1,16 +0,0 @@ -params { - genomes { - 'GRCh37_hmf' { - fasta = "/Users/matthewcooper/projects/oncoanalyser/genomes/GRCh37_hmf/Homo_sapiens.GRCh37.GATK.illumina.fasta" - fai = "/Users/matthewcooper/projects/oncoanalyser/genomes/GRCh37_hmf/samtools_index/1.16/Homo_sapiens.GRCh37.GATK.illumina.fasta.fai" - dict = "/Users/matthewcooper/projects/oncoanalyser/genomes/GRCh37_hmf/samtools_index/1.16/Homo_sapiens.GRCh37.GATK.illumina.fasta.dict" - bwa_index = "/Users/matthewcooper/projects/oncoanalyser/genomes/GRCh37_hmf/bwa_index/0.7.17-r1188" - bwa_index_image = "/Users/matthewcooper/projects/oncoanalyser/genomes/GRCh37_hmf/bwa_index_image/0.7.17-r1188/Homo_sapiens.GRCh37.GATK.illumina.fasta.img" - gridss_index = "/Users/matthewcooper/projects/oncoanalyser/genomes/GRCh37_hmf/gridss_index/2.13.2/Homo_sapiens.GRCh37.GATK.illumina.fasta.gridsscache" - } - } - - ref_data_hmf_data_path = "/Users/matthewcooper/projects/oncoanalyser/hmf_reference_data/hmftools/5.34_37--0" - ref_data_virusbreakenddb_path = "/Users/matthewcooper/projects/oncoanalyser/virusbreakend/virusbreakenddb_20210401" - refdata_unmap_regions = "/Users/matthewcooper/projects/oncoanalyser/hmf_reference_data/markdups/unmap_regions_37.tsv" -} From 8d82bacfc5f69a8d4a98017fd840cdc3221129c2 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Fri, 23 Feb 2024 09:54:55 +1100 Subject: [PATCH 40/86] Fix process tags. --- modules/local/bwa/mem2/main.nf | 2 +- modules/local/fastp/main.nf | 2 +- modules/local/markdups/main.nf | 2 +- modules/local/sambamba/index/main.nf | 2 +- subworkflows/local/read_alignment.nf | 4 +++- subworkflows/local/read_processing.nf | 3 +++ 6 files changed, 10 insertions(+), 5 deletions(-) diff --git a/modules/local/bwa/mem2/main.nf b/modules/local/bwa/mem2/main.nf index f43b520b..f9c0151f 100644 --- a/modules/local/bwa/mem2/main.nf +++ b/modules/local/bwa/mem2/main.nf @@ -1,5 +1,5 @@ process BWA_MEM2 { - tag "${meta.subject_id}__${meta.sample_id}" + tag "${meta.id}" label 'process_high' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf index 86f72f92..be254c16 100644 --- a/modules/local/fastp/main.nf +++ b/modules/local/fastp/main.nf @@ -1,5 +1,5 @@ process FASTP { - tag "${meta.subject_id}__${meta.sample_id}" + tag "${meta.id}" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--hadf994f_2' : diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index 2860ab05..1a98731a 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -1,5 +1,5 @@ process MARKDUPS { - tag "${meta_bam.subject_id}__${meta_bam.sample_id}" + tag "${meta_bam.id}" label 'process_medium' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/sambamba/index/main.nf b/modules/local/sambamba/index/main.nf index f39b0ea7..0f620584 100644 --- a/modules/local/sambamba/index/main.nf +++ b/modules/local/sambamba/index/main.nf @@ -1,5 +1,5 @@ process SAMBAMBA_INDEX { - tag "${meta.subject_id}__${meta.sample_id}" + tag "${meta.id}" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/sambamba:1.0--h98b6b92_0' : diff --git a/subworkflows/local/read_alignment.nf b/subworkflows/local/read_alignment.nf index 4865ef10..3724345a 100644 --- a/subworkflows/local/read_alignment.nf +++ b/subworkflows/local/read_alignment.nf @@ -73,7 +73,7 @@ workflow READ_ALIGNMENT { def sample_id = meta[sample_key]['sample_id'] def fastq_files = meta[sample_key][Constants.FileType.FASTQ].toString().tokenize(';') - def meta_fastq_common = [:] + def meta_fastq_common = [id: "${meta.group_id}__${sample_id}"] meta.each { key, value -> if (key === sample_key) { @@ -122,6 +122,7 @@ workflow READ_ALIGNMENT { def meta_sample = split_fastq_pairs[0] def sample_key = Utils.shallow_copy(meta_sample) + sample_key.remove('id') sample_key.remove('read_group') sample_key.remove(meta_sample.sample_key) @@ -214,6 +215,7 @@ workflow READ_ALIGNMENT { def meta_bam = bam[0] def sample_key = Utils.shallow_copy(meta_bam) + sample_key.remove('id') sample_key.remove(meta_bam.sample_key) sample_key.remove('read_group') sample_key.remove('split') diff --git a/subworkflows/local/read_processing.nf b/subworkflows/local/read_processing.nf index 969f89d5..bdd71a5a 100644 --- a/subworkflows/local/read_processing.nf +++ b/subworkflows/local/read_processing.nf @@ -51,6 +51,8 @@ workflow READ_PROCESSING { } } + meta_bam['id'] = "${meta_bam.group_id}__${meta_bam.sample_id}" + if (!(bams instanceof Collection)) { bams = [bams] } @@ -82,6 +84,7 @@ workflow READ_PROCESSING { def meta_bam = bam[0] def meta = Utils.shallow_copy(meta_bam) + meta.remove('id') meta.remove('sample_id') meta.remove('sample_key') From be271bbe3c3c4f1df7d90313a1f30923e958f62f Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Fri, 23 Feb 2024 10:15:11 +1100 Subject: [PATCH 41/86] Reassign TODO. --- workflows/wgts.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 41199858..42fc0ebb 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -112,7 +112,7 @@ samplesheet = Utils.getFileObject(params.input) // TODO(MC): Drop commit 'WIP: Reverting bioconda containers'. // TODO(MC): Go from .fastq.gz. -// TODO(MC): Unmap region file location. +// TODO(SW): Unmap region resouce files. // TODO(MC): Fix warnings. workflow WGTS { // Create channel for versions From 1364c8b27ce3e7502cb1d22bbd90c1c440798cb7 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Fri, 23 Feb 2024 10:50:35 +1100 Subject: [PATCH 42/86] Add TODO. --- workflows/wgts.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 42fc0ebb..4d2a74fc 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -111,7 +111,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft samplesheet = Utils.getFileObject(params.input) // TODO(MC): Drop commit 'WIP: Reverting bioconda containers'. -// TODO(MC): Go from .fastq.gz. +// TODO(MC): Run full tests for going from .fastq.gz. // TODO(SW): Unmap region resouce files. // TODO(MC): Fix warnings. workflow WGTS { From e35e4f35d64cd5be1bc286d81c9c8d785b68f996 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Fri, 23 Feb 2024 11:51:09 +1100 Subject: [PATCH 43/86] Align read group construction with HMF pipeline5. --- lib/Utils.groovy | 43 ++++++++++++++++++++++++---- modules/local/bwa/mem2/main.nf | 1 - subworkflows/local/read_alignment.nf | 2 +- 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/lib/Utils.groovy b/lib/Utils.groovy index e40711e6..387b207b 100644 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -486,15 +486,46 @@ class Utils { return splitGroupIntoSamples(meta_group).size() } - static public readGroupFromFastqPath(fastq_path) { + static public fastqBasenameWithoutExtension(fastq_path) { def base_name = fastq_path.split('/')[-1] - def components = base_name.split('_') - def read_group_components = [] - for (def i = 0; i < components.size() - 2; ++i) { - read_group_components.add(components[i]) + def matcher = base_name =~ /^(.*)\.fastq(\.gz|)$/ + assert matcher.find() + return matcher[0][1] + } + + static public readGroupFromFastqPath(fwd_fastq_path, rev_fastq_path) { + def fwd_fastq_no_extension = fastqBasenameWithoutExtension(fwd_fastq_path) + def rev_fastq_no_extension = fastqBasenameWithoutExtension(rev_fastq_path) + + def lane_fastq_pattern = /^(.*)_(.*_L[0-9]{3})_(R[12])_([0-9]{3})$/ + def fwd_matcher = fwd_fastq_no_extension =~ lane_fastq_pattern + def rev_matcher = rev_fastq_no_extension =~ lane_fastq_pattern + + if (fwd_matcher.find()) { + assert rev_matcher.find() + assert fwd_matcher[0][3].equals("R1") + assert rev_matcher[0][3].equals("R2") + + def fwd_read_group = "${fwd_matcher[0][2]}_${fwd_matcher[0][4]}" + def rev_read_group = "${rev_matcher[0][2]}_${rev_matcher[0][4]}" + assert fwd_read_group.equals(rev_read_group) + return fwd_read_group } - return read_group_components.join('_') + // Case for when lane fastq files are from picard SamToFastq. + lane_fastq_pattern = /^(.*)_(.*_L[0-9]{3})_([0-9]{3})_([12])$/ + fwd_matcher = fwd_fastq_no_extension =~ lane_fastq_pattern + rev_matcher = rev_fastq_no_extension =~ lane_fastq_pattern + + assert fwd_matcher.find() + assert rev_matcher.find() + assert fwd_matcher[0][4].equals("1") + assert rev_matcher[0][4].equals("2") + + def fwd_read_group = "${fwd_matcher[0][1]}_${fwd_matcher[0][2]}" + def rev_read_group = "${rev_matcher[0][1]}_${rev_matcher[0][2]}" + assert fwd_read_group.equals(rev_read_group) + return fwd_read_group } } diff --git a/modules/local/bwa/mem2/main.nf b/modules/local/bwa/mem2/main.nf index f9c0151f..062a24ae 100644 --- a/modules/local/bwa/mem2/main.nf +++ b/modules/local/bwa/mem2/main.nf @@ -23,7 +23,6 @@ process BWA_MEM2 { task.ext.when == null || task.ext.when script: - // TODO(MC): Double check this with Charles. def read_group_tag = "@RG\\tID:${meta.read_group}\\tSM:${meta.sample_id}" // NOTE(MC): Hardcoding bwa-mem2 version since the CLI does not have a --version flag. diff --git a/subworkflows/local/read_alignment.nf b/subworkflows/local/read_alignment.nf index 3724345a..bee3cdbd 100644 --- a/subworkflows/local/read_alignment.nf +++ b/subworkflows/local/read_alignment.nf @@ -91,7 +91,7 @@ workflow READ_ALIGNMENT { def reads_rev = fastq_files[i + 1] def meta_fastq = Utils.shallow_copy(meta_fastq_common) - meta_fastq['read_group'] = Utils.readGroupFromFastqPath(reads_fwd) + meta_fastq['read_group'] = Utils.readGroupFromFastqPath(reads_fwd, reads_rev) fastq_pairs.add([meta_fastq, reads_fwd, reads_rev]) } From aa77b223348cb19ccf4d0b279d102b5671be9850 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Sat, 24 Feb 2024 08:15:49 +1100 Subject: [PATCH 44/86] Bump MarkDups to 1.1.1 --- modules/local/markdups/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index 1a98731a..0f6f25d7 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -3,8 +3,8 @@ process MARKDUPS { label 'process_medium' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/hmftools-mark-dups:1.1--hdfd78af_0' : - 'quay.io/biocontainers/hmftools-mark-dups:1.1--hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/hmftools-mark-dups:1.1.1--hdfd78af_0' : + 'quay.io/biocontainers/hmftools-mark-dups:1.1.1--hdfd78af_0' }" input: tuple val(meta_bam), path(bams), path(bais) From 130193266d66a3de19e82a3e30e06737d4519085 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Wed, 28 Feb 2024 12:26:30 +1100 Subject: [PATCH 45/86] Patch --- conf/hmf_data.config | 2 + conf/hmf_genomes.config | 4 + lib/Constants.groovy | 96 ++++- lib/Utils.groovy | 273 +++++++------ main.nf | 2 + modules/local/bwa/mem2/Dockerfile | 19 + modules/local/bwa/mem2/main.nf | 14 +- modules/local/fastp/main.nf | 12 +- modules/local/markdups/main.nf | 36 +- modules/local/sambamba/index/main.nf | 8 +- nextflow_schema.json | 14 + subworkflows/local/amber_profiling.nf | 42 +- subworkflows/local/bamtools_metrics.nf | 119 +++--- subworkflows/local/chord_prediction.nf | 2 +- subworkflows/local/cobalt_profiling.nf | 48 +-- subworkflows/local/cuppa_prediction.nf | 6 +- subworkflows/local/flagstat_metrics.nf | 132 +++--- subworkflows/local/gridss_svprep_calling.nf | 127 +++--- subworkflows/local/gripss_filtering.nf | 6 +- subworkflows/local/lilac_calling.nf | 67 ++-- subworkflows/local/linx_annotation.nf | 4 +- subworkflows/local/pave_annotation.nf | 4 +- subworkflows/local/prepare_reference.nf | 11 + subworkflows/local/purple_calling.nf | 2 +- subworkflows/local/read_alignment.nf | 421 ++++++++------------ subworkflows/local/read_processing.nf | 215 +++++----- subworkflows/local/sage_append.nf | 2 +- subworkflows/local/sage_calling.nf | 89 ++--- subworkflows/local/sigs_fitting.nf | 2 +- subworkflows/local/virusbreakend_calling.nf | 23 +- workflows/targeted.nf | 148 +++---- workflows/wgts.nf | 160 ++++---- 32 files changed, 1065 insertions(+), 1045 deletions(-) create mode 100644 modules/local/bwa/mem2/Dockerfile diff --git a/conf/hmf_data.config b/conf/hmf_data.config index fc6cf635..6fdcba0b 100644 --- a/conf/hmf_data.config +++ b/conf/hmf_data.config @@ -50,6 +50,7 @@ params { known_fusions = 'dna_pipeline/sv/known_fusions.37.bedpe' purple_germline_del = 'dna_pipeline/copy_number/cohort_germline_del_freq.37.csv' segment_mappability = 'dna_pipeline/variants/mappability_150.37.bed.gz' + unmap_regions = 'dna_pipeline/common/unmap_regions.37.tsv' } '38' { // AMBER @@ -101,6 +102,7 @@ params { known_fusions = 'dna_pipeline/sv/known_fusions.38.bedpe' purple_germline_del = 'dna_pipeline/copy_number/cohort_germline_del_freq.38.csv' segment_mappability = 'dna_pipeline/variants/mappability_150.38.bed.gz' + unmap_regions = 'dna_pipeline/common/unmap_regions.38.tsv' } } } diff --git a/conf/hmf_genomes.config b/conf/hmf_genomes.config index 147d4274..5e38201b 100644 --- a/conf/hmf_genomes.config +++ b/conf/hmf_genomes.config @@ -14,6 +14,8 @@ params { fai = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/samtools_index/1.16/Homo_sapiens.GRCh37.GATK.illumina.fasta.fai" dict = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/samtools_index/1.16/Homo_sapiens.GRCh37.GATK.illumina.fasta.dict" bwa_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index/0.7.17-r1188.tar.gz" + bwa_index_bseq = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index/2.2.1/Homo_sapiens.GRCh37.GATK.illumina.fasta.0123" + bwa_index_biidx = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index/2.2.1/Homo_sapiens.GRCh37.GATK.illumina.fasta.bwt.2bit.64" bwa_index_image = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index_image/0.7.17-r1188/Homo_sapiens.GRCh37.GATK.illumina.fasta.img" gridss_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/gridss_index/2.13.2/Homo_sapiens.GRCh37.GATK.illumina.fasta.gridsscache" } @@ -22,6 +24,8 @@ params { fai = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/samtools_index/1.16/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai" dict = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/samtools_index/1.16/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.dict" bwa_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/bwa_index/0.7.17-r1188.tar.gz" + bwa_index_bseq = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index/2.2.1/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.0123" + bwa_index_biiseq= "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index/2.2.1/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.bwt.2bit.64" bwa_index_image = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/bwa_index_image/0.7.17-r1188/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.img" gridss_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/gridss_index/2.13.2/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gridsscache" } diff --git a/lib/Constants.groovy b/lib/Constants.groovy index 91f1059c..10bdc0ea 100644 --- a/lib/Constants.groovy +++ b/lib/Constants.groovy @@ -11,8 +11,8 @@ class Constants { static List PANELS_DEFINED = ['tso500'] - static String HMF_DATA_37_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/hmftools/5.34_37--0.tar.gz' - static String HMF_DATA_38_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/hmftools/5.34_38--0.tar.gz' + static String HMF_DATA_37_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/hmftools/5.34_37--2.tar.gz' + static String HMF_DATA_38_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/hmftools/5.34_38--2.tar.gz' static String TSO500_PANEL_37_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/panels/tso500_5.34_37--0.tar.gz' @@ -34,13 +34,12 @@ class Constants { } static enum Process { - BWAMEM2, + ALIGNMENT, AMBER, BAMTOOLS, CHORD, COBALT, CUPPA, - FASTP, FLAGSTAT, GRIDSS, GRIPSS, @@ -52,7 +51,6 @@ class Constants { PAVE, PURPLE, SAGE, - SAMBAMBA_INDEX, SIGS, VIRUSINTERPRETER, } @@ -104,16 +102,96 @@ class Constants { DNA_RNA, } - static List DNA_SAMPLE_KEYS = [ - [Constants.SampleType.TUMOR, Constants.SequenceType.DNA], - [Constants.SampleType.NORMAL, Constants.SequenceType.DNA], - ] + static enum InfoField { + CANCER_TYPE, + LANE, + LIBRARY_ID, + } static Map PLACEHOLDER_META = [meta_placeholder: null] static List PLACEHOLDER_OPTIONAL_CHANNEL = [] static Map INPUT = [ + BAM_DNA_TUMOR: [ + FileType.BAM, + SampleType.TUMOR, + SequenceType.DNA, + ], + + BAM_MARKDUPS_DNA_TUMOR: [ + FileType.BAM_MARKDUPS, + SampleType.TUMOR, + SequenceType.DNA, + ], + + BAM_RNA_TUMOR : [ + FileType.BAM, + SampleType.TUMOR, + SequenceType.RNA, + ], + + BAM_DNA_NORMAL: [ + FileType.BAM, + SampleType.NORMAL, + SequenceType.DNA, + ], + + BAM_MARKDUPS_DNA_NORMAL: [ + FileType.BAM_MARKDUPS, + SampleType.NORMAL, + SequenceType.DNA, + ], + + BAM_RNA_NORMAL : [ + FileType.BAM, + SampleType.NORMAL, + SequenceType.RNA, + ], + + + + + + BAI_DNA_TUMOR: [ + FileType.BAI, + SampleType.TUMOR, + SequenceType.DNA, + ], + + BAI_MARKDUPS_DNA_TUMOR: [ + FileType.BAI_MARKDUPS, + SampleType.TUMOR, + SequenceType.DNA, + ], + + BAI_RNA_TUMOR : [ + FileType.BAI, + SampleType.TUMOR, + SequenceType.RNA, + ], + + BAI_DNA_NORMAL: [ + FileType.BAI, + SampleType.NORMAL, + SequenceType.DNA, + ], + + BAI_MARKDUPS_DNA_NORMAL: [ + FileType.BAI_MARKDUPS, + SampleType.NORMAL, + SequenceType.DNA, + ], + + BAI_RNA_NORMAL : [ + FileType.BAI, + SampleType.NORMAL, + SequenceType.RNA, + ], + + + + ISOFOX_DIR: [ FileType.ISOFOX_DIR, SampleType.TUMOR, diff --git a/lib/Utils.groovy b/lib/Utils.groovy index 387b207b..2ab04a11 100644 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -11,7 +11,7 @@ class Utils { public static parseInput(input_fp_str, stub_run, log) { - // NOTE(SW): using Nextflow .splitCsv channel operator, hence sould be easily interchangable + // NOTE(SW): using NF .splitCsv channel operator, hence should be easily interchangable with NF syntax def input_fp = Utils.getFileObject(input_fp_str) def inputs = nextflow.splitter.SplitterEx.splitCsv(input_fp, [header: true]) @@ -31,7 +31,6 @@ class Utils { meta.subject_id = it.subject_id } - // Sample type def sample_type_enum = Utils.getEnumFromString(it.sample_type, Constants.SampleType) if (!sample_type_enum) { @@ -64,12 +63,75 @@ class Utils { System.exit(1) } - if (meta_sample.containsKey(filetype_enum)) { + if (meta_sample.containsKey(filetype_enum) & filetype_enum != Constants.FileType.FASTQ) { log.error "got duplicate file for ${group_id} ${sample_type_enum}/${sequence_type_enum}: ${filetype_enum}" System.exit(1) } - meta_sample[filetype_enum] = Utils.getFileObject(it.filepath) + // Info data + def info_data = [:] + if (it.containsKey('info')) { + // Parse + it.info + .tokenize(';') + .each { e -> + def (k, v) = e.tokenize(':') + def info_field_enum = Utils.getEnumFromString(k, Constants.InfoField) + + if (!info_field_enum) { + def info_field_str = Utils.getEnumNames(Constants.InfoField).join('\n - ') + log.error "received invalid info field: '${k}'. Valid options are:\n - ${info_field_str}" + System.exit(1) + } + + if (info_data.containsKey(info_field_enum)) { + log.error "got duplicate info field for ${group_id} ${sample_type_enum}/${sequence_type_enum}: ${info_field_enum}" + System.exit(1) + } + + info_data[info_field_enum] = v + } + + // Process + if (info_data.containsKey(Constants.InfoField.CANCER_TYPE)) { + meta[Constants.InfoField.CANCER_TYPE] = info_data[Constants.InfoField.CANCER_TYPE] + } + + } + + + // Handle inputs appropriately + if (filetype_enum === Constants.FileType.FASTQ) { + + if (!info_data.containsKey(Constants.InfoField.LIBRARY_ID)) { + log.error "missing 'library_id' info field for ${group_id} ${sample_type_enum}/${sequence_type_enum}" + System.exit(1) + } + + if (!info_data.containsKey(Constants.InfoField.LANE)) { + log.error "missing 'lane' info field for ${group_id} ${sample_type_enum}/${sequence_type_enum}" + System.exit(1) + } + + def (fwd, rev) = it.filepath.tokenize(';') + def fastq_key = [info_data[Constants.InfoField.LIBRARY_ID], info_data[Constants.InfoField.LANE]] + + if (meta_sample.containsKey(fastq_key)) { + log.error "got duplicate lane + library_id data for ${group_id} ${sample_type_enum}/${sequence_type_enum}: ${fastq_key}" + System.exit(1) + } + + if (!meta_sample.containsKey(filetype_enum)) { + meta_sample[filetype_enum] = [:] + } + + meta_sample[filetype_enum][fastq_key] = ['fwd': fwd, 'rev': rev] + + } else { + + meta_sample[filetype_enum] = Utils.getFileObject(it.filepath) + + } // Record sample key to simplify iteration later on sample_keys << sample_key @@ -205,14 +267,14 @@ class Utils { if (run_config.mode === Constants.RunMode.TARGETED) { // Do not allow normal DNA - if (Utils.hasNormalDnaBam(meta)) { + if (Utils.hasNormalDna(meta)) { log.error "targeted mode is not compatible with the normal DNA BAM provided for ${meta.group_id}\n\n" + "The targeted workflow supports only tumor DNA BAMs (and tumor RNA BAMs for TSO500)" System.exit(1) } // Do not allow only tumor RNA - if (Utils.hasTumorRnaBam(meta) && !Utils.hasTumorDnaBam(meta)) { + if (Utils.hasTumorRnaBam(meta) && !Utils.hasTumorDna(meta)) { log.error "targeted mode is not compatible with only tumor RNA provided for ${meta.group_id}\n\n" + "The targeted workflow requires tumor DNA and can optionally take tumor RNA, depending on " + "the configured panel." @@ -229,7 +291,7 @@ class Utils { } // Do not allow normal DNA only - if (Utils.hasNormalDnaBam(meta) && !Utils.hasTumorDnaBam(meta)) { + if (Utils.hasNormalDna(meta) && !Utils.hasTumorDna(meta)) { log.error "germline only mode not supported, found only a normal DNA BAM for ${meta.group_id}\n" System.exit(1) } @@ -311,124 +373,134 @@ class Utils { return path ? nextflow.Nextflow.file(path) : [] } + static public getRunMode(run_mode, log) { + def run_mode_enum = Utils.getEnumFromString(run_mode, Constants.RunMode) + if (!run_mode_enum) { + def run_modes_str = Utils.getEnumNames(Constants.RunMode).join('\n - ') + log.error "recieved an invalid run mode: '${run_mode}'. Valid options are:\n - ${run_modes_str}" + System.exit(1) + } + return run_mode_enum + } + + + // Sample records + static public getTumorDnaSample(meta) { + return meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.DNA], [:]) + } + + static public getTumorRnaSample(meta) { + return meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.RNA], [:]) + } + + static public getNormalDnaSample(meta) { + return meta.getOrDefault([Constants.SampleType.NORMAL, Constants.SequenceType.DNA], [:]) + } + // Sample names static public getTumorDnaSampleName(meta) { - def meta_sample = meta[Constants.SampleType.TUMOR, Constants.SequenceType.DNA] - return meta_sample['sample_id'] + return getTumorDnaSample(meta)['sample_id'] } static public getTumorRnaSampleName(meta) { - def meta_sample = meta[Constants.SampleType.TUMOR, Constants.SequenceType.RNA] - return meta_sample['sample_id'] + return getTumorRnaSample(meta)['sample_id'] } static public getNormalDnaSampleName(meta) { - def meta_sample = meta[Constants.SampleType.NORMAL, Constants.SequenceType.DNA] - return meta_sample['sample_id'] + return getNormalDnaSample(meta)['sample_id'] } // Files static public getTumorDnaBam(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.DNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.BAM, null) + return getTumorDnaSample(meta).getOrDefault(Constants.FileType.BAM, null) } static public getTumorDnaBai(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.DNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.BAI, null) + return getTumorDnaSample(meta).getOrDefault(Constants.FileType.BAI, null) + } + + static public getTumorDnaMarkdupsBam(meta) { + return getTumorDnaSample(meta).getOrDefault(Constants.FileType.BAM_MARKDUPS, null) + } + + static public getTumorDnaFastq(meta) { + return getTumorDnaSample(meta).getOrDefault(Constants.FileType.FASTQ, null) } static public hasTumorDnaBam(meta) { return getTumorDnaBam(meta) !== null } + static public hasTumorDnaMarkdupsBam(meta) { + return getTumorDnaMarkdupsBam(meta) !== null + } + + static public hasTumorDnaFastq(meta) { + return getTumorDnaFastq(meta) !== null + } + static public getTumorRnaBam(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.RNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.BAM, null) + return getTumorRnaSample(meta).getOrDefault(Constants.FileType.BAM, null) } static public getTumorRnaBai(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.RNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.BAI, null) + return getTumorRnaSample(meta).getOrDefault(Constants.FileType.BAI, null) } static public hasTumorRnaBam(meta) { return getTumorRnaBam(meta) !== null } - static public getNormalDnaBam(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.NORMAL, Constants.SequenceType.DNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.BAM, null) + return getNormalDnaSample(meta).getOrDefault(Constants.FileType.BAM, null) } static public getNormalDnaBai(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.NORMAL, Constants.SequenceType.DNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.BAI, null) - } - - static public hasNormalDnaBam(meta) { - return getNormalDnaBam(meta) !== null + return getNormalDnaSample(meta).getOrDefault(Constants.FileType.BAI, null) } static public getNormalDnaMarkdupsBam(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.NORMAL, Constants.SequenceType.DNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.BAM_MARKDUPS, null) + return getNormalDnaSample(meta).getOrDefault(Constants.FileType.BAM_MARKDUPS, null) } - static public hasNormalDnaMarkdupsBam(meta) { - return getNormalDnaMarkdupsBam(meta) !== null - } - - static public getTumorDnaMarkdupsBam(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.DNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.BAM_MARKDUPS, null) - } - - static public hasTumorDnaMarkdupsBam(meta) { - return getTumorDnaMarkdupsBam(meta) !== null + static public getNormalDnaFastq(meta) { + return getNormalDnaSample(meta).getOrDefault(Constants.FileType.FASTQ, null) } - static public hasDnaMarkdupsBam(meta) { - return hasNormalDnaMarkdupsBam(meta) || hasTumorDnaMarkdupsBam(meta) + static public hasNormalDnaBam(meta) { + return getNormalDnaBam(meta) !== null } - static public getNormalDnaFastq(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.NORMAL, Constants.SequenceType.DNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.FASTQ, null) + static public hasNormalDnaMarkdupsBam(meta) { + return getNormalDnaMarkdupsBam(meta) !== null } static public hasNormalDnaFastq(meta) { return getNormalDnaFastq(meta) !== null } - static public getTumorDnaFastq(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.DNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.FASTQ, null) - } - - static public hasTumorDnaFastq(meta) { - return getTumorDnaFastq(meta) !== null + static public hasDnaMarkdupsBam(meta) { + return hasNormalDnaMarkdupsBam(meta) || hasTumorDnaMarkdupsBam(meta) } static public hasDnaFastq(meta) { return hasNormalDnaFastq(meta) || hasTumorDnaFastq(meta) } - static public getRunMode(run_mode, log) { - def run_mode_enum = Utils.getEnumFromString(run_mode, Constants.RunMode) - if (!run_mode_enum) { - def run_modes_str = Utils.getEnumNames(Constants.RunMode).join('\n - ') - log.error "recieved an invalid run mode: '${run_mode}'. Valid options are:\n - ${run_modes_str}" - System.exit(1) - } - return run_mode_enum + + // Status + static public hasTumorDna(meta) { + return hasTumorDnaBam(meta) || hasTumorDnaMarkdupsBam(meta) || hasTumorDnaFastq(meta) } + static public hasNormalDna(meta) { + return hasNormalDnaBam(meta) || hasNormalDnaMarkdupsBam(meta) || hasNormalDnaFastq(meta) + } + // Misc public static getInput(meta, key) { def result @@ -453,79 +525,4 @@ class Utils { } } - public static shallow_copy(obj) { - - return obj.getClass().newInstance(obj) - } - - // Alignment utils. - static public splitGroupIntoSamples(meta_group) { - def sample_entries = [:] - def common_entries = [:] - meta_group.each { key, value -> - - if ((value instanceof java.util.Map) && value.containsKey('sample_id')) { - sample_entries[key] = value - } else { - common_entries[key] = value - } - } - - def meta_samples = [] - sample_entries.each { key, value -> - - def meta_sample = common_entries.getClass().newInstance(common_entries) - meta_sample[key] = value - meta_samples.add(meta_sample) - } - - return meta_samples - } - - static public groupSampleCounts(meta_group) { - return splitGroupIntoSamples(meta_group).size() - } - - static public fastqBasenameWithoutExtension(fastq_path) { - def base_name = fastq_path.split('/')[-1] - def matcher = base_name =~ /^(.*)\.fastq(\.gz|)$/ - assert matcher.find() - return matcher[0][1] - } - - static public readGroupFromFastqPath(fwd_fastq_path, rev_fastq_path) { - def fwd_fastq_no_extension = fastqBasenameWithoutExtension(fwd_fastq_path) - def rev_fastq_no_extension = fastqBasenameWithoutExtension(rev_fastq_path) - - def lane_fastq_pattern = /^(.*)_(.*_L[0-9]{3})_(R[12])_([0-9]{3})$/ - def fwd_matcher = fwd_fastq_no_extension =~ lane_fastq_pattern - def rev_matcher = rev_fastq_no_extension =~ lane_fastq_pattern - - if (fwd_matcher.find()) { - assert rev_matcher.find() - assert fwd_matcher[0][3].equals("R1") - assert rev_matcher[0][3].equals("R2") - - def fwd_read_group = "${fwd_matcher[0][2]}_${fwd_matcher[0][4]}" - def rev_read_group = "${rev_matcher[0][2]}_${rev_matcher[0][4]}" - assert fwd_read_group.equals(rev_read_group) - return fwd_read_group - } - - // Case for when lane fastq files are from picard SamToFastq. - lane_fastq_pattern = /^(.*)_(.*_L[0-9]{3})_([0-9]{3})_([12])$/ - fwd_matcher = fwd_fastq_no_extension =~ lane_fastq_pattern - rev_matcher = rev_fastq_no_extension =~ lane_fastq_pattern - - assert fwd_matcher.find() - assert rev_matcher.find() - assert fwd_matcher[0][4].equals("1") - assert rev_matcher[0][4].equals("2") - - def fwd_read_group = "${fwd_matcher[0][1]}_${fwd_matcher[0][2]}" - def rev_read_group = "${rev_matcher[0][1]}_${rev_matcher[0][2]}" - assert fwd_read_group.equals(rev_read_group) - return fwd_read_group - } - } diff --git a/main.nf b/main.nf index caa57956..65841c42 100644 --- a/main.nf +++ b/main.nf @@ -25,6 +25,8 @@ params.ref_data_genome_fasta = WorkflowMain.getGenomeAttribute(params, params.ref_data_genome_fai = WorkflowMain.getGenomeAttribute(params, 'fai') params.ref_data_genome_dict = WorkflowMain.getGenomeAttribute(params, 'dict') params.ref_data_genome_bwa_index = WorkflowMain.getGenomeAttribute(params, 'bwa_index') +params.ref_data_genome_bwa_index_bseq = WorkflowMain.getGenomeAttribute(params, 'bwa_index_bseq') +params.ref_data_genome_bwa_index_biidx = WorkflowMain.getGenomeAttribute(params, 'bwa_index_biidx') params.ref_data_genome_bwa_index_image = WorkflowMain.getGenomeAttribute(params, 'bwa_index_image') params.ref_data_genome_gridss_index = WorkflowMain.getGenomeAttribute(params, 'gridss_index') diff --git a/modules/local/bwa/mem2/Dockerfile b/modules/local/bwa/mem2/Dockerfile new file mode 100644 index 00000000..0a0bc7ea --- /dev/null +++ b/modules/local/bwa/mem2/Dockerfile @@ -0,0 +1,19 @@ +FROM docker.io/mambaorg/micromamba:1.5.6 + +USER root + +RUN \ + apt-get update && \ + apt-get install -y procps && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +USER mambauser + +RUN \ + micromamba install -y -n base -c bioconda -c conda-forge \ + 'bwa-mem2 ==2.2.1' \ + 'sambamba ==1.0' && \ + micromamba clean --all --yes + +ENV PATH="/opt/conda/bin:/opt/conda/condabin:${PATH}" diff --git a/modules/local/bwa/mem2/main.nf b/modules/local/bwa/mem2/main.nf index 062a24ae..ee76a9cc 100644 --- a/modules/local/bwa/mem2/main.nf +++ b/modules/local/bwa/mem2/main.nf @@ -2,18 +2,15 @@ process BWA_MEM2 { tag "${meta.id}" label 'process_high' - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bwa-mem2:2.2.1--hd03093a_5' : - 'quay.io/biocontainers/bwa-mem2:2.2.1--hd03093a_5' }" + // TODO(SW): create BioContainers multi-package image when appropriate + container 'docker.io/scwatts/bwa-mem2:2.2.1' input: tuple val(meta), path(reads_fwd), path(reads_rev) path genome_fasta - // TODO(SW): The following resourse files are needed from gs://hmf-public/HMFtools-Resources/ref_genome/37: - // + Homo_sapiens.GRCh37.GATK.illumina.fasta.bwt.2bit.64 - // + Homo_sapiens.GRCh37.GATK.illumina.fasta.0123 - // Similarly for ref genome 38. path genome_bwa_index + path genome_bwa_index_bseq + path genome_bwa_index_biidx output: tuple val(meta), path('*.bam'), emit: bam @@ -25,7 +22,6 @@ process BWA_MEM2 { script: def read_group_tag = "@RG\\tID:${meta.read_group}\\tSM:${meta.sample_id}" - // NOTE(MC): Hardcoding bwa-mem2 version since the CLI does not have a --version flag. """ ln -fs \$(find -L ${genome_bwa_index} -type f) ./ @@ -51,7 +47,7 @@ process BWA_MEM2 { cat <<-END_VERSIONS > versions.yml "${task.process}": - bwa-mem2: 2.2.1 + bwa-mem2: \$(bwa-mem2 version 2>/dev/null) sambamba: \$(sambamba --version 2>&1 | egrep '^sambamba' | head -n 1 | awk '{ print \$NF }') END_VERSIONS """ diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf index be254c16..9b1121a9 100644 --- a/modules/local/fastp/main.nf +++ b/modules/local/fastp/main.nf @@ -10,8 +10,8 @@ process FASTP { val(max_fastq_records) output: - tuple val(meta), path('*_R1.fastp.fastq'), path('*_R2.fastp.fastq'), emit: fastq - path 'versions.yml' , emit: versions + tuple val(meta), path('*_R1.fastp.fastq.gz'), path('*_R2.fastp.fastq.gz'), emit: fastq + path 'versions.yml' , emit: versions when: task.ext.when == null || task.ext.when @@ -30,8 +30,8 @@ process FASTP { --disable_adapter_trimming \\ --disable_trim_poly_g \\ --split_by_lines ${4 * max_fastq_records} \\ - --out1 ${meta.sample_id}_${meta.read_group}_R1.fastp.fastq \\ - --out2 ${meta.sample_id}_${meta.read_group}_R2.fastp.fastq + --out1 ${meta.sample_id}_${meta.library_id}_${meta.lane}_R1.fastp.fastq.gz \\ + --out2 ${meta.sample_id}_${meta.library_id}_${meta.lane}_R2.fastp.fastq.gz cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -41,8 +41,8 @@ process FASTP { stub: """ - touch 00{1..4}.${meta.sample_id}_${meta.read_group}_R1.fastp.fastq - touch 00{1..4}.${meta.sample_id}_${meta.read_group}_R2.fastp.fastq + touch 00{1..4}.${meta.sample_id}_${meta.library_id}_${meta.lane}_R1.fastp.fastq.gz + touch 00{1..4}.${meta.sample_id}_${meta.library_id}_${meta.lane}_R2.fastp.fastq.gz echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml """ diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index 0f6f25d7..a26b4111 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -1,22 +1,22 @@ process MARKDUPS { - tag "${meta_bam.id}" + tag "${meta.id}" label 'process_medium' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/hmftools-mark-dups:1.1.1--hdfd78af_0' : - 'quay.io/biocontainers/hmftools-mark-dups:1.1.1--hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/hmftools-mark-dups:1.1.2--hdfd78af_0' : + 'quay.io/biocontainers/hmftools-mark-dups:1.1.2--hdfd78af_0' }" input: - tuple val(meta_bam), path(bams), path(bais) - val genome_ver + tuple val(meta), path(bams), path(bais) path genome_fasta + val genome_ver path genome_fai path genome_dict path unmap_regions val has_umis output: - tuple val(meta_bam), path('*bam'), path('*bai'), emit: bam + tuple val(meta), path('*bam'), path('*bai'), emit: bam path 'versions.yml' , emit: versions path '*.tsv' @@ -33,7 +33,7 @@ process MARKDUPS { -samtools \$(which samtools) \\ -sambamba \$(which sambamba) \\ \\ - -sample ${meta_bam.sample_id} \\ + -sample ${meta.sample_id} \\ -input_bam ${bams.join(',')} \\ \\ -form_consensus \\ @@ -45,29 +45,29 @@ process MARKDUPS { -ref_genome_version ${genome_ver} \\ \\ -write_stats \\ - -threads 16 \\ + -threads ${task.cpus} \\ \\ - -output_bam ${meta_bam.sample_id}.markdups.bam + -output_bam ${meta.sample_id}.markdups.bam cat <<-END_VERSIONS > versions.yml "${task.process}": markdups: \$(markdups -version | awk '{ print \$NF }') - openjdk: \$(java --version | egrep '^OpenJDK Runtime Environment ' | sed 's/^.*build //' | sed 's/.\$//') sambamba: \$(sambamba --version 2>&1 | egrep '^sambamba' | head -n 1 | awk '{ print \$NF }') samtools: \$(samtools --version 2>&1 | egrep '^samtools\\s' | head -n 1 | sed 's/^.* //') END_VERSIONS """ stub: - def umi_output_files = has_umis ? "touch ${meta_bam.sample_id}.umi_coord_freq.tsv;" + - " touch ${meta_bam.sample_id}.umi_edit_distance.tsv;" + - " touch ${meta_bam.sample_id}.umi_nucleotide_freq.tsv" : '' - """ - touch ${meta_bam.sample_id}.markdups.bam - touch ${meta_bam.sample_id}.markdups.bam.bai - touch ${meta_bam.sample_id}.duplicate_freq.tsv - ${umi_output_files} + touch ${meta.sample_id}.markdups.bam + touch ${meta.sample_id}.markdups.bam.bai + touch ${meta.sample_id}.duplicate_freq.tsv + + if [[ -n "${has_umis}" ]]; then + touch ${meta.sample_id}.umi_coord_freq.tsv + touch ${meta.sample_id}.umi_edit_distance.tsv + touch ${meta.sample_id}.umi_nucleotide_freq.tsv + fi; echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml """ diff --git a/modules/local/sambamba/index/main.nf b/modules/local/sambamba/index/main.nf index 0f620584..f6b238e3 100644 --- a/modules/local/sambamba/index/main.nf +++ b/modules/local/sambamba/index/main.nf @@ -9,8 +9,8 @@ process SAMBAMBA_INDEX { tuple val(meta), path(bam) output: - tuple val(meta), path(bam), path('*bai'), emit: bam - path 'versions.yml' , emit: versions + tuple val(meta), path('*bai'), emit: bai + path 'versions.yml' , emit: versions when: task.ext.when == null || task.ext.when @@ -19,7 +19,7 @@ process SAMBAMBA_INDEX { """ sambamba index \\ --nthreads ${task.cpus} \\ - ${meta.split}.${meta.sample_id}.${meta.read_group}.bam + ${bam} cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -29,7 +29,7 @@ process SAMBAMBA_INDEX { stub: """ - touch ${meta.split}.${meta.sample_id}.${meta.read_group}.bam.bai + touch ${bam}.bai echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml """ diff --git a/nextflow_schema.json b/nextflow_schema.json index 32e470a8..d75b428d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -218,6 +218,20 @@ "fa_icon": "far fa-file-code", "hidden": true }, + "ref_data_genome_bwa_index_bseq": { + "type": "string", + "format": "directory-path", + "description": "Path to directory containing reference genome BWA MEM 2 binary sequence index file.", + "fa_icon": "far fa-file-code", + "hidden": true + }, + "ref_data_genome_bwa_index_biidx": { + "type": "string", + "format": "directory-path", + "description": "Path to directory containing reference genome BWA MEM 2 bi-index file.", + "fa_icon": "far fa-file-code", + "hidden": true + }, "ref_data_genome_bwa_index_image": { "type": "string", "format": "file-path", diff --git a/subworkflows/local/amber_profiling.nf b/subworkflows/local/amber_profiling.nf index f20a3d24..fb021e91 100644 --- a/subworkflows/local/amber_profiling.nf +++ b/subworkflows/local/amber_profiling.nf @@ -11,6 +11,8 @@ workflow AMBER_PROFILING { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] + ch_normal_bam // channel: [mandatory] [ meta, bam, bai ] // Reference data genome_version // channel: [mandatory] genome version @@ -22,19 +24,33 @@ workflow AMBER_PROFILING { // channel: [ versions.yml ] ch_versions = Channel.empty() - // Sort inputs - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> + // Select input sources and sort + // channel: runnable: [ meta, tumor_bam, tumor_bai, normal_bam, normal_bai] + // channel: skip: [ meta ] + ch_inputs_sorted = WorkflowOncoanalyser.groupByMeta( + ch_tumor_bam, + ch_normal_bam, + ) + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + return [ + meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), + Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_NORMAL), + ] + } + .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.AMBER_DIR) - runnable: Utils.hasTumorDnaBam(meta) && !has_existing + runnable: tumor_bam && !has_existing skip: true + meta } // Create process input channel // channel: [ meta_amber, tumor_bam, normal_bam, tumor_bai, normal_bai ] ch_amber_inputs = ch_inputs_sorted.runnable - .map { meta -> + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def meta_amber = [ key: meta.group_id, @@ -42,20 +58,6 @@ workflow AMBER_PROFILING { tumor_id: Utils.getTumorDnaSampleName(meta), ] - def tumor_bam = Utils.getTumorDnaBam(meta) - def tumor_bai = Utils.getTumorDnaBai(meta) - - def normal_bam = [] - def normal_bai = [] - - if (Utils.hasNormalDnaBam(meta)) { - - meta_amber.normal_id = Utils.getNormalDnaSampleName(meta) - normal_bam = Utils.getNormalDnaBam(meta) - normal_bai = Utils.getNormalDnaBai(meta) - - } - [meta_amber, tumor_bam, normal_bam, tumor_bai, normal_bai] } diff --git a/subworkflows/local/bamtools_metrics.nf b/subworkflows/local/bamtools_metrics.nf index 92237b83..ba6aebee 100644 --- a/subworkflows/local/bamtools_metrics.nf +++ b/subworkflows/local/bamtools_metrics.nf @@ -11,6 +11,8 @@ workflow BAMTOOLS_METRICS { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] + ch_normal_bam // channel: [mandatory] [ meta, bam, bai ] // Reference data genome_fasta // channel: [mandatory] /path/to/genome_fasta @@ -21,78 +23,55 @@ workflow BAMTOOLS_METRICS { // channel: [ versions.yml ] ch_versions = Channel.empty() - // Sort inputs - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> - - def has_tumor_dna = Utils.hasTumorDnaBam(meta) - def has_normal_dna = Utils.hasNormalDnaBam(meta) - - runnable: has_tumor_dna || has_normal_dna + // Sort inputs, separate by tumor and normal + // channel: runnable: [ meta, bam, bai ] + // channel: skip: [ meta ] + ch_inputs_tumor_sorted = ch_tumor_bam + .map { meta, bam, bai -> + return [ + meta, + Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + ] + } + .branch { meta, bam, bai -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAMTOOLS_TUMOR) + runnable: bam && !has_existing skip: true + meta } - // Flatten into BAM/BAI pairs, select inputs that are eligible to run - // channel: runnable: [ meta_extra, bam, bai ] - // channel: skip: [ meta_extra ] - ch_bams_bais_sorted = ch_inputs_sorted.runnable - .flatMap { meta -> - - def tumor_sample_id = [] - def tumor_bam = [] - def tumor_bai = [] - - def normal_sample_id = [] - def normal_bam = [] - def normal_bai = [] - - - if (Utils.hasTumorDnaBam(meta)) { - tumor_sample_id = Utils.getTumorDnaSampleName(meta) - tumor_bam = Utils.getTumorDnaBam(meta) - tumor_bai = Utils.getTumorDnaBai(meta) - } - - if (Utils.hasNormalDnaBam(meta)) { - normal_sample_id = Utils.getNormalDnaSampleName(meta) - normal_bam = Utils.getNormalDnaBam(meta) - normal_bai = Utils.getNormalDnaBai(meta) - } - + // channel: runnable: [ meta, bam, bai ] + // channel: skip: [ meta ] + ch_inputs_normal_sorted = ch_normal_bam + .map { meta, bam, bai -> return [ - [[key: meta.group_id, *:meta, sample_id: tumor_sample_id, sample_type: 'tumor'], tumor_bam, tumor_bai], - [[key: meta.group_id, *:meta, sample_id: normal_sample_id, sample_type: 'normal'], normal_bam, normal_bai], + meta, + Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), + Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_NORMAL), ] } - .branch { meta_extra, bam, bai -> - - def input_key - if (meta_extra.sample_type == 'tumor') { - input_key = Constants.INPUT.BAMTOOLS_TUMOR - } else if (meta_extra.sample_type == 'normal') { - input_key = Constants.INPUT.BAMTOOLS_NORMAL - } else { - assert false - } - - def has_existing = Utils.hasExistingInput(meta_extra, input_key) - - runnable: bam && bai && !has_existing + .branch { meta, bam, bai -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAMTOOLS_NORMAL) + runnable: bam && !has_existing skip: true - return meta_extra + meta } // Create process input channel // channel: [ meta_bamtools, bam, bai ] - ch_bamtools_inputs = ch_bams_bais_sorted.runnable - .map { meta_extra, bam, bai -> + ch_bamtools_inputs = Channel.empty() + .mix( + ch_inputs_tumor_sorted.runnable.map { meta, bam, bai -> [meta, Utils.getTumorDnaSample(meta), 'tumor', bam, bai] }, + ch_inputs_normal_sorted.runnable.map { meta, bam, bai -> [meta, Utils.getNormalDnaSample(meta), 'normal', bam, bai] }, + ) + .map { meta, meta_sample, sample_type, bam, bai -> def meta_bamtools = [ - key: meta_extra.group_id, - id: "${meta_extra.group_id}__${meta_extra.sample_id}", - sample_id: meta_extra.sample_id, - sample_type: meta_extra.sample_type, + key: meta.group_id, + id: "${meta.group_id}_${meta_sample.sample_id}", + sample_id: meta_sample.sample_id, + sample_type: sample_type, ] return [meta_bamtools, bam, bai] @@ -107,29 +86,27 @@ workflow BAMTOOLS_METRICS { ch_versions = ch_versions.mix(BAMTOOLS.out.versions) - // Sort outputs into tumor and normal channels, adding partial skip entries - // channel: [ meta_bamtools, metrics ] - ch_outputs_sorted = Channel.empty() - .mix( - BAMTOOLS.out.metrics, - ch_bams_bais_sorted.skip.map { meta -> [meta, []] }, - ) + // Sort into a tumor and normal channel + ch_bamtools_out = BAMTOOLS.out.metrics .branch { meta_bamtools, metrics -> + assert ['tumor', 'normal'].contains(meta_bamtools.sample_type) tumor: meta_bamtools.sample_type == 'tumor' normal: meta_bamtools.sample_type == 'normal' + placeholder: true } - // Set outputs, restoring original meta, including full skip entries + // Set outputs, restoring original meta + // channel: [ meta, metrics ] ch_somatic_metrics = Channel.empty() .mix( - WorkflowOncoanalyser.restoreMeta(ch_outputs_sorted.tumor, ch_inputs), - ch_inputs_sorted.skip.map { meta -> [meta, []] }, + WorkflowOncoanalyser.restoreMeta(ch_bamtools_out.tumor, ch_inputs), + ch_inputs_tumor_sorted.skip.map { meta -> [meta, []] }, ) ch_germline_metrics = Channel.empty() .mix( - WorkflowOncoanalyser.restoreMeta(ch_outputs_sorted.normal, ch_inputs), - ch_inputs_sorted.skip.map { meta -> [meta, []] }, + WorkflowOncoanalyser.restoreMeta(ch_bamtools_out.normal, ch_inputs), + ch_inputs_normal_sorted.skip.map { meta -> [meta, []] }, ) emit: diff --git a/subworkflows/local/chord_prediction.nf b/subworkflows/local/chord_prediction.nf index d5f3c187..516917fe 100644 --- a/subworkflows/local/chord_prediction.nf +++ b/subworkflows/local/chord_prediction.nf @@ -34,7 +34,7 @@ workflow CHORD_PREDICTION { ch_inputs_sorted = ch_inputs_selected .branch { meta, purple_dir -> - def has_dna = Utils.hasTumorDnaBam(meta) + def has_dna = Utils.hasTumorDna(meta) def tumor_id def has_smlv_vcf diff --git a/subworkflows/local/cobalt_profiling.nf b/subworkflows/local/cobalt_profiling.nf index 6fb81e08..322575cc 100644 --- a/subworkflows/local/cobalt_profiling.nf +++ b/subworkflows/local/cobalt_profiling.nf @@ -11,6 +11,8 @@ workflow COBALT_PROFILING { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] + ch_normal_bam // channel: [mandatory] [ meta, bam, bai ] // Reference data gc_profile // channel: [mandatory] /path/to/gc_profile @@ -22,23 +24,37 @@ workflow COBALT_PROFILING { // channel: [ versions.yml ] ch_versions = Channel.empty() - // Sort inputs + // Select input sources and sort // NOTE(SW): germline mode is not currently supported - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> + // channel: runnable: [ meta, tumor_bam, tumor_bai, normal_bam, normal_bai] + // channel: skip: [ meta ] + ch_inputs_sorted = WorkflowOncoanalyser.groupByMeta( + ch_tumor_bam, + ch_normal_bam, + ) + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + return [ + meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), + Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_NORMAL), + ] + } + .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.COBALT_DIR) - runnable_tn: Utils.hasTumorDnaBam(meta) && Utils.hasNormalDnaBam(meta) && !has_existing - runnable_to: Utils.hasTumorDnaBam(meta) && !has_existing + runnable_tn: tumor_bam && normal_bam && !has_existing + runnable_to: tumor_bam && !has_existing skip: true + meta } // First set diploid BED input for tumor/normal and tumor only samples // NOTE(SW): since the diploid BED is provided as a channel, I seem to be only able to include via channel ops - // channel: [ meta, diploid_bed ] + // channel: [ meta, tumor_bam, tumor_bai, normal_bam, normal_bai, diploid_bed ] ch_inputs_runnable = Channel.empty() .mix( - ch_inputs_sorted.runnable_tn.map { meta -> [meta, []] }, + ch_inputs_sorted.runnable_tn.map { [*it, []] }, ch_inputs_sorted.runnable_to.combine(diploid_bed), ) @@ -46,7 +62,7 @@ workflow COBALT_PROFILING { // channel: sample_data: [ meta_cobalt, tumor_bam, normal_bam, tumor_bai, normal_bai ] // channel: diploid_bed: [ diploid_bed ] ch_cobalt_inputs = ch_inputs_runnable - .multiMap { meta, diploid_bed -> + .multiMap { meta, tumor_bam, tumor_bai, normal_bam, normal_bai, diploid_bed -> def meta_cobalt = [ key: meta.group_id, @@ -54,20 +70,6 @@ workflow COBALT_PROFILING { tumor_id: Utils.getTumorDnaSampleName(meta), ] - def tumor_bam = Utils.getTumorDnaBam(meta) - def tumor_bai = Utils.getTumorDnaBai(meta) - - def normal_bam = [] - def normal_bai = [] - - if (Utils.hasNormalDnaBam(meta)) { - - meta_cobalt.normal_id = Utils.getNormalDnaSampleName(meta) - normal_bam = Utils.getNormalDnaBam(meta) - normal_bai = Utils.getNormalDnaBai(meta) - - } - sample_data: [meta_cobalt, tumor_bam, normal_bam, tumor_bai, normal_bai] diploid_bed: diploid_bed } diff --git a/subworkflows/local/cuppa_prediction.nf b/subworkflows/local/cuppa_prediction.nf index bd9b5877..bc486322 100644 --- a/subworkflows/local/cuppa_prediction.nf +++ b/subworkflows/local/cuppa_prediction.nf @@ -62,7 +62,7 @@ workflow CUPPA_PREDICTION { // (run exclusions currently done basis for presence of normal DNA) def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.CUPPA_DIR) - def has_normal_dna = Utils.hasNormalDnaBam(meta) + def has_normal_dna = Utils.hasNormalDna(meta) def has_runnable_inputs = isofox_dir || (purple_dir && linx_annotation_dir && has_normal_dna) @@ -82,8 +82,8 @@ workflow CUPPA_PREDICTION { id: meta.group_id, ] - def has_tumor_dna = Utils.hasTumorDnaBam(meta) - def has_normal_dna = Utils.hasNormalDnaBam(meta) + def has_tumor_dna = Utils.hasTumorDna(meta) + def has_normal_dna = Utils.hasNormalDna(meta) def has_tumor_rna = Utils.hasTumorRnaBam(meta) def has_dna_inputs = (purple_dir && linx_annotation_dir) diff --git a/subworkflows/local/flagstat_metrics.nf b/subworkflows/local/flagstat_metrics.nf index 256d063d..cd2344a9 100644 --- a/subworkflows/local/flagstat_metrics.nf +++ b/subworkflows/local/flagstat_metrics.nf @@ -10,84 +10,64 @@ include { SAMTOOLS_FLAGSTAT } from '../../modules/nf-core/samtools/flagstat/main workflow FLAGSTAT_METRICS { take: // Sample data - ch_inputs // channel: [mandatory] [ meta ] + ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] + ch_normal_bam // channel: [mandatory] [ meta, bam, bai ] main: // Channel for version.yml files // channel: [ versions.yml ] ch_versions = Channel.empty() - // Sort inputs - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> - - def has_tumor_dna = Utils.hasTumorDnaBam(meta) - def has_normal_dna = Utils.hasNormalDnaBam(meta) - - runnable: has_tumor_dna || has_normal_dna + // Sort inputs, separate by tumor and normal + // channel: runnable: [ meta, bam, bai ] + // channel: skip: [ meta ] + ch_inputs_tumor_sorted = ch_tumor_bam + .map { meta, bam, bai -> + return [ + meta, + Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + ] + } + .branch { meta, bam, bai -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.FLAGSTAT_TUMOR) + runnable: bam && !has_existing skip: true + meta } - // Flatten into BAM/BAI pairs, select inputs that are eligible to run - // channel: runnable: [ meta_extra, bam, bai ] - // channel: skip: [ meta_extra ] - ch_bams_bais_sorted = ch_inputs_sorted.runnable - .flatMap { meta -> - - def tumor_sample_id = [] - def tumor_bam = [] - def tumor_bai = [] - - def normal_sample_id = [] - def normal_bam = [] - def normal_bai = [] - - if (Utils.hasTumorDnaBam(meta)) { - tumor_sample_id = Utils.getTumorDnaSampleName(meta) - tumor_bam = Utils.getTumorDnaBam(meta) - tumor_bai = Utils.getTumorDnaBai(meta) - } - - if (Utils.hasNormalDnaBam(meta)) { - normal_sample_id = Utils.getNormalDnaSampleName(meta) - normal_bam = Utils.getNormalDnaBam(meta) - normal_bai = Utils.getNormalDnaBai(meta) - } - + // channel: runnable: [ meta, bam, bai ] + // channel: skip: [ meta ] + ch_inputs_normal_sorted = ch_normal_bam + .map { meta, bam, bai -> return [ - [[key: meta.group_id, *:meta, sample_id: tumor_sample_id, sample_type: 'tumor'], tumor_bam, tumor_bai], - [[key: meta.group_id, *:meta, sample_id: normal_sample_id, sample_type: 'normal'], normal_bam, normal_bai], + meta, + Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), + Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_NORMAL), ] } - .branch { meta_extra, bam, bai -> - - def input_key - if (meta_extra.sample_type == 'tumor') { - input_key = Constants.INPUT.BAMTOOLS_TUMOR - } else if (meta_extra.sample_type == 'normal') { - input_key = Constants.INPUT.BAMTOOLS_NORMAL - } else { - assert false - } - - def has_existing = Utils.hasExistingInput(meta_extra, input_key) - - runnable: bam && bai && !has_existing + .branch { meta, bam, bai -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.FLAGSTAT_NORMAL) + runnable: bam && !has_existing skip: true - return meta_extra + meta } // Create process input channel // channel: [ meta_flagstat, bam, bai ] - ch_flagstat_inputs = ch_bams_bais_sorted.runnable - .map { meta_extra, bam, bai -> + ch_flagstat_inputs = Channel.empty() + .mix( + ch_inputs_tumor_sorted.runnable.map { meta, bam, bai -> [meta, Utils.getTumorDnaSample(meta), 'tumor', bam, bai] }, + ch_inputs_normal_sorted.runnable.map { meta, bam, bai -> [meta, Utils.getNormalDnaSample(meta), 'normal', bam, bai] }, + ) + .map { meta, meta_sample, sample_type, bam, bai -> def meta_flagstat = [ - key: meta_extra.group_id, - id: "${meta_extra.group_id}__${meta_extra.sample_id}", - sample_id: meta_extra.sample_id, - sample_type: meta_extra.sample_type, + key: meta.group_id, + id: "${meta.group_id}_${meta_sample.sample_id}", + sample_id: meta_sample.sample_id, + sample_type: sample_type, ] return [meta_flagstat, bam, bai] @@ -100,34 +80,32 @@ workflow FLAGSTAT_METRICS { ch_versions = ch_versions.mix(SAMTOOLS_FLAGSTAT.out.versions) - // Sort outputs into tumor and normal channels, adding partial skip entries - // channel: [ meta_flagstat, metrics ] - ch_outputs_sorted = Channel.empty() - .mix( - SAMTOOLS_FLAGSTAT.out.flagstat, - ch_bams_bais_sorted.skip.map { meta -> [meta, []] }, - ) - .branch { meta_flagstat, metrics -> + // Sort into a tumor and normal channel + ch_flagstat_out = SAMTOOLS_FLAGSTAT.out.flagstat + .branch { meta_flagstat, flagstat -> + assert ['tumor', 'normal'].contains(meta_flagstat.sample_type) tumor: meta_flagstat.sample_type == 'tumor' normal: meta_flagstat.sample_type == 'normal' + placeholder: true } - // Set outputs, restoring original meta, including full skip entries - ch_somatic_metrics = Channel.empty() + // Set outputs, restoring original meta + // channel: [ meta, flagstat ] + ch_somatic_flagstat = Channel.empty() .mix( - WorkflowOncoanalyser.restoreMeta(ch_outputs_sorted.tumor, ch_inputs), - ch_inputs_sorted.skip.map { meta -> [meta, []] }, + WorkflowOncoanalyser.restoreMeta(ch_flagstat_out.tumor, ch_inputs), + ch_inputs_tumor_sorted.skip.map { meta -> [meta, []] }, ) - ch_germline_metrics = Channel.empty() + ch_germline_flagstat = Channel.empty() .mix( - WorkflowOncoanalyser.restoreMeta(ch_outputs_sorted.normal, ch_inputs), - ch_inputs_sorted.skip.map { meta -> [meta, []] }, + WorkflowOncoanalyser.restoreMeta(ch_flagstat_out.normal, ch_inputs), + ch_inputs_normal_sorted.skip.map { meta -> [meta, []] }, ) emit: - somatic = ch_somatic_metrics // channel: [ meta, metrics ] - germline = ch_germline_metrics // channel: [ meta, metrics ] + somatic = ch_somatic_flagstat // channel: [ meta, flagstat ] + germline = ch_germline_flagstat // channel: [ meta, flagstat ] - versions = ch_versions // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/gridss_svprep_calling.nf b/subworkflows/local/gridss_svprep_calling.nf index 9a58c34c..71aed28a 100644 --- a/subworkflows/local/gridss_svprep_calling.nf +++ b/subworkflows/local/gridss_svprep_calling.nf @@ -17,6 +17,8 @@ workflow GRIDSS_SVPREP_CALLING { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] + ch_normal_bam // channel: [mandatory] [ meta, bam, bai ] // Reference data genome_fasta // channel: [mandatory] /path/to/genome_fasta @@ -38,16 +40,32 @@ workflow GRIDSS_SVPREP_CALLING { // channel: [ versions.yml ] ch_versions = Channel.empty() - // Sort inputs - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> + // Select input sources and sort + // channel: runnable_tn: [ meta, tumor_bam, tumor_bai, normal_bam, normal_bai ] + // channel: runnable_to: [ meta, tumor_bam, tumor_bai ] + // channel: skip: [ meta ] + ch_inputs_sorted = WorkflowOncoanalyser.groupByMeta( + ch_tumor_bam, + ch_normal_bam, + ) + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + return [ + meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), + Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_NORMAL), + ] + } + .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.GRIDSS_VCF) - runnable_tn: Utils.hasTumorDnaBam(meta) && Utils.hasNormalDnaBam(meta) && !has_existing - runnable_to: Utils.hasTumorDnaBam(meta) && !has_existing + runnable_tn: tumor_bam && normal_bam && !has_existing + runnable_to: tumor_bam && !has_existing + [meta, tumor_bam, tumor_bai] skip: true + meta } // @@ -57,10 +75,10 @@ workflow GRIDSS_SVPREP_CALLING { // channel: [ meta_svprep, bam_tumor, bai_tumor, [] ] ch_svprep_tumor_inputs = Channel.empty() .mix( - ch_inputs_sorted.runnable_to, + ch_inputs_sorted.runnable_to.map { [*it, [], []] }, ch_inputs_sorted.runnable_tn, ) - .map { meta -> + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def meta_svprep = [ key: meta.group_id, @@ -68,12 +86,9 @@ workflow GRIDSS_SVPREP_CALLING { sample_id: Utils.getTumorDnaSampleName(meta), sample_type: 'tumor', // NOTE(SW): slightly redundant since we have this information then lose it with .mix above - group_size: Utils.hasNormalDnaBam(meta) ? 2 : 1 + group_size: normal_bam ? 2 : 1 ] - def tumor_bam = Utils.getTumorDnaBam(meta) - def tumor_bai = Utils.getTumorDnaBai(meta) - return [meta_svprep, tumor_bam, tumor_bai, []] } @@ -103,10 +118,13 @@ workflow GRIDSS_SVPREP_CALLING { // MODULE: SV Prep (normal) // // Create process input channel - // NOTE(SW): this implicitly selects only entries present in ch_inputs_sorted.runnable_tn // channel: [ meta_svprep, bam_normal, bai_normal, junctions_tumor ] - ch_svprep_normal_inputs = WorkflowOncoanalyser.restoreMeta(SVPREP_TUMOR.out.junctions, ch_inputs_sorted.runnable_tn) - .map { meta, junctions_tumor -> + ch_svprep_normal_inputs = WorkflowOncoanalyser.groupByMeta( + ch_inputs_sorted.runnable_tn, + // NOTE(SW): this implicitly selects only entries present in ch_inputs_sorted.runnable_tn + WorkflowOncoanalyser.restoreMeta(SVPREP_TUMOR.out.junctions, ch_inputs_sorted.runnable_tn.map { it[0] }) + ) + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai, junctions_tumor -> def meta_svprep = [ key: meta.group_id, @@ -116,9 +134,6 @@ workflow GRIDSS_SVPREP_CALLING { group_size: 2, // Assumption holds since germline only is not supported and we source from runnable_tn ] - def normal_bam = Utils.getNormalDnaBam(meta) - def normal_bai = Utils.getNormalDnaBai(meta) - return [meta_svprep, normal_bam, normal_bai, junctions_tumor] } @@ -299,13 +314,49 @@ workflow GRIDSS_SVPREP_CALLING { // MODULE: SV Prep depth annotation // // Restore original meta, create process input channel - // channel: tumor/normal: [ meta_svprep, [bams], [bais], vcf, [labels] ] - // channel: tumor only: [ meta_svprep, bam, bai, vcf, label ] - ch_depth_inputs = WorkflowOncoanalyser.restoreMeta(CALL.out.vcf, ch_inputs) - .map { meta, vcf -> + // channel: [ meta, [bams], [bais], vcf, [labels] ] + ch_depth_inputs_tn = WorkflowOncoanalyser.groupByMeta( + ch_inputs_sorted.runnable_tn, + // NOTE(SW): this implicitly selects only entries present in ch_inputs_sorted.runnable_tn + WorkflowOncoanalyser.restoreMeta(CALL.out.vcf, ch_inputs_sorted.runnable_tn.map { it[0] }) + ) + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai, vcf -> + return [ + meta, + [normal_bam, tumor_bam], + [normal_bai, tumor_bai], + vcf, + [Utils.getNormalDnaSampleName(meta), Utils.getTumorDnaSampleName(meta)], + ] + } + + // channel: [ meta, bam, bai, vcf, label ] + ch_depth_inputs_to = WorkflowOncoanalyser.groupByMeta( + ch_inputs_sorted.runnable_to, + // NOTE(SW): this implicitly selects only entries present in ch_inputs_sorted.runnable_to + WorkflowOncoanalyser.restoreMeta(CALL.out.vcf, ch_inputs_sorted.runnable_to.map { it[0] }) + ) + .map { meta, tumor_bam, tumor_bai, vcf -> + return [ + meta, + tumor_bam, + tumor_bai, + vcf, + Utils.getTumorDnaSampleName(meta), + ] + } + + // channel: runnable_tn: [ meta_svprep, [bams], [bais], vcf, [labels] ] + // channel: runnable_to: [ meta_svprep, bam, bai, vcf, label ] + ch_depth_inputs = Channel.empty() + .mix( + ch_depth_inputs_tn, + ch_depth_inputs_to, + ) + .map { d -> - // NOTE(SW): germline only is not currently supported - assert Utils.hasTumorDnaBam(meta) + def meta = d[0] + def fps = d[1..-1] def meta_svprep = [ key: meta.group_id, @@ -313,33 +364,7 @@ workflow GRIDSS_SVPREP_CALLING { tumor_id: Utils.getTumorDnaSampleName(meta) ] - def data = [] - - if (Utils.hasNormalDnaBam(meta)) { - - data = [ - meta_svprep, - [Utils.getNormalDnaBam(meta), Utils.getTumorDnaBam(meta)], - [Utils.getNormalDnaBai(meta), Utils.getTumorDnaBai(meta)], - vcf, - [Utils.getNormalDnaSampleName(meta), Utils.getTumorDnaSampleName(meta)], - ] - - } else if (Utils.hasTumorDnaBam(meta)) { - - data = [ - meta_svprep, - Utils.getTumorDnaBam(meta), - Utils.getTumorDnaBai(meta), - vcf, - Utils.getTumorDnaSampleName(meta), - ] - - } else { - assert false - } - - return data + return [meta_svprep, *fps] } // Add depth annotations to calls diff --git a/subworkflows/local/gripss_filtering.nf b/subworkflows/local/gripss_filtering.nf index 7913be75..90f0cbd2 100644 --- a/subworkflows/local/gripss_filtering.nf +++ b/subworkflows/local/gripss_filtering.nf @@ -53,7 +53,7 @@ workflow GRIPSS_FILTERING { // channel: skip: [ meta ] ch_inputs_germline_sorted = ch_inputs_sorted.runnable .branch { meta, gridss_vcf -> - def has_tumor_normal = Utils.hasTumorDnaBam(meta) && Utils.hasNormalDnaBam(meta) + def has_tumor_normal = Utils.hasTumorDna(meta) && Utils.hasNormalDna(meta) def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.GRIPSS_VCF_NORMAL) runnable: has_tumor_normal && !has_existing @@ -98,7 +98,7 @@ workflow GRIPSS_FILTERING { // channel: skip: [ meta ] ch_inputs_somatic_sorted = ch_inputs_sorted.runnable .branch { meta, gridss_vcf -> - def has_tumor = Utils.hasTumorDnaBam(meta) + def has_tumor = Utils.hasTumorDna(meta) def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.GRIPSS_VCF_TUMOR) runnable: has_tumor && !has_existing @@ -117,7 +117,7 @@ workflow GRIPSS_FILTERING { tumor_id: Utils.getTumorDnaSampleName(meta), ] - if (Utils.hasNormalDnaBam(meta)) { + if (Utils.hasNormalDna(meta)) { meta_gripss.normal_id = Utils.getNormalDnaSampleName(meta) } diff --git a/subworkflows/local/lilac_calling.nf b/subworkflows/local/lilac_calling.nf index 37980d18..eb8ac01d 100644 --- a/subworkflows/local/lilac_calling.nf +++ b/subworkflows/local/lilac_calling.nf @@ -14,6 +14,8 @@ workflow LILAC_CALLING { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] + ch_normal_bam // channel: [mandatory] [ meta, bam, bai ] ch_purple // channel: [mandatory] [ meta, purple_dir ] // Reference data @@ -28,42 +30,29 @@ workflow LILAC_CALLING { // channel: [ versions.yml ] ch_versions = Channel.empty() - // Sort inputs - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> - - def has_tumor_dna = Utils.hasTumorDnaBam(meta) - def has_normal_dna = Utils.hasNormalDnaBam(meta) + // Select input sources and sort for DNA BAMs + // channel: runnable: [ meta, tumor_dna_bam, tumor_dna_bai, normal_dna_bam, normal_dna_bai ] + // channel: skip: [ meta ] + ch_dna_inputs_sorted = WorkflowOncoanalyser.groupByMeta( + ch_tumor_bam, + ch_normal_bam, + ) + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + return [ + meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), + Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_NORMAL), + ] + } + .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.LILAC_DIR) - runnable: (has_tumor_dna || has_normal_dna) && !has_existing + runnable: (tumor_bam || normal_bam) && !has_existing skip: true - } - - // Create channel for DNA BAMs - // channel: [ meta, tumor_dna_bam, tumor_dna_bai, normal_dna_bam, normal_dna_bai ] - ch_dna_inputs = ch_inputs_sorted.runnable - .map { meta -> - - def tumor_bam = [] - def tumor_bai = [] - - def normal_bam = [] - def normal_bai = [] - - if (Utils.hasTumorDnaBam(meta)) { - tumor_bam = Utils.getTumorDnaBam(meta) - tumor_bai = Utils.getTumorDnaBai(meta) - } - - if (Utils.hasNormalDnaBam(meta)) { - normal_bam = Utils.getNormalDnaBam(meta) - normal_bai = Utils.getNormalDnaBai(meta) - } - - return [meta, tumor_bam, tumor_bai, normal_bam, normal_bai] + meta } // Realign reads mapping to HLA regions and homologus regions if using reference genome with ALT contigs @@ -75,11 +64,11 @@ workflow LILAC_CALLING { // Flatten into BAM/BAI pairs, select inputs that are eligible to run // channel: runnable: [ meta_extra, bam, bai ] // channel: skip: [ meta_extra ] - ch_realign_inputs_sorted = ch_dna_inputs + ch_realign_inputs_sorted = ch_dna_inputs_sorted.runnable .flatMap { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> - def tumor_sample_id = Utils.hasTumorDnaBam(meta) ? Utils.getTumorDnaSampleName(meta) : [] - def normal_sample_id = Utils.hasNormalDnaBam(meta) ? Utils.getNormalDnaSampleName(meta) : [] + def tumor_sample_id = Utils.hasTumorDna(meta) ? Utils.getTumorDnaSampleName(meta) : [] + def normal_sample_id = Utils.hasNormalDna(meta) ? Utils.getNormalDnaSampleName(meta) : [] return [ [[key: meta.group_id, *:meta, sample_id: tumor_sample_id, sample_type: 'tumor'], tumor_bam, tumor_bai], @@ -168,7 +157,7 @@ workflow LILAC_CALLING { } else { // channel: [ meta, tumor_dna_bam, tumor_dna_bai, normal_dna_bam, normal_dna_bai ] - ch_dna_inputs_ready = ch_dna_inputs + ch_dna_inputs_ready = ch_dna_inputs_sorted.runnable } @@ -205,11 +194,11 @@ workflow LILAC_CALLING { id: meta.group_id, ] - if (Utils.hasTumorDnaBam(meta)) { + if (Utils.hasTumorDna(meta)) { meta_lilac.tumor_id = Utils.getTumorDnaSampleName(meta) } - if (Utils.hasNormalDnaBam(meta)) { + if (Utils.hasNormalDna(meta)) { meta_lilac.normal_id = Utils.getNormalDnaSampleName(meta) } @@ -240,7 +229,7 @@ workflow LILAC_CALLING { ch_outputs = Channel.empty() .mix( WorkflowOncoanalyser.restoreMeta(LILAC.out.lilac_dir, ch_inputs), - ch_inputs_sorted.skip.map { meta -> [meta, []] }, + ch_dna_inputs_sorted.skip.map { meta -> [meta, []] }, ) emit: diff --git a/subworkflows/local/linx_annotation.nf b/subworkflows/local/linx_annotation.nf index e5c9e26a..5ae97275 100644 --- a/subworkflows/local/linx_annotation.nf +++ b/subworkflows/local/linx_annotation.nf @@ -53,7 +53,7 @@ workflow LINX_ANNOTATION { def tumor_id = Utils.getTumorDnaSampleName(meta) - def has_tumor_normal = Utils.hasTumorDnaBam(meta) && Utils.hasNormalDnaBam(meta) + def has_tumor_normal = Utils.hasTumorDna(meta) && Utils.hasNormalDna(meta) def has_sv_germline_vcf = file(purple_dir).resolve("${tumor_id}.purple.sv.germline.vcf.gz") def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.PURPLE_DIR) @@ -99,7 +99,7 @@ workflow LINX_ANNOTATION { ch_inputs_somatic_sorted = ch_inputs_sorted.runnable .branch { meta, purple_dir -> - def has_tumor = Utils.hasTumorDnaBam(meta) + def has_tumor = Utils.hasTumorDna(meta) def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.PURPLE_DIR) runnable: has_tumor && !has_existing diff --git a/subworkflows/local/pave_annotation.nf b/subworkflows/local/pave_annotation.nf index 98bbd273..ea15ed27 100644 --- a/subworkflows/local/pave_annotation.nf +++ b/subworkflows/local/pave_annotation.nf @@ -51,7 +51,7 @@ workflow PAVE_ANNOTATION { def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.PAVE_VCF_NORMAL) - runnable: Utils.hasTumorDnaBam(meta) && Utils.hasNormalDnaBam(meta) && sage_vcf && !has_existing + runnable: Utils.hasTumorDna(meta) && Utils.hasNormalDna(meta) && sage_vcf && !has_existing skip: true return meta } @@ -105,7 +105,7 @@ workflow PAVE_ANNOTATION { def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.PAVE_VCF_TUMOR) - runnable: Utils.hasTumorDnaBam(meta) && sage_vcf && !has_existing + runnable: Utils.hasTumorDna(meta) && sage_vcf && !has_existing skip: true return meta } diff --git a/subworkflows/local/prepare_reference.nf b/subworkflows/local/prepare_reference.nf index b7df9251..fc7b724f 100644 --- a/subworkflows/local/prepare_reference.nf +++ b/subworkflows/local/prepare_reference.nf @@ -2,6 +2,11 @@ // Prepare reference data as required // + +// NOTE(SW): BWA MEM2 indexes are required and are not created +// TODO(SW): consider removing index creation since it's unlikely to be used, replace with documentation + + include { SAMTOOLS_FAIDX } from '../../modules/nf-core/samtools/faidx/main' include { SAMTOOLS_DICT } from '../../modules/nf-core/samtools/dict/main' include { BWA_INDEX } from '../../modules/nf-core/bwa/index/main' @@ -94,6 +99,10 @@ workflow PREPARE_REFERENCE { } } + // Explicitly create value channels for BWA MEM2 index files + ch_genome_bwa_index_bseq = Channel.value(params.ref_data_genome_bwa_index_bseq) + ch_genome_bwa_index_biidx = Channel.value(params.ref_data_genome_bwa_index_biidx) + // // Set VIRUSBreakend database path / stage, unpack if required // @@ -168,6 +177,8 @@ workflow PREPARE_REFERENCE { genome_fai = ch_genome_fai // path: genome_fai genome_dict = ch_genome_dict // path: genome_dict genome_bwa_index = ch_genome_bwa_index // path: genome_bwa_index + genome_bwa_index_bseq = ch_genome_bwa_index_bseq // path: genome_bwa_index_bseq + genome_bwa_index_biidx = ch_genome_bwa_index_biidx // path: genome_bwa_index_biidx genome_bwa_index_image = ch_genome_bwa_index_image // path: genome_bwa_index_image genome_gridss_index = ch_genome_gridss_index // path: genome_gridss_index genome_version = params.ref_data_genome_version // val: genome_version diff --git a/subworkflows/local/purple_calling.nf b/subworkflows/local/purple_calling.nf index 3070cc1e..182b460a 100644 --- a/subworkflows/local/purple_calling.nf +++ b/subworkflows/local/purple_calling.nf @@ -102,7 +102,7 @@ workflow PURPLE_CALLING { tumor_id: Utils.getTumorDnaSampleName(meta), ] - if (Utils.hasNormalDnaBam(meta)) { + if (Utils.hasNormalDna(meta)) { meta_purple.normal_id = Utils.getNormalDnaSampleName(meta) } diff --git a/subworkflows/local/read_alignment.nf b/subworkflows/local/read_alignment.nf index bee3cdbd..809e9982 100644 --- a/subworkflows/local/read_alignment.nf +++ b/subworkflows/local/read_alignment.nf @@ -1,301 +1,234 @@ -include { BWA_MEM2 } from '../../modules/local/bwa/mem2/main' +include { BWA_MEM2 } from '../../modules/local/bwa/mem2/main' include { FASTP } from '../../modules/local/fastp/main' include { SAMBAMBA_INDEX } from '../../modules/local/sambamba/index/main' -include { STAR } from '../../modules/local/star/main' workflow READ_ALIGNMENT { take: - // Sample data - ch_inputs // channel: [mandatory] [ meta ] - genome_fasta - genome_bwa_index - max_fastq_records + // Sample data + ch_inputs // channel: [mandatory] [ meta ] - main: - // Channel for version.yml files - // channel: [ versions.yml ] - ch_versions = Channel.empty() - - // channel: [ group_id, sample_count ] - ch_sample_counts = ch_inputs.map { meta -> [meta.group_id, Utils.groupSampleCounts(meta)] } - - // channel: [ meta ] (One sample per record). - ch_meta_samples = ch_inputs.flatMap { meta -> Utils.splitGroupIntoSamples(meta) } - - // Sort inputs - // channel: [ meta ] (One sample per record). - ch_meta_samples_sorted = ch_meta_samples - .branch { meta -> - runnable_fastq: Utils.hasDnaFastq(meta) - skip: true - } - - // STAR - // TODO(SW): implement inputs - // ch_star_inputs = Channel.of([[id: 'foo'], []]) - // STAR( - // ch_star_inputs, - // // TODO(SW): include reference files - // ) - // TODO(SW): implement outputs - ch_star_outputs = Channel.empty() - - // BWA MEM2 - // channel: [ sample_key, fastq_pair_count ] - ch_sample_fastq_pair_count = ch_meta_samples_sorted.runnable_fastq.map { meta_sample -> - - def sample_key = Utils.shallow_copy(meta_sample) - def fastq_pair_count = 0 - meta_sample.each { key, value -> - - if ((value instanceof java.util.Map) && value.containsKey('sample_id')) { - sample_key['sample_key'] = key - sample_key['sample_id'] = value.sample_id - sample_key.remove(key) - - fastq_pair_count = value[Constants.FileType.FASTQ].toString().tokenize(';').size() / 2 - } - } + // Reference data + genome_fasta // channel: [mandatory] /path/to/genome_fasta + genome_bwa_index // channel: [mandatory] /path/to/genome_bwa_index_dir/ + genome_bwa_index_bseq // channel: [mandatory] /path/to/genome_bwa_index_binary_seq + genome_bwa_index_biidx // channel: [mandatory] /path/to/genome_bwa_index_bi-index - [sample_key, fastq_pair_count] - } + // Params + max_fastq_records // numeric: [mandatory] max number of FASTQ records per split - // channel: [ meta_fastq, reads_fwd_fastq, reads_rev_fastq ] - ch_fastq_pairs = ch_meta_samples_sorted.runnable_fastq - .flatMap { meta -> - - def sample_key = Constants.DNA_SAMPLE_KEYS.find { key -> meta.containsKey(key) } - if (sample_key === null) { - log.error "No DNA sample found" - System.exit(1) + main: + // Channel for version.yml files + // channel: [ versions.yml ] + ch_versions = Channel.empty() + + // Sort inputs, separate by tumor and normal + // channel: [ meta ] + ch_inputs_tumor_sorted = ch_inputs + .branch { meta -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAM_DNA_TUMOR) + runnable: Utils.hasTumorDnaFastq(meta) && !has_existing + skip: true } - def sample_id = meta[sample_key]['sample_id'] - def fastq_files = meta[sample_key][Constants.FileType.FASTQ].toString().tokenize(';') - - def meta_fastq_common = [id: "${meta.group_id}__${sample_id}"] - meta.each { key, value -> - - if (key === sample_key) { - return - } + ch_inputs_normal_sorted = ch_inputs + .branch { meta -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAM_DNA_NORMAL) + runnable: Utils.hasNormalDnaFastq(meta) && !has_existing + skip: true + } - meta_fastq_common[key] = meta[key] + // Create FASTQ input channel + // channel: [ meta_fastq, fastq_fwd, fastq_rev ] + ch_fastq_inputs = Channel.empty() + .mix( + ch_inputs_tumor_sorted.runnable.map { meta -> [meta, Utils.getTumorDnaSample(meta), 'tumor'] }, + ch_inputs_normal_sorted.runnable.map { meta -> [meta, Utils.getNormalDnaSample(meta), 'normal'] }, + ) + .flatMap { meta, meta_sample, sample_type -> + meta_sample + .getAt(Constants.FileType.FASTQ) + .collect { key, fps -> + def (library_id, lane) = key + + def meta_fastq = [ + key: meta.group_id, + id: "${meta.group_id}_${meta_sample.sample_id}", + sample_id: meta_sample.sample_id, + library_id: library_id, + lane: lane, + sample_type: sample_type, + ] + + return [meta_fastq, fps['fwd'], fps['rev']] + } } - meta_fastq_common['sample_key'] = sample_key - meta_fastq_common['sample_id'] = sample_id - def fastq_pairs = [] - for (i = 0; i < fastq_files.size(); i += 2) { - def reads_fwd = fastq_files[i] - def reads_rev = fastq_files[i + 1] + // + // MODULE: fastp + // + // Split FASTQ into chunks if requested for distributed processing + // channel: [ meta_fastq_ready, fastq_fwd, fastq_fwd ] + ch_fastqs_ready = Channel.empty() + if (max_fastq_records > 0) { - def meta_fastq = Utils.shallow_copy(meta_fastq_common) - meta_fastq['read_group'] = Utils.readGroupFromFastqPath(reads_fwd, reads_rev) + // Run process + FASTP( + ch_fastq_inputs, + max_fastq_records, + ) - fastq_pairs.add([meta_fastq, reads_fwd, reads_rev]) - } + ch_versions = ch_versions.mix(FASTP.out.versions) - fastq_pairs - } + // Prepare outputs within conditional block + ch_fastqs_ready = FASTP.out.fastq + .flatMap { meta_fastq, reads_fwd, reads_rev -> - // Split fastq files using fastp. - // channel: [ meta_fastq, reads_fwd_fastqs, reads_rev_fastqs ] - ch_split_fastq_pairs = Channel.empty() - if (max_fastq_records > 0) { - FASTP( - ch_fastq_pairs, - max_fastq_records, - ) + def data = [reads_fwd, reads_rev] + .transpose() + .collect { fwd, rev -> - ch_versions = ch_versions.mix(FASTP.out.versions) + def split_fwd = fwd.name.replaceAll('\\..+$', '') + def split_rev = rev.name.replaceAll('\\..+$', '') - ch_split_fastq_pairs = FASTP.out.fastq - } else { - ch_split_fastq_pairs = ch_fastq_pairs.map { fastq_pair -> [fastq_pair[0], [fastq_pair[1]], [fastq_pair[2]]] } - } + assert split_fwd == split_rev - // channel: [ sample_key, fastq_pair_split_count ] - ch_sample_fastq_pair_split_count = ch_sample_fastq_pair_count - .cross( - ch_split_fastq_pairs.map { split_fastq_pairs -> + // NOTE(SW): split allows meta_fastq_ready to be unique, which is required during reunite below + def meta_fastq_ready = [ + *:meta_fastq, + id: "${meta_fastq.id}_${split_fwd}", + split: split_fwd, + ] - def meta_sample = split_fastq_pairs[0] - def sample_key = Utils.shallow_copy(meta_sample) - sample_key.remove('id') - sample_key.remove('read_group') - sample_key.remove(meta_sample.sample_key) + return [meta_fastq_ready, fwd, rev] + } - [sample_key, split_fastq_pairs[1].size()] - } - ) - .map { count_tuple, split_count_tuple -> - def sample_key = count_tuple[0] - def count = count_tuple[1].intValue() - def split_count = split_count_tuple[1] + return data + } - tuple(groupKey(sample_key, count), sample_key, split_count) - } - .groupTuple() - .map { group_key, sample_keys, split_counts -> + } else { - [sample_keys[0], split_counts.sum()] - } + ch_fastqs_ready = ch_fastq_inputs + .map { meta_fastq, fastq_fwd, fastq_rev -> - // Create inputs for bwa mem. - // channel: [ meta_fastq, reads_fwd_fastq, reads_rev_fastq ] - ch_bwa_mem_inputs = ch_split_fastq_pairs.flatMap { fastq -> - def meta = fastq[0] - def fwd_reads = fastq[1] - def rev_reads = fastq[2] - - // Pair up the reads. - def read_pairs = [:] - if (fwd_reads.size() == 1) { - read_pairs[""] = ["000", fwd_reads[0], rev_reads[0]] - } else { - fwd_reads.each { fastq_path -> - - def base_name = fastq_path.getFileName().toString() - def pattern = /^(\d+)\.(.+)_R[12]\.fastp\.fastq$/ - def matcher = base_name =~ pattern - assert matcher.find() - def split = matcher[0][1] - def key = "${split}.${matcher[0][2]}" - assert !read_pairs.containsKey(key) - read_pairs[key] = [split, fastq_path] - } + def meta_fastq_ready = [ + *:meta_fastq, + split: null, + ] - rev_reads.each { fastq_path -> + return [meta_fastq_ready, fastq_fwd, fastq_rev] + } - def base_name = fastq_path.getFileName().toString() - def pattern = /^(.+)_R[12]\.fastp\.fastq$/ - def matcher = base_name =~ pattern - assert matcher.find() - def key = matcher[0][1] - assert read_pairs.containsKey(key) - read_pairs[key].add(fastq_path) - } } - def fastqs = [] - read_pairs.values().each { split_fastq_pair -> + // + // MODULE: BWA-MEM2 + // + // Create process input channel + // channel: [ meta_bwa, fastq_fwd, fastq_rev ] + ch_bwa_inputs = ch_fastqs_ready + .map { meta_fastq_ready, fastq_fwd, fastq_rev -> - meta_fastq = Utils.shallow_copy(meta) - meta_fastq['split'] = split_fastq_pair[0] + def meta_bwa = [ + *:meta_fastq_ready, - fastqs.add([meta_fastq, split_fastq_pair[1], split_fastq_pair[2]]) - } - fastqs - } + // TODO(SW): understand target format + read_group: "${meta_fastq_ready.sample_id}.${meta_fastq_ready.library_id}.${meta_fastq_ready.lane}", - // channel: [ meta_fastq, bam ] - BWA_MEM2( - ch_bwa_mem_inputs, - genome_fasta, - genome_bwa_index, - ) - ch_versions = ch_versions.mix(BWA_MEM2.out.versions) + ] - // channel: [ meta_fastq, bam, bai ] - SAMBAMBA_INDEX( - BWA_MEM2.out.bam, - ) + return [meta_bwa, fastq_fwd, fastq_rev] - ch_versions = ch_versions.mix(SAMBAMBA_INDEX.out.versions) + } - // Merge all bam records for a single sample into a singlke record. - // channel: [ meta ] (One sample per meta record). - ch_merged_bam_samples = ch_sample_fastq_pair_split_count - .cross( - SAMBAMBA_INDEX.out.bam - .map { bam -> + // Run process + BWA_MEM2( + ch_bwa_inputs, + genome_fasta, + genome_bwa_index, + genome_bwa_index_bseq, + genome_bwa_index_biidx, + ) - def meta_bam = bam[0] - def sample_key = Utils.shallow_copy(meta_bam) - sample_key.remove('id') - sample_key.remove(meta_bam.sample_key) - sample_key.remove('read_group') - sample_key.remove('split') + ch_versions = ch_versions.mix(BWA_MEM2.out.versions) - [sample_key, bam] - } + // + // MODULE: Sambamba index + // + SAMBAMBA_INDEX( + BWA_MEM2.out.bam, ) - .map { count_tuple, bam_tuple -> - def sample_key = count_tuple[0] - def count = count_tuple[1] - def bam = bam_tuple[1] - - tuple(groupKey(sample_key, count), bam) - } - .groupTuple() - .map { group_key, bams -> + ch_versions = ch_versions.mix(SAMBAMBA_INDEX.out.versions) - def first_meta_bam = bams[0][0] - def sample_key = first_meta_bam.sample_key + // Combine BAMs and BAIs + ch_bams_flat = Channel.empty() + .mix( + BWA_MEM2.out.bam, + SAMBAMBA_INDEX.out.bai + ) + .groupTuple(size: 2) + .map { it.flatten() } - def bam_files = [] - def bai_files = [] - def meta_bam = Utils.shallow_copy(first_meta_bam) - meta_bam.remove(sample_key) - meta_bam.remove('sample_key') - meta_bam.remove('sample_id') - meta_bam.remove('read_group') - meta_bam.remove('split') - meta_bam[sample_key] = [sample_id: first_meta_bam.sample_id] - meta_bam[sample_key][Constants.FileType.BAM_MARKDUPS] = bam_files - meta_bam[sample_key][Constants.FileType.BAI_MARKDUPS] = bai_files + // Reunite BAMs + // First, count expected BAMs per sample for non-blocking groupTuple op + ch_sample_fastq_counts = ch_bwa_inputs + .map { meta_bwa, reads_fwd, reads_rev -> - bams.each { bam -> + def meta_count = [ + key: meta_bwa.key, + sample_type: meta_bwa.sample_type, + ] - bam_files.add(bam[1]) - bai_files.add(bam[2]) + return [meta_count, meta_bwa] } + .groupTuple() + .map { meta_count, meta_bwas -> return [meta_count, meta_bwas.size()] } - meta_bam - } + // Now, group with expected size then sort into tumor and normal channels + ch_bams_united = ch_sample_fastq_counts + .cross( + // First element to match meta_count above for `cross` + ch_bams_flat.map { meta_bwa, bam, bai -> [[key: meta_bwa.key, sample_type: meta_bwa.sample_type], bam, bai] } + ) + .map { count_tuple, bam_tuple -> - // Merge back in skipped meta entries. - // channel: [ meta ] (One sample per meta record). - ch_all_samples = Channel.empty() - .mix( - ch_merged_bam_samples, - ch_meta_samples_sorted.skip, - ) + def group_size = count_tuple[1] + def (meta_bam, bam, bai) = bam_tuple - // Merge individual sample records back into group records without blocking for the whole channel to be processed. - // channel: [ meta_bam ] - ch_bwa_outputs = ch_sample_counts - .cross( - ch_all_samples.map { meta -> [meta.group_id, meta] } - ) - .map { count_tuple, meta_tuple -> - def group_id = count_tuple[0] - def count = count_tuple[1] - def meta = meta_tuple[1] - - tuple(groupKey(group_id, count), meta) - } - .groupTuple() - .map { group_key, meta_samples -> + def meta_group = [ + *:meta_bam, + ] - def meta_group = [:] - meta_samples.each { meta_sample -> - - meta_sample.each { key, value -> meta_group[key] = value } + return tuple(groupKey(meta_group, group_size), bam, bai) + } + .groupTuple() + .branch { meta_group, bams, bais -> + assert ['tumor', 'normal'].contains(meta_group.sample_type) + tumor: meta_group.sample_type == 'tumor' + normal: meta_group.sample_type == 'normal' + placeholder: true } - meta_group - } + // Set outputs, restoring original meta + // channel: [ meta, [bam, ...], [bai, ...] ] + ch_bam_tumor_out = Channel.empty() + .mix( + WorkflowOncoanalyser.restoreMeta(ch_bams_united.tumor, ch_inputs), + ch_inputs_tumor_sorted.skip.map { meta -> [meta, [], []] }, + ) + + ch_bam_normal_out = Channel.empty() + .mix( + WorkflowOncoanalyser.restoreMeta(ch_bams_united.normal, ch_inputs), + ch_inputs_normal_sorted.skip.map { meta -> [meta, [], []] }, + ) emit: - dna = ch_bwa_outputs // channel: [ meta ] - // TODO(SW): RNA alignment. - rna = ch_star_outputs // channel: [ meta, bam_rna ] - versions = ch_versions // channel: [ versions.yml ] + dna_tumor = ch_bam_tumor_out // channel: [ meta, [bam, ...], [bai, ...] ] + dna_normal = ch_bam_normal_out // channel: [ meta, [bam, ...], [bai, ...] ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/read_processing.nf b/subworkflows/local/read_processing.nf index bdd71a5a..d3cef837 100644 --- a/subworkflows/local/read_processing.nf +++ b/subworkflows/local/read_processing.nf @@ -1,140 +1,115 @@ -include { MARKDUPS } from '../../modules/local/markdups/main' +include { MARKDUPS } from '../../modules/local/markdups/main' workflow READ_PROCESSING { take: - // Sample data - ch_inputs // channel: [mandatory] [ meta ] - ch_dna_bams // channel: [mandatory] [ meta, bam_dna ] - ch_rna_bams // channel: [mandatory] [ meta, bam_rna ] - genome_ver - genome_fasta - genome_fai - genome_dict - unmap_regions - has_umis + // Sample data + ch_inputs // channel: [mandatory] [ meta ] + ch_dna_tumor // channel: [mandatory] [ meta, [bam, ...], [bai, ...] ] + ch_dna_normal // channel: [mandatory] [ meta, [bam, ...], [bai, ...] ] - main: - // Channel for version.yml files - // channel: [ versions.yml ] - ch_versions = Channel.empty() - - // channel: [ group_id, sample_count ] - ch_sample_counts = ch_inputs.map { meta -> [meta.group_id, Utils.groupSampleCounts(meta)] } - - // channel: [ meta ] (One sample per record). - ch_meta_samples = ch_dna_bams.flatMap { meta -> Utils.splitGroupIntoSamples(meta) } + // Reference data + genome_fasta // channel: [mandatory] /path/to/genome_fasta + genome_ver // channel: [mandatory] genome version + genome_fai // channel: [mandatory] /path/to/genome_fai + genome_dict // channel: [mandatory] /path/to/genome_dict + unmap_regions // channel: [mandatory] /path/to/unmap_regions - // Sort inputs - // channel: [ meta ] (One sample per record). - ch_meta_samples_sorted = ch_meta_samples - .branch { meta -> - runnable: Utils.hasDnaMarkdupsBam(meta) - skip: true - } + // Params + has_umis // boolean: [mandatory] UMI processing flag - // MarkDups - // Prepare input to markdups process. - // channel: [ meta_bam, bams, bais ] - ch_markdups_inputs = ch_meta_samples_sorted.runnable - .map { meta_sample -> - - def meta_bam = Utils.shallow_copy(meta_sample) - def bams = [] - def bais = [] - meta_sample.each { key, value -> - - if ((value instanceof java.util.Map) && value.containsKey('sample_id')) { - meta_bam['sample_id'] = value.sample_id - meta_bam['sample_key'] = key - bams = value[Constants.FileType.BAM_MARKDUPS] - bais = value[Constants.FileType.BAI_MARKDUPS] - } + main: + // Channel for version.yml files + // channel: [ versions.yml ] + ch_versions = Channel.empty() + + // Select and sort input sources, separating bytumor and normal + // channel: runnable: [ meta, bams, bais ] + // channel: skip: [ meta ] + ch_inputs_tumor_sorted = ch_dna_tumor + .map { meta, bams, bais -> + return [ + meta, + Utils.hasExistingInput(meta, Constants.INPUT.BAM_DNA_TUMOR) ? [Utils.getInput(meta, Constants.INPUT.BAM_DNA_TUMOR)] : bams, + Utils.hasExistingInput(meta, Constants.INPUT.BAI_DNA_TUMOR) ? [Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR)] : bais, + ] } - - meta_bam['id'] = "${meta_bam.group_id}__${meta_bam.sample_id}" - - if (!(bams instanceof Collection)) { - bams = [bams] + .branch { meta, bams, bais -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR) + runnable: bams && !has_existing + skip: true + return meta } - if (!(bais instanceof Collection)) { - bais = [bais] + ch_inputs_normal_sorted = ch_dna_normal + .map { meta, bams, bais -> + return [ + meta, + Utils.hasExistingInput(meta, Constants.INPUT.BAM_DNA_NORMAL) ? [Utils.getInput(meta, Constants.INPUT.BAM_DNA_NORMAL)] : bams, + Utils.hasExistingInput(meta, Constants.INPUT.BAI_DNA_NORMAL) ? [Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL)] : bais, + ] + } + .branch { meta, bams, bais -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL) + runnable: bams && !has_existing + skip: true + return meta } - [meta_bam, bams, bais] - } - - // channel: [ meta_bam, bam, bai ] - MARKDUPS( - ch_markdups_inputs, - genome_ver, - genome_fasta, - genome_fai, - genome_dict, - unmap_regions, - has_umis, - ) - - ch_versions = ch_versions.mix(MARKDUPS.out.versions) - - // Update sample information. - // channel: [ meta ] (One sample per meta record). - ch_bam_samples = MARKDUPS.out.bam.map { bam -> - - def meta_bam = bam[0] - - def meta = Utils.shallow_copy(meta_bam) - meta.remove('id') - meta.remove('sample_id') - meta.remove('sample_key') - - def sample = [sample_id: meta_bam.sample_id] - sample[Constants.FileType.BAM] = bam[1] - sample[Constants.FileType.BAI] = bam[2] - meta[meta_bam.sample_key] = sample - - meta - } - - // Merge back in skipped meta entries. - // channel: [ meta ] (One sample per meta record). - ch_all_samples = Channel.empty() - .mix( - ch_bam_samples, - ch_meta_samples_sorted.skip, - ) + // Create process input channel + // channel: [ meta_markdups, bam, bai ] + ch_markdups_inputs = Channel.empty() + .mix( + ch_inputs_tumor_sorted.runnable.map { meta, bam, bai -> [meta, Utils.getTumorDnaSample(meta), 'tumor', bam, bai] }, + ch_inputs_normal_sorted.runnable.map { meta, bam, bai -> [meta, Utils.getNormalDnaSample(meta), 'normal', bam, bai] }, + ) + .map { meta, meta_sample, sample_type, bam, bai -> + + def meta_markdups = [ + key: meta.group_id, + id: "${meta.group_id}_${meta_sample.sample_id}", + sample_id: meta_sample.sample_id, + sample_type: sample_type, + ] + + return [meta_markdups, bam, bai] + } - // Merge individual sample records back into group records without blocking for the whole channel to be processed. - // channel: [ meta_bam ] - ch_markduplicates_dna_out = ch_sample_counts - .cross( - ch_all_samples.map { meta -> [meta.group_id, meta] } + // Run process + MARKDUPS( + ch_markdups_inputs, + genome_fasta, + genome_ver, + genome_fai, + genome_dict, + unmap_regions, + has_umis, ) - .map { count_tuple, meta_tuple -> - - def group_id = count_tuple[0] - def count = count_tuple[1] - def meta = meta_tuple[1] - - tuple(groupKey(group_id, count), meta) - } - .groupTuple() - .map { group_key, meta_samples -> - - def meta_group = [:] - meta_samples.each { meta_sample -> - meta_sample.each { key, value -> meta_group[key] = value } + // Sort into a tumor and normal channel + ch_markdups_out = MARKDUPS.out.bam + .branch { meta_markdups, bam, bai -> + assert ['tumor', 'normal'].contains(meta_markdups.sample_type) + tumor: meta_markdups.sample_type == 'tumor' + normal: meta_markdups.sample_type == 'normal' + placeholder: true } - meta_group - } + // Set outputs, restoring original meta + // channel: [ meta, bam, bai ] + ch_bam_tumor_out = Channel.empty() + .mix( + WorkflowOncoanalyser.restoreMeta(ch_markdups_out.tumor, ch_inputs), + ch_inputs_tumor_sorted.skip.map { meta -> [meta, [], []] }, + ) - // TODO(SW): implement outputs - ch_markduplicates_rna_out = Channel.empty() + ch_bam_normal_out = Channel.empty() + .mix( + WorkflowOncoanalyser.restoreMeta(ch_markdups_out.normal, ch_inputs), + ch_inputs_normal_sorted.skip.map { meta -> [meta, [], []] }, + ) emit: - dna = ch_markduplicates_dna_out // channel: [ meta ] - rna = ch_markduplicates_rna_out // channel: [ meta, bam_rna ] - versions = ch_versions // channel: [ versions.yml ] + dna_tumor = ch_bam_tumor_out // channel: [ meta, bam, bai ] + dna_normal = ch_bam_normal_out // channel: [ meta, bam, bai ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/sage_append.nf b/subworkflows/local/sage_append.nf index 342069b1..29a9c438 100644 --- a/subworkflows/local/sage_append.nf +++ b/subworkflows/local/sage_append.nf @@ -104,7 +104,7 @@ workflow SAGE_APPEND { .branch { meta, purple_dir -> def tumor_dna_id = Utils.getTumorDnaSampleName(meta) - def has_tumor_dna = Utils.hasTumorDnaBam(meta) + def has_tumor_dna = Utils.hasTumorDna(meta) def has_tumor_rna = Utils.hasTumorRnaBam(meta) def has_smlv_somatic = file(purple_dir).resolve("${tumor_dna_id}.purple.somatic.vcf.gz") def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.SAGE_APPEND_VCF_TUMOR) diff --git a/subworkflows/local/sage_calling.nf b/subworkflows/local/sage_calling.nf index 5c05a511..cdec723b 100644 --- a/subworkflows/local/sage_calling.nf +++ b/subworkflows/local/sage_calling.nf @@ -12,6 +12,8 @@ workflow SAGE_CALLING { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] + ch_normal_bam // channel: [mandatory] [ meta, bam, bai ] // Reference data genome_fasta // channel: [mandatory] /path/to/genome_fasta @@ -33,31 +35,47 @@ workflow SAGE_CALLING { ch_versions = Channel.empty() // Sort inputs - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> - runnable: Utils.hasTumorDnaBam(meta) + // channel: runnable: [ meta, tumor_bam, tumor_bai, normal_bam, normal_bai ] + // channel: skip: [ meta ] + ch_inputs_sorted = WorkflowOncoanalyser.groupByMeta( + ch_tumor_bam, + ch_normal_bam, + ) + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + return [ + meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), + Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_NORMAL), + ] + } + .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + runnable: tumor_bam skip: true + meta } // // MODULE: SAGE germline // // Select inputs that are eligible to run - // channel: [ meta ] + // channel: runnable: [ meta, tumor_bam, tumor_bai, normal_bam, normal_bai ] + // channel: skip: [ meta ] ch_inputs_germline_sorted = ch_inputs_sorted.runnable - .branch { meta -> - def has_tumor_normal = Utils.hasTumorDnaBam(meta) && Utils.hasNormalDnaBam(meta) + .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + def has_tumor_normal = tumor_bam && normal_bam def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.SAGE_VCF_NORMAL) runnable: has_tumor_normal && !has_existing skip: true + meta } // Create process input channel - // channel: [ meta_sage, tbam, nbam, tbai, nbai ] + // channel: [ meta_sage, tumor_bam, normal_bam, tumor_bai, normal_bai ] ch_sage_germline_inputs = ch_inputs_germline_sorted.runnable - .map { meta -> + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def meta_sage = [ key: meta.group_id, @@ -66,16 +84,7 @@ workflow SAGE_CALLING { normal_id: Utils.getNormalDnaSampleName(meta), ] - data = [ - meta_sage, - Utils.getTumorDnaBam(meta), - Utils.getNormalDnaBam(meta), - Utils.getTumorDnaBai(meta), - Utils.getNormalDnaBai(meta), - ] - - return data - + return [meta_sage, tumor_bam, normal_bam, tumor_bai, normal_bai] } // Run process @@ -98,21 +107,23 @@ workflow SAGE_CALLING { // MODULE: SAGE somatic // // Select inputs that are eligible to run - // channel: [ meta ] + // channel: runnable: [ meta, tumor_bam, tumor_bai, normal_bam, normal_bai ] + // channel: skip: [ meta ] ch_inputs_somatic_sorted = ch_inputs_sorted.runnable - .branch { meta -> - def has_tumor = Utils.hasTumorDnaBam(meta) + .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + def has_tumor = tumor_bam def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.SAGE_VCF_TUMOR) runnable: has_tumor && !has_existing skip: true + meta } // Create process input channel - // channel: tumor/normal: [ meta_sage, tbam, nbam, tbai, nbai ] - // channel: tumor only: [ meta_sage, tbam, [], tbai, [] ] + // channel: tumor/normal: [ meta_sage, tumor_bam, normal_bam, tumor_bai, normal_bai ] + // channel: tumor only: [ meta_sage, tumor_bam, [], tumor_bai, [] ] ch_sage_somatic_inputs = ch_inputs_somatic_sorted.runnable - .map { meta -> + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def meta_sage = [ key: meta.group_id, @@ -120,33 +131,7 @@ workflow SAGE_CALLING { tumor_id: Utils.getTumorDnaSampleName(meta), ] - def data = [] - if (Utils.hasNormalDnaBam(meta)) { - - meta_sage.normal_id = Utils.getNormalDnaSampleName(meta) - - data = [ - meta_sage, - Utils.getTumorDnaBam(meta), - Utils.getNormalDnaBam(meta), - Utils.getTumorDnaBai(meta), - Utils.getNormalDnaBai(meta), - ] - - } else { - - data = [ - meta_sage, - Utils.getTumorDnaBam(meta), - [], - Utils.getTumorDnaBai(meta), - [], - ] - - } - - return data - + return [meta_sage, tumor_bam, normal_bam, tumor_bai, normal_bai] } // Run process diff --git a/subworkflows/local/sigs_fitting.nf b/subworkflows/local/sigs_fitting.nf index 4a2f159b..9d47f8f7 100644 --- a/subworkflows/local/sigs_fitting.nf +++ b/subworkflows/local/sigs_fitting.nf @@ -34,7 +34,7 @@ workflow SIGS_FITTING { ch_inputs_sorted = ch_inputs_selected .branch { meta, purple_dir -> - def has_dna = Utils.hasTumorDnaBam(meta) + def has_dna = Utils.hasTumorDna(meta) def tumor_id def has_smlv_vcf diff --git a/subworkflows/local/virusbreakend_calling.nf b/subworkflows/local/virusbreakend_calling.nf index c32389da..87e16488 100644 --- a/subworkflows/local/virusbreakend_calling.nf +++ b/subworkflows/local/virusbreakend_calling.nf @@ -12,6 +12,7 @@ workflow VIRUSBREAKEND_CALLING { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] ch_purple // channel: [mandatory] [ meta, purple_dir ] ch_bamtools_somatic // channel: [mandatory] [ meta, metrics ] @@ -36,14 +37,20 @@ workflow VIRUSBREAKEND_CALLING { // Sort inputs // NOTE(SW): VIRUSBreakend inputs are not allowed in the samplesheet, so aren't considered - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> - + // channel: [ meta, tumor_bam, tumor_bai ] + ch_inputs_sorted = ch_tumor_bam + .map { meta, tumor_bam, tumor_bai -> + return [ + meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + ] + } + .branch { meta, tumor_bam, tumor_bai -> def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.VIRUSINTERPRETER_DIR) - - runnable: Utils.hasTumorDnaBam(meta) && !has_existing + runnable: tumor_bam && !has_existing skip: true + meta } // @@ -52,7 +59,7 @@ workflow VIRUSBREAKEND_CALLING { // Create process input channel // channel: [ meta_virus, tumor_bam ] ch_virusbreakend_inputs = ch_inputs_sorted.runnable - .map { meta -> + .map { meta, tumor_bam, tumor_bai -> def meta_virus = [ key: meta.group_id, @@ -60,7 +67,7 @@ workflow VIRUSBREAKEND_CALLING { sample_id: Utils.getTumorDnaSampleName(meta), ] - return [meta_virus, Utils.getTumorDnaBam(meta)] + return [meta_virus, tumor_bam] } // Run process diff --git a/workflows/targeted.nf b/workflows/targeted.nf index 13170ba1..d8507996 100644 --- a/workflows/targeted.nf +++ b/workflows/targeted.nf @@ -128,66 +128,64 @@ workflow TARGETED { // // SUBWORKFLOW: Align reads // - // channel: [ meta ] - ch_dna_alignment_out = Channel.empty() - // channel: [ meta, bam_rna ] - ch_rna_alignment_out = Channel.empty() - // TODO(SW): set up correctly - if (true || run_config.stages.alignment) { + // channel: [ meta, [bam, ...], [bai, ...] ] + ch_align_dna_tumor_out = Channel.empty() + ch_align_dna_normal_out = Channel.empty() + if (run_config.stages.alignment) { READ_ALIGNMENT( ch_inputs, ref_data.genome_fasta, ref_data.genome_bwa_index, + ref_data.genome_bwa_index_bseq, + ref_data.genome_bwa_index_biidx, params.max_fastq_records, ) ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) - ch_dna_alignment_out = ch_dna_alignment_out.mix(READ_ALIGNMENT.out.dna) - ch_rna_alignment_out = ch_rna_alignment_out.mix(READ_ALIGNMENT.out.rna) - + ch_align_dna_tumor_out = ch_align_dna_tumor_out.mix(READ_ALIGNMENT.out.dna_tumor) + ch_align_dna_normal_out = ch_align_dna_normal_out.mix(READ_ALIGNMENT.out.dna_normal) } else { - ch_dna_alignment_out = ch_inputs - ch_rna_alignment_out = ch_inputs.map { meta -> [meta, []] } + ch_align_dna_tumor_out = ch_inputs.map { meta -> [meta, [], []] } + ch_align_dna_normal_out = ch_inputs.map { meta -> [meta, [], []] } } // // SUBWORKFLOW: Process read alignments // - // channel: [ meta ] - ch_dna_processed_out = Channel.empty() - // channel: [ meta, bam_rna ] - ch_rna_processed_out = Channel.empty() - // TODO(SW): set up correctly - if (true || run_config.stages.markdups) { + // channel: [ meta, bam, bai ] + ch_process_dna_tumor_out = Channel.empty() + ch_process_dna_normal_out = Channel.empty() + if (run_config.stages.markdups) { + // NOTE(SW/MC): hardcoded for initial testing purposes has_umis = run_config.panel.equalsIgnoreCase('tso500') READ_PROCESSING( ch_inputs, - ch_dna_alignment_out, - ch_rna_alignment_out, - ref_data.genome_version, + ch_align_dna_tumor_out, + ch_align_dna_normal_out, ref_data.genome_fasta, + ref_data.genome_version, ref_data.genome_fai, ref_data.genome_dict, - file(params.refdata_unmap_regions), + hmf_data.unmap_regions, has_umis, ) ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) - ch_dna_processed_out = ch_dna_processed_out.mix(READ_PROCESSING.out.dna) - ch_rna_processed_out = ch_rna_processed_out.mix(READ_PROCESSING.out.rna) + ch_process_dna_tumor_out = ch_process_dna_tumor_out.mix(READ_PROCESSING.out.dna_tumor) + ch_process_dna_normal_out = ch_process_dna_normal_out.mix(READ_PROCESSING.out.dna_normal) } else { - ch_dna_processed_out = ch_inputs.map - ch_rna_processed_out = ch_inputs.map { meta -> [meta, []] } + ch_process_dna_normal_out = ch_inputs.map + ch_process_dna_normal_out = ch_inputs.map { meta -> [meta, []] } } @@ -205,7 +203,7 @@ workflow TARGETED { isofox_tpm_norm = params.isofox_tpm_norm ? file(params.isofox_tpm_norm) : panel_data.isofox_tpm_norm ISOFOX_QUANTIFICATION( - ch_dna_processed_out, + ch_inputs, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -224,7 +222,7 @@ workflow TARGETED { } else { - ch_isofox_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_isofox_out = ch_inputs.map { meta -> [meta, []] } } @@ -236,7 +234,9 @@ workflow TARGETED { if (run_config.stages.amber) { AMBER_PROFILING( - ch_dna_processed_out, + ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_version, hmf_data.heterozygous_sites, panel_data.target_region_bed, @@ -247,7 +247,7 @@ workflow TARGETED { } else { - ch_amber_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_amber_out = ch_inputs.map { meta -> [meta, []] } } @@ -259,7 +259,9 @@ workflow TARGETED { if (run_config.stages.cobalt) { COBALT_PROFILING( - ch_dna_processed_out, + ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, hmf_data.gc_profile, hmf_data.diploid_bed, panel_data.target_region_normalisation, @@ -271,7 +273,7 @@ workflow TARGETED { } else { - ch_cobalt_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_cobalt_out = ch_inputs.map { meta -> [meta, []] } } @@ -283,7 +285,9 @@ workflow TARGETED { if (run_config.stages.gridss) { GRIDSS_SVPREP_CALLING( - ch_dna_processed_out, + ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -303,7 +307,7 @@ workflow TARGETED { } else { - ch_gridss_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_gridss_out = ch_inputs.map { meta -> [meta, []] } } @@ -317,7 +321,7 @@ workflow TARGETED { if (run_config.stages.gripss) { GRIPSS_FILTERING( - ch_dna_processed_out, + ch_inputs, ch_gridss_out, ref_data.genome_fasta, ref_data.genome_version, @@ -337,9 +341,9 @@ workflow TARGETED { } else { - ch_gripss_somatic_out = ch_dna_processed_out.map { meta -> [meta, [], []] } - ch_gripss_germline_out = ch_dna_processed_out.map { meta -> [meta, [], []] } - ch_gripss_somatic_unfiltered_out = ch_dna_processed_out.map { meta -> [meta, [], []] } + ch_gripss_somatic_out = ch_inputs.map { meta -> [meta, [], []] } + ch_gripss_germline_out = ch_inputs.map { meta -> [meta, [], []] } + ch_gripss_somatic_unfiltered_out = ch_inputs.map { meta -> [meta, [], []] } } @@ -355,7 +359,9 @@ workflow TARGETED { if (run_config.stages.sage) { SAGE_CALLING( - ch_dna_processed_out, + ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -379,10 +385,10 @@ workflow TARGETED { } else { - ch_sage_germline_vcf_out = ch_dna_processed_out.map { meta -> [meta, [], []] } - ch_sage_somatic_vcf_out = ch_dna_processed_out.map { meta -> [meta, [], []] } - ch_sage_germline_dir_out = ch_dna_processed_out.map { meta -> [meta, []] } - ch_sage_somatic_dir_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_sage_germline_vcf_out = ch_inputs.map { meta -> [meta, [], []] } + ch_sage_somatic_vcf_out = ch_inputs.map { meta -> [meta, [], []] } + ch_sage_germline_dir_out = ch_inputs.map { meta -> [meta, []] } + ch_sage_somatic_dir_out = ch_inputs.map { meta -> [meta, []] } } @@ -395,7 +401,7 @@ workflow TARGETED { if (run_config.stages.pave) { PAVE_ANNOTATION( - ch_dna_processed_out, + ch_inputs, ch_sage_germline_vcf_out, ch_sage_somatic_vcf_out, ref_data.genome_fasta, @@ -419,8 +425,8 @@ workflow TARGETED { } else { - ch_pave_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } - ch_pave_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_pave_germline_out = ch_inputs.map { meta -> [meta, []] } + ch_pave_somatic_out = ch_inputs.map { meta -> [meta, []] } } @@ -432,7 +438,7 @@ workflow TARGETED { if (run_config.stages.purple) { PURPLE_CALLING( - ch_dna_processed_out, + ch_inputs, ch_amber_out, ch_cobalt_out, ch_pave_somatic_out, @@ -461,7 +467,7 @@ workflow TARGETED { } else { - ch_purple_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_purple_out = ch_inputs.map { meta -> [meta, []] } } @@ -476,7 +482,7 @@ workflow TARGETED { // NOTE(SW): currently used only for ORANGE but will also be used for Neo once implemented SAGE_APPEND( - ch_dna_processed_out, + ch_inputs, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, @@ -491,8 +497,8 @@ workflow TARGETED { } else { - ch_sage_somatic_append_out = ch_dna_processed_out.map { meta -> [meta, []] } - ch_sage_germline_append_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_sage_somatic_append_out = ch_inputs.map { meta -> [meta, []] } + ch_sage_germline_append_out = ch_inputs.map { meta -> [meta, []] } } @@ -505,7 +511,7 @@ workflow TARGETED { if (run_config.stages.linx) { LINX_ANNOTATION( - ch_dna_processed_out, + ch_inputs, ch_purple_out, ref_data.genome_version, hmf_data.ensembl_data_resources, @@ -521,8 +527,8 @@ workflow TARGETED { } else { - ch_linx_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } - ch_linx_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_linx_somatic_out = ch_inputs.map { meta -> [meta, []] } + ch_linx_germline_out = ch_inputs.map { meta -> [meta, []] } } @@ -534,7 +540,7 @@ workflow TARGETED { if (run_config.stages.linx) { LINX_PLOTTING( - ch_dna_processed_out, + ch_inputs, ch_linx_somatic_out, ref_data.genome_version, hmf_data.ensembl_data_resources, @@ -546,7 +552,7 @@ workflow TARGETED { } else { - ch_linx_somatic_visualiser_dir_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_linx_somatic_visualiser_dir_out = ch_inputs.map { meta -> [meta, []] } } @@ -559,7 +565,9 @@ workflow TARGETED { if (run_config.stages.orange && run_config.stages.flagstat) { FLAGSTAT_METRICS( - ch_dna_processed_out, + ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ) ch_versions = ch_versions.mix(FLAGSTAT_METRICS.out.versions) @@ -569,8 +577,8 @@ workflow TARGETED { } else { - ch_flagstat_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } - ch_flagstat_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_flagstat_somatic_out = ch_inputs.map { meta -> [meta, []] } + ch_flagstat_germline_out = ch_inputs.map { meta -> [meta, []] } } @@ -583,7 +591,9 @@ workflow TARGETED { if (run_config.stages.bamtools) { BAMTOOLS_METRICS( - ch_dna_processed_out, + ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_fasta, ref_data.genome_version, ) @@ -595,8 +605,8 @@ workflow TARGETED { } else { - ch_bamtools_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } - ch_bamtools_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_bamtools_somatic_out = ch_inputs.map { meta -> [meta, []] } + ch_bamtools_germline_out = ch_inputs.map { meta -> [meta, []] } } @@ -611,7 +621,9 @@ workflow TARGETED { ref_data_hla_slice_bed = params.containsKey('ref_data_hla_slice_bed') ? params.ref_data_hla_slice_bed : [] LILAC_CALLING( - ch_dna_processed_out, + ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, @@ -626,7 +638,7 @@ workflow TARGETED { } else { - ch_lilac_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_lilac_out = ch_inputs.map { meta -> [meta, []] } } @@ -636,13 +648,13 @@ workflow TARGETED { if (run_config.stages.orange) { // Create placeholder channels for empty remaining channels - ch_chord_out = ch_dna_processed_out.map { meta -> [meta, []] } - ch_cuppa_out = ch_dna_processed_out.map { meta -> [meta, []] } - ch_sigs_out = ch_dna_processed_out.map { meta -> [meta, []] } - ch_virusinterpreter_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_chord_out = ch_inputs.map { meta -> [meta, []] } + ch_cuppa_out = ch_inputs.map { meta -> [meta, []] } + ch_sigs_out = ch_inputs.map { meta -> [meta, []] } + ch_virusinterpreter_out = ch_inputs.map { meta -> [meta, []] } ORANGE_REPORTING( - ch_dna_processed_out, + ch_inputs, ch_bamtools_somatic_out, ch_bamtools_germline_out, ch_flagstat_somatic_out, diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 4d2a74fc..66984000 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -112,7 +112,6 @@ samplesheet = Utils.getFileObject(params.input) // TODO(MC): Drop commit 'WIP: Reverting bioconda containers'. // TODO(MC): Run full tests for going from .fastq.gz. -// TODO(SW): Unmap region resouce files. // TODO(MC): Fix warnings. workflow WGTS { // Create channel for versions @@ -136,67 +135,65 @@ workflow WGTS { // // SUBWORKFLOW: Align reads // - // channel: [ meta ] - ch_dna_alignment_out = Channel.empty() - // channel: [ meta, bam_rna ] - ch_rna_alignment_out = Channel.empty() - // TODO(SW): set up correctly - if (true || run_config.stages.alignment) { + // channel: [ meta, [bam, ...], [bai, ...] ] + ch_align_dna_tumor_out = Channel.empty() + ch_align_dna_normal_out = Channel.empty() + if (run_config.stages.alignment) { READ_ALIGNMENT( ch_inputs, ref_data.genome_fasta, ref_data.genome_bwa_index, + ref_data.genome_bwa_index_bseq, + ref_data.genome_bwa_index_biidx, params.max_fastq_records, ) ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) - ch_dna_alignment_out = ch_dna_alignment_out.mix(READ_ALIGNMENT.out.dna) - ch_rna_alignment_out = ch_rna_alignment_out.mix(READ_ALIGNMENT.out.rna) - + ch_align_dna_tumor_out = ch_align_dna_tumor_out.mix(READ_ALIGNMENT.out.dna_tumor) + ch_align_dna_normal_out = ch_align_dna_normal_out.mix(READ_ALIGNMENT.out.dna_normal) } else { - ch_dna_alignment_out = ch_inputs - ch_rna_alignment_out = ch_inputs.map { meta -> [meta, []] } + ch_align_dna_tumor_out = ch_inputs.map { meta -> [meta, [], []] } + ch_align_dna_normal_out = ch_inputs.map { meta -> [meta, [], []] } } // // SUBWORKFLOW: Process read alignments // - // channel: [ meta ] - ch_dna_processed_out = Channel.empty() - // channel: [ meta, bam_rna ] - ch_rna_processed_out = Channel.empty() - // TODO(SW): set up correctly - if (true || run_config.stages.markdups) { + // channel: [ meta, bam, bai ] + ch_process_dna_tumor_out = Channel.empty() + ch_process_dna_normal_out = Channel.empty() + if (run_config.stages.markdups) { READ_PROCESSING( ch_inputs, - ch_dna_alignment_out, - ch_rna_alignment_out, - ref_data.genome_version, + ch_align_dna_tumor_out, + ch_align_dna_normal_out, ref_data.genome_fasta, + ref_data.genome_version, ref_data.genome_fai, ref_data.genome_dict, - file(params.refdata_unmap_regions), - false, + hmf_data.unmap_regions, + false, // has_umis ) ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) - ch_dna_processed_out = ch_dna_processed_out.mix(READ_PROCESSING.out.dna) - ch_rna_processed_out = ch_rna_processed_out.mix(READ_PROCESSING.out.rna) + ch_process_dna_tumor_out = ch_process_dna_tumor_out.mix(READ_PROCESSING.out.dna_tumor) + ch_process_dna_normal_out = ch_process_dna_normal_out.mix(READ_PROCESSING.out.dna_normal) } else { - ch_dna_processed_out = ch_inputs.map - ch_rna_processed_out = ch_inputs.map { meta -> [meta, []] } + ch_process_dna_normal_out = ch_inputs.map + ch_process_dna_normal_out = ch_inputs.map { meta -> [meta, []] } } + // // MODULE: Run Isofox to analyse RNA data // @@ -208,7 +205,7 @@ workflow WGTS { isofox_gc_ratios = params.isofox_gc_ratios ? file(params.isofox_gc_ratios) : hmf_data.isofox_gc_ratios ISOFOX_QUANTIFICATION( - ch_dna_processed_out, + ch_inputs, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -227,7 +224,7 @@ workflow WGTS { } else { - ch_isofox_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_isofox_out = ch_inputs.map { meta -> [meta, []] } } @@ -239,7 +236,9 @@ workflow WGTS { if (run_config.stages.amber) { AMBER_PROFILING( - ch_dna_processed_out, + ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_version, hmf_data.heterozygous_sites, [], // target_region_bed @@ -251,7 +250,7 @@ workflow WGTS { } else { - ch_amber_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_amber_out = ch_inputs.map { meta -> [meta, []] } } @@ -263,7 +262,9 @@ workflow WGTS { if (run_config.stages.cobalt) { COBALT_PROFILING( - ch_dna_processed_out, + ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, hmf_data.gc_profile, hmf_data.diploid_bed, [], // panel_target_region_normalisation @@ -275,7 +276,7 @@ workflow WGTS { } else { - ch_cobalt_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_cobalt_out = ch_inputs.map { meta -> [meta, []] } } @@ -287,7 +288,9 @@ workflow WGTS { if (run_config.stages.gridss) { GRIDSS_SVPREP_CALLING( - ch_dna_processed_out, + ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -307,7 +310,7 @@ workflow WGTS { } else { - ch_gridss_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_gridss_out = ch_inputs.map { meta -> [meta, []] } } @@ -321,7 +324,7 @@ workflow WGTS { if (run_config.stages.gripss) { GRIPSS_FILTERING( - ch_dna_processed_out, + ch_inputs, ch_gridss_out, ref_data.genome_fasta, ref_data.genome_version, @@ -341,9 +344,9 @@ workflow WGTS { } else { - ch_gripss_somatic_out = ch_dna_processed_out.map { meta -> [meta, [], []] } - ch_gripss_germline_out = ch_dna_processed_out.map { meta -> [meta, [], []] } - ch_gripss_somatic_unfiltered_out = ch_dna_processed_out.map { meta -> [meta, [], []] } + ch_gripss_somatic_out = ch_inputs.map { meta -> [meta, [], []] } + ch_gripss_germline_out = ch_inputs.map { meta -> [meta, [], []] } + ch_gripss_somatic_unfiltered_out = ch_inputs.map { meta -> [meta, [], []] } } @@ -359,7 +362,9 @@ workflow WGTS { if (run_config.stages.sage) { SAGE_CALLING( - ch_dna_processed_out, + ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -383,10 +388,10 @@ workflow WGTS { } else { - ch_sage_germline_vcf_out = ch_dna_processed_out.map { meta -> [meta, [], []] } - ch_sage_somatic_vcf_out = ch_dna_processed_out.map { meta -> [meta, [], []] } - ch_sage_germline_dir_out = ch_dna_processed_out.map { meta -> [meta, []] } - ch_sage_somatic_dir_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_sage_germline_vcf_out = ch_inputs.map { meta -> [meta, [], []] } + ch_sage_somatic_vcf_out = ch_inputs.map { meta -> [meta, [], []] } + ch_sage_germline_dir_out = ch_inputs.map { meta -> [meta, []] } + ch_sage_somatic_dir_out = ch_inputs.map { meta -> [meta, []] } } @@ -399,7 +404,7 @@ workflow WGTS { if (run_config.stages.pave) { PAVE_ANNOTATION( - ch_dna_processed_out, + ch_inputs, ch_sage_germline_vcf_out, ch_sage_somatic_vcf_out, ref_data.genome_fasta, @@ -423,8 +428,8 @@ workflow WGTS { } else { - ch_pave_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } - ch_pave_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_pave_germline_out = ch_inputs.map { meta -> [meta, []] } + ch_pave_somatic_out = ch_inputs.map { meta -> [meta, []] } } @@ -436,7 +441,7 @@ workflow WGTS { if (run_config.stages.purple) { PURPLE_CALLING( - ch_dna_processed_out, + ch_inputs, ch_amber_out, ch_cobalt_out, ch_pave_somatic_out, @@ -465,7 +470,7 @@ workflow WGTS { } else { - ch_purple_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_purple_out = ch_inputs.map { meta -> [meta, []] } } @@ -480,7 +485,7 @@ workflow WGTS { // NOTE(SW): currently used only for ORANGE but will also be used for Neo once implemented SAGE_APPEND( - ch_dna_processed_out, + ch_inputs, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, @@ -494,8 +499,8 @@ workflow WGTS { } else { - ch_sage_somatic_append_out = ch_dna_processed_out.map { meta -> [meta, []] } - ch_sage_germline_append_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_sage_somatic_append_out = ch_inputs.map { meta -> [meta, []] } + ch_sage_germline_append_out = ch_inputs.map { meta -> [meta, []] } } @@ -508,7 +513,7 @@ workflow WGTS { if (run_config.stages.linx) { LINX_ANNOTATION( - ch_dna_processed_out, + ch_inputs, ch_purple_out, ref_data.genome_version, hmf_data.ensembl_data_resources, @@ -524,8 +529,8 @@ workflow WGTS { } else { - ch_linx_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } - ch_linx_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_linx_somatic_out = ch_inputs.map { meta -> [meta, []] } + ch_linx_germline_out = ch_inputs.map { meta -> [meta, []] } } @@ -537,7 +542,7 @@ workflow WGTS { if (run_config.stages.linx) { LINX_PLOTTING( - ch_dna_processed_out, + ch_inputs, ch_linx_somatic_out, ref_data.genome_version, hmf_data.ensembl_data_resources, @@ -549,7 +554,7 @@ workflow WGTS { } else { - ch_linx_somatic_visualiser_dir_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_linx_somatic_visualiser_dir_out = ch_inputs.map { meta -> [meta, []] } } @@ -562,7 +567,9 @@ workflow WGTS { if (run_config.stages.orange && run_config.stages.flagstat) { FLAGSTAT_METRICS( - ch_dna_processed_out, + ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ) ch_versions = ch_versions.mix(FLAGSTAT_METRICS.out.versions) @@ -572,8 +579,8 @@ workflow WGTS { } else { - ch_flagstat_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } - ch_flagstat_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_flagstat_somatic_out = ch_inputs.map { meta -> [meta, []] } + ch_flagstat_germline_out = ch_inputs.map { meta -> [meta, []] } } @@ -586,7 +593,9 @@ workflow WGTS { if (run_config.stages.bamtools) { BAMTOOLS_METRICS( - ch_dna_processed_out, + ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_fasta, ref_data.genome_version, ) @@ -598,8 +607,8 @@ workflow WGTS { } else { - ch_bamtools_somatic_out = ch_dna_processed_out.map { meta -> [meta, []] } - ch_bamtools_germline_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_bamtools_somatic_out = ch_inputs.map { meta -> [meta, []] } + ch_bamtools_germline_out = ch_inputs.map { meta -> [meta, []] } } @@ -611,7 +620,7 @@ workflow WGTS { if (run_config.stages.sigs) { SIGS_FITTING( - ch_dna_processed_out, + ch_inputs, ch_purple_out, hmf_data.sigs_signatures, ) @@ -622,7 +631,7 @@ workflow WGTS { } else { - ch_sigs_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_sigs_out = ch_inputs.map { meta -> [meta, []] } } @@ -634,7 +643,7 @@ workflow WGTS { if (run_config.stages.chord) { CHORD_PREDICTION( - ch_dna_processed_out, + ch_inputs, ch_purple_out, ref_data.genome_version, ) @@ -645,7 +654,7 @@ workflow WGTS { } else { - ch_chord_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_chord_out = ch_inputs.map { meta -> [meta, []] } } @@ -660,7 +669,9 @@ workflow WGTS { ref_data_hla_slice_bed = params.containsKey('ref_data_hla_slice_bed') ? params.ref_data_hla_slice_bed : [] LILAC_CALLING( - ch_dna_processed_out, + ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, @@ -675,7 +686,7 @@ workflow WGTS { } else { - ch_lilac_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_lilac_out = ch_inputes.map { meta -> [meta, []] } } @@ -687,7 +698,8 @@ workflow WGTS { if (run_config.stages.virusinterpreter) { VIRUSBREAKEND_CALLING( - ch_dna_processed_out, + ch_inputs, + ch_process_dna_tumor_out, ch_purple_out, ch_bamtools_somatic_out, ref_data.genome_fasta, @@ -708,7 +720,7 @@ workflow WGTS { } else { - ch_virusinterpreter_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_virusinterpreter_out = ch_inputs.map { meta -> [meta, []] } } @@ -720,7 +732,7 @@ workflow WGTS { if (run_config.stages.cuppa) { CUPPA_PREDICTION( - ch_dna_processed_out, + ch_inputs, ch_isofox_out, ch_purple_out, ch_linx_somatic_out, @@ -735,7 +747,7 @@ workflow WGTS { } else { - ch_cuppa_out = ch_dna_processed_out.map { meta -> [meta, []] } + ch_cuppa_out = ch_inputs.map { meta -> [meta, []] } } @@ -745,7 +757,7 @@ workflow WGTS { if (run_config.stages.orange) { ORANGE_REPORTING( - ch_dna_processed_out, + ch_inputs, ch_bamtools_somatic_out, ch_bamtools_germline_out, ch_flagstat_somatic_out, From 0b27c4a089d68193045ab9ba950db7ef99640c6b Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Thu, 29 Feb 2024 10:21:39 +1100 Subject: [PATCH 46/86] Fix join for alignment BAM and corresponding BAIs --- subworkflows/local/read_alignment.nf | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/subworkflows/local/read_alignment.nf b/subworkflows/local/read_alignment.nf index 809e9982..cd9c65ce 100644 --- a/subworkflows/local/read_alignment.nf +++ b/subworkflows/local/read_alignment.nf @@ -163,15 +163,12 @@ workflow READ_ALIGNMENT { ch_versions = ch_versions.mix(SAMBAMBA_INDEX.out.versions) // Combine BAMs and BAIs - ch_bams_flat = Channel.empty() - .mix( - BWA_MEM2.out.bam, - SAMBAMBA_INDEX.out.bai - ) - .groupTuple(size: 2) - .map { it.flatten() } - - + // channel: [ meta, sample_type, bam, bai ] + ch_bams_flat = WorkflowOncoanalyser.groupByMeta( + BWA_MEM2.out.bam.map { meta_bwa, bam -> [meta_bwa, meta_bwa.sample_type] }, + BWA_MEM2.out.bam, + SAMBAMBA_INDEX.out.bai, + ) // Reunite BAMs // First, count expected BAMs per sample for non-blocking groupTuple op @@ -188,11 +185,12 @@ workflow READ_ALIGNMENT { .groupTuple() .map { meta_count, meta_bwas -> return [meta_count, meta_bwas.size()] } + // Now, group with expected size then sort into tumor and normal channels ch_bams_united = ch_sample_fastq_counts .cross( // First element to match meta_count above for `cross` - ch_bams_flat.map { meta_bwa, bam, bai -> [[key: meta_bwa.key, sample_type: meta_bwa.sample_type], bam, bai] } + ch_bams_flat.map { meta, sample_type, bam, bai -> [[key: meta.key, sample_type: sample_type], bam, bai] } ) .map { count_tuple, bam_tuple -> From 0ca87ec97de928b63fbff2882365f7b34eadb7cd Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Thu, 29 Feb 2024 10:22:39 +1100 Subject: [PATCH 47/86] Improve comments, syntax, etc --- modules/local/markdups/main.nf | 2 +- subworkflows/local/read_alignment.nf | 1 - subworkflows/local/read_processing.nf | 12 ++++++------ workflows/wgts.nf | 1 - 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index a26b4111..a1805036 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -17,7 +17,7 @@ process MARKDUPS { output: tuple val(meta), path('*bam'), path('*bai'), emit: bam - path 'versions.yml' , emit: versions + path 'versions.yml' , emit: versions path '*.tsv' when: diff --git a/subworkflows/local/read_alignment.nf b/subworkflows/local/read_alignment.nf index cd9c65ce..a876a132 100644 --- a/subworkflows/local/read_alignment.nf +++ b/subworkflows/local/read_alignment.nf @@ -139,7 +139,6 @@ workflow READ_ALIGNMENT { ] return [meta_bwa, fastq_fwd, fastq_rev] - } // Run process diff --git a/subworkflows/local/read_processing.nf b/subworkflows/local/read_processing.nf index d3cef837..638e197b 100644 --- a/subworkflows/local/read_processing.nf +++ b/subworkflows/local/read_processing.nf @@ -23,7 +23,7 @@ workflow READ_PROCESSING { ch_versions = Channel.empty() // Select and sort input sources, separating bytumor and normal - // channel: runnable: [ meta, bams, bais ] + // channel: runnable: [ meta, [bam, ...], [bai, ...] ] // channel: skip: [ meta ] ch_inputs_tumor_sorted = ch_dna_tumor .map { meta, bams, bais -> @@ -56,13 +56,13 @@ workflow READ_PROCESSING { } // Create process input channel - // channel: [ meta_markdups, bam, bai ] + // channel: [ meta_markdups, [bam, ...], [bai, ...] ] ch_markdups_inputs = Channel.empty() .mix( - ch_inputs_tumor_sorted.runnable.map { meta, bam, bai -> [meta, Utils.getTumorDnaSample(meta), 'tumor', bam, bai] }, - ch_inputs_normal_sorted.runnable.map { meta, bam, bai -> [meta, Utils.getNormalDnaSample(meta), 'normal', bam, bai] }, + ch_inputs_tumor_sorted.runnable.map { meta, bams, bais -> [meta, Utils.getTumorDnaSample(meta), 'tumor', bams, bais] }, + ch_inputs_normal_sorted.runnable.map { meta, bams, bais -> [meta, Utils.getNormalDnaSample(meta), 'normal', bams, bais] }, ) - .map { meta, meta_sample, sample_type, bam, bai -> + .map { meta, meta_sample, sample_type, bams, bais -> def meta_markdups = [ key: meta.group_id, @@ -71,7 +71,7 @@ workflow READ_PROCESSING { sample_type: sample_type, ] - return [meta_markdups, bam, bai] + return [meta_markdups, bams, bais] } // Run process diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 66984000..aa706180 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -193,7 +193,6 @@ workflow WGTS { } - // // MODULE: Run Isofox to analyse RNA data // From 51e21b509edb7ea1dff01a3840f5af6c2b84a1b1 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Thu, 29 Feb 2024 13:33:02 +1100 Subject: [PATCH 48/86] Implement RNA alignment --- conf/hmf_genomes.config | 2 + lib/Constants.groovy | 42 +--- lib/Utils.groovy | 73 ++++--- lib/WorkflowMain.groovy | 7 +- main.nf | 1 + modules/local/sambamba/index/main.nf | 1 - modules/local/sambamba/merge/main.nf | 33 +++ modules/local/star/main.nf | 52 +++-- modules/nf-core/gatk4/markduplicates/main.nf | 65 ++++++ modules/nf-core/gatk4/markduplicates/meta.yml | 72 +++++++ modules/nf-core/samtools/sort/main.nf | 49 +++++ modules/nf-core/samtools/sort/meta.yml | 51 +++++ nextflow_schema.json | 7 +- subworkflows/local/amber_profiling.nf | 4 +- subworkflows/local/bamtools_metrics.nf | 4 +- subworkflows/local/cobalt_profiling.nf | 4 +- subworkflows/local/cuppa_prediction.nf | 2 +- subworkflows/local/flagstat_metrics.nf | 4 +- subworkflows/local/gridss_svprep_calling.nf | 4 +- subworkflows/local/isofox_quantification.nf | 26 ++- subworkflows/local/lilac_calling.nf | 23 +- subworkflows/local/prepare_reference.nf | 15 ++ ...ead_alignment.nf => read_alignment_dna.nf} | 9 +- subworkflows/local/read_alignment_rna.nf | 197 ++++++++++++++++++ subworkflows/local/sage_append.nf | 58 +++--- subworkflows/local/sage_calling.nf | 4 +- subworkflows/local/virusbreakend_calling.nf | 2 +- workflows/targeted.nf | 30 ++- workflows/wgts.nf | 29 ++- 29 files changed, 697 insertions(+), 173 deletions(-) create mode 100644 modules/local/sambamba/merge/main.nf create mode 100644 modules/nf-core/gatk4/markduplicates/main.nf create mode 100644 modules/nf-core/gatk4/markduplicates/meta.yml create mode 100644 modules/nf-core/samtools/sort/main.nf create mode 100644 modules/nf-core/samtools/sort/meta.yml rename subworkflows/local/{read_alignment.nf => read_alignment_dna.nf} (96%) create mode 100644 subworkflows/local/read_alignment_rna.nf diff --git a/conf/hmf_genomes.config b/conf/hmf_genomes.config index 5e38201b..868315a9 100644 --- a/conf/hmf_genomes.config +++ b/conf/hmf_genomes.config @@ -18,6 +18,7 @@ params { bwa_index_biidx = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index/2.2.1/Homo_sapiens.GRCh37.GATK.illumina.fasta.bwt.2bit.64" bwa_index_image = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index_image/0.7.17-r1188/Homo_sapiens.GRCh37.GATK.illumina.fasta.img" gridss_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/gridss_index/2.13.2/Homo_sapiens.GRCh37.GATK.illumina.fasta.gridsscache" + star_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/star_index/gencode_19/2.7.3a.tar.gz" } 'GRCh38_hmf' { fasta = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" @@ -28,6 +29,7 @@ params { bwa_index_biiseq= "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index/2.2.1/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.bwt.2bit.64" bwa_index_image = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/bwa_index_image/0.7.17-r1188/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.img" gridss_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/gridss_index/2.13.2/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gridsscache" + star_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/star_index/gencode_38/2.7.3a.tar.gz" } } } diff --git a/lib/Constants.groovy b/lib/Constants.groovy index 10bdc0ea..c7618ac3 100644 --- a/lib/Constants.groovy +++ b/lib/Constants.groovy @@ -58,9 +58,8 @@ class Constants { static enum FileType { // Generic BAM, - BAI, BAM_MARKDUPS, - BAI_MARKDUPS, + BAI, FASTQ, // Process AMBER_DIR, @@ -125,12 +124,6 @@ class Constants { SequenceType.DNA, ], - BAM_RNA_TUMOR : [ - FileType.BAM, - SampleType.TUMOR, - SequenceType.RNA, - ], - BAM_DNA_NORMAL: [ FileType.BAM, SampleType.NORMAL, @@ -143,55 +136,30 @@ class Constants { SequenceType.DNA, ], - BAM_RNA_NORMAL : [ + BAM_RNA_TUMOR: [ FileType.BAM, - SampleType.NORMAL, + SampleType.TUMOR, SequenceType.RNA, ], - - - - BAI_DNA_TUMOR: [ FileType.BAI, SampleType.TUMOR, SequenceType.DNA, ], - BAI_MARKDUPS_DNA_TUMOR: [ - FileType.BAI_MARKDUPS, - SampleType.TUMOR, - SequenceType.DNA, - ], - - BAI_RNA_TUMOR : [ - FileType.BAI, - SampleType.TUMOR, - SequenceType.RNA, - ], - BAI_DNA_NORMAL: [ FileType.BAI, SampleType.NORMAL, SequenceType.DNA, ], - BAI_MARKDUPS_DNA_NORMAL: [ - FileType.BAI_MARKDUPS, - SampleType.NORMAL, - SequenceType.DNA, - ], - - BAI_RNA_NORMAL : [ + BAI_RNA_TUMOR: [ FileType.BAI, - SampleType.NORMAL, + SampleType.TUMOR, SequenceType.RNA, ], - - - ISOFOX_DIR: [ FileType.ISOFOX_DIR, SampleType.TUMOR, diff --git a/lib/Utils.groovy b/lib/Utils.groovy index 2ab04a11..6cfa638b 100644 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -151,7 +151,7 @@ class Utils { index_enum = Constants.FileType.BAI index_str = 'bai' } else if (key === Constants.FileType.BAM_MARKDUPS) { - index_enum = Constants.FileType.BAI_MARKDUPS + index_enum = Constants.FileType.BAI index_str = 'bai' } else if (key === Constants.FileType.GRIDSS_VCF) { index_enum = Constants.FileType.GRIDSS_VCF_TBI @@ -413,20 +413,25 @@ class Utils { // Files + static public getTumorDnaFastq(meta) { + return getTumorDnaSample(meta).getOrDefault(Constants.FileType.FASTQ, null) + } + static public getTumorDnaBam(meta) { return getTumorDnaSample(meta).getOrDefault(Constants.FileType.BAM, null) } + static public getTumorDnaMarkdupsBam(meta) { + return getTumorDnaSample(meta).getOrDefault(Constants.FileType.BAM_MARKDUPS, null) + } + static public getTumorDnaBai(meta) { return getTumorDnaSample(meta).getOrDefault(Constants.FileType.BAI, null) } - static public getTumorDnaMarkdupsBam(meta) { - return getTumorDnaSample(meta).getOrDefault(Constants.FileType.BAM_MARKDUPS, null) - } - static public getTumorDnaFastq(meta) { - return getTumorDnaSample(meta).getOrDefault(Constants.FileType.FASTQ, null) + static public hasTumorDnaFastq(meta) { + return getTumorDnaFastq(meta) !== null } static public hasTumorDnaBam(meta) { @@ -437,36 +442,25 @@ class Utils { return getTumorDnaMarkdupsBam(meta) !== null } - static public hasTumorDnaFastq(meta) { - return getTumorDnaFastq(meta) !== null - } - - static public getTumorRnaBam(meta) { - return getTumorRnaSample(meta).getOrDefault(Constants.FileType.BAM, null) - } - - static public getTumorRnaBai(meta) { - return getTumorRnaSample(meta).getOrDefault(Constants.FileType.BAI, null) - } - static public hasTumorRnaBam(meta) { - return getTumorRnaBam(meta) !== null + static public getNormalDnaFastq(meta) { + return getNormalDnaSample(meta).getOrDefault(Constants.FileType.FASTQ, null) } static public getNormalDnaBam(meta) { return getNormalDnaSample(meta).getOrDefault(Constants.FileType.BAM, null) } + static public getNormalDnaMarkdupsBam(meta) { + return getNormalDnaSample(meta).getOrDefault(Constants.FileType.BAM_MARKDUPS, null) + } static public getNormalDnaBai(meta) { return getNormalDnaSample(meta).getOrDefault(Constants.FileType.BAI, null) } - static public getNormalDnaMarkdupsBam(meta) { - return getNormalDnaSample(meta).getOrDefault(Constants.FileType.BAM_MARKDUPS, null) - } - static public getNormalDnaFastq(meta) { - return getNormalDnaSample(meta).getOrDefault(Constants.FileType.FASTQ, null) + static public hasNormalDnaFastq(meta) { + return getNormalDnaFastq(meta) !== null } static public hasNormalDnaBam(meta) { @@ -477,16 +471,35 @@ class Utils { return getNormalDnaMarkdupsBam(meta) !== null } - static public hasNormalDnaFastq(meta) { - return getNormalDnaFastq(meta) !== null + + static public hasDnaFastq(meta) { + return hasNormalDnaFastq(meta) || hasTumorDnaFastq(meta) } static public hasDnaMarkdupsBam(meta) { return hasNormalDnaMarkdupsBam(meta) || hasTumorDnaMarkdupsBam(meta) } - static public hasDnaFastq(meta) { - return hasNormalDnaFastq(meta) || hasTumorDnaFastq(meta) + + static public getTumorRnaFastq(meta) { + return getTumorRnaSample(meta).getOrDefault(Constants.FileType.FASTQ, null) + } + + static public getTumorRnaBam(meta) { + return getTumorRnaSample(meta).getOrDefault(Constants.FileType.BAM, null) + } + + static public getTumorRnaBai(meta) { + return getTumorRnaSample(meta).getOrDefault(Constants.FileType.BAI, null) + } + + + static public hasTumorRnaFastq(meta) { + return getTumorRnaFastq(meta) !== null + } + + static public hasTumorRnaBam(meta) { + return getTumorRnaBam(meta) !== null } @@ -499,6 +512,10 @@ class Utils { return hasNormalDnaBam(meta) || hasNormalDnaMarkdupsBam(meta) || hasNormalDnaFastq(meta) } + static public hasTumorRna(meta) { + return hasTumorRnaBam(meta) || hasTumorRnaFastq(meta) + } + // Misc public static getInput(meta, key) { diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 9bd6a46b..b38de223 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -1,6 +1,8 @@ // // This file holds several functions specific to the main.nf workflow in the nf-core/oncoanalyser pipeline // +import Utils + class WorkflowMain { @@ -314,8 +316,9 @@ class WorkflowMain { mode: run_mode, panel: run_mode === Constants.RunMode.TARGETED ? params.panel : null, stages: stages, - has_dna: inputs.any { it.containsKey([Constants.SampleType.TUMOR, Constants.SequenceType.DNA]) }, - has_rna: inputs.any { it.containsKey([Constants.SampleType.TUMOR, Constants.SequenceType.RNA]) }, + has_dna: inputs.any { Utils.hasTumorDna(it) }, + has_rna: inputs.any { Utils.hasTumorRna(it) }, + has_rna_fastq: inputs.any { Utils.hasTumorRnaFastq(it) }, ] } } diff --git a/main.nf b/main.nf index 65841c42..5e4c5768 100644 --- a/main.nf +++ b/main.nf @@ -29,6 +29,7 @@ params.ref_data_genome_bwa_index_bseq = WorkflowMain.getGenomeAttribute(params, params.ref_data_genome_bwa_index_biidx = WorkflowMain.getGenomeAttribute(params, 'bwa_index_biidx') params.ref_data_genome_bwa_index_image = WorkflowMain.getGenomeAttribute(params, 'bwa_index_image') params.ref_data_genome_gridss_index = WorkflowMain.getGenomeAttribute(params, 'gridss_index') +params.ref_data_genome_star_index = WorkflowMain.getGenomeAttribute(params, 'star_index') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/modules/local/sambamba/index/main.nf b/modules/local/sambamba/index/main.nf index f6b238e3..d3e3a2d2 100644 --- a/modules/local/sambamba/index/main.nf +++ b/modules/local/sambamba/index/main.nf @@ -30,7 +30,6 @@ process SAMBAMBA_INDEX { stub: """ touch ${bam}.bai - echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml """ } diff --git a/modules/local/sambamba/merge/main.nf b/modules/local/sambamba/merge/main.nf new file mode 100644 index 00000000..2551ed2f --- /dev/null +++ b/modules/local/sambamba/merge/main.nf @@ -0,0 +1,33 @@ +process SAMBAMBA_MERGE { + tag "${meta.id}" + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sambamba:1.0--h98b6b92_0' : + 'quay.io/biocontainers/sambamba:1.0--h98b6b92_0' }" + + input: + tuple val(meta), path(bams), path(bais) + + output: + tuple val(meta), path('*bam'), emit: bam + path 'versions.yml' , emit: versions + + script: + """ + sambamba merge \\ + --nthreads ${task.cpus} \\ + ${meta.sample_id}.bam \\ + ${bams} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sambamba: \$(sambamba --version 2>&1 | grep -m1 sambamba | sed 's/^sambamba //') + END_VERSIONS + """ + + stub: + """ + touch ${meta.sample_id}.bam + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ +} diff --git a/modules/local/star/main.nf b/modules/local/star/main.nf index 780139b3..fc906880 100644 --- a/modules/local/star/main.nf +++ b/modules/local/star/main.nf @@ -1,40 +1,60 @@ process STAR { tag "${meta.id}" - label 'process_low' + label 'process_medium' - // TODO(SW): create container - //container 'foo' + container 'quay.io/biocontainers/star:2.7.3a--0' input: - // TODO(SW): decide input structure - tuple val(meta), path(fastqs) + tuple val(meta), path(fastq_fwd), path(fastq_rev) + path genome_star_index output: - // TODO(SW): set outputs - tuple val(meta), path('bar'), emit: bam - path 'versions.yml' , emit: versions + tuple val(meta), path('*bam'), emit: bam + path 'versions.yml' , emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - - // TODO(SW): implement process """ - echo bar + STAR \\ + --readFilesIn ${fastq_fwd} ${fastq_rev} \\ + --genomeDir ${genome_star_index} \\ + --runThreadN ${task.cpus} \\ + --readFilesCommand zcat \\ + --alignSJstitchMismatchNmax 5 -1 5 5 \\ + --alignSplicedMateMapLmin 35 \\ + --alignSplicedMateMapLminOverLmate 0.33 \\ + --chimJunctionOverhangMin 10 \\ + --chimOutType WithinBAM SoftClip \\ + --chimScoreDropMax 30 \\ + --chimScoreJunctionNonGTAG 0 \\ + --chimScoreMin 1 \\ + --chimScoreSeparation 1 \\ + --chimSegmentMin 10 \\ + --chimSegmentReadGapMax 3 \\ + --limitOutSJcollapsed 3000000 \\ + --outBAMcompression 0 \\ + --outFilterMatchNmin 35 \\ + --outFilterMatchNminOverLread 0.33 \\ + --outFilterMismatchNmax 3 \\ + --outFilterMultimapNmax 10 \\ + --outFilterScoreMinOverLread 0.33 \\ + --outSAMattributes All \\ + --outSAMattrRGline 'ID:${meta.read_group} SM:${meta.sample_id}' \\ + --outSAMtype BAM Unsorted \\ + --outSAMunmapped Within \\ + --runRNGseed 0 cat <<-END_VERSIONS > versions.yml "${task.process}": - star: foo + star: \$(STAR --version | sed -e "s/STAR_//g") END_VERSIONS """ stub: - // TODO(SW): implement stub """ - touch bar + touch Aligned.out.bam echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml """ } - diff --git a/modules/nf-core/gatk4/markduplicates/main.nf b/modules/nf-core/gatk4/markduplicates/main.nf new file mode 100644 index 00000000..356cac0f --- /dev/null +++ b/modules/nf-core/gatk4/markduplicates/main.nf @@ -0,0 +1,65 @@ +process GATK4_MARKDUPLICATES { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::gatk4=4.4.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'quay.io/biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(bam) + path fasta + path fasta_fai + + output: + tuple val(meta), path("*cram"), emit: cram, optional: true + tuple val(meta), path("*bam"), emit: bam, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.metrics"), emit: metrics + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.sample_id}" + def input_list = bam.collect{"--INPUT $it"}.join(' ') + def reference = fasta ? "--REFERENCE_SEQUENCE ${fasta}" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M" MarkDuplicates \\ + $input_list \\ + --OUTPUT ${prefix}.md.bam \\ + --METRICS_FILE ${prefix}.md.metrics \\ + --TMP_DIR . \\ + --CREATE_INDEX \\ + ${reference} \\ + $args + + mv ${prefix}.md.bai ${prefix}.md.bam.bai + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.sample_id}" + + """ + touch ${prefix}.md.bam + touch ${prefix}.md.bam.bai + touch ${prefix}.md.metrics + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ +} diff --git a/modules/nf-core/gatk4/markduplicates/meta.yml b/modules/nf-core/gatk4/markduplicates/meta.yml new file mode 100644 index 00000000..ddf98d2f --- /dev/null +++ b/modules/nf-core/gatk4/markduplicates/meta.yml @@ -0,0 +1,72 @@ +name: gatk4_markduplicates +description: This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. +keywords: + - markduplicates + - bam + - sort +tools: + - gatk4: + description: + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360037052812-MarkDuplicates-Picard- + tool_dev_url: https://github.com/broadinstitute/gatk + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM file + pattern: "*.{bam}" + - fasta: + type: file + description: Fasta file + pattern: "*.{fasta}" + - fasta_fai: + type: file + description: Fasta index file + pattern: "*.{fai}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: Marked duplicates BAM file + pattern: "*.{bam}" + - cram: + type: file + description: Marked duplicates CRAM file + pattern: "*.{cram}" + - bai: + type: file + description: BAM index file + pattern: "*.{bam.bai}" + - crai: + type: file + description: CRAM index file + pattern: "*.{cram.crai}" + - metrics: + type: file + description: Duplicate metrics file generated by GATK + pattern: "*.{metrics.txt}" + +authors: + - "@ajodeh-juma" + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf new file mode 100644 index 00000000..18a3e1dc --- /dev/null +++ b/modules/nf-core/samtools/sort/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_SORT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.csi"), emit: csi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.prefix}" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools sort \\ + $args \\ + -@ $task.cpus \\ + -o ${prefix}.bam \\ + -T $prefix \\ + $bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.prefix}" + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml new file mode 100644 index 00000000..2200de72 --- /dev/null +++ b/modules/nf-core/samtools/sort/meta.yml @@ -0,0 +1,51 @@ +name: samtools_sort +description: Sort SAM/BAM/CRAM file +keywords: + - sort + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" +authors: + - "@drpatelh" + - "@ewels" +maintainers: + - "@drpatelh" + - "@ewels" diff --git a/nextflow_schema.json b/nextflow_schema.json index d75b428d..f52ffdb0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -213,7 +213,6 @@ }, "ref_data_genome_bwa_index": { "type": "string", - "format": "directory-path", "description": "Path to directory containing reference genome BWA indices.", "fa_icon": "far fa-file-code", "hidden": true @@ -247,6 +246,12 @@ "description": "Path to reference genome GRIDSS index file.", "fa_icon": "far fa-file-code", "hidden": true + }, + "ref_data_genome_star_index": { + "type": "string", + "description": "Path to reference genome STAR index file.", + "fa_icon": "far fa-file-code", + "hidden": true } } }, diff --git a/subworkflows/local/amber_profiling.nf b/subworkflows/local/amber_profiling.nf index fb021e91..ab156969 100644 --- a/subworkflows/local/amber_profiling.nf +++ b/subworkflows/local/amber_profiling.nf @@ -35,9 +35,9 @@ workflow AMBER_PROFILING { return [ meta, Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), - Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_DNA_TUMOR), Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), - Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_NORMAL), + Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_DNA_NORMAL), ] } .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> diff --git a/subworkflows/local/bamtools_metrics.nf b/subworkflows/local/bamtools_metrics.nf index ba6aebee..ca446828 100644 --- a/subworkflows/local/bamtools_metrics.nf +++ b/subworkflows/local/bamtools_metrics.nf @@ -31,7 +31,7 @@ workflow BAMTOOLS_METRICS { return [ meta, Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), - Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_DNA_TUMOR), ] } .branch { meta, bam, bai -> @@ -48,7 +48,7 @@ workflow BAMTOOLS_METRICS { return [ meta, Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), - Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_NORMAL), + Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_DNA_NORMAL), ] } .branch { meta, bam, bai -> diff --git a/subworkflows/local/cobalt_profiling.nf b/subworkflows/local/cobalt_profiling.nf index 322575cc..c425cd55 100644 --- a/subworkflows/local/cobalt_profiling.nf +++ b/subworkflows/local/cobalt_profiling.nf @@ -36,9 +36,9 @@ workflow COBALT_PROFILING { return [ meta, Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), - Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_DNA_TUMOR), Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), - Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_NORMAL), + Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_DNA_NORMAL), ] } .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> diff --git a/subworkflows/local/cuppa_prediction.nf b/subworkflows/local/cuppa_prediction.nf index bc486322..66ea086f 100644 --- a/subworkflows/local/cuppa_prediction.nf +++ b/subworkflows/local/cuppa_prediction.nf @@ -84,7 +84,7 @@ workflow CUPPA_PREDICTION { def has_tumor_dna = Utils.hasTumorDna(meta) def has_normal_dna = Utils.hasNormalDna(meta) - def has_tumor_rna = Utils.hasTumorRnaBam(meta) + def has_tumor_rna = Utils.hasTumorRna(meta) def has_dna_inputs = (purple_dir && linx_annotation_dir) def has_rna_inputs = isofox_dir diff --git a/subworkflows/local/flagstat_metrics.nf b/subworkflows/local/flagstat_metrics.nf index cd2344a9..83be4cd8 100644 --- a/subworkflows/local/flagstat_metrics.nf +++ b/subworkflows/local/flagstat_metrics.nf @@ -27,7 +27,7 @@ workflow FLAGSTAT_METRICS { return [ meta, Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), - Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_DNA_TUMOR), ] } .branch { meta, bam, bai -> @@ -44,7 +44,7 @@ workflow FLAGSTAT_METRICS { return [ meta, Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), - Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_NORMAL), + Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_DNA_NORMAL), ] } .branch { meta, bam, bai -> diff --git a/subworkflows/local/gridss_svprep_calling.nf b/subworkflows/local/gridss_svprep_calling.nf index 71aed28a..7bbe3d79 100644 --- a/subworkflows/local/gridss_svprep_calling.nf +++ b/subworkflows/local/gridss_svprep_calling.nf @@ -52,9 +52,9 @@ workflow GRIDSS_SVPREP_CALLING { return [ meta, Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), - Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_DNA_TUMOR), Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), - Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_NORMAL), + Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_DNA_NORMAL), ] } .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> diff --git a/subworkflows/local/isofox_quantification.nf b/subworkflows/local/isofox_quantification.nf index a82d7629..7e40c379 100644 --- a/subworkflows/local/isofox_quantification.nf +++ b/subworkflows/local/isofox_quantification.nf @@ -11,6 +11,7 @@ workflow ISOFOX_QUANTIFICATION { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_rna_bam // channel: [mandatory] [ meta, bam, bai ] // Reference data genome_fasta // channel: [mandatory] /path/to/genome_fasta @@ -31,19 +32,28 @@ workflow ISOFOX_QUANTIFICATION { // channel: [ versions.yml ] ch_versions = Channel.empty() - // Sort inputs - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> + // Select input sources and sort + // channel: runnable: [ meta, tumor_bam, tumor_bai ] + // channel: skip: [ meta ] + ch_inputs_sorted = ch_tumor_rna_bam + .map { meta, tumor_bam, tumor_bai -> + return [ + meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_RNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_RNA_TUMOR), + ] + } + .branch { meta, tumor_bam, tumor_bai -> def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.ISOFOX_DIR) - runnable: Utils.hasTumorRnaBam(meta) && !has_existing + runnable: tumor_bam && !has_existing skip: true + meta } // Create process input channel - // channel: [ meta_isofox, tumor_bam_rna ] + // channel: [ meta_isofox, tumor_bam, tumor_bai ] ch_isofox_inputs = ch_inputs_sorted.runnable - .map { meta -> + .map { meta, tumor_bam, tumor_bai -> def meta_isofox = [ key: meta.group_id, @@ -51,7 +61,7 @@ workflow ISOFOX_QUANTIFICATION { sample_id: Utils.getTumorRnaSampleName(meta), ] - return [meta_isofox, Utils.getTumorRnaBam(meta), Utils.getTumorRnaBai(meta)] + return [meta_isofox, tumor_bam, tumor_bai] } // Run process diff --git a/subworkflows/local/lilac_calling.nf b/subworkflows/local/lilac_calling.nf index eb8ac01d..be91e6d8 100644 --- a/subworkflows/local/lilac_calling.nf +++ b/subworkflows/local/lilac_calling.nf @@ -16,6 +16,7 @@ workflow LILAC_CALLING { ch_inputs // channel: [mandatory] [ meta ] ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] ch_normal_bam // channel: [mandatory] [ meta, bam, bai ] + ch_tumor_rna_bam // channel: [mandatory] [ meta, bam, bai ] ch_purple // channel: [mandatory] [ meta, purple_dir ] // Reference data @@ -41,9 +42,9 @@ workflow LILAC_CALLING { return [ meta, Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), - Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_DNA_TUMOR), Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), - Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_NORMAL), + Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_DNA_NORMAL), ] } .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> @@ -161,22 +162,6 @@ workflow LILAC_CALLING { } - // Create channel for RNA BAMs - // channel: [ meta, tumor_rna_bam, tumor_rna_bai ] - ch_rna_inputs_ready = ch_inputs - .map { meta -> - - def bam = [] - def bai = [] - - if (Utils.hasTumorRnaBam(meta)) { - bam = Utils.getTumorRnaBam(meta) - bai = Utils.getTumorRnaBai(meta) - } - - return [meta, bam, bai] - } - // // MODULE: LILAC // @@ -184,7 +169,7 @@ workflow LILAC_CALLING { // channel: [ meta_lilac, normal_dna_bam, normal_dna_bai, tumor_dna_bam, tumor_dna_bai, tumor_rna_bam, tumor_rna_bai, purple_dir ] ch_lilac_inputs = WorkflowOncoanalyser.groupByMeta( ch_dna_inputs_ready, - ch_rna_inputs_ready, + ch_tumor_rna_bam, ch_purple, ) .map { meta, tbam_dna, tbai_dna, nbam_dna, nbai_dna, tbam_rna, tbai_rna, purple_dir -> diff --git a/subworkflows/local/prepare_reference.nf b/subworkflows/local/prepare_reference.nf index fc7b724f..c04a9540 100644 --- a/subworkflows/local/prepare_reference.nf +++ b/subworkflows/local/prepare_reference.nf @@ -14,6 +14,7 @@ include { BWA_INDEX } from '../../modules/nf-core/bwa/index/main' include { CUSTOM_EXTRACTTARBALL as DECOMP_BWA_INDEX } from '../../modules/local/custom/extract_tarball/main' include { CUSTOM_EXTRACTTARBALL as DECOMP_HMF_DATA } from '../../modules/local/custom/extract_tarball/main' include { CUSTOM_EXTRACTTARBALL as DECOMP_PANEL_DATA } from '../../modules/local/custom/extract_tarball/main' +include { CUSTOM_EXTRACTTARBALL as DECOMP_STAR_INDEX } from '../../modules/local/custom/extract_tarball/main' include { CUSTOM_EXTRACTTARBALL as DECOMP_VIRUSBREAKEND_DB } from '../../modules/local/custom/extract_tarball/main' include { GRIDSS_INDEX as GRIDSS_BWA_INDEX_IMAGE } from '../../modules/local/gridss/index/main' include { GRIDSS_INDEX as GRIDSS_INDEX } from '../../modules/local/gridss/index/main' @@ -103,6 +104,19 @@ workflow PREPARE_REFERENCE { ch_genome_bwa_index_bseq = Channel.value(params.ref_data_genome_bwa_index_bseq) ch_genome_bwa_index_biidx = Channel.value(params.ref_data_genome_bwa_index_biidx) + // + // Decompress STAR index + // + ch_genome_star_index = params.ref_data_genome_star_index ? file(params.ref_data_genome_star_index) : [] + if (run_config.has_rna_fastq && run_config.stages.alignment && params.ref_data_genome_star_index.endsWith('.tar.gz')) { + ch_genome_star_index_inputs = [ + [id: 'star_index'], + file(params.ref_data_genome_star_index), + ] + DECOMP_STAR_INDEX(ch_genome_star_index_inputs) + ch_genome_star_index = DECOMP_STAR_INDEX.out.dir + } + // // Set VIRUSBreakend database path / stage, unpack if required // @@ -181,6 +195,7 @@ workflow PREPARE_REFERENCE { genome_bwa_index_biidx = ch_genome_bwa_index_biidx // path: genome_bwa_index_biidx genome_bwa_index_image = ch_genome_bwa_index_image // path: genome_bwa_index_image genome_gridss_index = ch_genome_gridss_index // path: genome_gridss_index + genome_star_index = ch_genome_star_index // path: genome_star_index genome_version = params.ref_data_genome_version // val: genome_version virusbreakenddb = ch_virusbreakenddb // path: VIRUSBreakend database diff --git a/subworkflows/local/read_alignment.nf b/subworkflows/local/read_alignment_dna.nf similarity index 96% rename from subworkflows/local/read_alignment.nf rename to subworkflows/local/read_alignment_dna.nf index a876a132..a9529e57 100644 --- a/subworkflows/local/read_alignment.nf +++ b/subworkflows/local/read_alignment_dna.nf @@ -2,7 +2,7 @@ include { BWA_MEM2 } from '../../modules/local/bwa/mem2/main' include { FASTP } from '../../modules/local/fastp/main' include { SAMBAMBA_INDEX } from '../../modules/local/sambamba/index/main' -workflow READ_ALIGNMENT { +workflow READ_ALIGNMENT_DNA { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] @@ -162,7 +162,7 @@ workflow READ_ALIGNMENT { ch_versions = ch_versions.mix(SAMBAMBA_INDEX.out.versions) // Combine BAMs and BAIs - // channel: [ meta, sample_type, bam, bai ] + // channel: [ meta_bwa, sample_type, bam, bai ] ch_bams_flat = WorkflowOncoanalyser.groupByMeta( BWA_MEM2.out.bam.map { meta_bwa, bam -> [meta_bwa, meta_bwa.sample_type] }, BWA_MEM2.out.bam, @@ -171,6 +171,7 @@ workflow READ_ALIGNMENT { // Reunite BAMs // First, count expected BAMs per sample for non-blocking groupTuple op + // channel: [ meta_count, group_size ] ch_sample_fastq_counts = ch_bwa_inputs .map { meta_bwa, reads_fwd, reads_rev -> @@ -184,12 +185,12 @@ workflow READ_ALIGNMENT { .groupTuple() .map { meta_count, meta_bwas -> return [meta_count, meta_bwas.size()] } - // Now, group with expected size then sort into tumor and normal channels + // channel: [ meta_group, [bam, ...], [bai, ...] ] ch_bams_united = ch_sample_fastq_counts .cross( // First element to match meta_count above for `cross` - ch_bams_flat.map { meta, sample_type, bam, bai -> [[key: meta.key, sample_type: sample_type], bam, bai] } + ch_bams_flat.map { meta_bwa, sample_type, bam, bai -> [[key: meta_bwa.key, sample_type: sample_type], bam, bai] } ) .map { count_tuple, bam_tuple -> diff --git a/subworkflows/local/read_alignment_rna.nf b/subworkflows/local/read_alignment_rna.nf new file mode 100644 index 00000000..61736170 --- /dev/null +++ b/subworkflows/local/read_alignment_rna.nf @@ -0,0 +1,197 @@ +include { GATK4_MARKDUPLICATES } from '../../modules/nf-core/gatk4/markduplicates/main' +include { SAMBAMBA_INDEX } from '../../modules/local/sambamba/index/main' +include { SAMBAMBA_MERGE } from '../../modules/local/sambamba/merge/main' +include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' +include { STAR } from '../../modules/local/star/main' + +workflow READ_ALIGNMENT_RNA { + take: + // Sample data + ch_inputs // channel: [mandatory] [ meta ] + + // Reference data + genome_star_index // channel: [mandatory] /path/to/genome_star_index/ + + main: + // Channel for version.yml files + // channel: [ versions.yml ] + ch_versions = Channel.empty() + + // Sort inputs + // channel: [ meta ] + ch_inputs_sorted = ch_inputs + .branch { meta -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAM_RNA_TUMOR) + runnable: Utils.hasTumorRnaFastq(meta) && !has_existing + skip: true + } + + // Create FASTQ input channel + // channel: [ meta_fastq, fastq_fwd, fastq_rev ] + ch_fastq_inputs = ch_inputs_sorted.runnable + .flatMap { meta -> + def meta_sample = Utils.getTumorRnaSample(meta) + meta_sample + .getAt(Constants.FileType.FASTQ) + .collect { key, fps -> + def (library_id, lane) = key + + def meta_fastq = [ + key: meta.group_id, + id: "${meta.group_id}_${meta_sample.sample_id}", + sample_id: meta_sample.sample_id, + library_id: library_id, + lane: lane, + ] + + return [meta_fastq, fps['fwd'], fps['rev']] + } + } + + // + // MODULE: STAR + // + // Create process input channel + // channel: [ meta_star, fastq_fwd, fastq_rev ] + ch_star_inputs = ch_fastq_inputs + .map { meta_fastq, fastq_fwd, fastq_rev -> + def meta_star = [ + *:meta_fastq, + + + // TODO(SW): understand target format + read_group: "${meta_fastq.sample_id}.${meta_fastq.library_id}.${meta_fastq.lane}", + + + ] + + return [meta_star, fastq_fwd, fastq_rev] + } + + // Run process + STAR( + ch_star_inputs, + genome_star_index, + ) + + ch_versions = ch_versions.mix(STAR.out.versions) + + // + // MODULE: SAMtools sort + // + // Create process input channel + // channel: [ meta_sort, bam ] + ch_sort_inputs = STAR.out.bam + .map { meta_star, bam -> + def meta_sort = [ + *:meta_star, + prefix: meta_star.read_group, + ] + + return [meta_sort, bam] + } + + // Run process + SAMTOOLS_SORT( + ch_sort_inputs, + ) + + ch_versions = ch_versions.mix(SAMTOOLS_SORT.out.versions) + + // + // MODULE: Sambamba index + // + SAMBAMBA_INDEX( + SAMTOOLS_SORT.out.bam, + ) + + ch_versions = ch_versions.mix(SAMBAMBA_INDEX.out.versions) + + // + // MODULE: Sambamba merge + // + // Combine BAMs and BAIs + // channel: [ meta_star, bam, bai ] + ch_bams_flat = WorkflowOncoanalyser.groupByMeta( + SAMTOOLS_SORT.out.bam, + SAMBAMBA_INDEX.out.bai, + ) + + // Reunite BAMs + // First, count expected BAMs per sample for non-blocking groupTuple op + ch_sample_fastq_counts = ch_star_inputs + .map { meta_star, reads_fwd, reads_rev -> + def meta_count = [key: meta_star.key] + return [meta_count, meta_star] + } + .groupTuple() + .map { meta_count, meta_stars -> return [meta_count, meta_stars.size()] } + + // Now, group with expected size then sort into tumor and normal channels + ch_bams_united = ch_sample_fastq_counts + .cross( + // First element to match meta_count above for `cross` + ch_bams_flat.map { meta_star, bam, bai -> [[key: meta_star.key], bam, bai] } + ) + .map { count_tuple, bam_tuple -> + + def group_size = count_tuple[1] + def (meta_bam, bam, bai) = bam_tuple + + def meta_group = [ + *:meta_bam, + ] + + return tuple(groupKey(meta_group, group_size), bam, bai) + } + .groupTuple() + + // Create process input channel + // channel: [ meta_merge, [bams, ...], [bais, ...] ] + ch_merge_inputs = WorkflowOncoanalyser.restoreMeta(ch_bams_united, ch_inputs) + .map { meta, bams, bais -> + def meta_merge = [ + key: meta.group_id, + id: meta.group_id, + sample_id: Utils.getTumorRnaSampleName(meta), + ] + return [meta_merge, bams, bais] + } + + // Run process + SAMBAMBA_MERGE( + ch_merge_inputs, + ) + + ch_versions = ch_versions.mix(SAMBAMBA_MERGE.out.versions) + + // + // MODULE: GATK4 markduplicates + // + GATK4_MARKDUPLICATES( + SAMBAMBA_MERGE.out.bam, + [], + [], + ) + + ch_versions = ch_versions.mix(GATK4_MARKDUPLICATES.out.versions) + + // Combine BAMs and BAIs + // channel: [ meta, bam, bai ] + ch_bams_ready = WorkflowOncoanalyser.groupByMeta( + WorkflowOncoanalyser.restoreMeta(GATK4_MARKDUPLICATES.out.bam, ch_inputs), + WorkflowOncoanalyser.restoreMeta(GATK4_MARKDUPLICATES.out.bai, ch_inputs), + ) + + // Set outputs + // channel: [ meta, bam, bai ] + ch_bam_out = Channel.empty() + .mix( + ch_bams_ready, + ch_inputs_sorted.skip.map { meta -> [meta, [], []] }, + ) + + emit: + rna_tumor = ch_bam_out // channel: [ meta, bam, bai ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/sage_append.nf b/subworkflows/local/sage_append.nf index 29a9c438..ad2e892b 100644 --- a/subworkflows/local/sage_append.nf +++ b/subworkflows/local/sage_append.nf @@ -11,14 +11,15 @@ include { SAGE_APPEND as GERMLINE } from '../../modules/local/sage/append/main' workflow SAGE_APPEND { take: // Sample data - ch_inputs // channel: [mandatory] [ meta ] - ch_purple_dir // channel: [mandatory] [ meta, purple_dir ] + ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_rna_bam // channel: [mandatory] [ meta, bam, bai ] + ch_purple_dir // channel: [mandatory] [ meta, purple_dir ] // Reference data - genome_fasta // channel: [mandatory] /path/to/genome_fasta - genome_version // channel: [mandatory] genome version - genome_fai // channel: [mandatory] /path/to/genome_fai - genome_dict // channel: [mandatory] /path/to/genome_dict + genome_fasta // channel: [mandatory] /path/to/genome_fasta + genome_version // channel: [mandatory] genome version + genome_fai // channel: [mandatory] /path/to/genome_fai + genome_dict // channel: [mandatory] /path/to/genome_dict main: // Channel for version.yml files @@ -26,17 +27,22 @@ workflow SAGE_APPEND { ch_versions = Channel.empty() // Select input sources and sort - // channel: runnable: [ meta, purple_dir ] + // channel: runnable: [ meta, tumor_bam, tumor_bai, purple_dir ] // channel: skip: [ meta ] - ch_inputs_sorted = ch_purple_dir - .map { meta, purple_dir -> + ch_inputs_sorted = WorkflowOncoanalyser.groupByMeta( + ch_tumor_rna_bam, + ch_purple_dir, + ) + .map { meta, tumor_bam, tumor_bai, purple_dir -> return [ meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_RNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_RNA_TUMOR), Utils.selectCurrentOrExisting(purple_dir, meta, Constants.INPUT.PURPLE_DIR), ] } - .branch { meta, purple_dir -> - runnable: purple_dir + .branch { meta, tumor_bam, tumor_bai, purple_dir -> + runnable: tumor_bam && purple_dir skip: true return meta } @@ -45,15 +51,15 @@ workflow SAGE_APPEND { // MODULE: SAGE append germline // // Select inputs that are eligible to run - // channel: runnable: [ meta, purple_dir ] + // channel: runnable: [ meta, tumor_bam, tumor_bai, purple_dir ] // channel: skip: [ meta ] ch_inputs_germline_sorted = ch_inputs_sorted.runnable - .branch { meta, purple_dir -> + .branch { meta, tumor_bam, tumor_bai, purple_dir -> def tumor_dna_id = Utils.getTumorDnaSampleName(meta) - def has_normal_dna = Utils.hasNormalDnaBam(meta) - def has_tumor_rna = Utils.hasTumorRnaBam(meta) + def has_normal_dna = Utils.hasNormalDna(meta) + def has_tumor_rna = Utils.hasTumorRna(meta) def has_smlv_germline = file(purple_dir).resolve("${tumor_dna_id}.purple.germline.vcf.gz") def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.SAGE_APPEND_VCF_NORMAL) @@ -63,9 +69,9 @@ workflow SAGE_APPEND { } // Create process input channel - // channel: [ meta_append, purple_smlv_vcf, tumor_rna_bam, tumor_rna_bai ] + // channel: [ meta_append, purple_smlv_vcf, tumor_bam, tumor_bai ] ch_sage_append_germline_inputs = ch_inputs_germline_sorted.runnable - .map { meta, purple_dir -> + .map { meta, tumor_bam, tumor_bai, purple_dir -> def tumor_dna_id = Utils.getTumorDnaSampleName(meta) @@ -76,11 +82,9 @@ workflow SAGE_APPEND { dna_id: Utils.getNormalDnaSampleName(meta), ] - def tumor_rna_bam = Utils.getTumorRnaBam(meta) - def tumor_rna_bai = Utils.getTumorRnaBai(meta) def purple_smlv_vcf = file(purple_dir).resolve("${tumor_dna_id}.purple.germline.vcf.gz") - return [meta_append, purple_smlv_vcf, tumor_rna_bam, tumor_rna_bai] + return [meta_append, purple_smlv_vcf, tumor_bam, tumor_bai] } // Run process @@ -98,14 +102,14 @@ workflow SAGE_APPEND { // MODULE: SAGE append somatic // // Select inputs that are eligible to run - // channel: runnable: [ meta, purple_dir ] + // channel: runnable: [ meta, tumor_bam, tumor_bai, purple_dir ] // channel: skip: [ meta ] ch_inputs_somatic_sorted = ch_inputs_sorted.runnable - .branch { meta, purple_dir -> + .branch { meta, tumor_bam, tumor_bai, purple_dir -> def tumor_dna_id = Utils.getTumorDnaSampleName(meta) def has_tumor_dna = Utils.hasTumorDna(meta) - def has_tumor_rna = Utils.hasTumorRnaBam(meta) + def has_tumor_rna = Utils.hasTumorRna(meta) def has_smlv_somatic = file(purple_dir).resolve("${tumor_dna_id}.purple.somatic.vcf.gz") def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.SAGE_APPEND_VCF_TUMOR) @@ -115,9 +119,9 @@ workflow SAGE_APPEND { } // Create process input channel - // channel: [ meta_append, purple_smlv_vcf, tumor_rna_bam, tumor_rna_bai ] + // channel: [ meta_append, purple_smlv_vcf, tumor_bam, tumor_bai ] ch_sage_append_somatic_inputs = ch_inputs_somatic_sorted.runnable - .map { meta, purple_dir -> + .map { meta, tumor_bam, tumor_bai, purple_dir -> def tumor_dna_id = Utils.getTumorDnaSampleName(meta) @@ -128,11 +132,9 @@ workflow SAGE_APPEND { dna_id: Utils.getTumorDnaSampleName(meta), ] - def tumor_rna_bam = Utils.getTumorRnaBam(meta) - def tumor_rna_bai = Utils.getTumorRnaBai(meta) def purple_smlv_vcf = file(purple_dir).resolve("${tumor_dna_id}.purple.somatic.vcf.gz") - return [meta_append, purple_smlv_vcf, tumor_rna_bam, tumor_rna_bai] + return [meta_append, purple_smlv_vcf, tumor_bam, tumor_bai] } // Run process diff --git a/subworkflows/local/sage_calling.nf b/subworkflows/local/sage_calling.nf index cdec723b..92342463 100644 --- a/subworkflows/local/sage_calling.nf +++ b/subworkflows/local/sage_calling.nf @@ -45,9 +45,9 @@ workflow SAGE_CALLING { return [ meta, Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), - Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_DNA_TUMOR), Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), - Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_NORMAL), + Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_DNA_NORMAL), ] } .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> diff --git a/subworkflows/local/virusbreakend_calling.nf b/subworkflows/local/virusbreakend_calling.nf index 87e16488..c13c940b 100644 --- a/subworkflows/local/virusbreakend_calling.nf +++ b/subworkflows/local/virusbreakend_calling.nf @@ -43,7 +43,7 @@ workflow VIRUSBREAKEND_CALLING { return [ meta, Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), - Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_DNA_TUMOR), ] } .branch { meta, tumor_bam, tumor_bai -> diff --git a/workflows/targeted.nf b/workflows/targeted.nf index d8507996..41008cf1 100644 --- a/workflows/targeted.nf +++ b/workflows/targeted.nf @@ -79,7 +79,8 @@ include { ORANGE_REPORTING } from '../subworkflows/local/orange_reporting' include { PAVE_ANNOTATION } from '../subworkflows/local/pave_annotation' include { PREPARE_REFERENCE } from '../subworkflows/local/prepare_reference' include { PURPLE_CALLING } from '../subworkflows/local/purple_calling' -include { READ_ALIGNMENT } from '../subworkflows/local/read_alignment' +include { READ_ALIGNMENT_DNA } from '../subworkflows/local/read_alignment_dna' +include { READ_ALIGNMENT_RNA } from '../subworkflows/local/read_alignment_rna' include { READ_PROCESSING } from '../subworkflows/local/read_processing' include { SAGE_APPEND } from '../subworkflows/local/sage_append' include { SAGE_CALLING } from '../subworkflows/local/sage_calling' @@ -105,7 +106,6 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft samplesheet = Utils.getFileObject(params.input) workflow TARGETED { - // Create channel for versions // channel: [ versions.yml ] ch_versions = Channel.empty() @@ -126,14 +126,15 @@ workflow TARGETED { gridss_config = params.containsKey('gridss_config') ? file(params.gridss_config) : hmf_data.gridss_config // - // SUBWORKFLOW: Align reads + // SUBWORKFLOW: Run read alignment to generate BAMs // // channel: [ meta, [bam, ...], [bai, ...] ] ch_align_dna_tumor_out = Channel.empty() ch_align_dna_normal_out = Channel.empty() + ch_align_rna_tumor_out = Channel.empty() if (run_config.stages.alignment) { - READ_ALIGNMENT( + READ_ALIGNMENT_DNA( ch_inputs, ref_data.genome_fasta, ref_data.genome_bwa_index, @@ -142,20 +143,30 @@ workflow TARGETED { params.max_fastq_records, ) - ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) + READ_ALIGNMENT_RNA( + ch_inputs, + ref_data.genome_star_index, + ) + + ch_versions = ch_versions.mix( + READ_ALIGNMENT_DNA.out.versions, + READ_ALIGNMENT_RNA.out.versions, + ) - ch_align_dna_tumor_out = ch_align_dna_tumor_out.mix(READ_ALIGNMENT.out.dna_tumor) - ch_align_dna_normal_out = ch_align_dna_normal_out.mix(READ_ALIGNMENT.out.dna_normal) + ch_align_dna_tumor_out = ch_align_dna_tumor_out.mix(READ_ALIGNMENT_DNA.out.dna_tumor) + ch_align_dna_normal_out = ch_align_dna_normal_out.mix(READ_ALIGNMENT_DNA.out.dna_normal) + ch_align_rna_tumor_out = ch_align_rna_tumor_out.mix(READ_ALIGNMENT_RNA.out.rna_tumor) } else { ch_align_dna_tumor_out = ch_inputs.map { meta -> [meta, [], []] } ch_align_dna_normal_out = ch_inputs.map { meta -> [meta, [], []] } + ch_align_rna_tumor_out = ch_inputs.map { meta -> [meta, [], []] } } // - // SUBWORKFLOW: Process read alignments + // SUBWORKFLOW: Run MarkDups for DNA BAMs // // channel: [ meta, bam, bai ] ch_process_dna_tumor_out = Channel.empty() @@ -204,6 +215,7 @@ workflow TARGETED { ISOFOX_QUANTIFICATION( ch_inputs, + ch_align_rna_tumor_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -483,6 +495,7 @@ workflow TARGETED { SAGE_APPEND( ch_inputs, + ch_align_rna_tumor_out, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, @@ -624,6 +637,7 @@ workflow TARGETED { ch_inputs, ch_process_dna_tumor_out, ch_process_dna_normal_out, + ch_align_rna_tumor_out, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, diff --git a/workflows/wgts.nf b/workflows/wgts.nf index aa706180..8405a5fa 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -83,7 +83,8 @@ include { ORANGE_REPORTING } from '../subworkflows/local/orange_reporting' include { PAVE_ANNOTATION } from '../subworkflows/local/pave_annotation' include { PREPARE_REFERENCE } from '../subworkflows/local/prepare_reference' include { PURPLE_CALLING } from '../subworkflows/local/purple_calling' -include { READ_ALIGNMENT } from '../subworkflows/local/read_alignment' +include { READ_ALIGNMENT_DNA } from '../subworkflows/local/read_alignment_dna' +include { READ_ALIGNMENT_RNA } from '../subworkflows/local/read_alignment_rna' include { READ_PROCESSING } from '../subworkflows/local/read_processing' include { SAGE_APPEND } from '../subworkflows/local/sage_append' include { SAGE_CALLING } from '../subworkflows/local/sage_calling' @@ -133,14 +134,15 @@ workflow WGTS { gridss_config = params.containsKey('gridss_config') ? file(params.gridss_config) : hmf_data.gridss_config // - // SUBWORKFLOW: Align reads + // SUBWORKFLOW: Run read alignment to generate BAMs // // channel: [ meta, [bam, ...], [bai, ...] ] ch_align_dna_tumor_out = Channel.empty() ch_align_dna_normal_out = Channel.empty() + ch_align_rna_tumor_out = Channel.empty() if (run_config.stages.alignment) { - READ_ALIGNMENT( + READ_ALIGNMENT_DNA( ch_inputs, ref_data.genome_fasta, ref_data.genome_bwa_index, @@ -149,20 +151,30 @@ workflow WGTS { params.max_fastq_records, ) - ch_versions = ch_versions.mix(READ_ALIGNMENT.out.versions) + READ_ALIGNMENT_RNA( + ch_inputs, + ref_data.genome_star_index, + ) + + ch_versions = ch_versions.mix( + READ_ALIGNMENT_DNA.out.versions, + READ_ALIGNMENT_RNA.out.versions, + ) - ch_align_dna_tumor_out = ch_align_dna_tumor_out.mix(READ_ALIGNMENT.out.dna_tumor) - ch_align_dna_normal_out = ch_align_dna_normal_out.mix(READ_ALIGNMENT.out.dna_normal) + ch_align_dna_tumor_out = ch_align_dna_tumor_out.mix(READ_ALIGNMENT_DNA.out.dna_tumor) + ch_align_dna_normal_out = ch_align_dna_normal_out.mix(READ_ALIGNMENT_DNA.out.dna_normal) + ch_align_rna_tumor_out = ch_align_rna_tumor_out.mix(READ_ALIGNMENT_RNA.out.rna_tumor) } else { ch_align_dna_tumor_out = ch_inputs.map { meta -> [meta, [], []] } ch_align_dna_normal_out = ch_inputs.map { meta -> [meta, [], []] } + ch_align_rna_tumor_out = ch_inputs.map { meta -> [meta, [], []] } } // - // SUBWORKFLOW: Process read alignments + // SUBWORKFLOW: Run MarkDups for DNA BAMs // // channel: [ meta, bam, bai ] ch_process_dna_tumor_out = Channel.empty() @@ -205,6 +217,7 @@ workflow WGTS { ISOFOX_QUANTIFICATION( ch_inputs, + ch_align_rna_tumor_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -485,6 +498,7 @@ workflow WGTS { SAGE_APPEND( ch_inputs, + ch_align_rna_tumor_out, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, @@ -671,6 +685,7 @@ workflow WGTS { ch_inputs, ch_process_dna_tumor_out, ch_process_dna_normal_out, + ch_align_rna_tumor_out, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, From 3220f7bd4f5653ff0836b76b1c4ddb9d9b7974e4 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Thu, 29 Feb 2024 15:14:34 +1100 Subject: [PATCH 49/86] Move, rename bwa-mem2 module --- modules/local/{bwa/mem2 => bwa-mem2/mem}/Dockerfile | 0 modules/local/{bwa/mem2 => bwa-mem2/mem}/main.nf | 0 subworkflows/local/read_alignment_dna.nf | 12 ++++++------ 3 files changed, 6 insertions(+), 6 deletions(-) rename modules/local/{bwa/mem2 => bwa-mem2/mem}/Dockerfile (100%) rename modules/local/{bwa/mem2 => bwa-mem2/mem}/main.nf (100%) diff --git a/modules/local/bwa/mem2/Dockerfile b/modules/local/bwa-mem2/mem/Dockerfile similarity index 100% rename from modules/local/bwa/mem2/Dockerfile rename to modules/local/bwa-mem2/mem/Dockerfile diff --git a/modules/local/bwa/mem2/main.nf b/modules/local/bwa-mem2/mem/main.nf similarity index 100% rename from modules/local/bwa/mem2/main.nf rename to modules/local/bwa-mem2/mem/main.nf diff --git a/subworkflows/local/read_alignment_dna.nf b/subworkflows/local/read_alignment_dna.nf index a9529e57..bd4bea84 100644 --- a/subworkflows/local/read_alignment_dna.nf +++ b/subworkflows/local/read_alignment_dna.nf @@ -1,4 +1,4 @@ -include { BWA_MEM2 } from '../../modules/local/bwa/mem2/main' +include { BWAMEM2_ALIGN } from '../../modules/local/bwa-mem2/mem/main' include { FASTP } from '../../modules/local/fastp/main' include { SAMBAMBA_INDEX } from '../../modules/local/sambamba/index/main' @@ -142,7 +142,7 @@ workflow READ_ALIGNMENT_DNA { } // Run process - BWA_MEM2( + BWAMEM2_ALIGN( ch_bwa_inputs, genome_fasta, genome_bwa_index, @@ -150,13 +150,13 @@ workflow READ_ALIGNMENT_DNA { genome_bwa_index_biidx, ) - ch_versions = ch_versions.mix(BWA_MEM2.out.versions) + ch_versions = ch_versions.mix(BWAMEM2_ALIGN.out.versions) // // MODULE: Sambamba index // SAMBAMBA_INDEX( - BWA_MEM2.out.bam, + BWAMEM2_ALIGN.out.bam, ) ch_versions = ch_versions.mix(SAMBAMBA_INDEX.out.versions) @@ -164,8 +164,8 @@ workflow READ_ALIGNMENT_DNA { // Combine BAMs and BAIs // channel: [ meta_bwa, sample_type, bam, bai ] ch_bams_flat = WorkflowOncoanalyser.groupByMeta( - BWA_MEM2.out.bam.map { meta_bwa, bam -> [meta_bwa, meta_bwa.sample_type] }, - BWA_MEM2.out.bam, + BWAMEM2_ALIGN.out.bam.map { meta_bwa, bam -> [meta_bwa, meta_bwa.sample_type] }, + BWAMEM2_ALIGN.out.bam, SAMBAMBA_INDEX.out.bai, ) From b9e88fe5fd2edef31a552587b7be8a6656b4ae48 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Thu, 29 Feb 2024 15:34:29 +1100 Subject: [PATCH 50/86] Fix bwa-mem2 process name --- modules/local/bwa-mem2/mem/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/bwa-mem2/mem/main.nf b/modules/local/bwa-mem2/mem/main.nf index ee76a9cc..280bb829 100644 --- a/modules/local/bwa-mem2/mem/main.nf +++ b/modules/local/bwa-mem2/mem/main.nf @@ -1,4 +1,4 @@ -process BWA_MEM2 { +process BWAMEM2_ALIGN { tag "${meta.id}" label 'process_high' From 781d883a80a1757587d1026584e07c3b1cd0b298 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Thu, 29 Feb 2024 15:38:51 +1100 Subject: [PATCH 51/86] Set STAR process label to 'process_high' --- modules/local/star/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/star/main.nf b/modules/local/star/main.nf index fc906880..2213bf47 100644 --- a/modules/local/star/main.nf +++ b/modules/local/star/main.nf @@ -1,6 +1,6 @@ process STAR { tag "${meta.id}" - label 'process_medium' + label 'process_high' container 'quay.io/biocontainers/star:2.7.3a--0' From 063aa3a08ccd0d13156c6296480cb18852b6f225 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Thu, 29 Feb 2024 15:58:46 +1100 Subject: [PATCH 52/86] Fix incomplete container URL for SAMtools sort --- modules/nf-core/samtools/sort/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf index 18a3e1dc..8aaf9a5b 100644 --- a/modules/nf-core/samtools/sort/main.nf +++ b/modules/nf-core/samtools/sort/main.nf @@ -5,7 +5,7 @@ process SAMTOOLS_SORT { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : - 'biocontainers/samtools:1.18--h50ea8bc_1' }" + 'quay.io/biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(bam) From 914edaba3c20845c5298cfc90f78eb02c9a38c7f Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Thu, 29 Feb 2024 16:11:14 +1100 Subject: [PATCH 53/86] Handle when read count less than max read split --- modules/local/sambamba/merge/main.nf | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/modules/local/sambamba/merge/main.nf b/modules/local/sambamba/merge/main.nf index 2551ed2f..f0590530 100644 --- a/modules/local/sambamba/merge/main.nf +++ b/modules/local/sambamba/merge/main.nf @@ -14,10 +14,19 @@ process SAMBAMBA_MERGE { script: """ - sambamba merge \\ - --nthreads ${task.cpus} \\ - ${meta.sample_id}.bam \\ - ${bams} + # NOTE(SW): single BAM when read count is less than max split count; merge expects at least two BAMs + if [[ \$(tr -cd ' ' <<< ${bams} | wc -c) -eq 0 ]]; then + + ln -s ${bams} ${meta.sample_id}.bam; + + else + + sambamba merge \\ + --nthreads ${task.cpus} \\ + ${meta.sample_id}.bam \\ + ${bams} + + fi; cat <<-END_VERSIONS > versions.yml "${task.process}": From 2d8c76cb798c6317491bb58a2b23f2568c696318 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Thu, 29 Feb 2024 16:33:54 +1100 Subject: [PATCH 54/86] Correctly format RG arg for STAR --- modules/local/star/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/star/main.nf b/modules/local/star/main.nf index 2213bf47..b58868ca 100644 --- a/modules/local/star/main.nf +++ b/modules/local/star/main.nf @@ -41,7 +41,7 @@ process STAR { --outFilterMultimapNmax 10 \\ --outFilterScoreMinOverLread 0.33 \\ --outSAMattributes All \\ - --outSAMattrRGline 'ID:${meta.read_group} SM:${meta.sample_id}' \\ + --outSAMattrRGline ID:${meta.read_group} SM:${meta.sample_id} \\ --outSAMtype BAM Unsorted \\ --outSAMunmapped Within \\ --runRNGseed 0 From 8749d4524d18e5ce6cae28c7871f2c7c14f52a11 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Thu, 29 Feb 2024 16:50:28 +1100 Subject: [PATCH 55/86] Fix bwa-mem2 bi-index variable name typo --- conf/hmf_genomes.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/hmf_genomes.config b/conf/hmf_genomes.config index 868315a9..42b4fba7 100644 --- a/conf/hmf_genomes.config +++ b/conf/hmf_genomes.config @@ -26,7 +26,7 @@ params { dict = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/samtools_index/1.16/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.dict" bwa_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/bwa_index/0.7.17-r1188.tar.gz" bwa_index_bseq = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index/2.2.1/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.0123" - bwa_index_biiseq= "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index/2.2.1/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.bwt.2bit.64" + bwa_index_biidx = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index/2.2.1/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.bwt.2bit.64" bwa_index_image = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/bwa_index_image/0.7.17-r1188/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.img" gridss_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/gridss_index/2.13.2/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gridsscache" star_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/star_index/gencode_38/2.7.3a.tar.gz" From de1aad043cdfef72832035635b4c499ed0ebd0a4 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Thu, 29 Feb 2024 17:03:23 +1100 Subject: [PATCH 56/86] Adjust, fix bwa-mem2 index handling --- conf/hmf_genomes.config | 4 ++-- subworkflows/local/prepare_reference.nf | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/conf/hmf_genomes.config b/conf/hmf_genomes.config index 42b4fba7..e067a3a0 100644 --- a/conf/hmf_genomes.config +++ b/conf/hmf_genomes.config @@ -25,8 +25,8 @@ params { fai = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/samtools_index/1.16/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai" dict = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/samtools_index/1.16/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.dict" bwa_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/bwa_index/0.7.17-r1188.tar.gz" - bwa_index_bseq = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index/2.2.1/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.0123" - bwa_index_biidx = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index/2.2.1/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.bwt.2bit.64" + bwa_index_bseq = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/bwa_index/2.2.1/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.0123" + bwa_index_biidx = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/bwa_index/2.2.1/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.bwt.2bit.64" bwa_index_image = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/bwa_index_image/0.7.17-r1188/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.img" gridss_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/gridss_index/2.13.2/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gridsscache" star_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/star_index/gencode_38/2.7.3a.tar.gz" diff --git a/subworkflows/local/prepare_reference.nf b/subworkflows/local/prepare_reference.nf index c04a9540..453d4ec9 100644 --- a/subworkflows/local/prepare_reference.nf +++ b/subworkflows/local/prepare_reference.nf @@ -100,9 +100,9 @@ workflow PREPARE_REFERENCE { } } - // Explicitly create value channels for BWA MEM2 index files - ch_genome_bwa_index_bseq = Channel.value(params.ref_data_genome_bwa_index_bseq) - ch_genome_bwa_index_biidx = Channel.value(params.ref_data_genome_bwa_index_biidx) + // Explicitly set BWA MEM2 index file inputs + ch_genome_bwa_index_bseq = file(params.ref_data_genome_bwa_index_bseq) + ch_genome_bwa_index_biidx = file(params.ref_data_genome_bwa_index_biidx) // // Decompress STAR index From 7e0fb7f018ec5489dd3531010fc02ff3eeb8ce65 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Thu, 29 Feb 2024 17:46:08 +1100 Subject: [PATCH 57/86] Remove deprecated MarkDups `-multi_bam` argument --- modules/local/markdups/main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index a1805036..18dbd4eb 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -37,7 +37,6 @@ process MARKDUPS { -input_bam ${bams.join(',')} \\ \\ -form_consensus \\ - -multi_bam \\ ${umi_flags} \\ \\ -unmap_regions ${unmap_regions} \\ From f47f33ea30d49854b9bef931532d548f215d6722 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 1 Mar 2024 08:23:33 +1100 Subject: [PATCH 58/86] Fix AMBER subworkflow TN mode --- subworkflows/local/amber_profiling.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/subworkflows/local/amber_profiling.nf b/subworkflows/local/amber_profiling.nf index ab156969..2a21d6b4 100644 --- a/subworkflows/local/amber_profiling.nf +++ b/subworkflows/local/amber_profiling.nf @@ -58,6 +58,10 @@ workflow AMBER_PROFILING { tumor_id: Utils.getTumorDnaSampleName(meta), ] + if (normal_bam) { + meta_amber.normal_id = Utils.getNormalDnaSampleName(meta) + } + [meta_amber, tumor_bam, normal_bam, tumor_bai, normal_bai] } From 08a58c81acde51e45aa62d654144991bcb68ae1d Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 1 Mar 2024 08:23:44 +1100 Subject: [PATCH 59/86] Fix COBALT subworkflow TN mode --- subworkflows/local/cobalt_profiling.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/subworkflows/local/cobalt_profiling.nf b/subworkflows/local/cobalt_profiling.nf index c425cd55..15c56355 100644 --- a/subworkflows/local/cobalt_profiling.nf +++ b/subworkflows/local/cobalt_profiling.nf @@ -70,6 +70,10 @@ workflow COBALT_PROFILING { tumor_id: Utils.getTumorDnaSampleName(meta), ] + if (normal_bam) { + meta_cobalt.normal_id = Utils.getNormalDnaSampleName(meta) + } + sample_data: [meta_cobalt, tumor_bam, normal_bam, tumor_bai, normal_bai] diploid_bed: diploid_bed } From c53ac44cb5c1a61f2e75f52e06afe465e7b1388f Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 1 Mar 2024 08:24:29 +1100 Subject: [PATCH 60/86] Adjust indentation --- modules/local/bwa-mem2/mem/main.nf | 36 +++++++++++------------ modules/local/fastp/main.nf | 18 ++++++------ modules/local/markdups/main.nf | 44 ++++++++++++++-------------- modules/local/sambamba/index/main.nf | 4 +-- 4 files changed, 51 insertions(+), 51 deletions(-) diff --git a/modules/local/bwa-mem2/mem/main.nf b/modules/local/bwa-mem2/mem/main.nf index 280bb829..2ae102b6 100644 --- a/modules/local/bwa-mem2/mem/main.nf +++ b/modules/local/bwa-mem2/mem/main.nf @@ -26,24 +26,24 @@ process BWAMEM2_ALIGN { ln -fs \$(find -L ${genome_bwa_index} -type f) ./ bwa-mem2 mem \\ - -Y \\ - -R '${read_group_tag}' \\ - -t ${task.cpus} \\ - ${genome_fasta} \\ - ${reads_fwd} \\ - ${reads_rev} | \\ - \\ - sambamba view \\ - --sam-input \\ - --format bam \\ - --compression-level 0 \\ - --nthreads ${task.cpus} \\ - /dev/stdin | \\ - \\ - sambamba sort \\ - --nthreads ${task.cpus} \\ - --out ${meta.split}.${meta.sample_id}.${meta.read_group}.bam \\ - /dev/stdin + -Y \\ + -R '${read_group_tag}' \\ + -t ${task.cpus} \\ + ${genome_fasta} \\ + ${reads_fwd} \\ + ${reads_rev} | \\ + \\ + sambamba view \\ + --sam-input \\ + --format bam \\ + --compression-level 0 \\ + --nthreads ${task.cpus} \\ + /dev/stdin | \\ + \\ + sambamba sort \\ + --nthreads ${task.cpus} \\ + --out ${meta.split}.${meta.sample_id}.${meta.read_group}.bam \\ + /dev/stdin cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf index 9b1121a9..8bd47159 100644 --- a/modules/local/fastp/main.nf +++ b/modules/local/fastp/main.nf @@ -23,15 +23,15 @@ process FASTP { # * do not process umis, already done for us fastp \\ - --in1 ${reads_fwd} \\ - --in2 ${reads_rev} \\ - --disable_quality_filtering \\ - --disable_length_filtering \\ - --disable_adapter_trimming \\ - --disable_trim_poly_g \\ - --split_by_lines ${4 * max_fastq_records} \\ - --out1 ${meta.sample_id}_${meta.library_id}_${meta.lane}_R1.fastp.fastq.gz \\ - --out2 ${meta.sample_id}_${meta.library_id}_${meta.lane}_R2.fastp.fastq.gz + --in1 ${reads_fwd} \\ + --in2 ${reads_rev} \\ + --disable_quality_filtering \\ + --disable_length_filtering \\ + --disable_adapter_trimming \\ + --disable_trim_poly_g \\ + --split_by_lines ${4 * max_fastq_records} \\ + --out1 ${meta.sample_id}_${meta.library_id}_${meta.lane}_R1.fastp.fastq.gz \\ + --out2 ${meta.sample_id}_${meta.library_id}_${meta.lane}_R2.fastp.fastq.gz cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index 18dbd4eb..e9f17f5d 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -28,25 +28,25 @@ process MARKDUPS { """ markdups \\ - -Xmx${Math.round(task.memory.bytes * 0.95)} \\ - \\ - -samtools \$(which samtools) \\ - -sambamba \$(which sambamba) \\ - \\ - -sample ${meta.sample_id} \\ - -input_bam ${bams.join(',')} \\ - \\ - -form_consensus \\ - ${umi_flags} \\ - \\ - -unmap_regions ${unmap_regions} \\ - -ref_genome ${genome_fasta} \\ - -ref_genome_version ${genome_ver} \\ - \\ - -write_stats \\ - -threads ${task.cpus} \\ - \\ - -output_bam ${meta.sample_id}.markdups.bam + -Xmx${Math.round(task.memory.bytes * 0.95)} \\ + \\ + -samtools \$(which samtools) \\ + -sambamba \$(which sambamba) \\ + \\ + -sample ${meta.sample_id} \\ + -input_bam ${bams.join(',')} \\ + \\ + -form_consensus \\ + ${umi_flags} \\ + \\ + -unmap_regions ${unmap_regions} \\ + -ref_genome ${genome_fasta} \\ + -ref_genome_version ${genome_ver} \\ + \\ + -write_stats \\ + -threads ${task.cpus} \\ + \\ + -output_bam ${meta.sample_id}.markdups.bam cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -63,9 +63,9 @@ process MARKDUPS { touch ${meta.sample_id}.duplicate_freq.tsv if [[ -n "${has_umis}" ]]; then - touch ${meta.sample_id}.umi_coord_freq.tsv - touch ${meta.sample_id}.umi_edit_distance.tsv - touch ${meta.sample_id}.umi_nucleotide_freq.tsv + touch ${meta.sample_id}.umi_coord_freq.tsv + touch ${meta.sample_id}.umi_edit_distance.tsv + touch ${meta.sample_id}.umi_nucleotide_freq.tsv fi; echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml diff --git a/modules/local/sambamba/index/main.nf b/modules/local/sambamba/index/main.nf index d3e3a2d2..56bb38af 100644 --- a/modules/local/sambamba/index/main.nf +++ b/modules/local/sambamba/index/main.nf @@ -18,8 +18,8 @@ process SAMBAMBA_INDEX { script: """ sambamba index \\ - --nthreads ${task.cpus} \\ - ${bam} + --nthreads ${task.cpus} \\ + ${bam} cat <<-END_VERSIONS > versions.yml "${task.process}": From df1f5b78bc7c985c08719de8ad08dcfe5d1f4ed6 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 1 Mar 2024 08:24:44 +1100 Subject: [PATCH 61/86] Fix SAGE calling subworkflow TN mode --- subworkflows/local/sage_calling.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/subworkflows/local/sage_calling.nf b/subworkflows/local/sage_calling.nf index 92342463..f3f18a15 100644 --- a/subworkflows/local/sage_calling.nf +++ b/subworkflows/local/sage_calling.nf @@ -131,6 +131,10 @@ workflow SAGE_CALLING { tumor_id: Utils.getTumorDnaSampleName(meta), ] + if (normal_bam) { + meta_sage.normal_id = Utils.getNormalDnaSampleName(meta) + } + return [meta_sage, tumor_bam, normal_bam, tumor_bai, normal_bai] } From 752f57ac5158e0a035175f2a43e2396c123c3659 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 1 Mar 2024 08:25:42 +1100 Subject: [PATCH 62/86] Improving handling of 'no merge' RNA BAM scenarios --- modules/local/sambamba/merge/main.nf | 11 +-------- subworkflows/local/read_alignment_rna.nf | 29 ++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/modules/local/sambamba/merge/main.nf b/modules/local/sambamba/merge/main.nf index f0590530..06993865 100644 --- a/modules/local/sambamba/merge/main.nf +++ b/modules/local/sambamba/merge/main.nf @@ -14,20 +14,11 @@ process SAMBAMBA_MERGE { script: """ - # NOTE(SW): single BAM when read count is less than max split count; merge expects at least two BAMs - if [[ \$(tr -cd ' ' <<< ${bams} | wc -c) -eq 0 ]]; then - - ln -s ${bams} ${meta.sample_id}.bam; - - else - - sambamba merge \\ + sambamba merge \\ --nthreads ${task.cpus} \\ ${meta.sample_id}.bam \\ ${bams} - fi; - cat <<-END_VERSIONS > versions.yml "${task.process}": sambamba: \$(sambamba --version 2>&1 | grep -m1 sambamba | sed 's/^sambamba //') diff --git a/subworkflows/local/read_alignment_rna.nf b/subworkflows/local/read_alignment_rna.nf index 61736170..7767e2a0 100644 --- a/subworkflows/local/read_alignment_rna.nf +++ b/subworkflows/local/read_alignment_rna.nf @@ -146,9 +146,17 @@ workflow READ_ALIGNMENT_RNA { } .groupTuple() + // Sort into merge-eligible BAMs (at least two BAMs required) + ch_bams_united_sorted = ch_bams_united + .branch { meta_group, bams, bais -> + runnable: bams.size() > 1 + skip: + return [meta_group, bams[0], bais[0]] + } + // Create process input channel // channel: [ meta_merge, [bams, ...], [bais, ...] ] - ch_merge_inputs = WorkflowOncoanalyser.restoreMeta(ch_bams_united, ch_inputs) + ch_merge_inputs = WorkflowOncoanalyser.restoreMeta(ch_bams_united_sorted.runnable, ch_inputs) .map { meta, bams, bais -> def meta_merge = [ key: meta.group_id, @@ -168,8 +176,25 @@ workflow READ_ALIGNMENT_RNA { // // MODULE: GATK4 markduplicates // + // Create process input channel + // channel: [ meta_markdups, bam, bai ] + ch_markdups_inputs = Channel.empty() + .mix( + WorkflowOncoanalyser.restoreMeta(SAMBAMBA_MERGE.out.bam, ch_inputs), + WorkflowOncoanalyser.restoreMeta(ch_bams_united_sorted.skip, ch_inputs), + ) + .map { meta, bam, bai -> + def meta_markdups = [ + key: meta.group_id, + id: meta.group_id, + sample_id: Utils.getTumorRnaSampleName(meta), + ] + return [meta_markdups, bams, bais] + } + + // Run process GATK4_MARKDUPLICATES( - SAMBAMBA_MERGE.out.bam, + ch_markdups_inputs, [], [], ) From c648028e7ca07ce941dedcd41fc86bfcdb4b5c53 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 1 Mar 2024 08:26:30 +1100 Subject: [PATCH 63/86] Set outputs for alignment workflows --- conf/modules.config | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index a0cd58f9..f6de9265 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,6 +12,22 @@ process { + withName: 'GATK4_MARKDUPLICATES' { + publishDir = [ + path: { "${params.outdir}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : "${meta.key}/alignments/rna/${filename}" }, + ] + } + + withName: 'MARKDUPS' { + publishDir = [ + path: { "${params.outdir}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : "${meta.key}/alignments/dna/${filename}" }, + ] + } + withName: 'AMBER' { publishDir = [ path: { "${params.outdir}" }, From db6bbf27f2510360b92b78cc3655d582767df5ee Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 1 Mar 2024 08:27:38 +1100 Subject: [PATCH 64/86] Add missing channel docs --- subworkflows/local/read_alignment_rna.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/subworkflows/local/read_alignment_rna.nf b/subworkflows/local/read_alignment_rna.nf index 7767e2a0..ea9ecc07 100644 --- a/subworkflows/local/read_alignment_rna.nf +++ b/subworkflows/local/read_alignment_rna.nf @@ -119,6 +119,7 @@ workflow READ_ALIGNMENT_RNA { // Reunite BAMs // First, count expected BAMs per sample for non-blocking groupTuple op + // channel: [ meta_count, group_size ] ch_sample_fastq_counts = ch_star_inputs .map { meta_star, reads_fwd, reads_rev -> def meta_count = [key: meta_star.key] @@ -128,6 +129,7 @@ workflow READ_ALIGNMENT_RNA { .map { meta_count, meta_stars -> return [meta_count, meta_stars.size()] } // Now, group with expected size then sort into tumor and normal channels + // channel: [ meta_group, [bam, ...], [bai, ...] ] ch_bams_united = ch_sample_fastq_counts .cross( // First element to match meta_count above for `cross` From c1f314225b0e06bd7e7704f2c44f652d1d1bef46 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 1 Mar 2024 08:43:38 +1100 Subject: [PATCH 65/86] Further work on RNA BAM handling --- subworkflows/local/read_alignment_rna.nf | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/read_alignment_rna.nf b/subworkflows/local/read_alignment_rna.nf index ea9ecc07..eefa939d 100644 --- a/subworkflows/local/read_alignment_rna.nf +++ b/subworkflows/local/read_alignment_rna.nf @@ -149,11 +149,13 @@ workflow READ_ALIGNMENT_RNA { .groupTuple() // Sort into merge-eligible BAMs (at least two BAMs required) + // channel: runnable: [ meta_group, [bam, ...], [bai, ...] ] + // channel: skip: [ meta_group, bam ] ch_bams_united_sorted = ch_bams_united .branch { meta_group, bams, bais -> runnable: bams.size() > 1 - skip: - return [meta_group, bams[0], bais[0]] + skip: true + return [meta_group, bams[0]] } // Create process input channel @@ -179,19 +181,19 @@ workflow READ_ALIGNMENT_RNA { // MODULE: GATK4 markduplicates // // Create process input channel - // channel: [ meta_markdups, bam, bai ] + // channel: [ meta_markdups, bam ] ch_markdups_inputs = Channel.empty() .mix( WorkflowOncoanalyser.restoreMeta(SAMBAMBA_MERGE.out.bam, ch_inputs), WorkflowOncoanalyser.restoreMeta(ch_bams_united_sorted.skip, ch_inputs), ) - .map { meta, bam, bai -> + .map { meta, bam -> def meta_markdups = [ key: meta.group_id, id: meta.group_id, sample_id: Utils.getTumorRnaSampleName(meta), ] - return [meta_markdups, bams, bais] + return [meta_markdups, bam] } // Run process From bd45719bc84ce8ec2da1e65165b7ed97ce58a505 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 1 Mar 2024 08:44:32 +1100 Subject: [PATCH 66/86] Use explicit returns in .branch ops --- subworkflows/local/amber_profiling.nf | 2 +- subworkflows/local/bamtools_metrics.nf | 4 ++-- subworkflows/local/cobalt_profiling.nf | 2 +- subworkflows/local/flagstat_metrics.nf | 4 ++-- subworkflows/local/gridss_svprep_calling.nf | 4 ++-- subworkflows/local/isofox_quantification.nf | 2 +- subworkflows/local/lilac_calling.nf | 2 +- subworkflows/local/sage_calling.nf | 6 +++--- subworkflows/local/virusbreakend_calling.nf | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) diff --git a/subworkflows/local/amber_profiling.nf b/subworkflows/local/amber_profiling.nf index 2a21d6b4..1cba9d18 100644 --- a/subworkflows/local/amber_profiling.nf +++ b/subworkflows/local/amber_profiling.nf @@ -44,7 +44,7 @@ workflow AMBER_PROFILING { def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.AMBER_DIR) runnable: tumor_bam && !has_existing skip: true - meta + return meta } // Create process input channel diff --git a/subworkflows/local/bamtools_metrics.nf b/subworkflows/local/bamtools_metrics.nf index ca446828..c1ccbf9a 100644 --- a/subworkflows/local/bamtools_metrics.nf +++ b/subworkflows/local/bamtools_metrics.nf @@ -38,7 +38,7 @@ workflow BAMTOOLS_METRICS { def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAMTOOLS_TUMOR) runnable: bam && !has_existing skip: true - meta + return meta } // channel: runnable: [ meta, bam, bai ] @@ -55,7 +55,7 @@ workflow BAMTOOLS_METRICS { def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAMTOOLS_NORMAL) runnable: bam && !has_existing skip: true - meta + return meta } // Create process input channel diff --git a/subworkflows/local/cobalt_profiling.nf b/subworkflows/local/cobalt_profiling.nf index 15c56355..b121ec6c 100644 --- a/subworkflows/local/cobalt_profiling.nf +++ b/subworkflows/local/cobalt_profiling.nf @@ -46,7 +46,7 @@ workflow COBALT_PROFILING { runnable_tn: tumor_bam && normal_bam && !has_existing runnable_to: tumor_bam && !has_existing skip: true - meta + return meta } // First set diploid BED input for tumor/normal and tumor only samples diff --git a/subworkflows/local/flagstat_metrics.nf b/subworkflows/local/flagstat_metrics.nf index 83be4cd8..cfa8b36c 100644 --- a/subworkflows/local/flagstat_metrics.nf +++ b/subworkflows/local/flagstat_metrics.nf @@ -34,7 +34,7 @@ workflow FLAGSTAT_METRICS { def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.FLAGSTAT_TUMOR) runnable: bam && !has_existing skip: true - meta + return meta } // channel: runnable: [ meta, bam, bai ] @@ -51,7 +51,7 @@ workflow FLAGSTAT_METRICS { def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.FLAGSTAT_NORMAL) runnable: bam && !has_existing skip: true - meta + return meta } // Create process input channel diff --git a/subworkflows/local/gridss_svprep_calling.nf b/subworkflows/local/gridss_svprep_calling.nf index 7bbe3d79..87b50b03 100644 --- a/subworkflows/local/gridss_svprep_calling.nf +++ b/subworkflows/local/gridss_svprep_calling.nf @@ -63,9 +63,9 @@ workflow GRIDSS_SVPREP_CALLING { runnable_tn: tumor_bam && normal_bam && !has_existing runnable_to: tumor_bam && !has_existing - [meta, tumor_bam, tumor_bai] + return [meta, tumor_bam, tumor_bai] skip: true - meta + return meta } // diff --git a/subworkflows/local/isofox_quantification.nf b/subworkflows/local/isofox_quantification.nf index 7e40c379..4859232c 100644 --- a/subworkflows/local/isofox_quantification.nf +++ b/subworkflows/local/isofox_quantification.nf @@ -47,7 +47,7 @@ workflow ISOFOX_QUANTIFICATION { def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.ISOFOX_DIR) runnable: tumor_bam && !has_existing skip: true - meta + return meta } // Create process input channel diff --git a/subworkflows/local/lilac_calling.nf b/subworkflows/local/lilac_calling.nf index be91e6d8..f696e6ed 100644 --- a/subworkflows/local/lilac_calling.nf +++ b/subworkflows/local/lilac_calling.nf @@ -53,7 +53,7 @@ workflow LILAC_CALLING { runnable: (tumor_bam || normal_bam) && !has_existing skip: true - meta + return meta } // Realign reads mapping to HLA regions and homologus regions if using reference genome with ALT contigs diff --git a/subworkflows/local/sage_calling.nf b/subworkflows/local/sage_calling.nf index f3f18a15..ffc5064e 100644 --- a/subworkflows/local/sage_calling.nf +++ b/subworkflows/local/sage_calling.nf @@ -53,7 +53,7 @@ workflow SAGE_CALLING { .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> runnable: tumor_bam skip: true - meta + return meta } // @@ -69,7 +69,7 @@ workflow SAGE_CALLING { runnable: has_tumor_normal && !has_existing skip: true - meta + return meta } // Create process input channel @@ -116,7 +116,7 @@ workflow SAGE_CALLING { runnable: has_tumor && !has_existing skip: true - meta + return meta } // Create process input channel diff --git a/subworkflows/local/virusbreakend_calling.nf b/subworkflows/local/virusbreakend_calling.nf index c13c940b..52cfba4b 100644 --- a/subworkflows/local/virusbreakend_calling.nf +++ b/subworkflows/local/virusbreakend_calling.nf @@ -50,7 +50,7 @@ workflow VIRUSBREAKEND_CALLING { def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.VIRUSINTERPRETER_DIR) runnable: tumor_bam && !has_existing skip: true - meta + return meta } // From 8d0648430ffb51a252aff67b54892b975fcf49dc Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 1 Mar 2024 09:12:07 +1100 Subject: [PATCH 67/86] Do not index RNA BAMs prior to merge --- modules/local/sambamba/merge/main.nf | 2 +- subworkflows/local/read_alignment_rna.nf | 35 ++++++------------------ 2 files changed, 10 insertions(+), 27 deletions(-) diff --git a/modules/local/sambamba/merge/main.nf b/modules/local/sambamba/merge/main.nf index 06993865..1bbb9646 100644 --- a/modules/local/sambamba/merge/main.nf +++ b/modules/local/sambamba/merge/main.nf @@ -6,7 +6,7 @@ process SAMBAMBA_MERGE { 'quay.io/biocontainers/sambamba:1.0--h98b6b92_0' }" input: - tuple val(meta), path(bams), path(bais) + tuple val(meta), path(bams) output: tuple val(meta), path('*bam'), emit: bam diff --git a/subworkflows/local/read_alignment_rna.nf b/subworkflows/local/read_alignment_rna.nf index eefa939d..29c84b02 100644 --- a/subworkflows/local/read_alignment_rna.nf +++ b/subworkflows/local/read_alignment_rna.nf @@ -1,5 +1,4 @@ include { GATK4_MARKDUPLICATES } from '../../modules/nf-core/gatk4/markduplicates/main' -include { SAMBAMBA_INDEX } from '../../modules/local/sambamba/index/main' include { SAMBAMBA_MERGE } from '../../modules/local/sambamba/merge/main' include { SAMTOOLS_SORT } from '../../modules/nf-core/samtools/sort/main' include { STAR } from '../../modules/local/star/main' @@ -98,25 +97,9 @@ workflow READ_ALIGNMENT_RNA { ch_versions = ch_versions.mix(SAMTOOLS_SORT.out.versions) - // - // MODULE: Sambamba index - // - SAMBAMBA_INDEX( - SAMTOOLS_SORT.out.bam, - ) - - ch_versions = ch_versions.mix(SAMBAMBA_INDEX.out.versions) - // // MODULE: Sambamba merge // - // Combine BAMs and BAIs - // channel: [ meta_star, bam, bai ] - ch_bams_flat = WorkflowOncoanalyser.groupByMeta( - SAMTOOLS_SORT.out.bam, - SAMBAMBA_INDEX.out.bai, - ) - // Reunite BAMs // First, count expected BAMs per sample for non-blocking groupTuple op // channel: [ meta_count, group_size ] @@ -129,45 +112,45 @@ workflow READ_ALIGNMENT_RNA { .map { meta_count, meta_stars -> return [meta_count, meta_stars.size()] } // Now, group with expected size then sort into tumor and normal channels - // channel: [ meta_group, [bam, ...], [bai, ...] ] + // channel: [ meta_group, [bam, ...] ] ch_bams_united = ch_sample_fastq_counts .cross( // First element to match meta_count above for `cross` - ch_bams_flat.map { meta_star, bam, bai -> [[key: meta_star.key], bam, bai] } + SAMTOOLS_SORT.out.bam.map { meta_star, bam -> [[key: meta_star.key], bam] } ) .map { count_tuple, bam_tuple -> def group_size = count_tuple[1] - def (meta_bam, bam, bai) = bam_tuple + def (meta_bam, bam) = bam_tuple def meta_group = [ *:meta_bam, ] - return tuple(groupKey(meta_group, group_size), bam, bai) + return tuple(groupKey(meta_group, group_size), bam) } .groupTuple() // Sort into merge-eligible BAMs (at least two BAMs required) - // channel: runnable: [ meta_group, [bam, ...], [bai, ...] ] + // channel: runnable: [ meta_group, [bam, ...] ] // channel: skip: [ meta_group, bam ] ch_bams_united_sorted = ch_bams_united - .branch { meta_group, bams, bais -> + .branch { meta_group, bams -> runnable: bams.size() > 1 skip: true return [meta_group, bams[0]] } // Create process input channel - // channel: [ meta_merge, [bams, ...], [bais, ...] ] + // channel: [ meta_merge, [bams, ...] ] ch_merge_inputs = WorkflowOncoanalyser.restoreMeta(ch_bams_united_sorted.runnable, ch_inputs) - .map { meta, bams, bais -> + .map { meta, bams -> def meta_merge = [ key: meta.group_id, id: meta.group_id, sample_id: Utils.getTumorRnaSampleName(meta), ] - return [meta_merge, bams, bais] + return [meta_merge, bams] } // Run process From c7e87c2dda8a0154b8e6738d685532d5a5723142 Mon Sep 17 00:00:00 2001 From: Matthew Cooper Date: Tue, 5 Mar 2024 20:33:06 +1100 Subject: [PATCH 68/86] Remove obsolete TODOs --- workflows/wgts.nf | 3 --- 1 file changed, 3 deletions(-) diff --git a/workflows/wgts.nf b/workflows/wgts.nf index 8405a5fa..7bf745d2 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -111,9 +111,6 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft // Get absolute file paths samplesheet = Utils.getFileObject(params.input) -// TODO(MC): Drop commit 'WIP: Reverting bioconda containers'. -// TODO(MC): Run full tests for going from .fastq.gz. -// TODO(MC): Fix warnings. workflow WGTS { // Create channel for versions // channel: [ versions.yml ] From 1653daf620601893d9d837a245f79fb1afe77b19 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 8 Mar 2024 10:14:36 +1100 Subject: [PATCH 69/86] Fix Isofox singularity container URL --- modules/local/isofox/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/isofox/main.nf b/modules/local/isofox/main.nf index 903e5e45..cd215d70 100644 --- a/modules/local/isofox/main.nf +++ b/modules/local/isofox/main.nf @@ -4,7 +4,7 @@ process ISOFOX { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/hmftools-isofox:1.7.1-hdfd78af_0': + 'https://depot.galaxyproject.org/singularity/hmftools-isofox:1.7.1--hdfd78af_0': 'quay.io/biocontainers/hmftools-isofox:1.7.1--hdfd78af_0' }" input: From f618283ff199707b8053f2818434ce7f9daf7659 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 8 Mar 2024 10:15:01 +1100 Subject: [PATCH 70/86] Bump TSO500 data bundle version * restrict target regions to canonical Ensembl transcripts --- lib/Constants.groovy | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Constants.groovy b/lib/Constants.groovy index c7618ac3..37c7ac75 100644 --- a/lib/Constants.groovy +++ b/lib/Constants.groovy @@ -15,8 +15,8 @@ class Constants { static String HMF_DATA_38_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/hmftools/5.34_38--2.tar.gz' - static String TSO500_PANEL_37_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/panels/tso500_5.34_37--0.tar.gz' - static String TSO500_PANEL_38_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/panels/tso500_5.34_38--0.tar.gz' + static String TSO500_PANEL_37_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/panels/tso500_5.34_37--1.tar.gz' + static String TSO500_PANEL_38_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/panels/tso500_5.34_38--1.tar.gz' static String VIRUSBREAKENDDB_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/virusbreakend/virusbreakenddb_20210401.tar.gz' From 2bd379cbc57767539c1db292aecfaccf9400a73a Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Fri, 8 Mar 2024 10:16:02 +1100 Subject: [PATCH 71/86] Remove -force_pathogenic_pass in PAVE somatic --- modules/local/pave/somatic/main.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/local/pave/somatic/main.nf b/modules/local/pave/somatic/main.nf index 7ad70200..dfecf48c 100644 --- a/modules/local/pave/somatic/main.nf +++ b/modules/local/pave/somatic/main.nf @@ -48,7 +48,6 @@ process PAVE_SOMATIC { // Targeted mode def pon_artefact_arg = pon_artefacts ? "-pon_artefact_file ${pon_artefacts}" : '' - def pathogenic_pass_force_arg = pon_artefacts ? '-force_pathogenic_pass': '' def sage_blocklist_regions_arg = sage_blocklist_regions ? "-blacklist_bed ${sage_blocklist_regions}" : '' def sage_blocklist_sites_arg = sage_blocklist_sites ? "-blacklist_vcf ${sage_blocklist_sites}" : '' def clinvar_annotations = clinvar_annotations ? "-clinvar_vcf ${clinvar_annotations}" : '' @@ -69,7 +68,6 @@ process PAVE_SOMATIC { -ensembl_data_dir ${ensembl_data_resources} \\ ${sage_blocklist_regions_arg} \\ ${sage_blocklist_sites_arg} \\ - ${pathogenic_pass_force_arg} \\ ${gnomad_args} \\ -read_pass_only \\ -threads ${task.cpus} \\ From f76f79b1fe5a52c92ce73e2fa17e674e55b57b7c Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Sat, 16 Mar 2024 17:24:28 +1100 Subject: [PATCH 72/86] Correct prepare reference panel data path lookup --- subworkflows/local/prepare_reference/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/prepare_reference/main.nf b/subworkflows/local/prepare_reference/main.nf index 54616930..b6e509a2 100644 --- a/subworkflows/local/prepare_reference/main.nf +++ b/subworkflows/local/prepare_reference/main.nf @@ -141,7 +141,7 @@ workflow PREPARE_REFERENCE { // NOTE(SW): consider approach to implement custom panel support - panel_data_paths_versions = params.ref_data.panel_data_paths[params.panel] + panel_data_paths_versions = params.panel_data_paths[params.panel] panel_data_paths = panel_data_paths_versions[params.ref_data.genome_version] if (params.ref_data.panel_data_path.endsWith('tar.gz')) { From 9b012d0558ba54170bdb82e5bdd393d3db5d7205 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Sat, 16 Mar 2024 18:10:53 +1100 Subject: [PATCH 73/86] Fix optional channel placeholders --- workflows/wgts.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/wgts.nf b/workflows/wgts.nf index bd3c6edf..832988d9 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -178,7 +178,7 @@ workflow WGTS { } else { - ch_process_dna_normal_out = ch_inputs.map + ch_process_dna_tumor_out = ch_inputs.map { meta -> [meta, []] } ch_process_dna_normal_out = ch_inputs.map { meta -> [meta, []] } } @@ -679,7 +679,7 @@ workflow WGTS { } else { - ch_lilac_out = ch_inputes.map { meta -> [meta, []] } + ch_lilac_out = ch_inputs.map { meta -> [meta, []] } } From b90c572ad740e60bbadeccdfc7b006675ba4d322 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Sat, 16 Mar 2024 18:26:02 +1100 Subject: [PATCH 74/86] Update modules.json --- modules.json | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/modules.json b/modules.json index 13432f4a..44d7fd99 100644 --- a/modules.json +++ b/modules.json @@ -10,6 +10,11 @@ "git_sha": "90aef30f432332bdf0ce9f4b9004aa5d5c4960bb", "installed_by": ["modules"] }, + "gatk4/markduplicates": { + "branch": "master", + "git_sha": "e726b1730dff525bde4a6839e544dabfea4cd7fd", + "installed_by": ["modules"] + }, "samtools/dict": { "branch": "master", "git_sha": "cf5b9c30a2adacc581793afb79fae5f5b50bed01", @@ -24,6 +29,11 @@ "branch": "master", "git_sha": "bbb99cb8d679555cc01c98766de7869f83283545", "installed_by": ["modules"] + }, + "samtools/sort": { + "branch": "master", + "git_sha": "d5d785b3d8b422cda9c6d84a23f629a8e9ff8cd8", + "installed_by": ["modules"] } } }, From 77b8bce2b86533b9ea2062f372261ed8b0b64c49 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Sat, 16 Mar 2024 18:32:37 +1100 Subject: [PATCH 75/86] Adjust indenting --- modules/local/bwa-mem2/mem/Dockerfile | 16 ++++---- subworkflows/local/read_alignment_dna/main.nf | 38 +++++++++---------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/modules/local/bwa-mem2/mem/Dockerfile b/modules/local/bwa-mem2/mem/Dockerfile index 0a0bc7ea..ba36b89b 100644 --- a/modules/local/bwa-mem2/mem/Dockerfile +++ b/modules/local/bwa-mem2/mem/Dockerfile @@ -3,17 +3,17 @@ FROM docker.io/mambaorg/micromamba:1.5.6 USER root RUN \ - apt-get update && \ - apt-get install -y procps && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* + apt-get update && \ + apt-get install -y procps && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* USER mambauser RUN \ - micromamba install -y -n base -c bioconda -c conda-forge \ - 'bwa-mem2 ==2.2.1' \ - 'sambamba ==1.0' && \ - micromamba clean --all --yes + micromamba install -y -n base -c bioconda -c conda-forge \ + 'bwa-mem2 ==2.2.1' \ + 'sambamba ==1.0' && \ + micromamba clean --all --yes ENV PATH="/opt/conda/bin:/opt/conda/condabin:${PATH}" diff --git a/subworkflows/local/read_alignment_dna/main.nf b/subworkflows/local/read_alignment_dna/main.nf index 0fb9f21a..e5fd7590 100644 --- a/subworkflows/local/read_alignment_dna/main.nf +++ b/subworkflows/local/read_alignment_dna/main.nf @@ -81,29 +81,29 @@ workflow READ_ALIGNMENT_DNA { // Prepare outputs within conditional block ch_fastqs_ready = FASTP.out.fastq - .flatMap { meta_fastq, reads_fwd, reads_rev -> + .flatMap { meta_fastq, reads_fwd, reads_rev -> - def data = [reads_fwd, reads_rev] - .transpose() - .collect { fwd, rev -> + def data = [reads_fwd, reads_rev] + .transpose() + .collect { fwd, rev -> - def split_fwd = fwd.name.replaceAll('\\..+$', '') - def split_rev = rev.name.replaceAll('\\..+$', '') + def split_fwd = fwd.name.replaceAll('\\..+$', '') + def split_rev = rev.name.replaceAll('\\..+$', '') - assert split_fwd == split_rev + assert split_fwd == split_rev - // NOTE(SW): split allows meta_fastq_ready to be unique, which is required during reunite below - def meta_fastq_ready = [ - *:meta_fastq, - id: "${meta_fastq.id}_${split_fwd}", - split: split_fwd, - ] + // NOTE(SW): split allows meta_fastq_ready to be unique, which is required during reunite below + def meta_fastq_ready = [ + *:meta_fastq, + id: "${meta_fastq.id}_${split_fwd}", + split: split_fwd, + ] - return [meta_fastq_ready, fwd, rev] - } + return [meta_fastq_ready, fwd, rev] + } - return data - } + return data + } } else { @@ -111,8 +111,8 @@ workflow READ_ALIGNMENT_DNA { .map { meta_fastq, fastq_fwd, fastq_rev -> def meta_fastq_ready = [ - *:meta_fastq, - split: null, + *:meta_fastq, + split: null, ] return [meta_fastq_ready, fastq_fwd, fastq_rev] From bd6b304f841147875927467217b7cd1d35962be1 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Tue, 19 Mar 2024 09:48:40 +1100 Subject: [PATCH 76/86] Use standard container directive format for STAR --- modules/local/star/main.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/local/star/main.nf b/modules/local/star/main.nf index b58868ca..a211d9a7 100644 --- a/modules/local/star/main.nf +++ b/modules/local/star/main.nf @@ -2,7 +2,9 @@ process STAR { tag "${meta.id}" label 'process_high' - container 'quay.io/biocontainers/star:2.7.3a--0' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/star:2.7.3a--0' : + 'quay.io/biocontainers/star:2.7.3a--0' }" input: tuple val(meta), path(fastq_fwd), path(fastq_rev) From c7f1774bc667b07db006aa6efee1ccc344b49478 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Tue, 19 Mar 2024 09:53:05 +1100 Subject: [PATCH 77/86] Add missing imports and subworkflow descriptions --- subworkflows/local/cuppa_prediction/main.nf | 1 + subworkflows/local/prepare_inputs/main.nf | 1 - subworkflows/local/prepare_reference/main.nf | 2 ++ subworkflows/local/read_alignment_dna/main.nf | 7 +++++++ subworkflows/local/read_alignment_rna/main.nf | 7 +++++++ subworkflows/local/read_processing/main.nf | 7 +++++++ 6 files changed, 24 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/cuppa_prediction/main.nf b/subworkflows/local/cuppa_prediction/main.nf index 40eaae38..2b74a441 100644 --- a/subworkflows/local/cuppa_prediction/main.nf +++ b/subworkflows/local/cuppa_prediction/main.nf @@ -3,6 +3,7 @@ // import Constants +import Utils include { CUPPA } from '../../../modules/local/cuppa/main' diff --git a/subworkflows/local/prepare_inputs/main.nf b/subworkflows/local/prepare_inputs/main.nf index f6e6dfae..18abe8eb 100644 --- a/subworkflows/local/prepare_inputs/main.nf +++ b/subworkflows/local/prepare_inputs/main.nf @@ -8,7 +8,6 @@ // through running workflows/processes with 'setup'. Hence, this subworkflow // isn't used in the main pipeline and is only used for execution of tests. -import Constants import Utils workflow PREPARE_INPUTS { diff --git a/subworkflows/local/prepare_reference/main.nf b/subworkflows/local/prepare_reference/main.nf index f005623d..405c00a9 100644 --- a/subworkflows/local/prepare_reference/main.nf +++ b/subworkflows/local/prepare_reference/main.nf @@ -5,6 +5,8 @@ // NOTE(SW): BWA MEM2 indexes are required and are not created // TODO(SW): consider removing index creation since it's unlikely to be used, replace with documentation +import Constants + include { SAMTOOLS_FAIDX } from '../../../modules/nf-core/samtools/faidx/main' include { SAMTOOLS_DICT } from '../../../modules/nf-core/samtools/dict/main' include { BWA_INDEX } from '../../../modules/nf-core/bwa/index/main' diff --git a/subworkflows/local/read_alignment_dna/main.nf b/subworkflows/local/read_alignment_dna/main.nf index e5fd7590..01c16a56 100644 --- a/subworkflows/local/read_alignment_dna/main.nf +++ b/subworkflows/local/read_alignment_dna/main.nf @@ -1,3 +1,10 @@ +// +// Align DNA reads +// + +import Constants +import Utils + include { BWAMEM2_ALIGN } from '../../../modules/local/bwa-mem2/mem/main' include { FASTP } from '../../../modules/local/fastp/main' include { SAMBAMBA_INDEX } from '../../../modules/local/sambamba/index/main' diff --git a/subworkflows/local/read_alignment_rna/main.nf b/subworkflows/local/read_alignment_rna/main.nf index 61c79ec4..ca795e58 100644 --- a/subworkflows/local/read_alignment_rna/main.nf +++ b/subworkflows/local/read_alignment_rna/main.nf @@ -1,3 +1,10 @@ +// +// Align RNA reads +// + +import Constants +import Utils + include { GATK4_MARKDUPLICATES } from '../../../modules/nf-core/gatk4/markduplicates/main' include { SAMBAMBA_MERGE } from '../../../modules/local/sambamba/merge/main' include { SAMTOOLS_SORT } from '../../../modules/nf-core/samtools/sort/main' diff --git a/subworkflows/local/read_processing/main.nf b/subworkflows/local/read_processing/main.nf index 4d5597fc..2e67d213 100644 --- a/subworkflows/local/read_processing/main.nf +++ b/subworkflows/local/read_processing/main.nf @@ -1,3 +1,10 @@ +// +// Apply post-alignment processing +// + +import Constants +import Utils + include { MARKDUPS } from '../../../modules/local/markdups/main' workflow READ_PROCESSING { From 177bf6946444f27aba6f99e6e0094c97f82e47eb Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Tue, 19 Mar 2024 18:52:17 +1100 Subject: [PATCH 78/86] Use Bioconda/BioContainers for bwa-mem2 module --- modules/local/bwa-mem2/mem/Dockerfile | 19 ------------------- modules/local/bwa-mem2/mem/environment.yml | 9 +++++++++ modules/local/bwa-mem2/mem/main.nf | 6 ++++-- 3 files changed, 13 insertions(+), 21 deletions(-) delete mode 100644 modules/local/bwa-mem2/mem/Dockerfile create mode 100644 modules/local/bwa-mem2/mem/environment.yml diff --git a/modules/local/bwa-mem2/mem/Dockerfile b/modules/local/bwa-mem2/mem/Dockerfile deleted file mode 100644 index ba36b89b..00000000 --- a/modules/local/bwa-mem2/mem/Dockerfile +++ /dev/null @@ -1,19 +0,0 @@ -FROM docker.io/mambaorg/micromamba:1.5.6 - -USER root - -RUN \ - apt-get update && \ - apt-get install -y procps && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -USER mambauser - -RUN \ - micromamba install -y -n base -c bioconda -c conda-forge \ - 'bwa-mem2 ==2.2.1' \ - 'sambamba ==1.0' && \ - micromamba clean --all --yes - -ENV PATH="/opt/conda/bin:/opt/conda/condabin:${PATH}" diff --git a/modules/local/bwa-mem2/mem/environment.yml b/modules/local/bwa-mem2/mem/environment.yml new file mode 100644 index 00000000..571dda57 --- /dev/null +++ b/modules/local/bwa-mem2/mem/environment.yml @@ -0,0 +1,9 @@ +name: bwa-mem2_mem +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bwa-mem2=2.2.1 + - bioconda::samtools=1.19.2 + - bioconda::sambamba=1.0 diff --git a/modules/local/bwa-mem2/mem/main.nf b/modules/local/bwa-mem2/mem/main.nf index 2ae102b6..961791eb 100644 --- a/modules/local/bwa-mem2/mem/main.nf +++ b/modules/local/bwa-mem2/mem/main.nf @@ -2,8 +2,10 @@ process BWAMEM2_ALIGN { tag "${meta.id}" label 'process_high' - // TODO(SW): create BioContainers multi-package image when appropriate - container 'docker.io/scwatts/bwa-mem2:2.2.1' + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-4dde50190ae599f2bb2027cb2c8763ea00fb5084:544519c4a0ff7e9616a3b44afde1f143c52f10c3-0' : + 'quay.io/biocontainers/mulled-v2-4dde50190ae599f2bb2027cb2c8763ea00fb5084:544519c4a0ff7e9616a3b44afde1f143c52f10c3-0' }" input: tuple val(meta), path(reads_fwd), path(reads_rev) From 9d61690a51cc8c2a4a476766e9a1eb03382764ef Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Wed, 27 Mar 2024 16:28:42 +1100 Subject: [PATCH 79/86] Improve naming for bwa-mem2 output BAMs --- modules/local/bwa-mem2/mem/main.nf | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/modules/local/bwa-mem2/mem/main.nf b/modules/local/bwa-mem2/mem/main.nf index 961791eb..14c9daf9 100644 --- a/modules/local/bwa-mem2/mem/main.nf +++ b/modules/local/bwa-mem2/mem/main.nf @@ -23,6 +23,7 @@ process BWAMEM2_ALIGN { script: def read_group_tag = "@RG\\tID:${meta.read_group}\\tSM:${meta.sample_id}" + def output_fn = meta.split ? "${meta.split}.${meta.sample_id}.${meta.read_group}.bam" : "${meta.sample_id}.${meta.read_group}.bam" """ ln -fs \$(find -L ${genome_bwa_index} -type f) ./ @@ -44,7 +45,7 @@ process BWAMEM2_ALIGN { \\ sambamba sort \\ --nthreads ${task.cpus} \\ - --out ${meta.split}.${meta.sample_id}.${meta.read_group}.bam \\ + --out ${output_fn} \\ /dev/stdin cat <<-END_VERSIONS > versions.yml @@ -55,8 +56,10 @@ process BWAMEM2_ALIGN { """ stub: + def output_fn = meta.split ? "${meta.split}.${meta.sample_id}.${meta.read_group}.bam" : "${meta.sample_id}.${meta.read_group}.bam" + """ - touch ${meta.split}.${meta.sample_id}.${meta.read_group}.bam + touch ${output_fn} echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml """ From 066c8d29b12b0e853f9ff979cfcab1528f4f855f Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Mon, 15 Apr 2024 15:16:11 +1000 Subject: [PATCH 80/86] Use BAM index created during alignment --- modules/local/bwa-mem2/mem/main.nf | 4 +-- modules/local/sambamba/index/main.nf | 35 ------------------- subworkflows/local/read_alignment_dna/main.nf | 20 +---------- 3 files changed, 3 insertions(+), 56 deletions(-) diff --git a/modules/local/bwa-mem2/mem/main.nf b/modules/local/bwa-mem2/mem/main.nf index 14c9daf9..99e60d10 100644 --- a/modules/local/bwa-mem2/mem/main.nf +++ b/modules/local/bwa-mem2/mem/main.nf @@ -15,8 +15,8 @@ process BWAMEM2_ALIGN { path genome_bwa_index_biidx output: - tuple val(meta), path('*.bam'), emit: bam - path 'versions.yml' , emit: versions + tuple val(meta), path('*.bam'), path('*.bai'), emit: bam + path 'versions.yml' , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/sambamba/index/main.nf b/modules/local/sambamba/index/main.nf index 56bb38af..e69de29b 100644 --- a/modules/local/sambamba/index/main.nf +++ b/modules/local/sambamba/index/main.nf @@ -1,35 +0,0 @@ -process SAMBAMBA_INDEX { - tag "${meta.id}" - - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/sambamba:1.0--h98b6b92_0' : - 'quay.io/biocontainers/sambamba:1.0--h98b6b92_0' }" - - input: - tuple val(meta), path(bam) - - output: - tuple val(meta), path('*bai'), emit: bai - path 'versions.yml' , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - """ - sambamba index \\ - --nthreads ${task.cpus} \\ - ${bam} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - sambamba: \$(sambamba --version 2>&1 | egrep '^sambamba' | head -n 1 | awk '{ print \$NF }') - END_VERSIONS - """ - - stub: - """ - touch ${bam}.bai - echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml - """ -} diff --git a/subworkflows/local/read_alignment_dna/main.nf b/subworkflows/local/read_alignment_dna/main.nf index 01c16a56..fb0aaab4 100644 --- a/subworkflows/local/read_alignment_dna/main.nf +++ b/subworkflows/local/read_alignment_dna/main.nf @@ -7,7 +7,6 @@ import Utils include { BWAMEM2_ALIGN } from '../../../modules/local/bwa-mem2/mem/main' include { FASTP } from '../../../modules/local/fastp/main' -include { SAMBAMBA_INDEX } from '../../../modules/local/sambamba/index/main' workflow READ_ALIGNMENT_DNA { take: @@ -159,23 +158,6 @@ workflow READ_ALIGNMENT_DNA { ch_versions = ch_versions.mix(BWAMEM2_ALIGN.out.versions) - // - // MODULE: Sambamba index - // - SAMBAMBA_INDEX( - BWAMEM2_ALIGN.out.bam, - ) - - ch_versions = ch_versions.mix(SAMBAMBA_INDEX.out.versions) - - // Combine BAMs and BAIs - // channel: [ meta_bwa, sample_type, bam, bai ] - ch_bams_flat = WorkflowOncoanalyser.groupByMeta( - BWAMEM2_ALIGN.out.bam.map { meta_bwa, bam -> [meta_bwa, meta_bwa.sample_type] }, - BWAMEM2_ALIGN.out.bam, - SAMBAMBA_INDEX.out.bai, - ) - // Reunite BAMs // First, count expected BAMs per sample for non-blocking groupTuple op // channel: [ meta_count, group_size ] @@ -197,7 +179,7 @@ workflow READ_ALIGNMENT_DNA { ch_bams_united = ch_sample_fastq_counts .cross( // First element to match meta_count above for `cross` - ch_bams_flat.map { meta_bwa, sample_type, bam, bai -> [[key: meta_bwa.key, sample_type: sample_type], bam, bai] } + BWAMEM2_ALIGN.out.bam.map { meta_bwa, bam, bai -> [[key: meta_bwa.key, sample_type: meta_bwa.sample_type], bam, bai] } ) .map { count_tuple, bam_tuple -> From 0abc35f5a74f8d1b5ca7d46ce57741d64724a448 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Mon, 22 Apr 2024 11:09:23 +1000 Subject: [PATCH 81/86] Improve BAM index selection --- subworkflows/local/amber_profiling/main.nf | 4 ++-- subworkflows/local/bamtools_metrics/main.nf | 4 ++-- subworkflows/local/cobalt_profiling/main.nf | 4 ++-- subworkflows/local/flagstat_metrics/main.nf | 4 ++-- subworkflows/local/gridss_svprep_calling/main.nf | 4 ++-- subworkflows/local/lilac_calling/main.nf | 4 ++-- subworkflows/local/sage_calling/main.nf | 4 ++-- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/subworkflows/local/amber_profiling/main.nf b/subworkflows/local/amber_profiling/main.nf index fe81787e..a4cb59f0 100644 --- a/subworkflows/local/amber_profiling/main.nf +++ b/subworkflows/local/amber_profiling/main.nf @@ -35,9 +35,9 @@ workflow AMBER_PROFILING { return [ meta, Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), - Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_DNA_TUMOR), + tumor_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR), Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), - Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_DNA_NORMAL), + normal_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL), ] } .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> diff --git a/subworkflows/local/bamtools_metrics/main.nf b/subworkflows/local/bamtools_metrics/main.nf index 2e3db8ec..e71ca2bb 100644 --- a/subworkflows/local/bamtools_metrics/main.nf +++ b/subworkflows/local/bamtools_metrics/main.nf @@ -31,7 +31,7 @@ workflow BAMTOOLS_METRICS { return [ meta, Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), - Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_DNA_TUMOR), + bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR), ] } .branch { meta, bam, bai -> @@ -48,7 +48,7 @@ workflow BAMTOOLS_METRICS { return [ meta, Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), - Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_DNA_NORMAL), + bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL), ] } .branch { meta, bam, bai -> diff --git a/subworkflows/local/cobalt_profiling/main.nf b/subworkflows/local/cobalt_profiling/main.nf index 6fb82561..13102dee 100644 --- a/subworkflows/local/cobalt_profiling/main.nf +++ b/subworkflows/local/cobalt_profiling/main.nf @@ -36,9 +36,9 @@ workflow COBALT_PROFILING { return [ meta, Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), - Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_DNA_TUMOR), + tumor_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR), Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), - Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_DNA_NORMAL), + normal_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL), ] } .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> diff --git a/subworkflows/local/flagstat_metrics/main.nf b/subworkflows/local/flagstat_metrics/main.nf index 668088dd..9f0a5f78 100644 --- a/subworkflows/local/flagstat_metrics/main.nf +++ b/subworkflows/local/flagstat_metrics/main.nf @@ -27,7 +27,7 @@ workflow FLAGSTAT_METRICS { return [ meta, Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), - Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_DNA_TUMOR), + bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR), ] } .branch { meta, bam, bai -> @@ -44,7 +44,7 @@ workflow FLAGSTAT_METRICS { return [ meta, Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), - Utils.selectCurrentOrExisting(bai, meta, Constants.INPUT.BAI_DNA_NORMAL), + bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL), ] } .branch { meta, bam, bai -> diff --git a/subworkflows/local/gridss_svprep_calling/main.nf b/subworkflows/local/gridss_svprep_calling/main.nf index 23521059..48333057 100644 --- a/subworkflows/local/gridss_svprep_calling/main.nf +++ b/subworkflows/local/gridss_svprep_calling/main.nf @@ -52,9 +52,9 @@ workflow GRIDSS_SVPREP_CALLING { return [ meta, Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), - Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_DNA_TUMOR), + tumor_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR), Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), - Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_DNA_NORMAL), + normal_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL), ] } .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> diff --git a/subworkflows/local/lilac_calling/main.nf b/subworkflows/local/lilac_calling/main.nf index e40cd692..3110cc97 100644 --- a/subworkflows/local/lilac_calling/main.nf +++ b/subworkflows/local/lilac_calling/main.nf @@ -42,9 +42,9 @@ workflow LILAC_CALLING { return [ meta, Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), - Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_DNA_TUMOR), + tumor_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR), Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), - Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_DNA_NORMAL), + normal_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL), ] } .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> diff --git a/subworkflows/local/sage_calling/main.nf b/subworkflows/local/sage_calling/main.nf index 156c631b..33d815d0 100644 --- a/subworkflows/local/sage_calling/main.nf +++ b/subworkflows/local/sage_calling/main.nf @@ -45,9 +45,9 @@ workflow SAGE_CALLING { return [ meta, Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), - Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_DNA_TUMOR), + tumor_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR), Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), - Utils.selectCurrentOrExisting(normal_bai, meta, Constants.INPUT.BAI_DNA_NORMAL), + normal_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL), ] } .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> From eb95b4f44abce9aeeb8b102e3f4253c55c5f2c3a Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Mon, 22 Apr 2024 11:12:55 +1000 Subject: [PATCH 82/86] Include BAI in bwa-mem2/align stub --- modules/local/bwa-mem2/mem/main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/local/bwa-mem2/mem/main.nf b/modules/local/bwa-mem2/mem/main.nf index 99e60d10..603f989d 100644 --- a/modules/local/bwa-mem2/mem/main.nf +++ b/modules/local/bwa-mem2/mem/main.nf @@ -60,6 +60,7 @@ process BWAMEM2_ALIGN { """ touch ${output_fn} + touch ${output_fn}.bai echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml """ From 4039507120d461fdb45884285be8e4fc66e35ad3 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Mon, 22 Apr 2024 11:35:49 +1000 Subject: [PATCH 83/86] Adjust input selection logic --- lib/Utils.groovy | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/Utils.groovy b/lib/Utils.groovy index b785aa3a..7604a344 100644 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -483,18 +483,20 @@ class Utils { // Misc public static getInput(meta, key) { - def result + def result = [] def (key_filetype, key_filetypes, key_sequencetypes) = key for (key_sample in [key_filetypes, key_sequencetypes].combinations()) { if (meta.containsKey(key_sample) && meta[key_sample].containsKey(key_filetype)) { - return meta[key_sample].getAt(key_filetype) + result = meta[key_sample].get(key_filetype) + break } } + return result } public static hasExistingInput(meta, key) { - return getInput(meta, key) !== null + return getInput(meta, key) != [] } public static selectCurrentOrExisting(val, meta, key) { From e8f6c99cbd9d8d982cddf2a627db63fac48d2ed3 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Tue, 23 Apr 2024 11:26:00 +1000 Subject: [PATCH 84/86] Bump MarkDups to 1.1.5 --- modules/local/markdups/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index e9f17f5d..d4d0f443 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -3,8 +3,8 @@ process MARKDUPS { label 'process_medium' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/hmftools-mark-dups:1.1.2--hdfd78af_0' : - 'quay.io/biocontainers/hmftools-mark-dups:1.1.2--hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/hmftools-mark-dups:1.1.5--hdfd78af_0' : + 'quay.io/biocontainers/hmftools-mark-dups:1.1.5--hdfd78af_0' }" input: tuple val(meta), path(bams), path(bais) From ed8e1d172a4906291216a27fecd41e42d38921d2 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Tue, 23 Apr 2024 15:43:15 +1000 Subject: [PATCH 85/86] Remove Sambamba index module file --- modules/local/sambamba/index/main.nf | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 modules/local/sambamba/index/main.nf diff --git a/modules/local/sambamba/index/main.nf b/modules/local/sambamba/index/main.nf deleted file mode 100644 index e69de29b..00000000 From e0da20af67a2e9453410c6662becd6bb14f6ac72 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Tue, 23 Apr 2024 15:45:12 +1000 Subject: [PATCH 86/86] Add new meta.yaml --- modules/local/bwa-mem2/mem/meta.yml | 59 +++++++++++++++++++++++++ modules/local/fastp/meta.yml | 47 ++++++++++++++++++++ modules/local/markdups/meta.yml | 62 +++++++++++++++++++++++++++ modules/local/sambamba/merge/meta.yml | 38 ++++++++++++++++ modules/local/star/main.nf | 4 +- modules/local/star/meta.yml | 46 ++++++++++++++++++++ 6 files changed, 254 insertions(+), 2 deletions(-) create mode 100644 modules/local/bwa-mem2/mem/meta.yml create mode 100644 modules/local/fastp/meta.yml create mode 100644 modules/local/markdups/meta.yml create mode 100644 modules/local/sambamba/merge/meta.yml create mode 100644 modules/local/star/meta.yml diff --git a/modules/local/bwa-mem2/mem/meta.yml b/modules/local/bwa-mem2/mem/meta.yml new file mode 100644 index 00000000..f8fb56b3 --- /dev/null +++ b/modules/local/bwa-mem2/mem/meta.yml @@ -0,0 +1,59 @@ +name: bwa-mem2_mem +description: The mem alignment algorithm of bwa-mem2 +keywords: + - bwa + - mem + - read alignment + - bwa-mem2 +tools: + - bwa-mem2: + description: Burrow-Wheeler Aligner for short-read alignment + homepage: https://github.com/bwa-mem2/bwa-mem2 + documentation: https://github.com/bwa-mem2/bwa-mem2 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - reads_fwd: + type: file + description: Forward reads FASTQ file + pattern: "*.{fastq.gz}" + - reads_rev: + type: file + description: Reverse reads FASTQ file + pattern: "*.{fastq.gz}" + - genome_fasta: + type: file + description: Reference genome assembly FASTA file + pattern: "*.{fa,fasta}" + - genome_bwa_index: + type: directory + description: bwa-mem1 index directory + - genome_bwa_index_bseq: + type: directory + description: bwa-mem2 bseq index file + pattern: "*.{0123}" + - genome_bwa_index_biidx: + type: directory + description: bwa-mem2 biidx index file + pattern: "*.{bwt.2bit.64}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - bam: + type: list + description: BAM and BAI file + pattern: "*.{bam,bam.bai}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@scwatts" + - "@mkcmkc" diff --git a/modules/local/fastp/meta.yml b/modules/local/fastp/meta.yml new file mode 100644 index 00000000..23aa5fb6 --- /dev/null +++ b/modules/local/fastp/meta.yml @@ -0,0 +1,47 @@ +name: fastp +description: An ultra-fast all-in-one FASTQ preprocessor +keywords: + - fastp + - fastq + - processing + - quality control +tools: + - fastp: + description: An ultra-fast all-in-one FASTQ preprocessor + homepage: https://github.com/OpenGene/fastp + documentation: https://github.com/OpenGene/fastp + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - reads_fwd: + type: file + description: Forward reads FASTQ file + pattern: "*.{fastq.gz}" + - reads_rev: + type: file + description: Reverse reads FASTQ file + pattern: "*.{fastq.gz}" + - max_fastq_records: + type: integer + description: Maximum number of reads per file +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - fastq: + type: list + description: Forward and reverse FASTQ files + pattern: "*.{fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@scwatts" + - "@mkcmkc" diff --git a/modules/local/markdups/meta.yml b/modules/local/markdups/meta.yml new file mode 100644 index 00000000..a9297b68 --- /dev/null +++ b/modules/local/markdups/meta.yml @@ -0,0 +1,62 @@ +name: markdups +description: Identify and mark duplicate reads ifrom alignment data +keywords: + - duplicating marking + - markdups +tools: + - MarkDups: + description: Identify and mark duplicate reads ifrom alignment data + homepage: https://github.com/hartwigmedical/hmftools/tree/master/mark-dups + documentation: https://github.com/hartwigmedical/hmftools/tree/master/mark-dups + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - bams: + type: list + description: List BAM files + - bais: + type: list + description: List BAI files + - genome_fasta: + type: file + description: Reference genome assembly FASTA file + pattern: "*.{fa,fasta}" + - genome_ver: + type: string + description: Reference genome version + - genome_fai: + type: file + description: Reference genome assembly fai file + pattern: "*.{fai}" + - genome_dict: + type: file + description: Reference genome assembly dict file + pattern: "*.{dict}" + - unmap_regions: + type: file + description: Unmapped regions file + pattern: "*.{tsv}" + - has_umis: + type: boolean + description: Flag indicating presence of UMIs in reads +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - bam: + type: list + description: BAM and BAI file + pattern: "*.{bam,bam.bai}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@scwatts" + - "@mkcmkc" diff --git a/modules/local/sambamba/merge/meta.yml b/modules/local/sambamba/merge/meta.yml new file mode 100644 index 00000000..c4424055 --- /dev/null +++ b/modules/local/sambamba/merge/meta.yml @@ -0,0 +1,38 @@ +name: sambamba_merge +description: Merge several BAM files into one +keywords: + - sambamba + - bam + - merge +tools: + - sambamba: + description: Tools for working with SAM/BAM data + homepage: https://github.com/biod/sambamba + documentation: https://lomereiter.github.io/sambamba/index.html + licence: ["GPL v2"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - bams: + type: list + description: List BAM files +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - bam: + type: file + description: BAM file + pattern: "*.{bam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@scwatts" + - "@mkcmkc" diff --git a/modules/local/star/main.nf b/modules/local/star/main.nf index a211d9a7..7aa4503f 100644 --- a/modules/local/star/main.nf +++ b/modules/local/star/main.nf @@ -7,7 +7,7 @@ process STAR { 'quay.io/biocontainers/star:2.7.3a--0' }" input: - tuple val(meta), path(fastq_fwd), path(fastq_rev) + tuple val(meta), path(reads_fwd), path(reads_rev) path genome_star_index output: @@ -20,7 +20,7 @@ process STAR { script: """ STAR \\ - --readFilesIn ${fastq_fwd} ${fastq_rev} \\ + --readFilesIn ${reads_fwd} ${reads_rev} \\ --genomeDir ${genome_star_index} \\ --runThreadN ${task.cpus} \\ --readFilesCommand zcat \\ diff --git a/modules/local/star/meta.yml b/modules/local/star/meta.yml new file mode 100644 index 00000000..0bbc3329 --- /dev/null +++ b/modules/local/star/meta.yml @@ -0,0 +1,46 @@ +name: star +description: An ultrafast universal RNA-seq aligner +keywords: + - rna-seq + - rna + - aligner + - star +tools: + - star: + description: An ultrafast universal RNA-seq aligner + homepage: https://github.com/alexdobin/STAR + documentation: https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - reads_fwd: + type: file + description: Forward reads FASTQ file + pattern: "*.{fastq.gz}" + - reads_rev: + type: file + description: Reverse reads FASTQ file + pattern: "*.{fastq.gz}" + - genome_star_index: + type: directory + description: STAR index directory +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - bam: + type: file + description: BAM file + pattern: "*.{bam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@scwatts"