diff --git a/conf/hmf_data.config b/conf/hmf_data.config index fc6cf635..6fdcba0b 100644 --- a/conf/hmf_data.config +++ b/conf/hmf_data.config @@ -50,6 +50,7 @@ params { known_fusions = 'dna_pipeline/sv/known_fusions.37.bedpe' purple_germline_del = 'dna_pipeline/copy_number/cohort_germline_del_freq.37.csv' segment_mappability = 'dna_pipeline/variants/mappability_150.37.bed.gz' + unmap_regions = 'dna_pipeline/common/unmap_regions.37.tsv' } '38' { // AMBER @@ -101,6 +102,7 @@ params { known_fusions = 'dna_pipeline/sv/known_fusions.38.bedpe' purple_germline_del = 'dna_pipeline/copy_number/cohort_germline_del_freq.38.csv' segment_mappability = 'dna_pipeline/variants/mappability_150.38.bed.gz' + unmap_regions = 'dna_pipeline/common/unmap_regions.38.tsv' } } } diff --git a/conf/hmf_genomes.config b/conf/hmf_genomes.config index 147d4274..e067a3a0 100644 --- a/conf/hmf_genomes.config +++ b/conf/hmf_genomes.config @@ -14,16 +14,22 @@ params { fai = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/samtools_index/1.16/Homo_sapiens.GRCh37.GATK.illumina.fasta.fai" dict = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/samtools_index/1.16/Homo_sapiens.GRCh37.GATK.illumina.fasta.dict" bwa_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index/0.7.17-r1188.tar.gz" + bwa_index_bseq = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index/2.2.1/Homo_sapiens.GRCh37.GATK.illumina.fasta.0123" + bwa_index_biidx = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index/2.2.1/Homo_sapiens.GRCh37.GATK.illumina.fasta.bwt.2bit.64" bwa_index_image = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/bwa_index_image/0.7.17-r1188/Homo_sapiens.GRCh37.GATK.illumina.fasta.img" gridss_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/gridss_index/2.13.2/Homo_sapiens.GRCh37.GATK.illumina.fasta.gridsscache" + star_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh37_hmf/star_index/gencode_19/2.7.3a.tar.gz" } 'GRCh38_hmf' { fasta = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" fai = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/samtools_index/1.16/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai" dict = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/samtools_index/1.16/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.dict" bwa_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/bwa_index/0.7.17-r1188.tar.gz" + bwa_index_bseq = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/bwa_index/2.2.1/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.0123" + bwa_index_biidx = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/bwa_index/2.2.1/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.bwt.2bit.64" bwa_index_image = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/bwa_index_image/0.7.17-r1188/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.img" gridss_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/gridss_index/2.13.2/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gridsscache" + star_index = "https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/genomes/GRCh38_hmf/star_index/gencode_38/2.7.3a.tar.gz" } } } diff --git a/conf/modules.config b/conf/modules.config index 320b0a7d..04c2afab 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,6 +12,22 @@ process { + withName: 'GATK4_MARKDUPLICATES' { + publishDir = [ + path: { "${params.outdir}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : "${meta.key}/alignments/rna/${filename}" }, + ] + } + + withName: 'MARKDUPS' { + publishDir = [ + path: { "${params.outdir}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : "${meta.key}/alignments/dna/${filename}" }, + ] + } + withName: 'AMBER' { publishDir = [ path: { "${params.outdir}" }, diff --git a/lib/Constants.groovy b/lib/Constants.groovy index 8f2eb1fd..37c7ac75 100644 --- a/lib/Constants.groovy +++ b/lib/Constants.groovy @@ -11,12 +11,12 @@ class Constants { static List PANELS_DEFINED = ['tso500'] - static String HMF_DATA_37_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/hmftools/5.34_37--0.tar.gz' - static String HMF_DATA_38_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/hmftools/5.34_38--0.tar.gz' + static String HMF_DATA_37_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/hmftools/5.34_37--2.tar.gz' + static String HMF_DATA_38_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/hmftools/5.34_38--2.tar.gz' - static String TSO500_PANEL_37_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/panels/tso500_5.34_37--0.tar.gz' - static String TSO500_PANEL_38_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/panels/tso500_5.34_38--0.tar.gz' + static String TSO500_PANEL_37_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/panels/tso500_5.34_37--1.tar.gz' + static String TSO500_PANEL_38_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/hmf_reference_data/panels/tso500_5.34_38--1.tar.gz' static String VIRUSBREAKENDDB_PATH = 'https://pub-29f2e5b2b7384811bdbbcba44f8b5083.r2.dev/virusbreakend/virusbreakenddb_20210401.tar.gz' @@ -34,6 +34,7 @@ class Constants { } static enum Process { + ALIGNMENT, AMBER, BAMTOOLS, CHORD, @@ -45,6 +46,7 @@ class Constants { ISOFOX, LILAC, LINX, + MARKDUPS, ORANGE, PAVE, PURPLE, @@ -56,7 +58,9 @@ class Constants { static enum FileType { // Generic BAM, + BAM_MARKDUPS, BAI, + FASTQ, // Process AMBER_DIR, BAMTOOLS, @@ -97,11 +101,65 @@ class Constants { DNA_RNA, } + static enum InfoField { + CANCER_TYPE, + LANE, + LIBRARY_ID, + } + static Map PLACEHOLDER_META = [meta_placeholder: null] static List PLACEHOLDER_OPTIONAL_CHANNEL = [] static Map INPUT = [ + BAM_DNA_TUMOR: [ + FileType.BAM, + SampleType.TUMOR, + SequenceType.DNA, + ], + + BAM_MARKDUPS_DNA_TUMOR: [ + FileType.BAM_MARKDUPS, + SampleType.TUMOR, + SequenceType.DNA, + ], + + BAM_DNA_NORMAL: [ + FileType.BAM, + SampleType.NORMAL, + SequenceType.DNA, + ], + + BAM_MARKDUPS_DNA_NORMAL: [ + FileType.BAM_MARKDUPS, + SampleType.NORMAL, + SequenceType.DNA, + ], + + BAM_RNA_TUMOR: [ + FileType.BAM, + SampleType.TUMOR, + SequenceType.RNA, + ], + + BAI_DNA_TUMOR: [ + FileType.BAI, + SampleType.TUMOR, + SequenceType.DNA, + ], + + BAI_DNA_NORMAL: [ + FileType.BAI, + SampleType.NORMAL, + SequenceType.DNA, + ], + + BAI_RNA_TUMOR: [ + FileType.BAI, + SampleType.TUMOR, + SequenceType.RNA, + ], + ISOFOX_DIR: [ FileType.ISOFOX_DIR, SampleType.TUMOR, diff --git a/lib/Utils.groovy b/lib/Utils.groovy index 087888b8..7604a344 100644 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -11,7 +11,7 @@ class Utils { public static parseInput(input_fp_str, stub_run, log) { - // NOTE(SW): using Nextflow .splitCsv channel operator, hence sould be easily interchangable + // NOTE(SW): using NF .splitCsv channel operator, hence should be easily interchangable with NF syntax def input_fp = Utils.getFileObject(input_fp_str) def inputs = nextflow.splitter.SplitterEx.splitCsv(input_fp, [header: true]) @@ -31,7 +31,6 @@ class Utils { meta.subject_id = it.subject_id } - // Sample type def sample_type_enum = Utils.getEnumFromString(it.sample_type, Constants.SampleType) if (!sample_type_enum) { @@ -64,12 +63,75 @@ class Utils { System.exit(1) } - if (meta_sample.containsKey(filetype_enum)) { + if (meta_sample.containsKey(filetype_enum) & filetype_enum != Constants.FileType.FASTQ) { log.error "got duplicate file for ${group_id} ${sample_type_enum}/${sequence_type_enum}: ${filetype_enum}" System.exit(1) } - meta_sample[filetype_enum] = Utils.getFileObject(it.filepath) + // Info data + def info_data = [:] + if (it.containsKey('info')) { + // Parse + it.info + .tokenize(';') + .each { e -> + def (k, v) = e.tokenize(':') + def info_field_enum = Utils.getEnumFromString(k, Constants.InfoField) + + if (!info_field_enum) { + def info_field_str = Utils.getEnumNames(Constants.InfoField).join('\n - ') + log.error "received invalid info field: '${k}'. Valid options are:\n - ${info_field_str}" + System.exit(1) + } + + if (info_data.containsKey(info_field_enum)) { + log.error "got duplicate info field for ${group_id} ${sample_type_enum}/${sequence_type_enum}: ${info_field_enum}" + System.exit(1) + } + + info_data[info_field_enum] = v + } + + // Process + if (info_data.containsKey(Constants.InfoField.CANCER_TYPE)) { + meta[Constants.InfoField.CANCER_TYPE] = info_data[Constants.InfoField.CANCER_TYPE] + } + + } + + + // Handle inputs appropriately + if (filetype_enum === Constants.FileType.FASTQ) { + + if (!info_data.containsKey(Constants.InfoField.LIBRARY_ID)) { + log.error "missing 'library_id' info field for ${group_id} ${sample_type_enum}/${sequence_type_enum}" + System.exit(1) + } + + if (!info_data.containsKey(Constants.InfoField.LANE)) { + log.error "missing 'lane' info field for ${group_id} ${sample_type_enum}/${sequence_type_enum}" + System.exit(1) + } + + def (fwd, rev) = it.filepath.tokenize(';') + def fastq_key = [info_data[Constants.InfoField.LIBRARY_ID], info_data[Constants.InfoField.LANE]] + + if (meta_sample.containsKey(fastq_key)) { + log.error "got duplicate lane + library_id data for ${group_id} ${sample_type_enum}/${sequence_type_enum}: ${fastq_key}" + System.exit(1) + } + + if (!meta_sample.containsKey(filetype_enum)) { + meta_sample[filetype_enum] = [:] + } + + meta_sample[filetype_enum][fastq_key] = ['fwd': fwd, 'rev': rev] + + } else { + + meta_sample[filetype_enum] = Utils.getFileObject(it.filepath) + + } // Record sample key to simplify iteration later on sample_keys << sample_key @@ -88,6 +150,9 @@ class Utils { if (key === Constants.FileType.BAM) { index_enum = Constants.FileType.BAI index_str = 'bai' + } else if (key === Constants.FileType.BAM_MARKDUPS) { + index_enum = Constants.FileType.BAI + index_str = 'bai' } else if (key === Constants.FileType.GRIDSS_VCF) { index_enum = Constants.FileType.GRIDSS_VCF_TBI index_str = 'tbi' @@ -136,6 +201,8 @@ class Utils { params.ref_data.genome_dict, params.ref_data.genome_bwa_index, params.ref_data.genome_bwa_index_image, + params.ref_data.genome_bwa_index_bseq, + params.ref_data.genome_bwa_index_biidx, params.ref_data.genome_gridss_index, params.ref_data.virusbreakenddb_path, ] @@ -177,7 +244,7 @@ class Utils { inputs.each { meta -> - // Require BAMs for each defined sample type + // Require BAMs or BAM_MARKDUPs or FASTQs for each defined sample type // NOTE(SW): repeating key pairs above to avoid having to duplicate error messages sample_keys.each { key -> @@ -187,9 +254,12 @@ class Utils { def (sample_type, sequence_type) = key - if (!meta[key].containsKey(Constants.FileType.BAM)) { - log.error "no BAM provided for ${meta.group_id} ${sample_type}/${sequence_type}\n\n" + - "NB: BAMs are always required as they are the basis to determine input sample type." + if (!meta[key].containsKey(Constants.FileType.BAM) && + !meta[key].containsKey(Constants.FileType.BAM_MARKDUPS) && + !meta[key].containsKey(Constants.FileType.FASTQ)) { + + log.error "no BAMs nor BAM_MARKDUPs nor FASTQs provided for ${meta.group_id} ${sample_type}/${sequence_type}\n\n" + + "NB: BAMs or BAM_MARKDUPs or FASTQs are always required as they are the basis to determine input sample type." System.exit(1) } @@ -199,14 +269,14 @@ class Utils { if (run_config.mode === Constants.RunMode.TARGETED) { // Do not allow normal DNA - if (Utils.hasNormalDnaBam(meta)) { + if (Utils.hasNormalDna(meta)) { log.error "targeted mode is not compatible with the normal DNA BAM provided for ${meta.group_id}\n\n" + "The targeted workflow supports only tumor DNA BAMs (and tumor RNA BAMs for TSO500)" System.exit(1) } // Do not allow only tumor RNA - if (Utils.hasTumorRnaBam(meta) && !Utils.hasTumorDnaBam(meta)) { + if (Utils.hasTumorRnaBam(meta) && !Utils.hasTumorDna(meta)) { log.error "targeted mode is not compatible with only tumor RNA provided for ${meta.group_id}\n\n" + "The targeted workflow requires tumor DNA and can optionally take tumor RNA, depending on " + "the configured panel." @@ -223,7 +293,7 @@ class Utils { } // Do not allow normal DNA only - if (Utils.hasNormalDnaBam(meta) && !Utils.hasTumorDnaBam(meta)) { + if (Utils.hasNormalDna(meta) && !Utils.hasTumorDna(meta)) { log.error "germline only mode not supported, found only a normal DNA BAM for ${meta.group_id}\n" System.exit(1) } @@ -266,95 +336,167 @@ class Utils { return path ? nextflow.Nextflow.file(path) : [] } + static public getRunMode(run_mode, log) { + def run_mode_enum = Utils.getEnumFromString(run_mode, Constants.RunMode) + if (!run_mode_enum) { + def run_modes_str = Utils.getEnumNames(Constants.RunMode).join('\n - ') + log.error "recieved an invalid run mode: '${run_mode}'. Valid options are:\n - ${run_modes_str}" + System.exit(1) + } + return run_mode_enum + } + + + // Sample records + static public getTumorDnaSample(meta) { + return meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.DNA], [:]) + } + + static public getTumorRnaSample(meta) { + return meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.RNA], [:]) + } + + static public getNormalDnaSample(meta) { + return meta.getOrDefault([Constants.SampleType.NORMAL, Constants.SequenceType.DNA], [:]) + } + // Sample names static public getTumorDnaSampleName(meta) { - def meta_sample = meta[Constants.SampleType.TUMOR, Constants.SequenceType.DNA] - return meta_sample['sample_id'] + return getTumorDnaSample(meta)['sample_id'] } static public getTumorRnaSampleName(meta) { - def meta_sample = meta[Constants.SampleType.TUMOR, Constants.SequenceType.RNA] - return meta_sample['sample_id'] + return getTumorRnaSample(meta)['sample_id'] } static public getNormalDnaSampleName(meta) { - def meta_sample = meta[Constants.SampleType.NORMAL, Constants.SequenceType.DNA] - return meta_sample['sample_id'] + return getNormalDnaSample(meta)['sample_id'] } // Files + static public getTumorDnaFastq(meta) { + return getTumorDnaSample(meta).getOrDefault(Constants.FileType.FASTQ, null) + } + static public getTumorDnaBam(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.DNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.BAM, null) + return getTumorDnaSample(meta).getOrDefault(Constants.FileType.BAM, null) + } + + static public getTumorDnaMarkdupsBam(meta) { + return getTumorDnaSample(meta).getOrDefault(Constants.FileType.BAM_MARKDUPS, null) } static public getTumorDnaBai(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.DNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.BAI, null) + return getTumorDnaSample(meta).getOrDefault(Constants.FileType.BAI, null) + } + + + static public hasTumorDnaFastq(meta) { + return getTumorDnaFastq(meta) !== null } static public hasTumorDnaBam(meta) { return getTumorDnaBam(meta) !== null } - static public getTumorRnaBam(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.RNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.BAM, null) + static public hasTumorDnaMarkdupsBam(meta) { + return getTumorDnaMarkdupsBam(meta) !== null } - static public getTumorRnaBai(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.TUMOR, Constants.SequenceType.RNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.BAI, null) - } - static public hasTumorRnaBam(meta) { - return getTumorRnaBam(meta) !== null + static public getNormalDnaFastq(meta) { + return getNormalDnaSample(meta).getOrDefault(Constants.FileType.FASTQ, null) } - static public getNormalDnaBam(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.NORMAL, Constants.SequenceType.DNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.BAM, null) + return getNormalDnaSample(meta).getOrDefault(Constants.FileType.BAM, null) } + static public getNormalDnaMarkdupsBam(meta) { + return getNormalDnaSample(meta).getOrDefault(Constants.FileType.BAM_MARKDUPS, null) + } static public getNormalDnaBai(meta) { - def meta_sample = meta.getOrDefault([Constants.SampleType.NORMAL, Constants.SequenceType.DNA], [:]) - return meta_sample.getOrDefault(Constants.FileType.BAI, null) + return getNormalDnaSample(meta).getOrDefault(Constants.FileType.BAI, null) + } + + + static public hasNormalDnaFastq(meta) { + return getNormalDnaFastq(meta) !== null } static public hasNormalDnaBam(meta) { return getNormalDnaBam(meta) !== null } + static public hasNormalDnaMarkdupsBam(meta) { + return getNormalDnaMarkdupsBam(meta) !== null + } - static public getRunMode(run_mode, log) { - def run_mode_enum = Utils.getEnumFromString(run_mode, Constants.RunMode) - if (!run_mode_enum) { - def run_modes_str = Utils.getEnumNames(Constants.RunMode).join('\n - ') - log.error "recieved an invalid run mode: '${run_mode}'. Valid options are:\n - ${run_modes_str}" - System.exit(1) - } - return run_mode_enum + + static public hasDnaFastq(meta) { + return hasNormalDnaFastq(meta) || hasTumorDnaFastq(meta) } + static public hasDnaMarkdupsBam(meta) { + return hasNormalDnaMarkdupsBam(meta) || hasTumorDnaMarkdupsBam(meta) + } + + + static public getTumorRnaFastq(meta) { + return getTumorRnaSample(meta).getOrDefault(Constants.FileType.FASTQ, null) + } + + static public getTumorRnaBam(meta) { + return getTumorRnaSample(meta).getOrDefault(Constants.FileType.BAM, null) + } + + static public getTumorRnaBai(meta) { + return getTumorRnaSample(meta).getOrDefault(Constants.FileType.BAI, null) + } + + + static public hasTumorRnaFastq(meta) { + return getTumorRnaFastq(meta) !== null + } + + static public hasTumorRnaBam(meta) { + return getTumorRnaBam(meta) !== null + } + + + // Status + static public hasTumorDna(meta) { + return hasTumorDnaBam(meta) || hasTumorDnaMarkdupsBam(meta) || hasTumorDnaFastq(meta) + } + + static public hasNormalDna(meta) { + return hasNormalDnaBam(meta) || hasNormalDnaMarkdupsBam(meta) || hasNormalDnaFastq(meta) + } + + static public hasTumorRna(meta) { + return hasTumorRnaBam(meta) || hasTumorRnaFastq(meta) + } + // Misc public static getInput(meta, key) { - def result + def result = [] def (key_filetype, key_filetypes, key_sequencetypes) = key for (key_sample in [key_filetypes, key_sequencetypes].combinations()) { if (meta.containsKey(key_sample) && meta[key_sample].containsKey(key_filetype)) { - return meta[key_sample].getAt(key_filetype) + result = meta[key_sample].get(key_filetype) + break } } + return result } public static hasExistingInput(meta, key) { - return getInput(meta, key) !== null + return getInput(meta, key) != [] } public static selectCurrentOrExisting(val, meta, key) { diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 4d866709..6e9cb85c 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -1,6 +1,8 @@ // // This file holds several functions specific to the main.nf workflow in the nf-core/oncoanalyser pipeline // +import Utils + class WorkflowMain { @@ -215,8 +217,9 @@ class WorkflowMain { mode: run_mode, panel: run_mode === Constants.RunMode.TARGETED ? params.panel : null, stages: stages, - has_dna: inputs.any { it.containsKey([Constants.SampleType.TUMOR, Constants.SequenceType.DNA]) }, - has_rna: inputs.any { it.containsKey([Constants.SampleType.TUMOR, Constants.SequenceType.RNA]) }, + has_dna: inputs.any { Utils.hasTumorDna(it) }, + has_rna: inputs.any { Utils.hasTumorRna(it) }, + has_rna_fastq: inputs.any { Utils.hasTumorRnaFastq(it) }, ] } } diff --git a/main.nf b/main.nf index 98e306f1..e01b75fd 100644 --- a/main.nf +++ b/main.nf @@ -42,7 +42,10 @@ params.ref_data.genome_fai = getGenomeAttribute('fai') params.ref_data.genome_dict = getGenomeAttribute('dict') params.ref_data.genome_bwa_index = getGenomeAttribute('bwa_index') params.ref_data.genome_bwa_index_image = getGenomeAttribute('bwa_index_image') +params.ref_data.genome_bwa_index_bseq = getGenomeAttribute('bwa_index_bseq') +params.ref_data.genome_bwa_index_biidx = getGenomeAttribute('bwa_index_biidx') params.ref_data.genome_gridss_index = getGenomeAttribute('gridss_index') +params.ref_data.genome_star_index = getGenomeAttribute('star_index') WorkflowMain.setParamsDefaults(params, log) WorkflowMain.validateParams(params, log) diff --git a/modules.json b/modules.json index 13432f4a..44d7fd99 100644 --- a/modules.json +++ b/modules.json @@ -10,6 +10,11 @@ "git_sha": "90aef30f432332bdf0ce9f4b9004aa5d5c4960bb", "installed_by": ["modules"] }, + "gatk4/markduplicates": { + "branch": "master", + "git_sha": "e726b1730dff525bde4a6839e544dabfea4cd7fd", + "installed_by": ["modules"] + }, "samtools/dict": { "branch": "master", "git_sha": "cf5b9c30a2adacc581793afb79fae5f5b50bed01", @@ -24,6 +29,11 @@ "branch": "master", "git_sha": "bbb99cb8d679555cc01c98766de7869f83283545", "installed_by": ["modules"] + }, + "samtools/sort": { + "branch": "master", + "git_sha": "d5d785b3d8b422cda9c6d84a23f629a8e9ff8cd8", + "installed_by": ["modules"] } } }, diff --git a/modules/local/bwa-mem2/mem/environment.yml b/modules/local/bwa-mem2/mem/environment.yml new file mode 100644 index 00000000..571dda57 --- /dev/null +++ b/modules/local/bwa-mem2/mem/environment.yml @@ -0,0 +1,9 @@ +name: bwa-mem2_mem +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bwa-mem2=2.2.1 + - bioconda::samtools=1.19.2 + - bioconda::sambamba=1.0 diff --git a/modules/local/bwa-mem2/mem/main.nf b/modules/local/bwa-mem2/mem/main.nf new file mode 100644 index 00000000..603f989d --- /dev/null +++ b/modules/local/bwa-mem2/mem/main.nf @@ -0,0 +1,67 @@ +process BWAMEM2_ALIGN { + tag "${meta.id}" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-4dde50190ae599f2bb2027cb2c8763ea00fb5084:544519c4a0ff7e9616a3b44afde1f143c52f10c3-0' : + 'quay.io/biocontainers/mulled-v2-4dde50190ae599f2bb2027cb2c8763ea00fb5084:544519c4a0ff7e9616a3b44afde1f143c52f10c3-0' }" + + input: + tuple val(meta), path(reads_fwd), path(reads_rev) + path genome_fasta + path genome_bwa_index + path genome_bwa_index_bseq + path genome_bwa_index_biidx + + output: + tuple val(meta), path('*.bam'), path('*.bai'), emit: bam + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def read_group_tag = "@RG\\tID:${meta.read_group}\\tSM:${meta.sample_id}" + def output_fn = meta.split ? "${meta.split}.${meta.sample_id}.${meta.read_group}.bam" : "${meta.sample_id}.${meta.read_group}.bam" + + """ + ln -fs \$(find -L ${genome_bwa_index} -type f) ./ + + bwa-mem2 mem \\ + -Y \\ + -R '${read_group_tag}' \\ + -t ${task.cpus} \\ + ${genome_fasta} \\ + ${reads_fwd} \\ + ${reads_rev} | \\ + \\ + sambamba view \\ + --sam-input \\ + --format bam \\ + --compression-level 0 \\ + --nthreads ${task.cpus} \\ + /dev/stdin | \\ + \\ + sambamba sort \\ + --nthreads ${task.cpus} \\ + --out ${output_fn} \\ + /dev/stdin + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa-mem2: \$(bwa-mem2 version 2>/dev/null) + sambamba: \$(sambamba --version 2>&1 | egrep '^sambamba' | head -n 1 | awk '{ print \$NF }') + END_VERSIONS + """ + + stub: + def output_fn = meta.split ? "${meta.split}.${meta.sample_id}.${meta.read_group}.bam" : "${meta.sample_id}.${meta.read_group}.bam" + + """ + touch ${output_fn} + touch ${output_fn}.bai + + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ +} diff --git a/modules/local/bwa-mem2/mem/meta.yml b/modules/local/bwa-mem2/mem/meta.yml new file mode 100644 index 00000000..f8fb56b3 --- /dev/null +++ b/modules/local/bwa-mem2/mem/meta.yml @@ -0,0 +1,59 @@ +name: bwa-mem2_mem +description: The mem alignment algorithm of bwa-mem2 +keywords: + - bwa + - mem + - read alignment + - bwa-mem2 +tools: + - bwa-mem2: + description: Burrow-Wheeler Aligner for short-read alignment + homepage: https://github.com/bwa-mem2/bwa-mem2 + documentation: https://github.com/bwa-mem2/bwa-mem2 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - reads_fwd: + type: file + description: Forward reads FASTQ file + pattern: "*.{fastq.gz}" + - reads_rev: + type: file + description: Reverse reads FASTQ file + pattern: "*.{fastq.gz}" + - genome_fasta: + type: file + description: Reference genome assembly FASTA file + pattern: "*.{fa,fasta}" + - genome_bwa_index: + type: directory + description: bwa-mem1 index directory + - genome_bwa_index_bseq: + type: directory + description: bwa-mem2 bseq index file + pattern: "*.{0123}" + - genome_bwa_index_biidx: + type: directory + description: bwa-mem2 biidx index file + pattern: "*.{bwt.2bit.64}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - bam: + type: list + description: BAM and BAI file + pattern: "*.{bam,bam.bai}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@scwatts" + - "@mkcmkc" diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf new file mode 100644 index 00000000..8bd47159 --- /dev/null +++ b/modules/local/fastp/main.nf @@ -0,0 +1,49 @@ +process FASTP { + tag "${meta.id}" + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--hadf994f_2' : + 'quay.io/biocontainers/fastp:0.23.4--hadf994f_2' }" + + input: + tuple val(meta), path(reads_fwd), path(reads_rev) + val(max_fastq_records) + + output: + tuple val(meta), path('*_R1.fastp.fastq.gz'), path('*_R2.fastp.fastq.gz'), emit: fastq + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + # * do not apply trimming/clipping, already done in BCL convert + # * turn off all filtering + # * do not process umis, already done for us + + fastp \\ + --in1 ${reads_fwd} \\ + --in2 ${reads_rev} \\ + --disable_quality_filtering \\ + --disable_length_filtering \\ + --disable_adapter_trimming \\ + --disable_trim_poly_g \\ + --split_by_lines ${4 * max_fastq_records} \\ + --out1 ${meta.sample_id}_${meta.library_id}_${meta.lane}_R1.fastp.fastq.gz \\ + --out2 ${meta.sample_id}_${meta.library_id}_${meta.lane}_R2.fastp.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed 's/^.* //') + END_VERSIONS + """ + + stub: + """ + touch 00{1..4}.${meta.sample_id}_${meta.library_id}_${meta.lane}_R1.fastp.fastq.gz + touch 00{1..4}.${meta.sample_id}_${meta.library_id}_${meta.lane}_R2.fastp.fastq.gz + + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ +} diff --git a/modules/local/fastp/meta.yml b/modules/local/fastp/meta.yml new file mode 100644 index 00000000..23aa5fb6 --- /dev/null +++ b/modules/local/fastp/meta.yml @@ -0,0 +1,47 @@ +name: fastp +description: An ultra-fast all-in-one FASTQ preprocessor +keywords: + - fastp + - fastq + - processing + - quality control +tools: + - fastp: + description: An ultra-fast all-in-one FASTQ preprocessor + homepage: https://github.com/OpenGene/fastp + documentation: https://github.com/OpenGene/fastp + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - reads_fwd: + type: file + description: Forward reads FASTQ file + pattern: "*.{fastq.gz}" + - reads_rev: + type: file + description: Reverse reads FASTQ file + pattern: "*.{fastq.gz}" + - max_fastq_records: + type: integer + description: Maximum number of reads per file +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - fastq: + type: list + description: Forward and reverse FASTQ files + pattern: "*.{fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@scwatts" + - "@mkcmkc" diff --git a/modules/local/isofox/main.nf b/modules/local/isofox/main.nf index 903e5e45..cd215d70 100644 --- a/modules/local/isofox/main.nf +++ b/modules/local/isofox/main.nf @@ -4,7 +4,7 @@ process ISOFOX { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/hmftools-isofox:1.7.1-hdfd78af_0': + 'https://depot.galaxyproject.org/singularity/hmftools-isofox:1.7.1--hdfd78af_0': 'quay.io/biocontainers/hmftools-isofox:1.7.1--hdfd78af_0' }" input: diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf new file mode 100644 index 00000000..d4d0f443 --- /dev/null +++ b/modules/local/markdups/main.nf @@ -0,0 +1,73 @@ +process MARKDUPS { + tag "${meta.id}" + label 'process_medium' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/hmftools-mark-dups:1.1.5--hdfd78af_0' : + 'quay.io/biocontainers/hmftools-mark-dups:1.1.5--hdfd78af_0' }" + + input: + tuple val(meta), path(bams), path(bais) + path genome_fasta + val genome_ver + path genome_fai + path genome_dict + path unmap_regions + val has_umis + + output: + tuple val(meta), path('*bam'), path('*bai'), emit: bam + path 'versions.yml' , emit: versions + path '*.tsv' + + when: + task.ext.when == null || task.ext.when + + script: + def umi_flags = has_umis ? '-umi_enabled -umi_duplex -umi_duplex_delim +' : '' + + """ + markdups \\ + -Xmx${Math.round(task.memory.bytes * 0.95)} \\ + \\ + -samtools \$(which samtools) \\ + -sambamba \$(which sambamba) \\ + \\ + -sample ${meta.sample_id} \\ + -input_bam ${bams.join(',')} \\ + \\ + -form_consensus \\ + ${umi_flags} \\ + \\ + -unmap_regions ${unmap_regions} \\ + -ref_genome ${genome_fasta} \\ + -ref_genome_version ${genome_ver} \\ + \\ + -write_stats \\ + -threads ${task.cpus} \\ + \\ + -output_bam ${meta.sample_id}.markdups.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + markdups: \$(markdups -version | awk '{ print \$NF }') + sambamba: \$(sambamba --version 2>&1 | egrep '^sambamba' | head -n 1 | awk '{ print \$NF }') + samtools: \$(samtools --version 2>&1 | egrep '^samtools\\s' | head -n 1 | sed 's/^.* //') + END_VERSIONS + """ + + stub: + """ + touch ${meta.sample_id}.markdups.bam + touch ${meta.sample_id}.markdups.bam.bai + touch ${meta.sample_id}.duplicate_freq.tsv + + if [[ -n "${has_umis}" ]]; then + touch ${meta.sample_id}.umi_coord_freq.tsv + touch ${meta.sample_id}.umi_edit_distance.tsv + touch ${meta.sample_id}.umi_nucleotide_freq.tsv + fi; + + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ +} diff --git a/modules/local/markdups/meta.yml b/modules/local/markdups/meta.yml new file mode 100644 index 00000000..a9297b68 --- /dev/null +++ b/modules/local/markdups/meta.yml @@ -0,0 +1,62 @@ +name: markdups +description: Identify and mark duplicate reads ifrom alignment data +keywords: + - duplicating marking + - markdups +tools: + - MarkDups: + description: Identify and mark duplicate reads ifrom alignment data + homepage: https://github.com/hartwigmedical/hmftools/tree/master/mark-dups + documentation: https://github.com/hartwigmedical/hmftools/tree/master/mark-dups + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - bams: + type: list + description: List BAM files + - bais: + type: list + description: List BAI files + - genome_fasta: + type: file + description: Reference genome assembly FASTA file + pattern: "*.{fa,fasta}" + - genome_ver: + type: string + description: Reference genome version + - genome_fai: + type: file + description: Reference genome assembly fai file + pattern: "*.{fai}" + - genome_dict: + type: file + description: Reference genome assembly dict file + pattern: "*.{dict}" + - unmap_regions: + type: file + description: Unmapped regions file + pattern: "*.{tsv}" + - has_umis: + type: boolean + description: Flag indicating presence of UMIs in reads +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - bam: + type: list + description: BAM and BAI file + pattern: "*.{bam,bam.bai}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@scwatts" + - "@mkcmkc" diff --git a/modules/local/pave/somatic/main.nf b/modules/local/pave/somatic/main.nf index 7ad70200..dfecf48c 100644 --- a/modules/local/pave/somatic/main.nf +++ b/modules/local/pave/somatic/main.nf @@ -48,7 +48,6 @@ process PAVE_SOMATIC { // Targeted mode def pon_artefact_arg = pon_artefacts ? "-pon_artefact_file ${pon_artefacts}" : '' - def pathogenic_pass_force_arg = pon_artefacts ? '-force_pathogenic_pass': '' def sage_blocklist_regions_arg = sage_blocklist_regions ? "-blacklist_bed ${sage_blocklist_regions}" : '' def sage_blocklist_sites_arg = sage_blocklist_sites ? "-blacklist_vcf ${sage_blocklist_sites}" : '' def clinvar_annotations = clinvar_annotations ? "-clinvar_vcf ${clinvar_annotations}" : '' @@ -69,7 +68,6 @@ process PAVE_SOMATIC { -ensembl_data_dir ${ensembl_data_resources} \\ ${sage_blocklist_regions_arg} \\ ${sage_blocklist_sites_arg} \\ - ${pathogenic_pass_force_arg} \\ ${gnomad_args} \\ -read_pass_only \\ -threads ${task.cpus} \\ diff --git a/modules/local/sambamba/merge/main.nf b/modules/local/sambamba/merge/main.nf new file mode 100644 index 00000000..1bbb9646 --- /dev/null +++ b/modules/local/sambamba/merge/main.nf @@ -0,0 +1,33 @@ +process SAMBAMBA_MERGE { + tag "${meta.id}" + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sambamba:1.0--h98b6b92_0' : + 'quay.io/biocontainers/sambamba:1.0--h98b6b92_0' }" + + input: + tuple val(meta), path(bams) + + output: + tuple val(meta), path('*bam'), emit: bam + path 'versions.yml' , emit: versions + + script: + """ + sambamba merge \\ + --nthreads ${task.cpus} \\ + ${meta.sample_id}.bam \\ + ${bams} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sambamba: \$(sambamba --version 2>&1 | grep -m1 sambamba | sed 's/^sambamba //') + END_VERSIONS + """ + + stub: + """ + touch ${meta.sample_id}.bam + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ +} diff --git a/modules/local/sambamba/merge/meta.yml b/modules/local/sambamba/merge/meta.yml new file mode 100644 index 00000000..c4424055 --- /dev/null +++ b/modules/local/sambamba/merge/meta.yml @@ -0,0 +1,38 @@ +name: sambamba_merge +description: Merge several BAM files into one +keywords: + - sambamba + - bam + - merge +tools: + - sambamba: + description: Tools for working with SAM/BAM data + homepage: https://github.com/biod/sambamba + documentation: https://lomereiter.github.io/sambamba/index.html + licence: ["GPL v2"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - bams: + type: list + description: List BAM files +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - bam: + type: file + description: BAM file + pattern: "*.{bam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@scwatts" + - "@mkcmkc" diff --git a/modules/local/star/main.nf b/modules/local/star/main.nf new file mode 100644 index 00000000..7aa4503f --- /dev/null +++ b/modules/local/star/main.nf @@ -0,0 +1,62 @@ +process STAR { + tag "${meta.id}" + label 'process_high' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/star:2.7.3a--0' : + 'quay.io/biocontainers/star:2.7.3a--0' }" + + input: + tuple val(meta), path(reads_fwd), path(reads_rev) + path genome_star_index + + output: + tuple val(meta), path('*bam'), emit: bam + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + STAR \\ + --readFilesIn ${reads_fwd} ${reads_rev} \\ + --genomeDir ${genome_star_index} \\ + --runThreadN ${task.cpus} \\ + --readFilesCommand zcat \\ + --alignSJstitchMismatchNmax 5 -1 5 5 \\ + --alignSplicedMateMapLmin 35 \\ + --alignSplicedMateMapLminOverLmate 0.33 \\ + --chimJunctionOverhangMin 10 \\ + --chimOutType WithinBAM SoftClip \\ + --chimScoreDropMax 30 \\ + --chimScoreJunctionNonGTAG 0 \\ + --chimScoreMin 1 \\ + --chimScoreSeparation 1 \\ + --chimSegmentMin 10 \\ + --chimSegmentReadGapMax 3 \\ + --limitOutSJcollapsed 3000000 \\ + --outBAMcompression 0 \\ + --outFilterMatchNmin 35 \\ + --outFilterMatchNminOverLread 0.33 \\ + --outFilterMismatchNmax 3 \\ + --outFilterMultimapNmax 10 \\ + --outFilterScoreMinOverLread 0.33 \\ + --outSAMattributes All \\ + --outSAMattrRGline ID:${meta.read_group} SM:${meta.sample_id} \\ + --outSAMtype BAM Unsorted \\ + --outSAMunmapped Within \\ + --runRNGseed 0 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + END_VERSIONS + """ + + stub: + """ + touch Aligned.out.bam + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ +} diff --git a/modules/local/star/meta.yml b/modules/local/star/meta.yml new file mode 100644 index 00000000..0bbc3329 --- /dev/null +++ b/modules/local/star/meta.yml @@ -0,0 +1,46 @@ +name: star +description: An ultrafast universal RNA-seq aligner +keywords: + - rna-seq + - rna + - aligner + - star +tools: + - star: + description: An ultrafast universal RNA-seq aligner + homepage: https://github.com/alexdobin/STAR + documentation: https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - reads_fwd: + type: file + description: Forward reads FASTQ file + pattern: "*.{fastq.gz}" + - reads_rev: + type: file + description: Reverse reads FASTQ file + pattern: "*.{fastq.gz}" + - genome_star_index: + type: directory + description: STAR index directory +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [id: 'sample_id', tumor_id: 'tumor_name', normal_id: 'normal_name'] + - bam: + type: file + description: BAM file + pattern: "*.{bam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@scwatts" diff --git a/modules/nf-core/gatk4/markduplicates/main.nf b/modules/nf-core/gatk4/markduplicates/main.nf new file mode 100644 index 00000000..356cac0f --- /dev/null +++ b/modules/nf-core/gatk4/markduplicates/main.nf @@ -0,0 +1,65 @@ +process GATK4_MARKDUPLICATES { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::gatk4=4.4.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'quay.io/biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(bam) + path fasta + path fasta_fai + + output: + tuple val(meta), path("*cram"), emit: cram, optional: true + tuple val(meta), path("*bam"), emit: bam, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.metrics"), emit: metrics + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.sample_id}" + def input_list = bam.collect{"--INPUT $it"}.join(' ') + def reference = fasta ? "--REFERENCE_SEQUENCE ${fasta}" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M" MarkDuplicates \\ + $input_list \\ + --OUTPUT ${prefix}.md.bam \\ + --METRICS_FILE ${prefix}.md.metrics \\ + --TMP_DIR . \\ + --CREATE_INDEX \\ + ${reference} \\ + $args + + mv ${prefix}.md.bai ${prefix}.md.bam.bai + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.sample_id}" + + """ + touch ${prefix}.md.bam + touch ${prefix}.md.bam.bai + touch ${prefix}.md.metrics + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ +} diff --git a/modules/nf-core/gatk4/markduplicates/meta.yml b/modules/nf-core/gatk4/markduplicates/meta.yml new file mode 100644 index 00000000..ddf98d2f --- /dev/null +++ b/modules/nf-core/gatk4/markduplicates/meta.yml @@ -0,0 +1,72 @@ +name: gatk4_markduplicates +description: This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. +keywords: + - markduplicates + - bam + - sort +tools: + - gatk4: + description: + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360037052812-MarkDuplicates-Picard- + tool_dev_url: https://github.com/broadinstitute/gatk + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM file + pattern: "*.{bam}" + - fasta: + type: file + description: Fasta file + pattern: "*.{fasta}" + - fasta_fai: + type: file + description: Fasta index file + pattern: "*.{fai}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: Marked duplicates BAM file + pattern: "*.{bam}" + - cram: + type: file + description: Marked duplicates CRAM file + pattern: "*.{cram}" + - bai: + type: file + description: BAM index file + pattern: "*.{bam.bai}" + - crai: + type: file + description: CRAM index file + pattern: "*.{cram.crai}" + - metrics: + type: file + description: Duplicate metrics file generated by GATK + pattern: "*.{metrics.txt}" + +authors: + - "@ajodeh-juma" + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf new file mode 100644 index 00000000..8aaf9a5b --- /dev/null +++ b/modules/nf-core/samtools/sort/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_SORT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'quay.io/biocontainers/samtools:1.18--h50ea8bc_1' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.csi"), emit: csi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.prefix}" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools sort \\ + $args \\ + -@ $task.cpus \\ + -o ${prefix}.bam \\ + -T $prefix \\ + $bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.prefix}" + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml new file mode 100644 index 00000000..2200de72 --- /dev/null +++ b/modules/nf-core/samtools/sort/meta.yml @@ -0,0 +1,51 @@ +name: samtools_sort +description: Sort SAM/BAM/CRAM file +keywords: + - sort + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" +authors: + - "@drpatelh" + - "@ewels" +maintainers: + - "@drpatelh" + - "@ewels" diff --git a/nextflow.config b/nextflow.config index 4c9c3afd..2076f74d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -36,6 +36,8 @@ params { virusbreakenddb_path = null hla_slice_bed = null + max_fastq_records = 10000000 + isofox_counts = null isofox_gc_ratios = null isofox_gene_ids = null @@ -56,9 +58,9 @@ params { version = false // Other workflow inputs and options - create_stub_placeholders = false - linx_gene_id_file = null - isofox_functions = 'TRANSCRIPT_COUNTS;ALT_SPLICE_JUNCTIONS;FUSIONS;RETAINED_INTRONS' + create_stub_placeholders = false + linx_gene_id_file = null + isofox_functions = 'TRANSCRIPT_COUNTS;ALT_SPLICE_JUNCTIONS;FUSIONS;RETAINED_INTRONS' // Config options config_profile_name = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 2f05ef50..fcd6c864 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -81,6 +81,11 @@ "default": false, "fa_icon": "fas fa-diagram-project" }, + "max_fastq_records": { + "type": "integer", + "description": "When positive, will use fastp to split fastq files so that each resultant fastq file has no more than max_fastq_records records. When nonpositive, fastp is not used and the provided fastq files are passed as-is to the aligner.", + "fa_icon": "fas fa-cog" + }, "gridss_config": { "type": "string", "description": "Path to GRIDSS configuration file.", diff --git a/subworkflows/local/amber_profiling/main.nf b/subworkflows/local/amber_profiling/main.nf index 08682c8e..a4cb59f0 100644 --- a/subworkflows/local/amber_profiling/main.nf +++ b/subworkflows/local/amber_profiling/main.nf @@ -11,6 +11,8 @@ workflow AMBER_PROFILING { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] + ch_normal_bam // channel: [mandatory] [ meta, bam, bai ] // Reference data genome_version // channel: [mandatory] genome version @@ -22,19 +24,33 @@ workflow AMBER_PROFILING { // channel: [ versions.yml ] ch_versions = Channel.empty() - // Sort inputs - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> + // Select input sources and sort + // channel: runnable: [ meta, tumor_bam, tumor_bai, normal_bam, normal_bai] + // channel: skip: [ meta ] + ch_inputs_sorted = WorkflowOncoanalyser.groupByMeta( + ch_tumor_bam, + ch_normal_bam, + ) + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + return [ + meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + tumor_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR), + Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), + normal_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL), + ] + } + .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.AMBER_DIR) - runnable: Utils.hasTumorDnaBam(meta) && !has_existing + runnable: tumor_bam && !has_existing skip: true + return meta } // Create process input channel // channel: [ meta_amber, tumor_bam, normal_bam, tumor_bai, normal_bai ] ch_amber_inputs = ch_inputs_sorted.runnable - .map { meta -> + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def meta_amber = [ key: meta.group_id, @@ -42,18 +58,8 @@ workflow AMBER_PROFILING { tumor_id: Utils.getTumorDnaSampleName(meta), ] - def tumor_bam = Utils.getTumorDnaBam(meta) - def tumor_bai = Utils.getTumorDnaBai(meta) - - def normal_bam = [] - def normal_bai = [] - - if (Utils.hasNormalDnaBam(meta)) { - + if (normal_bam) { meta_amber.normal_id = Utils.getNormalDnaSampleName(meta) - normal_bam = Utils.getNormalDnaBam(meta) - normal_bai = Utils.getNormalDnaBai(meta) - } [meta_amber, tumor_bam, normal_bam, tumor_bai, normal_bai] diff --git a/subworkflows/local/bamtools_metrics/main.nf b/subworkflows/local/bamtools_metrics/main.nf index 87b81a8a..e71ca2bb 100644 --- a/subworkflows/local/bamtools_metrics/main.nf +++ b/subworkflows/local/bamtools_metrics/main.nf @@ -11,6 +11,8 @@ workflow BAMTOOLS_METRICS { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] + ch_normal_bam // channel: [mandatory] [ meta, bam, bai ] // Reference data genome_fasta // channel: [mandatory] /path/to/genome_fasta @@ -21,78 +23,55 @@ workflow BAMTOOLS_METRICS { // channel: [ versions.yml ] ch_versions = Channel.empty() - // Sort inputs - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> - - def has_tumor_dna = Utils.hasTumorDnaBam(meta) - def has_normal_dna = Utils.hasNormalDnaBam(meta) - - runnable: has_tumor_dna || has_normal_dna + // Sort inputs, separate by tumor and normal + // channel: runnable: [ meta, bam, bai ] + // channel: skip: [ meta ] + ch_inputs_tumor_sorted = ch_tumor_bam + .map { meta, bam, bai -> + return [ + meta, + Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR), + ] + } + .branch { meta, bam, bai -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAMTOOLS_TUMOR) + runnable: bam && !has_existing skip: true + return meta } - // Flatten into BAM/BAI pairs, select inputs that are eligible to run - // channel: runnable: [ meta_extra, bam, bai ] - // channel: skip: [ meta_extra ] - ch_bams_bais_sorted = ch_inputs_sorted.runnable - .flatMap { meta -> - - def tumor_sample_id = [] - def tumor_bam = [] - def tumor_bai = [] - - def normal_sample_id = [] - def normal_bam = [] - def normal_bai = [] - - - if (Utils.hasTumorDnaBam(meta)) { - tumor_sample_id = Utils.getTumorDnaSampleName(meta) - tumor_bam = Utils.getTumorDnaBam(meta) - tumor_bai = Utils.getTumorDnaBai(meta) - } - - if (Utils.hasNormalDnaBam(meta)) { - normal_sample_id = Utils.getNormalDnaSampleName(meta) - normal_bam = Utils.getNormalDnaBam(meta) - normal_bai = Utils.getNormalDnaBai(meta) - } - + // channel: runnable: [ meta, bam, bai ] + // channel: skip: [ meta ] + ch_inputs_normal_sorted = ch_normal_bam + .map { meta, bam, bai -> return [ - [[key: meta.group_id, *:meta, sample_id: tumor_sample_id, sample_type: 'tumor'], tumor_bam, tumor_bai], - [[key: meta.group_id, *:meta, sample_id: normal_sample_id, sample_type: 'normal'], normal_bam, normal_bai], + meta, + Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), + bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL), ] } - .branch { meta_extra, bam, bai -> - - def input_key - if (meta_extra.sample_type == 'tumor') { - input_key = Constants.INPUT.BAMTOOLS_TUMOR - } else if (meta_extra.sample_type == 'normal') { - input_key = Constants.INPUT.BAMTOOLS_NORMAL - } else { - assert false - } - - def has_existing = Utils.hasExistingInput(meta_extra, input_key) - - runnable: bam && bai && !has_existing + .branch { meta, bam, bai -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAMTOOLS_NORMAL) + runnable: bam && !has_existing skip: true - return meta_extra + return meta } // Create process input channel // channel: [ meta_bamtools, bam, bai ] - ch_bamtools_inputs = ch_bams_bais_sorted.runnable - .map { meta_extra, bam, bai -> + ch_bamtools_inputs = Channel.empty() + .mix( + ch_inputs_tumor_sorted.runnable.map { meta, bam, bai -> [meta, Utils.getTumorDnaSample(meta), 'tumor', bam, bai] }, + ch_inputs_normal_sorted.runnable.map { meta, bam, bai -> [meta, Utils.getNormalDnaSample(meta), 'normal', bam, bai] }, + ) + .map { meta, meta_sample, sample_type, bam, bai -> def meta_bamtools = [ - key: meta_extra.group_id, - id: "${meta_extra.group_id}__${meta_extra.sample_id}", - sample_id: meta_extra.sample_id, - sample_type: meta_extra.sample_type, + key: meta.group_id, + id: "${meta.group_id}_${meta_sample.sample_id}", + sample_id: meta_sample.sample_id, + sample_type: sample_type, ] return [meta_bamtools, bam, bai] @@ -107,29 +86,27 @@ workflow BAMTOOLS_METRICS { ch_versions = ch_versions.mix(BAMTOOLS.out.versions) - // Sort outputs into tumor and normal channels, adding partial skip entries - // channel: [ meta_bamtools, metrics ] - ch_outputs_sorted = Channel.empty() - .mix( - BAMTOOLS.out.metrics, - ch_bams_bais_sorted.skip.map { meta -> [meta, []] }, - ) + // Sort into a tumor and normal channel + ch_bamtools_out = BAMTOOLS.out.metrics .branch { meta_bamtools, metrics -> + assert ['tumor', 'normal'].contains(meta_bamtools.sample_type) tumor: meta_bamtools.sample_type == 'tumor' normal: meta_bamtools.sample_type == 'normal' + placeholder: true } - // Set outputs, restoring original meta, including full skip entries + // Set outputs, restoring original meta + // channel: [ meta, metrics ] ch_somatic_metrics = Channel.empty() .mix( - WorkflowOncoanalyser.restoreMeta(ch_outputs_sorted.tumor, ch_inputs), - ch_inputs_sorted.skip.map { meta -> [meta, []] }, + WorkflowOncoanalyser.restoreMeta(ch_bamtools_out.tumor, ch_inputs), + ch_inputs_tumor_sorted.skip.map { meta -> [meta, []] }, ) ch_germline_metrics = Channel.empty() .mix( - WorkflowOncoanalyser.restoreMeta(ch_outputs_sorted.normal, ch_inputs), - ch_inputs_sorted.skip.map { meta -> [meta, []] }, + WorkflowOncoanalyser.restoreMeta(ch_bamtools_out.normal, ch_inputs), + ch_inputs_normal_sorted.skip.map { meta -> [meta, []] }, ) emit: diff --git a/subworkflows/local/chord_prediction/main.nf b/subworkflows/local/chord_prediction/main.nf index b8c0424e..e482e5d2 100644 --- a/subworkflows/local/chord_prediction/main.nf +++ b/subworkflows/local/chord_prediction/main.nf @@ -34,7 +34,7 @@ workflow CHORD_PREDICTION { ch_inputs_sorted = ch_inputs_selected .branch { meta, purple_dir -> - def has_dna = Utils.hasTumorDnaBam(meta) + def has_dna = Utils.hasTumorDna(meta) def tumor_id def has_smlv_vcf diff --git a/subworkflows/local/cobalt_profiling/main.nf b/subworkflows/local/cobalt_profiling/main.nf index 609b8a69..13102dee 100644 --- a/subworkflows/local/cobalt_profiling/main.nf +++ b/subworkflows/local/cobalt_profiling/main.nf @@ -11,6 +11,8 @@ workflow COBALT_PROFILING { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] + ch_normal_bam // channel: [mandatory] [ meta, bam, bai ] // Reference data gc_profile // channel: [mandatory] /path/to/gc_profile @@ -22,23 +24,37 @@ workflow COBALT_PROFILING { // channel: [ versions.yml ] ch_versions = Channel.empty() - // Sort inputs + // Select input sources and sort // NOTE(SW): germline mode is not currently supported - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> + // channel: runnable: [ meta, tumor_bam, tumor_bai, normal_bam, normal_bai] + // channel: skip: [ meta ] + ch_inputs_sorted = WorkflowOncoanalyser.groupByMeta( + ch_tumor_bam, + ch_normal_bam, + ) + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + return [ + meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + tumor_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR), + Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), + normal_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL), + ] + } + .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.COBALT_DIR) - runnable_tn: Utils.hasTumorDnaBam(meta) && Utils.hasNormalDnaBam(meta) && !has_existing - runnable_to: Utils.hasTumorDnaBam(meta) && !has_existing + runnable_tn: tumor_bam && normal_bam && !has_existing + runnable_to: tumor_bam && !has_existing skip: true + return meta } // First set diploid BED input for tumor/normal and tumor only samples // NOTE(SW): since the diploid BED is provided as a channel, I seem to be only able to include via channel ops - // channel: [ meta, diploid_bed ] + // channel: [ meta, tumor_bam, tumor_bai, normal_bam, normal_bai, diploid_bed ] ch_inputs_runnable = Channel.empty() .mix( - ch_inputs_sorted.runnable_tn.map { meta -> [meta, []] }, + ch_inputs_sorted.runnable_tn.map { [*it, []] }, ch_inputs_sorted.runnable_to.combine(diploid_bed), ) @@ -46,7 +62,7 @@ workflow COBALT_PROFILING { // channel: sample_data: [ meta_cobalt, tumor_bam, normal_bam, tumor_bai, normal_bai ] // channel: diploid_bed: [ diploid_bed ] ch_cobalt_inputs = ch_inputs_runnable - .multiMap { meta, diploid_bed -> + .multiMap { meta, tumor_bam, tumor_bai, normal_bam, normal_bai, diploid_bed -> def meta_cobalt = [ key: meta.group_id, @@ -54,18 +70,8 @@ workflow COBALT_PROFILING { tumor_id: Utils.getTumorDnaSampleName(meta), ] - def tumor_bam = Utils.getTumorDnaBam(meta) - def tumor_bai = Utils.getTumorDnaBai(meta) - - def normal_bam = [] - def normal_bai = [] - - if (Utils.hasNormalDnaBam(meta)) { - + if (normal_bam) { meta_cobalt.normal_id = Utils.getNormalDnaSampleName(meta) - normal_bam = Utils.getNormalDnaBam(meta) - normal_bai = Utils.getNormalDnaBai(meta) - } sample_data: [meta_cobalt, tumor_bam, normal_bam, tumor_bai, normal_bai] diff --git a/subworkflows/local/cuppa_prediction/main.nf b/subworkflows/local/cuppa_prediction/main.nf index f2cffa5f..2b74a441 100644 --- a/subworkflows/local/cuppa_prediction/main.nf +++ b/subworkflows/local/cuppa_prediction/main.nf @@ -3,6 +3,7 @@ // import Constants +import Utils include { CUPPA } from '../../../modules/local/cuppa/main' @@ -62,7 +63,7 @@ workflow CUPPA_PREDICTION { // (run exclusions currently done basis for presence of normal DNA) def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.CUPPA_DIR) - def has_normal_dna = Utils.hasNormalDnaBam(meta) + def has_normal_dna = Utils.hasNormalDna(meta) def has_runnable_inputs = isofox_dir || (purple_dir && linx_annotation_dir && has_normal_dna) @@ -82,9 +83,9 @@ workflow CUPPA_PREDICTION { id: meta.group_id, ] - def has_tumor_dna = Utils.hasTumorDnaBam(meta) - def has_normal_dna = Utils.hasNormalDnaBam(meta) - def has_tumor_rna = Utils.hasTumorRnaBam(meta) + def has_tumor_dna = Utils.hasTumorDna(meta) + def has_normal_dna = Utils.hasNormalDna(meta) + def has_tumor_rna = Utils.hasTumorRna(meta) def has_dna_inputs = (purple_dir && linx_annotation_dir) def has_rna_inputs = isofox_dir diff --git a/subworkflows/local/flagstat_metrics/main.nf b/subworkflows/local/flagstat_metrics/main.nf index 83196ef2..9f0a5f78 100644 --- a/subworkflows/local/flagstat_metrics/main.nf +++ b/subworkflows/local/flagstat_metrics/main.nf @@ -10,84 +10,64 @@ include { SAMTOOLS_FLAGSTAT } from '../../../modules/nf-core/samtools/flagstat/m workflow FLAGSTAT_METRICS { take: // Sample data - ch_inputs // channel: [mandatory] [ meta ] + ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] + ch_normal_bam // channel: [mandatory] [ meta, bam, bai ] main: // Channel for version.yml files // channel: [ versions.yml ] ch_versions = Channel.empty() - // Sort inputs - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> - - def has_tumor_dna = Utils.hasTumorDnaBam(meta) - def has_normal_dna = Utils.hasNormalDnaBam(meta) - - runnable: has_tumor_dna || has_normal_dna + // Sort inputs, separate by tumor and normal + // channel: runnable: [ meta, bam, bai ] + // channel: skip: [ meta ] + ch_inputs_tumor_sorted = ch_tumor_bam + .map { meta, bam, bai -> + return [ + meta, + Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR), + ] + } + .branch { meta, bam, bai -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.FLAGSTAT_TUMOR) + runnable: bam && !has_existing skip: true + return meta } - // Flatten into BAM/BAI pairs, select inputs that are eligible to run - // channel: runnable: [ meta_extra, bam, bai ] - // channel: skip: [ meta_extra ] - ch_bams_bais_sorted = ch_inputs_sorted.runnable - .flatMap { meta -> - - def tumor_sample_id = [] - def tumor_bam = [] - def tumor_bai = [] - - def normal_sample_id = [] - def normal_bam = [] - def normal_bai = [] - - if (Utils.hasTumorDnaBam(meta)) { - tumor_sample_id = Utils.getTumorDnaSampleName(meta) - tumor_bam = Utils.getTumorDnaBam(meta) - tumor_bai = Utils.getTumorDnaBai(meta) - } - - if (Utils.hasNormalDnaBam(meta)) { - normal_sample_id = Utils.getNormalDnaSampleName(meta) - normal_bam = Utils.getNormalDnaBam(meta) - normal_bai = Utils.getNormalDnaBai(meta) - } - + // channel: runnable: [ meta, bam, bai ] + // channel: skip: [ meta ] + ch_inputs_normal_sorted = ch_normal_bam + .map { meta, bam, bai -> return [ - [[key: meta.group_id, *:meta, sample_id: tumor_sample_id, sample_type: 'tumor'], tumor_bam, tumor_bai], - [[key: meta.group_id, *:meta, sample_id: normal_sample_id, sample_type: 'normal'], normal_bam, normal_bai], + meta, + Utils.selectCurrentOrExisting(bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), + bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL), ] } - .branch { meta_extra, bam, bai -> - - def input_key - if (meta_extra.sample_type == 'tumor') { - input_key = Constants.INPUT.BAMTOOLS_TUMOR - } else if (meta_extra.sample_type == 'normal') { - input_key = Constants.INPUT.BAMTOOLS_NORMAL - } else { - assert false - } - - def has_existing = Utils.hasExistingInput(meta_extra, input_key) - - runnable: bam && bai && !has_existing + .branch { meta, bam, bai -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.FLAGSTAT_NORMAL) + runnable: bam && !has_existing skip: true - return meta_extra + return meta } // Create process input channel // channel: [ meta_flagstat, bam, bai ] - ch_flagstat_inputs = ch_bams_bais_sorted.runnable - .map { meta_extra, bam, bai -> + ch_flagstat_inputs = Channel.empty() + .mix( + ch_inputs_tumor_sorted.runnable.map { meta, bam, bai -> [meta, Utils.getTumorDnaSample(meta), 'tumor', bam, bai] }, + ch_inputs_normal_sorted.runnable.map { meta, bam, bai -> [meta, Utils.getNormalDnaSample(meta), 'normal', bam, bai] }, + ) + .map { meta, meta_sample, sample_type, bam, bai -> def meta_flagstat = [ - key: meta_extra.group_id, - id: "${meta_extra.group_id}__${meta_extra.sample_id}", - sample_id: meta_extra.sample_id, - sample_type: meta_extra.sample_type, + key: meta.group_id, + id: "${meta.group_id}_${meta_sample.sample_id}", + sample_id: meta_sample.sample_id, + sample_type: sample_type, ] return [meta_flagstat, bam, bai] @@ -100,34 +80,32 @@ workflow FLAGSTAT_METRICS { ch_versions = ch_versions.mix(SAMTOOLS_FLAGSTAT.out.versions) - // Sort outputs into tumor and normal channels, adding partial skip entries - // channel: [ meta_flagstat, metrics ] - ch_outputs_sorted = Channel.empty() - .mix( - SAMTOOLS_FLAGSTAT.out.flagstat, - ch_bams_bais_sorted.skip.map { meta -> [meta, []] }, - ) - .branch { meta_flagstat, metrics -> + // Sort into a tumor and normal channel + ch_flagstat_out = SAMTOOLS_FLAGSTAT.out.flagstat + .branch { meta_flagstat, flagstat -> + assert ['tumor', 'normal'].contains(meta_flagstat.sample_type) tumor: meta_flagstat.sample_type == 'tumor' normal: meta_flagstat.sample_type == 'normal' + placeholder: true } - // Set outputs, restoring original meta, including full skip entries - ch_somatic_metrics = Channel.empty() + // Set outputs, restoring original meta + // channel: [ meta, flagstat ] + ch_somatic_flagstat = Channel.empty() .mix( - WorkflowOncoanalyser.restoreMeta(ch_outputs_sorted.tumor, ch_inputs), - ch_inputs_sorted.skip.map { meta -> [meta, []] }, + WorkflowOncoanalyser.restoreMeta(ch_flagstat_out.tumor, ch_inputs), + ch_inputs_tumor_sorted.skip.map { meta -> [meta, []] }, ) - ch_germline_metrics = Channel.empty() + ch_germline_flagstat = Channel.empty() .mix( - WorkflowOncoanalyser.restoreMeta(ch_outputs_sorted.normal, ch_inputs), - ch_inputs_sorted.skip.map { meta -> [meta, []] }, + WorkflowOncoanalyser.restoreMeta(ch_flagstat_out.normal, ch_inputs), + ch_inputs_normal_sorted.skip.map { meta -> [meta, []] }, ) emit: - somatic = ch_somatic_metrics // channel: [ meta, metrics ] - germline = ch_germline_metrics // channel: [ meta, metrics ] + somatic = ch_somatic_flagstat // channel: [ meta, flagstat ] + germline = ch_germline_flagstat // channel: [ meta, flagstat ] - versions = ch_versions // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/gridss_svprep_calling/main.nf b/subworkflows/local/gridss_svprep_calling/main.nf index 9b794fb9..48333057 100644 --- a/subworkflows/local/gridss_svprep_calling/main.nf +++ b/subworkflows/local/gridss_svprep_calling/main.nf @@ -17,6 +17,8 @@ workflow GRIDSS_SVPREP_CALLING { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] + ch_normal_bam // channel: [mandatory] [ meta, bam, bai ] // Reference data genome_fasta // channel: [mandatory] /path/to/genome_fasta @@ -38,16 +40,32 @@ workflow GRIDSS_SVPREP_CALLING { // channel: [ versions.yml ] ch_versions = Channel.empty() - // Sort inputs - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> + // Select input sources and sort + // channel: runnable_tn: [ meta, tumor_bam, tumor_bai, normal_bam, normal_bai ] + // channel: runnable_to: [ meta, tumor_bam, tumor_bai ] + // channel: skip: [ meta ] + ch_inputs_sorted = WorkflowOncoanalyser.groupByMeta( + ch_tumor_bam, + ch_normal_bam, + ) + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + return [ + meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + tumor_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR), + Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), + normal_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL), + ] + } + .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.GRIDSS_VCF) - runnable_tn: Utils.hasTumorDnaBam(meta) && Utils.hasNormalDnaBam(meta) && !has_existing - runnable_to: Utils.hasTumorDnaBam(meta) && !has_existing + runnable_tn: tumor_bam && normal_bam && !has_existing + runnable_to: tumor_bam && !has_existing + return [meta, tumor_bam, tumor_bai] skip: true + return meta } // @@ -57,10 +75,10 @@ workflow GRIDSS_SVPREP_CALLING { // channel: [ meta_svprep, bam_tumor, bai_tumor, [] ] ch_svprep_tumor_inputs = Channel.empty() .mix( - ch_inputs_sorted.runnable_to, + ch_inputs_sorted.runnable_to.map { [*it, [], []] }, ch_inputs_sorted.runnable_tn, ) - .map { meta -> + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def meta_svprep = [ key: meta.group_id, @@ -68,12 +86,9 @@ workflow GRIDSS_SVPREP_CALLING { sample_id: Utils.getTumorDnaSampleName(meta), sample_type: 'tumor', // NOTE(SW): slightly redundant since we have this information then lose it with .mix above - group_size: Utils.hasNormalDnaBam(meta) ? 2 : 1 + group_size: normal_bam ? 2 : 1 ] - def tumor_bam = Utils.getTumorDnaBam(meta) - def tumor_bai = Utils.getTumorDnaBai(meta) - return [meta_svprep, tumor_bam, tumor_bai, []] } @@ -103,10 +118,13 @@ workflow GRIDSS_SVPREP_CALLING { // MODULE: SV Prep (normal) // // Create process input channel - // NOTE(SW): this implicitly selects only entries present in ch_inputs_sorted.runnable_tn // channel: [ meta_svprep, bam_normal, bai_normal, junctions_tumor ] - ch_svprep_normal_inputs = WorkflowOncoanalyser.restoreMeta(SVPREP_TUMOR.out.junctions, ch_inputs_sorted.runnable_tn) - .map { meta, junctions_tumor -> + ch_svprep_normal_inputs = WorkflowOncoanalyser.groupByMeta( + ch_inputs_sorted.runnable_tn, + // NOTE(SW): this implicitly selects only entries present in ch_inputs_sorted.runnable_tn + WorkflowOncoanalyser.restoreMeta(SVPREP_TUMOR.out.junctions, ch_inputs_sorted.runnable_tn.map { it[0] }) + ) + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai, junctions_tumor -> def meta_svprep = [ key: meta.group_id, @@ -116,9 +134,6 @@ workflow GRIDSS_SVPREP_CALLING { group_size: 2, // Assumption holds since germline only is not supported and we source from runnable_tn ] - def normal_bam = Utils.getNormalDnaBam(meta) - def normal_bai = Utils.getNormalDnaBai(meta) - return [meta_svprep, normal_bam, normal_bai, junctions_tumor] } @@ -299,13 +314,49 @@ workflow GRIDSS_SVPREP_CALLING { // MODULE: SV Prep depth annotation // // Restore original meta, create process input channel - // channel: tumor/normal: [ meta_svprep, [bams], [bais], vcf, [labels] ] - // channel: tumor only: [ meta_svprep, bam, bai, vcf, label ] - ch_depth_inputs = WorkflowOncoanalyser.restoreMeta(CALL.out.vcf, ch_inputs) - .map { meta, vcf -> + // channel: [ meta, [bams], [bais], vcf, [labels] ] + ch_depth_inputs_tn = WorkflowOncoanalyser.groupByMeta( + ch_inputs_sorted.runnable_tn, + // NOTE(SW): this implicitly selects only entries present in ch_inputs_sorted.runnable_tn + WorkflowOncoanalyser.restoreMeta(CALL.out.vcf, ch_inputs_sorted.runnable_tn.map { it[0] }) + ) + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai, vcf -> + return [ + meta, + [normal_bam, tumor_bam], + [normal_bai, tumor_bai], + vcf, + [Utils.getNormalDnaSampleName(meta), Utils.getTumorDnaSampleName(meta)], + ] + } + + // channel: [ meta, bam, bai, vcf, label ] + ch_depth_inputs_to = WorkflowOncoanalyser.groupByMeta( + ch_inputs_sorted.runnable_to, + // NOTE(SW): this implicitly selects only entries present in ch_inputs_sorted.runnable_to + WorkflowOncoanalyser.restoreMeta(CALL.out.vcf, ch_inputs_sorted.runnable_to.map { it[0] }) + ) + .map { meta, tumor_bam, tumor_bai, vcf -> + return [ + meta, + tumor_bam, + tumor_bai, + vcf, + Utils.getTumorDnaSampleName(meta), + ] + } + + // channel: runnable_tn: [ meta_svprep, [bams], [bais], vcf, [labels] ] + // channel: runnable_to: [ meta_svprep, bam, bai, vcf, label ] + ch_depth_inputs = Channel.empty() + .mix( + ch_depth_inputs_tn, + ch_depth_inputs_to, + ) + .map { d -> - // NOTE(SW): germline only is not currently supported - assert Utils.hasTumorDnaBam(meta) + def meta = d[0] + def fps = d[1..-1] def meta_svprep = [ key: meta.group_id, @@ -313,33 +364,7 @@ workflow GRIDSS_SVPREP_CALLING { tumor_id: Utils.getTumorDnaSampleName(meta) ] - def data = [] - - if (Utils.hasNormalDnaBam(meta)) { - - data = [ - meta_svprep, - [Utils.getNormalDnaBam(meta), Utils.getTumorDnaBam(meta)], - [Utils.getNormalDnaBai(meta), Utils.getTumorDnaBai(meta)], - vcf, - [Utils.getNormalDnaSampleName(meta), Utils.getTumorDnaSampleName(meta)], - ] - - } else if (Utils.hasTumorDnaBam(meta)) { - - data = [ - meta_svprep, - Utils.getTumorDnaBam(meta), - Utils.getTumorDnaBai(meta), - vcf, - Utils.getTumorDnaSampleName(meta), - ] - - } else { - assert false - } - - return data + return [meta_svprep, *fps] } // Add depth annotations to calls diff --git a/subworkflows/local/gripss_filtering/main.nf b/subworkflows/local/gripss_filtering/main.nf index 7db6b449..4d9bfcb2 100644 --- a/subworkflows/local/gripss_filtering/main.nf +++ b/subworkflows/local/gripss_filtering/main.nf @@ -53,7 +53,7 @@ workflow GRIPSS_FILTERING { // channel: skip: [ meta ] ch_inputs_germline_sorted = ch_inputs_sorted.runnable .branch { meta, gridss_vcf -> - def has_tumor_normal = Utils.hasTumorDnaBam(meta) && Utils.hasNormalDnaBam(meta) + def has_tumor_normal = Utils.hasTumorDna(meta) && Utils.hasNormalDna(meta) def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.GRIPSS_VCF_NORMAL) runnable: has_tumor_normal && !has_existing @@ -98,7 +98,7 @@ workflow GRIPSS_FILTERING { // channel: skip: [ meta ] ch_inputs_somatic_sorted = ch_inputs_sorted.runnable .branch { meta, gridss_vcf -> - def has_tumor = Utils.hasTumorDnaBam(meta) + def has_tumor = Utils.hasTumorDna(meta) def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.GRIPSS_VCF_TUMOR) runnable: has_tumor && !has_existing @@ -117,7 +117,7 @@ workflow GRIPSS_FILTERING { tumor_id: Utils.getTumorDnaSampleName(meta), ] - if (Utils.hasNormalDnaBam(meta)) { + if (Utils.hasNormalDna(meta)) { meta_gripss.normal_id = Utils.getNormalDnaSampleName(meta) } diff --git a/subworkflows/local/isofox_quantification/main.nf b/subworkflows/local/isofox_quantification/main.nf index a8d07af8..8de357f9 100644 --- a/subworkflows/local/isofox_quantification/main.nf +++ b/subworkflows/local/isofox_quantification/main.nf @@ -11,6 +11,7 @@ workflow ISOFOX_QUANTIFICATION { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_rna_bam // channel: [mandatory] [ meta, bam, bai ] // Reference data genome_fasta // channel: [mandatory] /path/to/genome_fasta @@ -31,19 +32,28 @@ workflow ISOFOX_QUANTIFICATION { // channel: [ versions.yml ] ch_versions = Channel.empty() - // Sort inputs - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> + // Select input sources and sort + // channel: runnable: [ meta, tumor_bam, tumor_bai ] + // channel: skip: [ meta ] + ch_inputs_sorted = ch_tumor_rna_bam + .map { meta, tumor_bam, tumor_bai -> + return [ + meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_RNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_RNA_TUMOR), + ] + } + .branch { meta, tumor_bam, tumor_bai -> def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.ISOFOX_DIR) - runnable: Utils.hasTumorRnaBam(meta) && !has_existing + runnable: tumor_bam && !has_existing skip: true + return meta } // Create process input channel - // channel: [ meta_isofox, tumor_bam_rna ] + // channel: [ meta_isofox, tumor_bam, tumor_bai ] ch_isofox_inputs = ch_inputs_sorted.runnable - .map { meta -> + .map { meta, tumor_bam, tumor_bai -> def meta_isofox = [ key: meta.group_id, @@ -51,7 +61,7 @@ workflow ISOFOX_QUANTIFICATION { sample_id: Utils.getTumorRnaSampleName(meta), ] - return [meta_isofox, Utils.getTumorRnaBam(meta), Utils.getTumorRnaBai(meta)] + return [meta_isofox, tumor_bam, tumor_bai] } // Run process diff --git a/subworkflows/local/lilac_calling/main.nf b/subworkflows/local/lilac_calling/main.nf index bcc91dae..3110cc97 100644 --- a/subworkflows/local/lilac_calling/main.nf +++ b/subworkflows/local/lilac_calling/main.nf @@ -14,6 +14,9 @@ workflow LILAC_CALLING { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] + ch_normal_bam // channel: [mandatory] [ meta, bam, bai ] + ch_tumor_rna_bam // channel: [mandatory] [ meta, bam, bai ] ch_purple // channel: [mandatory] [ meta, purple_dir ] // Reference data @@ -28,42 +31,29 @@ workflow LILAC_CALLING { // channel: [ versions.yml ] ch_versions = Channel.empty() - // Sort inputs - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> - - def has_tumor_dna = Utils.hasTumorDnaBam(meta) - def has_normal_dna = Utils.hasNormalDnaBam(meta) + // Select input sources and sort for DNA BAMs + // channel: runnable: [ meta, tumor_dna_bam, tumor_dna_bai, normal_dna_bam, normal_dna_bai ] + // channel: skip: [ meta ] + ch_dna_inputs_sorted = WorkflowOncoanalyser.groupByMeta( + ch_tumor_bam, + ch_normal_bam, + ) + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + return [ + meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + tumor_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR), + Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), + normal_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL), + ] + } + .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.LILAC_DIR) - runnable: (has_tumor_dna || has_normal_dna) && !has_existing + runnable: (tumor_bam || normal_bam) && !has_existing skip: true - } - - // Create channel for DNA BAMs - // channel: [ meta, tumor_dna_bam, tumor_dna_bai, normal_dna_bam, normal_dna_bai ] - ch_dna_inputs = ch_inputs_sorted.runnable - .map { meta -> - - def tumor_bam = [] - def tumor_bai = [] - - def normal_bam = [] - def normal_bai = [] - - if (Utils.hasTumorDnaBam(meta)) { - tumor_bam = Utils.getTumorDnaBam(meta) - tumor_bai = Utils.getTumorDnaBai(meta) - } - - if (Utils.hasNormalDnaBam(meta)) { - normal_bam = Utils.getNormalDnaBam(meta) - normal_bai = Utils.getNormalDnaBai(meta) - } - - return [meta, tumor_bam, tumor_bai, normal_bam, normal_bai] + return meta } // Realign reads mapping to HLA regions and homologus regions if using reference genome with ALT contigs @@ -75,11 +65,11 @@ workflow LILAC_CALLING { // Flatten into BAM/BAI pairs, select inputs that are eligible to run // channel: runnable: [ meta_extra, bam, bai ] // channel: skip: [ meta_extra ] - ch_realign_inputs_sorted = ch_dna_inputs + ch_realign_inputs_sorted = ch_dna_inputs_sorted.runnable .flatMap { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> - def tumor_sample_id = Utils.hasTumorDnaBam(meta) ? Utils.getTumorDnaSampleName(meta) : [] - def normal_sample_id = Utils.hasNormalDnaBam(meta) ? Utils.getNormalDnaSampleName(meta) : [] + def tumor_sample_id = Utils.hasTumorDna(meta) ? Utils.getTumorDnaSampleName(meta) : [] + def normal_sample_id = Utils.hasNormalDna(meta) ? Utils.getNormalDnaSampleName(meta) : [] return [ [[key: meta.group_id, *:meta, sample_id: tumor_sample_id, sample_type: 'tumor'], tumor_bam, tumor_bai], @@ -168,26 +158,10 @@ workflow LILAC_CALLING { } else { // channel: [ meta, tumor_dna_bam, tumor_dna_bai, normal_dna_bam, normal_dna_bai ] - ch_dna_inputs_ready = ch_dna_inputs + ch_dna_inputs_ready = ch_dna_inputs_sorted.runnable } - // Create channel for RNA BAMs - // channel: [ meta, tumor_rna_bam, tumor_rna_bai ] - ch_rna_inputs_ready = ch_inputs - .map { meta -> - - def bam = [] - def bai = [] - - if (Utils.hasTumorRnaBam(meta)) { - bam = Utils.getTumorRnaBam(meta) - bai = Utils.getTumorRnaBai(meta) - } - - return [meta, bam, bai] - } - // // MODULE: LILAC // @@ -195,7 +169,7 @@ workflow LILAC_CALLING { // channel: [ meta_lilac, normal_dna_bam, normal_dna_bai, tumor_dna_bam, tumor_dna_bai, tumor_rna_bam, tumor_rna_bai, purple_dir ] ch_lilac_inputs = WorkflowOncoanalyser.groupByMeta( ch_dna_inputs_ready, - ch_rna_inputs_ready, + ch_tumor_rna_bam, ch_purple, ) .map { meta, tbam_dna, tbai_dna, nbam_dna, nbai_dna, tbam_rna, tbai_rna, purple_dir -> @@ -205,11 +179,11 @@ workflow LILAC_CALLING { id: meta.group_id, ] - if (Utils.hasTumorDnaBam(meta)) { + if (Utils.hasTumorDna(meta)) { meta_lilac.tumor_id = Utils.getTumorDnaSampleName(meta) } - if (Utils.hasNormalDnaBam(meta)) { + if (Utils.hasNormalDna(meta)) { meta_lilac.normal_id = Utils.getNormalDnaSampleName(meta) } @@ -240,7 +214,7 @@ workflow LILAC_CALLING { ch_outputs = Channel.empty() .mix( WorkflowOncoanalyser.restoreMeta(LILAC.out.lilac_dir, ch_inputs), - ch_inputs_sorted.skip.map { meta -> [meta, []] }, + ch_dna_inputs_sorted.skip.map { meta -> [meta, []] }, ) emit: diff --git a/subworkflows/local/linx_annotation/main.nf b/subworkflows/local/linx_annotation/main.nf index 8dd94df7..c8599584 100644 --- a/subworkflows/local/linx_annotation/main.nf +++ b/subworkflows/local/linx_annotation/main.nf @@ -53,7 +53,7 @@ workflow LINX_ANNOTATION { def tumor_id = Utils.getTumorDnaSampleName(meta) - def has_tumor_normal = Utils.hasTumorDnaBam(meta) && Utils.hasNormalDnaBam(meta) + def has_tumor_normal = Utils.hasTumorDna(meta) && Utils.hasNormalDna(meta) def has_sv_germline_vcf = file(purple_dir).resolve("${tumor_id}.purple.sv.germline.vcf.gz") def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.PURPLE_DIR) @@ -99,7 +99,7 @@ workflow LINX_ANNOTATION { ch_inputs_somatic_sorted = ch_inputs_sorted.runnable .branch { meta, purple_dir -> - def has_tumor = Utils.hasTumorDnaBam(meta) + def has_tumor = Utils.hasTumorDna(meta) def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.PURPLE_DIR) runnable: has_tumor && !has_existing diff --git a/subworkflows/local/pave_annotation/main.nf b/subworkflows/local/pave_annotation/main.nf index 92128ede..1517f92e 100644 --- a/subworkflows/local/pave_annotation/main.nf +++ b/subworkflows/local/pave_annotation/main.nf @@ -51,7 +51,7 @@ workflow PAVE_ANNOTATION { def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.PAVE_VCF_NORMAL) - runnable: Utils.hasTumorDnaBam(meta) && Utils.hasNormalDnaBam(meta) && sage_vcf && !has_existing + runnable: Utils.hasTumorDna(meta) && Utils.hasNormalDna(meta) && sage_vcf && !has_existing skip: true return meta } @@ -105,7 +105,7 @@ workflow PAVE_ANNOTATION { def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.PAVE_VCF_TUMOR) - runnable: Utils.hasTumorDnaBam(meta) && sage_vcf && !has_existing + runnable: Utils.hasTumorDna(meta) && sage_vcf && !has_existing skip: true return meta } diff --git a/subworkflows/local/prepare_inputs/main.nf b/subworkflows/local/prepare_inputs/main.nf index f6e6dfae..18abe8eb 100644 --- a/subworkflows/local/prepare_inputs/main.nf +++ b/subworkflows/local/prepare_inputs/main.nf @@ -8,7 +8,6 @@ // through running workflows/processes with 'setup'. Hence, this subworkflow // isn't used in the main pipeline and is only used for execution of tests. -import Constants import Utils workflow PREPARE_INPUTS { diff --git a/subworkflows/local/prepare_reference/main.nf b/subworkflows/local/prepare_reference/main.nf index 54616930..405c00a9 100644 --- a/subworkflows/local/prepare_reference/main.nf +++ b/subworkflows/local/prepare_reference/main.nf @@ -2,6 +2,11 @@ // Prepare reference data as required // +// NOTE(SW): BWA MEM2 indexes are required and are not created +// TODO(SW): consider removing index creation since it's unlikely to be used, replace with documentation + +import Constants + include { SAMTOOLS_FAIDX } from '../../../modules/nf-core/samtools/faidx/main' include { SAMTOOLS_DICT } from '../../../modules/nf-core/samtools/dict/main' include { BWA_INDEX } from '../../../modules/nf-core/bwa/index/main' @@ -9,6 +14,7 @@ include { BWA_INDEX } from '../../../modules/nf-core/bwa/index/main' include { CUSTOM_EXTRACTTARBALL as DECOMP_BWA_INDEX } from '../../../modules/local/custom/extract_tarball/main' include { CUSTOM_EXTRACTTARBALL as DECOMP_HMF_DATA } from '../../../modules/local/custom/extract_tarball/main' include { CUSTOM_EXTRACTTARBALL as DECOMP_PANEL_DATA } from '../../../modules/local/custom/extract_tarball/main' +include { CUSTOM_EXTRACTTARBALL as DECOMP_STAR_INDEX } from '../../../modules/local/custom/extract_tarball/main' include { CUSTOM_EXTRACTTARBALL as DECOMP_VIRUSBREAKEND_DB } from '../../../modules/local/custom/extract_tarball/main' include { GRIDSS_INDEX as GRIDSS_BWA_INDEX_IMAGE } from '../../../modules/local/gridss/index/main' include { GRIDSS_INDEX as GRIDSS_INDEX } from '../../../modules/local/gridss/index/main' @@ -94,6 +100,23 @@ workflow PREPARE_REFERENCE { } } + // Explicitly set BWA MEM2 index file inputs + ch_genome_bwa_index_bseq = file(params.ref_data.genome_bwa_index_bseq) + ch_genome_bwa_index_biidx = file(params.ref_data.genome_bwa_index_biidx) + + // + // Decompress STAR index + // + ch_genome_star_index = params.ref_data.genome_star_index ? file(params.ref_data.genome_star_index) : [] + if (run_config.has_rna_fastq && run_config.stages.alignment && params.ref_data.genome_star_index.endsWith('.tar.gz')) { + ch_genome_star_index_inputs = [ + [id: 'star_index'], + file(params.ref_data.genome_star_index), + ] + DECOMP_STAR_INDEX(ch_genome_star_index_inputs) + ch_genome_star_index = DECOMP_STAR_INDEX.out.dir + } + // // Set VIRUSBreakend database path / stage, unpack if required // @@ -141,7 +164,7 @@ workflow PREPARE_REFERENCE { // NOTE(SW): consider approach to implement custom panel support - panel_data_paths_versions = params.ref_data.panel_data_paths[params.panel] + panel_data_paths_versions = params.panel_data_paths[params.panel] panel_data_paths = panel_data_paths_versions[params.ref_data.genome_version] if (params.ref_data.panel_data_path.endsWith('tar.gz')) { @@ -168,8 +191,11 @@ workflow PREPARE_REFERENCE { genome_fai = ch_genome_fai // path: genome_fai genome_dict = ch_genome_dict // path: genome_dict genome_bwa_index = ch_genome_bwa_index // path: genome_bwa_index + genome_bwa_index_bseq = ch_genome_bwa_index_bseq // path: genome_bwa_index_bseq + genome_bwa_index_biidx = ch_genome_bwa_index_biidx // path: genome_bwa_index_biidx genome_bwa_index_image = ch_genome_bwa_index_image // path: genome_bwa_index_image genome_gridss_index = ch_genome_gridss_index // path: genome_gridss_index + genome_star_index = ch_genome_star_index // path: genome_star_index genome_version = params.ref_data.genome_version // val: genome_version virusbreakenddb = ch_virusbreakenddb // path: VIRUSBreakend database diff --git a/subworkflows/local/purple_calling/main.nf b/subworkflows/local/purple_calling/main.nf index 6bc40b87..229e2fbb 100644 --- a/subworkflows/local/purple_calling/main.nf +++ b/subworkflows/local/purple_calling/main.nf @@ -102,7 +102,7 @@ workflow PURPLE_CALLING { tumor_id: Utils.getTumorDnaSampleName(meta), ] - if (Utils.hasNormalDnaBam(meta)) { + if (Utils.hasNormalDna(meta)) { meta_purple.normal_id = Utils.getNormalDnaSampleName(meta) } diff --git a/subworkflows/local/read_alignment_dna/main.nf b/subworkflows/local/read_alignment_dna/main.nf new file mode 100644 index 00000000..fb0aaab4 --- /dev/null +++ b/subworkflows/local/read_alignment_dna/main.nf @@ -0,0 +1,221 @@ +// +// Align DNA reads +// + +import Constants +import Utils + +include { BWAMEM2_ALIGN } from '../../../modules/local/bwa-mem2/mem/main' +include { FASTP } from '../../../modules/local/fastp/main' + +workflow READ_ALIGNMENT_DNA { + take: + // Sample data + ch_inputs // channel: [mandatory] [ meta ] + + // Reference data + genome_fasta // channel: [mandatory] /path/to/genome_fasta + genome_bwa_index // channel: [mandatory] /path/to/genome_bwa_index_dir/ + genome_bwa_index_bseq // channel: [mandatory] /path/to/genome_bwa_index_binary_seq + genome_bwa_index_biidx // channel: [mandatory] /path/to/genome_bwa_index_bi-index + + // Params + max_fastq_records // numeric: [mandatory] max number of FASTQ records per split + + main: + // Channel for version.yml files + // channel: [ versions.yml ] + ch_versions = Channel.empty() + + // Sort inputs, separate by tumor and normal + // channel: [ meta ] + ch_inputs_tumor_sorted = ch_inputs + .branch { meta -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAM_DNA_TUMOR) + runnable: Utils.hasTumorDnaFastq(meta) && !has_existing + skip: true + } + + ch_inputs_normal_sorted = ch_inputs + .branch { meta -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAM_DNA_NORMAL) + runnable: Utils.hasNormalDnaFastq(meta) && !has_existing + skip: true + } + + // Create FASTQ input channel + // channel: [ meta_fastq, fastq_fwd, fastq_rev ] + ch_fastq_inputs = Channel.empty() + .mix( + ch_inputs_tumor_sorted.runnable.map { meta -> [meta, Utils.getTumorDnaSample(meta), 'tumor'] }, + ch_inputs_normal_sorted.runnable.map { meta -> [meta, Utils.getNormalDnaSample(meta), 'normal'] }, + ) + .flatMap { meta, meta_sample, sample_type -> + meta_sample + .getAt(Constants.FileType.FASTQ) + .collect { key, fps -> + def (library_id, lane) = key + + def meta_fastq = [ + key: meta.group_id, + id: "${meta.group_id}_${meta_sample.sample_id}", + sample_id: meta_sample.sample_id, + library_id: library_id, + lane: lane, + sample_type: sample_type, + ] + + return [meta_fastq, fps['fwd'], fps['rev']] + } + } + + // + // MODULE: fastp + // + // Split FASTQ into chunks if requested for distributed processing + // channel: [ meta_fastq_ready, fastq_fwd, fastq_fwd ] + ch_fastqs_ready = Channel.empty() + if (max_fastq_records > 0) { + + // Run process + FASTP( + ch_fastq_inputs, + max_fastq_records, + ) + + ch_versions = ch_versions.mix(FASTP.out.versions) + + // Prepare outputs within conditional block + ch_fastqs_ready = FASTP.out.fastq + .flatMap { meta_fastq, reads_fwd, reads_rev -> + + def data = [reads_fwd, reads_rev] + .transpose() + .collect { fwd, rev -> + + def split_fwd = fwd.name.replaceAll('\\..+$', '') + def split_rev = rev.name.replaceAll('\\..+$', '') + + assert split_fwd == split_rev + + // NOTE(SW): split allows meta_fastq_ready to be unique, which is required during reunite below + def meta_fastq_ready = [ + *:meta_fastq, + id: "${meta_fastq.id}_${split_fwd}", + split: split_fwd, + ] + + return [meta_fastq_ready, fwd, rev] + } + + return data + } + + } else { + + ch_fastqs_ready = ch_fastq_inputs + .map { meta_fastq, fastq_fwd, fastq_rev -> + + def meta_fastq_ready = [ + *:meta_fastq, + split: null, + ] + + return [meta_fastq_ready, fastq_fwd, fastq_rev] + } + + } + + // + // MODULE: BWA-MEM2 + // + // Create process input channel + // channel: [ meta_bwa, fastq_fwd, fastq_rev ] + ch_bwa_inputs = ch_fastqs_ready + .map { meta_fastq_ready, fastq_fwd, fastq_rev -> + + def meta_bwa = [ + *:meta_fastq_ready, + + + // TODO(SW): understand target format + read_group: "${meta_fastq_ready.sample_id}.${meta_fastq_ready.library_id}.${meta_fastq_ready.lane}", + + + ] + + return [meta_bwa, fastq_fwd, fastq_rev] + } + + // Run process + BWAMEM2_ALIGN( + ch_bwa_inputs, + genome_fasta, + genome_bwa_index, + genome_bwa_index_bseq, + genome_bwa_index_biidx, + ) + + ch_versions = ch_versions.mix(BWAMEM2_ALIGN.out.versions) + + // Reunite BAMs + // First, count expected BAMs per sample for non-blocking groupTuple op + // channel: [ meta_count, group_size ] + ch_sample_fastq_counts = ch_bwa_inputs + .map { meta_bwa, reads_fwd, reads_rev -> + + def meta_count = [ + key: meta_bwa.key, + sample_type: meta_bwa.sample_type, + ] + + return [meta_count, meta_bwa] + } + .groupTuple() + .map { meta_count, meta_bwas -> return [meta_count, meta_bwas.size()] } + + // Now, group with expected size then sort into tumor and normal channels + // channel: [ meta_group, [bam, ...], [bai, ...] ] + ch_bams_united = ch_sample_fastq_counts + .cross( + // First element to match meta_count above for `cross` + BWAMEM2_ALIGN.out.bam.map { meta_bwa, bam, bai -> [[key: meta_bwa.key, sample_type: meta_bwa.sample_type], bam, bai] } + ) + .map { count_tuple, bam_tuple -> + + def group_size = count_tuple[1] + def (meta_bam, bam, bai) = bam_tuple + + def meta_group = [ + *:meta_bam, + ] + + return tuple(groupKey(meta_group, group_size), bam, bai) + } + .groupTuple() + .branch { meta_group, bams, bais -> + assert ['tumor', 'normal'].contains(meta_group.sample_type) + tumor: meta_group.sample_type == 'tumor' + normal: meta_group.sample_type == 'normal' + placeholder: true + } + + // Set outputs, restoring original meta + // channel: [ meta, [bam, ...], [bai, ...] ] + ch_bam_tumor_out = Channel.empty() + .mix( + WorkflowOncoanalyser.restoreMeta(ch_bams_united.tumor, ch_inputs), + ch_inputs_tumor_sorted.skip.map { meta -> [meta, [], []] }, + ) + + ch_bam_normal_out = Channel.empty() + .mix( + WorkflowOncoanalyser.restoreMeta(ch_bams_united.normal, ch_inputs), + ch_inputs_normal_sorted.skip.map { meta -> [meta, [], []] }, + ) + + emit: + dna_tumor = ch_bam_tumor_out // channel: [ meta, [bam, ...], [bai, ...] ] + dna_normal = ch_bam_normal_out // channel: [ meta, [bam, ...], [bai, ...] ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/read_alignment_rna/main.nf b/subworkflows/local/read_alignment_rna/main.nf new file mode 100644 index 00000000..ca795e58 --- /dev/null +++ b/subworkflows/local/read_alignment_rna/main.nf @@ -0,0 +1,216 @@ +// +// Align RNA reads +// + +import Constants +import Utils + +include { GATK4_MARKDUPLICATES } from '../../../modules/nf-core/gatk4/markduplicates/main' +include { SAMBAMBA_MERGE } from '../../../modules/local/sambamba/merge/main' +include { SAMTOOLS_SORT } from '../../../modules/nf-core/samtools/sort/main' +include { STAR } from '../../../modules/local/star/main' + +workflow READ_ALIGNMENT_RNA { + take: + // Sample data + ch_inputs // channel: [mandatory] [ meta ] + + // Reference data + genome_star_index // channel: [mandatory] /path/to/genome_star_index/ + + main: + // Channel for version.yml files + // channel: [ versions.yml ] + ch_versions = Channel.empty() + + // Sort inputs + // channel: [ meta ] + ch_inputs_sorted = ch_inputs + .branch { meta -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAM_RNA_TUMOR) + runnable: Utils.hasTumorRnaFastq(meta) && !has_existing + skip: true + } + + // Create FASTQ input channel + // channel: [ meta_fastq, fastq_fwd, fastq_rev ] + ch_fastq_inputs = ch_inputs_sorted.runnable + .flatMap { meta -> + def meta_sample = Utils.getTumorRnaSample(meta) + meta_sample + .getAt(Constants.FileType.FASTQ) + .collect { key, fps -> + def (library_id, lane) = key + + def meta_fastq = [ + key: meta.group_id, + id: "${meta.group_id}_${meta_sample.sample_id}", + sample_id: meta_sample.sample_id, + library_id: library_id, + lane: lane, + ] + + return [meta_fastq, fps['fwd'], fps['rev']] + } + } + + // + // MODULE: STAR + // + // Create process input channel + // channel: [ meta_star, fastq_fwd, fastq_rev ] + ch_star_inputs = ch_fastq_inputs + .map { meta_fastq, fastq_fwd, fastq_rev -> + def meta_star = [ + *:meta_fastq, + + + // TODO(SW): understand target format + read_group: "${meta_fastq.sample_id}.${meta_fastq.library_id}.${meta_fastq.lane}", + + + ] + + return [meta_star, fastq_fwd, fastq_rev] + } + + // Run process + STAR( + ch_star_inputs, + genome_star_index, + ) + + ch_versions = ch_versions.mix(STAR.out.versions) + + // + // MODULE: SAMtools sort + // + // Create process input channel + // channel: [ meta_sort, bam ] + ch_sort_inputs = STAR.out.bam + .map { meta_star, bam -> + def meta_sort = [ + *:meta_star, + prefix: meta_star.read_group, + ] + + return [meta_sort, bam] + } + + // Run process + SAMTOOLS_SORT( + ch_sort_inputs, + ) + + ch_versions = ch_versions.mix(SAMTOOLS_SORT.out.versions) + + // + // MODULE: Sambamba merge + // + // Reunite BAMs + // First, count expected BAMs per sample for non-blocking groupTuple op + // channel: [ meta_count, group_size ] + ch_sample_fastq_counts = ch_star_inputs + .map { meta_star, reads_fwd, reads_rev -> + def meta_count = [key: meta_star.key] + return [meta_count, meta_star] + } + .groupTuple() + .map { meta_count, meta_stars -> return [meta_count, meta_stars.size()] } + + // Now, group with expected size then sort into tumor and normal channels + // channel: [ meta_group, [bam, ...] ] + ch_bams_united = ch_sample_fastq_counts + .cross( + // First element to match meta_count above for `cross` + SAMTOOLS_SORT.out.bam.map { meta_star, bam -> [[key: meta_star.key], bam] } + ) + .map { count_tuple, bam_tuple -> + + def group_size = count_tuple[1] + def (meta_bam, bam) = bam_tuple + + def meta_group = [ + *:meta_bam, + ] + + return tuple(groupKey(meta_group, group_size), bam) + } + .groupTuple() + + // Sort into merge-eligible BAMs (at least two BAMs required) + // channel: runnable: [ meta_group, [bam, ...] ] + // channel: skip: [ meta_group, bam ] + ch_bams_united_sorted = ch_bams_united + .branch { meta_group, bams -> + runnable: bams.size() > 1 + skip: true + return [meta_group, bams[0]] + } + + // Create process input channel + // channel: [ meta_merge, [bams, ...] ] + ch_merge_inputs = WorkflowOncoanalyser.restoreMeta(ch_bams_united_sorted.runnable, ch_inputs) + .map { meta, bams -> + def meta_merge = [ + key: meta.group_id, + id: meta.group_id, + sample_id: Utils.getTumorRnaSampleName(meta), + ] + return [meta_merge, bams] + } + + // Run process + SAMBAMBA_MERGE( + ch_merge_inputs, + ) + + ch_versions = ch_versions.mix(SAMBAMBA_MERGE.out.versions) + + // + // MODULE: GATK4 markduplicates + // + // Create process input channel + // channel: [ meta_markdups, bam ] + ch_markdups_inputs = Channel.empty() + .mix( + WorkflowOncoanalyser.restoreMeta(SAMBAMBA_MERGE.out.bam, ch_inputs), + WorkflowOncoanalyser.restoreMeta(ch_bams_united_sorted.skip, ch_inputs), + ) + .map { meta, bam -> + def meta_markdups = [ + key: meta.group_id, + id: meta.group_id, + sample_id: Utils.getTumorRnaSampleName(meta), + ] + return [meta_markdups, bam] + } + + // Run process + GATK4_MARKDUPLICATES( + ch_markdups_inputs, + [], + [], + ) + + ch_versions = ch_versions.mix(GATK4_MARKDUPLICATES.out.versions) + + // Combine BAMs and BAIs + // channel: [ meta, bam, bai ] + ch_bams_ready = WorkflowOncoanalyser.groupByMeta( + WorkflowOncoanalyser.restoreMeta(GATK4_MARKDUPLICATES.out.bam, ch_inputs), + WorkflowOncoanalyser.restoreMeta(GATK4_MARKDUPLICATES.out.bai, ch_inputs), + ) + + // Set outputs + // channel: [ meta, bam, bai ] + ch_bam_out = Channel.empty() + .mix( + ch_bams_ready, + ch_inputs_sorted.skip.map { meta -> [meta, [], []] }, + ) + + emit: + rna_tumor = ch_bam_out // channel: [ meta, bam, bai ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/read_processing/main.nf b/subworkflows/local/read_processing/main.nf new file mode 100644 index 00000000..2e67d213 --- /dev/null +++ b/subworkflows/local/read_processing/main.nf @@ -0,0 +1,122 @@ +// +// Apply post-alignment processing +// + +import Constants +import Utils + +include { MARKDUPS } from '../../../modules/local/markdups/main' + +workflow READ_PROCESSING { + take: + // Sample data + ch_inputs // channel: [mandatory] [ meta ] + ch_dna_tumor // channel: [mandatory] [ meta, [bam, ...], [bai, ...] ] + ch_dna_normal // channel: [mandatory] [ meta, [bam, ...], [bai, ...] ] + + // Reference data + genome_fasta // channel: [mandatory] /path/to/genome_fasta + genome_ver // channel: [mandatory] genome version + genome_fai // channel: [mandatory] /path/to/genome_fai + genome_dict // channel: [mandatory] /path/to/genome_dict + unmap_regions // channel: [mandatory] /path/to/unmap_regions + + // Params + has_umis // boolean: [mandatory] UMI processing flag + + main: + // Channel for version.yml files + // channel: [ versions.yml ] + ch_versions = Channel.empty() + + // Select and sort input sources, separating bytumor and normal + // channel: runnable: [ meta, [bam, ...], [bai, ...] ] + // channel: skip: [ meta ] + ch_inputs_tumor_sorted = ch_dna_tumor + .map { meta, bams, bais -> + return [ + meta, + Utils.hasExistingInput(meta, Constants.INPUT.BAM_DNA_TUMOR) ? [Utils.getInput(meta, Constants.INPUT.BAM_DNA_TUMOR)] : bams, + Utils.hasExistingInput(meta, Constants.INPUT.BAI_DNA_TUMOR) ? [Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR)] : bais, + ] + } + .branch { meta, bams, bais -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR) + runnable: bams && !has_existing + skip: true + return meta + } + + ch_inputs_normal_sorted = ch_dna_normal + .map { meta, bams, bais -> + return [ + meta, + Utils.hasExistingInput(meta, Constants.INPUT.BAM_DNA_NORMAL) ? [Utils.getInput(meta, Constants.INPUT.BAM_DNA_NORMAL)] : bams, + Utils.hasExistingInput(meta, Constants.INPUT.BAI_DNA_NORMAL) ? [Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL)] : bais, + ] + } + .branch { meta, bams, bais -> + def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL) + runnable: bams && !has_existing + skip: true + return meta + } + + // Create process input channel + // channel: [ meta_markdups, [bam, ...], [bai, ...] ] + ch_markdups_inputs = Channel.empty() + .mix( + ch_inputs_tumor_sorted.runnable.map { meta, bams, bais -> [meta, Utils.getTumorDnaSample(meta), 'tumor', bams, bais] }, + ch_inputs_normal_sorted.runnable.map { meta, bams, bais -> [meta, Utils.getNormalDnaSample(meta), 'normal', bams, bais] }, + ) + .map { meta, meta_sample, sample_type, bams, bais -> + + def meta_markdups = [ + key: meta.group_id, + id: "${meta.group_id}_${meta_sample.sample_id}", + sample_id: meta_sample.sample_id, + sample_type: sample_type, + ] + + return [meta_markdups, bams, bais] + } + + // Run process + MARKDUPS( + ch_markdups_inputs, + genome_fasta, + genome_ver, + genome_fai, + genome_dict, + unmap_regions, + has_umis, + ) + + // Sort into a tumor and normal channel + ch_markdups_out = MARKDUPS.out.bam + .branch { meta_markdups, bam, bai -> + assert ['tumor', 'normal'].contains(meta_markdups.sample_type) + tumor: meta_markdups.sample_type == 'tumor' + normal: meta_markdups.sample_type == 'normal' + placeholder: true + } + + // Set outputs, restoring original meta + // channel: [ meta, bam, bai ] + ch_bam_tumor_out = Channel.empty() + .mix( + WorkflowOncoanalyser.restoreMeta(ch_markdups_out.tumor, ch_inputs), + ch_inputs_tumor_sorted.skip.map { meta -> [meta, [], []] }, + ) + + ch_bam_normal_out = Channel.empty() + .mix( + WorkflowOncoanalyser.restoreMeta(ch_markdups_out.normal, ch_inputs), + ch_inputs_normal_sorted.skip.map { meta -> [meta, [], []] }, + ) + + emit: + dna_tumor = ch_bam_tumor_out // channel: [ meta, bam, bai ] + dna_normal = ch_bam_normal_out // channel: [ meta, bam, bai ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/sage_append/main.nf b/subworkflows/local/sage_append/main.nf index 8e15f4f2..8dfee365 100644 --- a/subworkflows/local/sage_append/main.nf +++ b/subworkflows/local/sage_append/main.nf @@ -11,14 +11,15 @@ include { SAGE_APPEND as GERMLINE } from '../../../modules/local/sage/append/mai workflow SAGE_APPEND { take: // Sample data - ch_inputs // channel: [mandatory] [ meta ] - ch_purple_dir // channel: [mandatory] [ meta, purple_dir ] + ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_rna_bam // channel: [mandatory] [ meta, bam, bai ] + ch_purple_dir // channel: [mandatory] [ meta, purple_dir ] // Reference data - genome_fasta // channel: [mandatory] /path/to/genome_fasta - genome_version // channel: [mandatory] genome version - genome_fai // channel: [mandatory] /path/to/genome_fai - genome_dict // channel: [mandatory] /path/to/genome_dict + genome_fasta // channel: [mandatory] /path/to/genome_fasta + genome_version // channel: [mandatory] genome version + genome_fai // channel: [mandatory] /path/to/genome_fai + genome_dict // channel: [mandatory] /path/to/genome_dict main: // Channel for version.yml files @@ -26,17 +27,22 @@ workflow SAGE_APPEND { ch_versions = Channel.empty() // Select input sources and sort - // channel: runnable: [ meta, purple_dir ] + // channel: runnable: [ meta, tumor_bam, tumor_bai, purple_dir ] // channel: skip: [ meta ] - ch_inputs_sorted = ch_purple_dir - .map { meta, purple_dir -> + ch_inputs_sorted = WorkflowOncoanalyser.groupByMeta( + ch_tumor_rna_bam, + ch_purple_dir, + ) + .map { meta, tumor_bam, tumor_bai, purple_dir -> return [ meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_RNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_RNA_TUMOR), Utils.selectCurrentOrExisting(purple_dir, meta, Constants.INPUT.PURPLE_DIR), ] } - .branch { meta, purple_dir -> - runnable: purple_dir + .branch { meta, tumor_bam, tumor_bai, purple_dir -> + runnable: tumor_bam && purple_dir skip: true return meta } @@ -45,15 +51,15 @@ workflow SAGE_APPEND { // MODULE: SAGE append germline // // Select inputs that are eligible to run - // channel: runnable: [ meta, purple_dir ] + // channel: runnable: [ meta, tumor_bam, tumor_bai, purple_dir ] // channel: skip: [ meta ] ch_inputs_germline_sorted = ch_inputs_sorted.runnable - .branch { meta, purple_dir -> + .branch { meta, tumor_bam, tumor_bai, purple_dir -> def tumor_dna_id = Utils.getTumorDnaSampleName(meta) - def has_normal_dna = Utils.hasNormalDnaBam(meta) - def has_tumor_rna = Utils.hasTumorRnaBam(meta) + def has_normal_dna = Utils.hasNormalDna(meta) + def has_tumor_rna = Utils.hasTumorRna(meta) def has_smlv_germline = file(purple_dir).resolve("${tumor_dna_id}.purple.germline.vcf.gz") def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.SAGE_APPEND_VCF_NORMAL) @@ -63,9 +69,9 @@ workflow SAGE_APPEND { } // Create process input channel - // channel: [ meta_append, purple_smlv_vcf, tumor_rna_bam, tumor_rna_bai ] + // channel: [ meta_append, purple_smlv_vcf, tumor_bam, tumor_bai ] ch_sage_append_germline_inputs = ch_inputs_germline_sorted.runnable - .map { meta, purple_dir -> + .map { meta, tumor_bam, tumor_bai, purple_dir -> def tumor_dna_id = Utils.getTumorDnaSampleName(meta) @@ -76,11 +82,9 @@ workflow SAGE_APPEND { dna_id: Utils.getNormalDnaSampleName(meta), ] - def tumor_rna_bam = Utils.getTumorRnaBam(meta) - def tumor_rna_bai = Utils.getTumorRnaBai(meta) def purple_smlv_vcf = file(purple_dir).resolve("${tumor_dna_id}.purple.germline.vcf.gz") - return [meta_append, purple_smlv_vcf, tumor_rna_bam, tumor_rna_bai] + return [meta_append, purple_smlv_vcf, tumor_bam, tumor_bai] } // Run process @@ -98,14 +102,14 @@ workflow SAGE_APPEND { // MODULE: SAGE append somatic // // Select inputs that are eligible to run - // channel: runnable: [ meta, purple_dir ] + // channel: runnable: [ meta, tumor_bam, tumor_bai, purple_dir ] // channel: skip: [ meta ] ch_inputs_somatic_sorted = ch_inputs_sorted.runnable - .branch { meta, purple_dir -> + .branch { meta, tumor_bam, tumor_bai, purple_dir -> def tumor_dna_id = Utils.getTumorDnaSampleName(meta) - def has_tumor_dna = Utils.hasTumorDnaBam(meta) - def has_tumor_rna = Utils.hasTumorRnaBam(meta) + def has_tumor_dna = Utils.hasTumorDna(meta) + def has_tumor_rna = Utils.hasTumorRna(meta) def has_smlv_somatic = file(purple_dir).resolve("${tumor_dna_id}.purple.somatic.vcf.gz") def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.SAGE_APPEND_VCF_TUMOR) @@ -115,9 +119,9 @@ workflow SAGE_APPEND { } // Create process input channel - // channel: [ meta_append, purple_smlv_vcf, tumor_rna_bam, tumor_rna_bai ] + // channel: [ meta_append, purple_smlv_vcf, tumor_bam, tumor_bai ] ch_sage_append_somatic_inputs = ch_inputs_somatic_sorted.runnable - .map { meta, purple_dir -> + .map { meta, tumor_bam, tumor_bai, purple_dir -> def tumor_dna_id = Utils.getTumorDnaSampleName(meta) @@ -128,11 +132,9 @@ workflow SAGE_APPEND { dna_id: Utils.getTumorDnaSampleName(meta), ] - def tumor_rna_bam = Utils.getTumorRnaBam(meta) - def tumor_rna_bai = Utils.getTumorRnaBai(meta) def purple_smlv_vcf = file(purple_dir).resolve("${tumor_dna_id}.purple.somatic.vcf.gz") - return [meta_append, purple_smlv_vcf, tumor_rna_bam, tumor_rna_bai] + return [meta_append, purple_smlv_vcf, tumor_bam, tumor_bai] } // Run process diff --git a/subworkflows/local/sage_calling/main.nf b/subworkflows/local/sage_calling/main.nf index 3d63c680..33d815d0 100644 --- a/subworkflows/local/sage_calling/main.nf +++ b/subworkflows/local/sage_calling/main.nf @@ -12,6 +12,8 @@ workflow SAGE_CALLING { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] + ch_normal_bam // channel: [mandatory] [ meta, bam, bai ] // Reference data genome_fasta // channel: [mandatory] /path/to/genome_fasta @@ -33,31 +35,47 @@ workflow SAGE_CALLING { ch_versions = Channel.empty() // Sort inputs - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> - runnable: Utils.hasTumorDnaBam(meta) + // channel: runnable: [ meta, tumor_bam, tumor_bai, normal_bam, normal_bai ] + // channel: skip: [ meta ] + ch_inputs_sorted = WorkflowOncoanalyser.groupByMeta( + ch_tumor_bam, + ch_normal_bam, + ) + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + return [ + meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + tumor_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_TUMOR), + Utils.selectCurrentOrExisting(normal_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_NORMAL), + normal_bai ?: Utils.getInput(meta, Constants.INPUT.BAI_DNA_NORMAL), + ] + } + .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + runnable: tumor_bam skip: true + return meta } // // MODULE: SAGE germline // // Select inputs that are eligible to run - // channel: [ meta ] + // channel: runnable: [ meta, tumor_bam, tumor_bai, normal_bam, normal_bai ] + // channel: skip: [ meta ] ch_inputs_germline_sorted = ch_inputs_sorted.runnable - .branch { meta -> - def has_tumor_normal = Utils.hasTumorDnaBam(meta) && Utils.hasNormalDnaBam(meta) + .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + def has_tumor_normal = tumor_bam && normal_bam def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.SAGE_VCF_NORMAL) runnable: has_tumor_normal && !has_existing skip: true + return meta } // Create process input channel - // channel: [ meta_sage, tbam, nbam, tbai, nbai ] + // channel: [ meta_sage, tumor_bam, normal_bam, tumor_bai, normal_bai ] ch_sage_germline_inputs = ch_inputs_germline_sorted.runnable - .map { meta -> + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def meta_sage = [ key: meta.group_id, @@ -66,16 +84,7 @@ workflow SAGE_CALLING { normal_id: Utils.getNormalDnaSampleName(meta), ] - data = [ - meta_sage, - Utils.getTumorDnaBam(meta), - Utils.getNormalDnaBam(meta), - Utils.getTumorDnaBai(meta), - Utils.getNormalDnaBai(meta), - ] - - return data - + return [meta_sage, tumor_bam, normal_bam, tumor_bai, normal_bai] } // Run process @@ -98,21 +107,23 @@ workflow SAGE_CALLING { // MODULE: SAGE somatic // // Select inputs that are eligible to run - // channel: [ meta ] + // channel: runnable: [ meta, tumor_bam, tumor_bai, normal_bam, normal_bai ] + // channel: skip: [ meta ] ch_inputs_somatic_sorted = ch_inputs_sorted.runnable - .branch { meta -> - def has_tumor = Utils.hasTumorDnaBam(meta) + .branch { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> + def has_tumor = tumor_bam def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.SAGE_VCF_TUMOR) runnable: has_tumor && !has_existing skip: true + return meta } // Create process input channel - // channel: tumor/normal: [ meta_sage, tbam, nbam, tbai, nbai ] - // channel: tumor only: [ meta_sage, tbam, [], tbai, [] ] + // channel: tumor/normal: [ meta_sage, tumor_bam, normal_bam, tumor_bai, normal_bai ] + // channel: tumor only: [ meta_sage, tumor_bam, [], tumor_bai, [] ] ch_sage_somatic_inputs = ch_inputs_somatic_sorted.runnable - .map { meta -> + .map { meta, tumor_bam, tumor_bai, normal_bam, normal_bai -> def meta_sage = [ key: meta.group_id, @@ -120,33 +131,11 @@ workflow SAGE_CALLING { tumor_id: Utils.getTumorDnaSampleName(meta), ] - def data = [] - if (Utils.hasNormalDnaBam(meta)) { - + if (normal_bam) { meta_sage.normal_id = Utils.getNormalDnaSampleName(meta) - - data = [ - meta_sage, - Utils.getTumorDnaBam(meta), - Utils.getNormalDnaBam(meta), - Utils.getTumorDnaBai(meta), - Utils.getNormalDnaBai(meta), - ] - - } else { - - data = [ - meta_sage, - Utils.getTumorDnaBam(meta), - [], - Utils.getTumorDnaBai(meta), - [], - ] - } - return data - + return [meta_sage, tumor_bam, normal_bam, tumor_bai, normal_bai] } // Run process diff --git a/subworkflows/local/sigs_fitting/main.nf b/subworkflows/local/sigs_fitting/main.nf index 73cebff5..41d2acb4 100644 --- a/subworkflows/local/sigs_fitting/main.nf +++ b/subworkflows/local/sigs_fitting/main.nf @@ -34,7 +34,7 @@ workflow SIGS_FITTING { ch_inputs_sorted = ch_inputs_selected .branch { meta, purple_dir -> - def has_dna = Utils.hasTumorDnaBam(meta) + def has_dna = Utils.hasTumorDna(meta) def tumor_id def has_smlv_vcf diff --git a/subworkflows/local/virusbreakend_calling/main.nf b/subworkflows/local/virusbreakend_calling/main.nf index 0e328e06..198af3e4 100644 --- a/subworkflows/local/virusbreakend_calling/main.nf +++ b/subworkflows/local/virusbreakend_calling/main.nf @@ -12,6 +12,7 @@ workflow VIRUSBREAKEND_CALLING { take: // Sample data ch_inputs // channel: [mandatory] [ meta ] + ch_tumor_bam // channel: [mandatory] [ meta, bam, bai ] ch_purple // channel: [mandatory] [ meta, purple_dir ] ch_bamtools_somatic // channel: [mandatory] [ meta, metrics ] @@ -36,14 +37,20 @@ workflow VIRUSBREAKEND_CALLING { // Sort inputs // NOTE(SW): VIRUSBreakend inputs are not allowed in the samplesheet, so aren't considered - // channel: [ meta ] - ch_inputs_sorted = ch_inputs - .branch { meta -> - + // channel: [ meta, tumor_bam, tumor_bai ] + ch_inputs_sorted = ch_tumor_bam + .map { meta, tumor_bam, tumor_bai -> + return [ + meta, + Utils.selectCurrentOrExisting(tumor_bam, meta, Constants.INPUT.BAM_MARKDUPS_DNA_TUMOR), + Utils.selectCurrentOrExisting(tumor_bai, meta, Constants.INPUT.BAI_DNA_TUMOR), + ] + } + .branch { meta, tumor_bam, tumor_bai -> def has_existing = Utils.hasExistingInput(meta, Constants.INPUT.VIRUSINTERPRETER_DIR) - - runnable: Utils.hasTumorDnaBam(meta) && !has_existing + runnable: tumor_bam && !has_existing skip: true + return meta } // @@ -52,7 +59,7 @@ workflow VIRUSBREAKEND_CALLING { // Create process input channel // channel: [ meta_virus, tumor_bam ] ch_virusbreakend_inputs = ch_inputs_sorted.runnable - .map { meta -> + .map { meta, tumor_bam, tumor_bai -> def meta_virus = [ key: meta.group_id, @@ -60,7 +67,7 @@ workflow VIRUSBREAKEND_CALLING { sample_id: Utils.getTumorDnaSampleName(meta), ] - return [meta_virus, Utils.getTumorDnaBam(meta)] + return [meta_virus, tumor_bam] } // Run process diff --git a/workflows/targeted.nf b/workflows/targeted.nf index 8b821d98..fc72b747 100644 --- a/workflows/targeted.nf +++ b/workflows/targeted.nf @@ -71,6 +71,9 @@ include { ORANGE_REPORTING } from '../subworkflows/local/orange_reporting' include { PAVE_ANNOTATION } from '../subworkflows/local/pave_annotation' include { PREPARE_REFERENCE } from '../subworkflows/local/prepare_reference' include { PURPLE_CALLING } from '../subworkflows/local/purple_calling' +include { READ_ALIGNMENT_DNA } from '../subworkflows/local/read_alignment_dna' +include { READ_ALIGNMENT_RNA } from '../subworkflows/local/read_alignment_rna' +include { READ_PROCESSING } from '../subworkflows/local/read_processing' include { SAGE_APPEND } from '../subworkflows/local/sage_append' include { SAGE_CALLING } from '../subworkflows/local/sage_calling' @@ -84,7 +87,6 @@ include { SAGE_CALLING } from '../subworkflows/local/sage_calling' samplesheet = Utils.getFileObject(params.input) workflow TARGETED { - // Create channel for versions // channel: [ versions.yml ] ch_versions = Channel.empty() @@ -104,6 +106,81 @@ workflow TARGETED { // Set GRIDSS config gridss_config = params.gridss_config !== null ? file(params.gridss_config) : hmf_data.gridss_config + // + // SUBWORKFLOW: Run read alignment to generate BAMs + // + // channel: [ meta, [bam, ...], [bai, ...] ] + ch_align_dna_tumor_out = Channel.empty() + ch_align_dna_normal_out = Channel.empty() + ch_align_rna_tumor_out = Channel.empty() + if (run_config.stages.alignment) { + + READ_ALIGNMENT_DNA( + ch_inputs, + ref_data.genome_fasta, + ref_data.genome_bwa_index, + ref_data.genome_bwa_index_bseq, + ref_data.genome_bwa_index_biidx, + params.max_fastq_records, + ) + + READ_ALIGNMENT_RNA( + ch_inputs, + ref_data.genome_star_index, + ) + + ch_versions = ch_versions.mix( + READ_ALIGNMENT_DNA.out.versions, + READ_ALIGNMENT_RNA.out.versions, + ) + + ch_align_dna_tumor_out = ch_align_dna_tumor_out.mix(READ_ALIGNMENT_DNA.out.dna_tumor) + ch_align_dna_normal_out = ch_align_dna_normal_out.mix(READ_ALIGNMENT_DNA.out.dna_normal) + ch_align_rna_tumor_out = ch_align_rna_tumor_out.mix(READ_ALIGNMENT_RNA.out.rna_tumor) + + } else { + + ch_align_dna_tumor_out = ch_inputs.map { meta -> [meta, [], []] } + ch_align_dna_normal_out = ch_inputs.map { meta -> [meta, [], []] } + ch_align_rna_tumor_out = ch_inputs.map { meta -> [meta, [], []] } + + } + + // + // SUBWORKFLOW: Run MarkDups for DNA BAMs + // + // channel: [ meta, bam, bai ] + ch_process_dna_tumor_out = Channel.empty() + ch_process_dna_normal_out = Channel.empty() + if (run_config.stages.markdups) { + + // NOTE(SW/MC): hardcoded for initial testing purposes + has_umis = run_config.panel.equalsIgnoreCase('tso500') + + READ_PROCESSING( + ch_inputs, + ch_align_dna_tumor_out, + ch_align_dna_normal_out, + ref_data.genome_fasta, + ref_data.genome_version, + ref_data.genome_fai, + ref_data.genome_dict, + hmf_data.unmap_regions, + has_umis, + ) + + ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) + + ch_process_dna_tumor_out = ch_process_dna_tumor_out.mix(READ_PROCESSING.out.dna_tumor) + ch_process_dna_normal_out = ch_process_dna_normal_out.mix(READ_PROCESSING.out.dna_normal) + + } else { + + ch_process_dna_normal_out = ch_inputs.map + ch_process_dna_normal_out = ch_inputs.map { meta -> [meta, []] } + + } + // // MODULE: Run Isofox to analyse RNA data // @@ -120,6 +197,7 @@ workflow TARGETED { ISOFOX_QUANTIFICATION( ch_inputs, + ch_align_rna_tumor_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -151,6 +229,8 @@ workflow TARGETED { AMBER_PROFILING( ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_version, hmf_data.heterozygous_sites, panel_data.target_region_bed, @@ -174,6 +254,8 @@ workflow TARGETED { COBALT_PROFILING( ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, hmf_data.gc_profile, hmf_data.diploid_bed, panel_data.target_region_normalisation, @@ -198,6 +280,8 @@ workflow TARGETED { GRIDSS_SVPREP_CALLING( ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -270,6 +354,8 @@ workflow TARGETED { SAGE_CALLING( ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -391,6 +477,7 @@ workflow TARGETED { SAGE_APPEND( ch_inputs, + ch_align_rna_tumor_out, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, @@ -474,6 +561,8 @@ workflow TARGETED { FLAGSTAT_METRICS( ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ) ch_versions = ch_versions.mix(FLAGSTAT_METRICS.out.versions) @@ -498,6 +587,8 @@ workflow TARGETED { BAMTOOLS_METRICS( ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_fasta, ref_data.genome_version, ) @@ -526,6 +617,9 @@ workflow TARGETED { LILAC_CALLING( ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, + ch_align_rna_tumor_out, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, diff --git a/workflows/wgts.nf b/workflows/wgts.nf index eaedf1e7..832988d9 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -75,6 +75,9 @@ include { ORANGE_REPORTING } from '../subworkflows/local/orange_reporting' include { PAVE_ANNOTATION } from '../subworkflows/local/pave_annotation' include { PREPARE_REFERENCE } from '../subworkflows/local/prepare_reference' include { PURPLE_CALLING } from '../subworkflows/local/purple_calling' +include { READ_ALIGNMENT_DNA } from '../subworkflows/local/read_alignment_dna' +include { READ_ALIGNMENT_RNA } from '../subworkflows/local/read_alignment_rna' +include { READ_PROCESSING } from '../subworkflows/local/read_processing' include { SAGE_APPEND } from '../subworkflows/local/sage_append' include { SAGE_CALLING } from '../subworkflows/local/sage_calling' include { SIGS_FITTING } from '../subworkflows/local/sigs_fitting' @@ -108,6 +111,78 @@ workflow WGTS { // Set GRIDSS config gridss_config = params.gridss_config !== null ? file(params.gridss_config) : hmf_data.gridss_config + // + // SUBWORKFLOW: Run read alignment to generate BAMs + // + // channel: [ meta, [bam, ...], [bai, ...] ] + ch_align_dna_tumor_out = Channel.empty() + ch_align_dna_normal_out = Channel.empty() + ch_align_rna_tumor_out = Channel.empty() + if (run_config.stages.alignment) { + + READ_ALIGNMENT_DNA( + ch_inputs, + ref_data.genome_fasta, + ref_data.genome_bwa_index, + ref_data.genome_bwa_index_bseq, + ref_data.genome_bwa_index_biidx, + params.max_fastq_records, + ) + + READ_ALIGNMENT_RNA( + ch_inputs, + ref_data.genome_star_index, + ) + + ch_versions = ch_versions.mix( + READ_ALIGNMENT_DNA.out.versions, + READ_ALIGNMENT_RNA.out.versions, + ) + + ch_align_dna_tumor_out = ch_align_dna_tumor_out.mix(READ_ALIGNMENT_DNA.out.dna_tumor) + ch_align_dna_normal_out = ch_align_dna_normal_out.mix(READ_ALIGNMENT_DNA.out.dna_normal) + ch_align_rna_tumor_out = ch_align_rna_tumor_out.mix(READ_ALIGNMENT_RNA.out.rna_tumor) + + } else { + + ch_align_dna_tumor_out = ch_inputs.map { meta -> [meta, [], []] } + ch_align_dna_normal_out = ch_inputs.map { meta -> [meta, [], []] } + ch_align_rna_tumor_out = ch_inputs.map { meta -> [meta, [], []] } + + } + + // + // SUBWORKFLOW: Run MarkDups for DNA BAMs + // + // channel: [ meta, bam, bai ] + ch_process_dna_tumor_out = Channel.empty() + ch_process_dna_normal_out = Channel.empty() + if (run_config.stages.markdups) { + + READ_PROCESSING( + ch_inputs, + ch_align_dna_tumor_out, + ch_align_dna_normal_out, + ref_data.genome_fasta, + ref_data.genome_version, + ref_data.genome_fai, + ref_data.genome_dict, + hmf_data.unmap_regions, + false, // has_umis + ) + + ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) + + ch_process_dna_tumor_out = ch_process_dna_tumor_out.mix(READ_PROCESSING.out.dna_tumor) + ch_process_dna_normal_out = ch_process_dna_normal_out.mix(READ_PROCESSING.out.dna_normal) + + } else { + + ch_process_dna_tumor_out = ch_inputs.map { meta -> [meta, []] } + ch_process_dna_normal_out = ch_inputs.map { meta -> [meta, []] } + + } + // // MODULE: Run Isofox to analyse RNA data // @@ -121,6 +196,7 @@ workflow WGTS { ISOFOX_QUANTIFICATION( ch_inputs, + ch_align_rna_tumor_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -152,6 +228,8 @@ workflow WGTS { AMBER_PROFILING( ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_version, hmf_data.heterozygous_sites, [], // target_region_bed @@ -176,6 +254,8 @@ workflow WGTS { COBALT_PROFILING( ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, hmf_data.gc_profile, hmf_data.diploid_bed, [], // panel_target_region_normalisation @@ -200,6 +280,8 @@ workflow WGTS { GRIDSS_SVPREP_CALLING( ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -272,6 +354,8 @@ workflow WGTS { SAGE_CALLING( ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_fasta, ref_data.genome_version, ref_data.genome_fai, @@ -393,6 +477,7 @@ workflow WGTS { SAGE_APPEND( ch_inputs, + ch_align_rna_tumor_out, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, @@ -475,6 +560,8 @@ workflow WGTS { FLAGSTAT_METRICS( ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ) ch_versions = ch_versions.mix(FLAGSTAT_METRICS.out.versions) @@ -499,6 +586,8 @@ workflow WGTS { BAMTOOLS_METRICS( ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, ref_data.genome_fasta, ref_data.genome_version, ) @@ -573,6 +662,9 @@ workflow WGTS { LILAC_CALLING( ch_inputs, + ch_process_dna_tumor_out, + ch_process_dna_normal_out, + ch_align_rna_tumor_out, ch_purple_out, ref_data.genome_fasta, ref_data.genome_version, @@ -600,6 +692,7 @@ workflow WGTS { VIRUSBREAKEND_CALLING( ch_inputs, + ch_process_dna_tumor_out, ch_purple_out, ch_bamtools_somatic_out, ref_data.genome_fasta,