From 0d4426a46fbdea099668d5b025e4f9d9241792e0 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Mon, 14 Oct 2024 10:48:10 +1100 Subject: [PATCH] Expand UMI handling --- .nf-core.yml | 4 ++ lib/WorkflowMain.groovy | 68 ++++++++++++++++--- modules/local/fastp/main.nf | 13 +++- modules/local/fastp/meta.yml | 10 ++- modules/local/markdups/main.nf | 22 +++--- modules/local/markdups/meta.yml | 4 +- nextflow.config | 11 +-- nextflow_schema.json | 31 ++++++++- subworkflows/local/read_alignment_dna/main.nf | 19 ++++-- subworkflows/local/read_processing/main.nf | 4 +- workflows/targeted.nf | 11 +-- workflows/wgts.nf | 9 ++- 12 files changed, 155 insertions(+), 51 deletions(-) diff --git a/.nf-core.yml b/.nf-core.yml index 7453b9e9..5aee5ec3 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -17,6 +17,10 @@ lint: - lib/Utils.groovy - lib/WorkflowMain.groovy - lib/WorkflowOncoanalyser.groovy + nextflow_config: + - config_defaults: + - params.fastp_umi_length + - params.fastp_umi_skip bump_version: null org_path: null update: null diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 5f144e67..4b32c452 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -37,7 +37,7 @@ class WorkflowMain { } } - if (!params.containsKey('ref_hmf_data_path')) { + if (!params.containsKey('ref_data_hmf_data_path')) { if (params.genome_version.toString() == '37') { params.ref_data_hmf_data_path = Constants.HMF_DATA_37_PATH } else if (params.genome_version.toString() == '38') { @@ -65,7 +65,6 @@ class WorkflowMain { if (run_mode === Constants.RunMode.TARGETED) { // Attempt to set default panel data path; make no assumption on valid 'panel' value - if (params.containsKey('panel')) { if (params.panel == 'tso500' && params.genome_version.toString() == '37') { params.ref_data_panel_data_path = Constants.TSO500_PANEL_37_PATH @@ -73,6 +72,23 @@ class WorkflowMain { params.ref_data_panel_data_path = Constants.TSO500_PANEL_38_PATH } } + + // When fastp UMI is enabled, MarkDups UMI should be as well + if (params.fastp_umi && (!params.containsKey('markdups_umi') || !params.markdups_umi)) { + params.markdups_umi = true + } + + // Set the MarkDups UMI duplex delimiter to '_' when the following conditions are met: + // - both fastp and MarkDups UMI processing enabled + // - fastp is using a duplex UMI location type (per_index or per_read) + // - no MarkDups duplex delimiter has been set + def fastp_and_markdups_umi = params.fastp_umi && params.markdups_umi + def fastp_duplex_location = params.containsKey('fastp_umi_location') && (params.fastp_umi_location == 'per_index' || params.fastp_umi_location == 'per_read') + def no_umi_duplex_delim = !params.containsKey('markdups_umi_duplex_delim') || !params.markdups_umi_duplex_delim + if (fastp_and_markdups_umi && fastp_duplex_location && no_umi_duplex_delim) { + params.markdups_umi_duplex_delim = '_' + } + } def stages = Processes.getRunStages( @@ -93,12 +109,18 @@ class WorkflowMain { } // Final point to set any default to avoid access to undefined parameters during nf-validation - if (!params.containsKey('panel')) { params.panel = null } - if (!params.containsKey('ref_data_genome_alt')) { params.ref_data_genome_alt = null } - if (!params.containsKey('ref_data_genome_gtf')) { params.ref_data_genome_gtf = null } - if (!params.containsKey('ref_data_hla_slice_bed')) { params.ref_data_hla_slice_bed = null } - if (!params.containsKey('ref_data_panel_data_path')) { params.ref_data_panel_data_path = null } - if (!params.containsKey('ref_data_virusbreakenddb_path')) { params.ref_data_virusbreakenddb_path = null } + if (!params.containsKey('panel')) params.panel = null + if (!params.containsKey('ref_data_genome_alt')) params.ref_data_genome_alt = null + if (!params.containsKey('ref_data_genome_gtf')) params.ref_data_genome_gtf = null + if (!params.containsKey('ref_data_hla_slice_bed')) params.ref_data_hla_slice_bed = null + if (!params.containsKey('ref_data_panel_data_path')) params.ref_data_panel_data_path = null + if (!params.containsKey('ref_data_virusbreakenddb_path')) params.ref_data_virusbreakenddb_path = null + + // Additionally set selected parameters with false-ish truthy values to avoid passing null values as inputs + if (!params.containsKey('fastp_umi_location')) params.fastp_umi_location = '' + if (!params.containsKey('fastp_umi_length')) params.fastp_umi_length = 0 + if (!params.containsKey('fastp_umi_skip')) params.fastp_umi_skip = -1 + if (!params.containsKey('markdups_umi_duplex_delim')) params.markdups_umi_duplex_delim = '' } @@ -227,6 +249,36 @@ class WorkflowMain { } + // UMI parameters + + def fastp_umi_args_set_any = params.fastp_umi_location || params.fastp_umi_length || params.fastp_umi_skip >= 0 + if (fastp_umi_args_set_any && !params.fastp_umi) { + log.error "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " Detected use of fastp UMI parameters but fastp UMI processing has not been enabled.\n" + + " Please review your configuration and set the fastp_umi flag or otherwise adjust\n" + + " accordingly.\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + Nextflow.exit(1) + } + + def fastp_umi_args_set_all = params.fastp_umi_location && params.fastp_umi_length && params.fastp_umi_skip >= 0 + if (params.fastp_umi && !fastp_umi_args_set_all) { + log.error "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " Refusing to run fastp UMI processing without having any UMI params configured.\n" + + " Please review your configuration and appropriately set all fastp_umi_* parameters.\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + Nextflow.exit(1) + } + + if (params.markdups_umi_duplex_delim && params.markdups_umi === false) { + log.error "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + + " Detected use of MarkDups UMI parameters but MarkDups UMI processing has not been\n" + + " enabled. Please review your configuration and set the markdups_umi flag or\n" + + " otherwise adjust accordingly.\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + Nextflow.exit(1) + } + } public static getRunConfig(params, inputs, log) { diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf index 95a14791..8c950944 100644 --- a/modules/local/fastp/main.nf +++ b/modules/local/fastp/main.nf @@ -10,7 +10,9 @@ process FASTP { input: tuple val(meta), path(reads_fwd), path(reads_rev) val max_fastq_records + val umi_location val umi_length + val umi_skip output: tuple val(meta), path('*_R1.fastp.fastq.gz'), path('*_R2.fastp.fastq.gz'), emit: fastq @@ -22,8 +24,13 @@ process FASTP { script: def args = task.ext.args ?: '' - def umi_extraction = umi_length > 0 ? '--umi --umi_loc per_read --umi_len ' + umi_length : '' - def split_by_lines_arg = max_fastq_records ? "--split_by_lines ${4 * max_fastq_records}" : '' + def split_by_lines_arg = max_fastq_records > 0 ? "--split_by_lines ${4 * max_fastq_records}" : '' + + def umi_args_list = [] + if (umi_location) umi_args_list.add("--umi_loc ${umi_location}") + if (umi_length) umi_args_list.add("--umi_len ${umi_length}") + if (umi_skip >= 0) umi_args_list.add("--umi_skip ${umi_skip}") + def umi_args = umi_args_list ? '--umi ' + umi_args_list.join(' ') : '' """ fastp \\ @@ -34,7 +41,7 @@ process FASTP { --disable_length_filtering \\ --disable_adapter_trimming \\ --disable_trim_poly_g \\ - ${umi_extraction} \\ + ${umi_args} \\ ${split_by_lines_arg} \\ --out1 ${meta.sample_id}_${meta.library_id}_${meta.lane}_R1.fastp.fastq.gz \\ --out2 ${meta.sample_id}_${meta.library_id}_${meta.lane}_R2.fastp.fastq.gz diff --git a/modules/local/fastp/meta.yml b/modules/local/fastp/meta.yml index 5184ceb2..434d3cc4 100644 --- a/modules/local/fastp/meta.yml +++ b/modules/local/fastp/meta.yml @@ -27,10 +27,16 @@ input: pattern: "*.{fastq.gz}" - max_fastq_records: type: integer - description: Maximum number of reads per file + description: Maximum number of reads per file (optional) + - umi_location: + type: string + description: UMI location type (optional) - umi_length: type: integer - description: UMI length for UMI extraction + description: UMI length (optional) + - umi_skip: + type: integer + description: UMI base skip (optional) output: - meta: type: map diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf index 12f82843..2845b3d4 100644 --- a/modules/local/markdups/main.nf +++ b/modules/local/markdups/main.nf @@ -14,7 +14,7 @@ process MARKDUPS { path genome_fai path genome_dict path unmap_regions - val has_umis + val umi_enable val umi_duplex_delim output: @@ -30,15 +30,12 @@ process MARKDUPS { def xmx_mod = task.ext.xmx_mod ?: 0.95 - def umi_flags - if (has_umis) { - umi_flags = '-umi_enabled' - if (umi_duplex_delim) { - umi_flags = "${umi_flags} -umi_duplex -umi_duplex_delim ${umi_duplex_delim}" - } - } else { - umi_flags = '-form_consensus' - } + def form_consensus_arg = umi_enable ? '' : '-form_consensus' + + def umi_args_list = [] + if (umi_enable) umi_args_list.add('-umi_enabled') + if (umi_duplex_delim) umi_args_list.add("-umi_duplex -umi_duplex_delim ${umi_duplex_delim}") + def umi_args = umi_args_list ? umi_args_list.join(' ') : '' """ markdups \\ @@ -51,7 +48,8 @@ process MARKDUPS { -sample ${meta.sample_id} \\ -input_bam ${bams.join(',')} \\ \\ - ${umi_flags} \\ + ${form_consensus_arg} \\ + ${umi_args} \\ \\ -unmap_regions ${unmap_regions} \\ -ref_genome ${genome_fasta} \\ @@ -76,7 +74,7 @@ process MARKDUPS { touch ${meta.sample_id}.markdups.bam.bai touch ${meta.sample_id}.duplicate_freq.tsv - if [[ -n "${has_umis}" ]]; then + if [[ -n "${umi_enable}" ]]; then touch ${meta.sample_id}.umi_coord_freq.tsv touch ${meta.sample_id}.umi_edit_distance.tsv touch ${meta.sample_id}.umi_nucleotide_freq.tsv diff --git a/modules/local/markdups/meta.yml b/modules/local/markdups/meta.yml index be21f189..3d2fc144 100644 --- a/modules/local/markdups/meta.yml +++ b/modules/local/markdups/meta.yml @@ -40,9 +40,9 @@ input: type: file description: Unmapped regions file pattern: "*.{tsv}" - - has_umis: + - umi_enable: type: boolean - description: Flag indicating presence of UMIs in reads + description: Flag to enable UMI processing - umi_duplex_delim: type: string description: UMI duplex delimiter diff --git a/nextflow.config b/nextflow.config index e32eb2ca..fdb47d5c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,8 +21,8 @@ params { // Read processing and alignment options max_fastq_records = 10000000 - umi_length = 0 - umi_duplex_delim = '+' + fastp_umi = false + markdups_umi = false // Process configuration processes_manual = false @@ -31,7 +31,6 @@ params { // Reference genome information; iGenomes is effectively disabled but retained for linting genome = null - force_genome = false igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = true hmf_genomes_base = 'https://pub-cf6ba01919994c3cbd354659947f74d8.r2.dev/genomes' @@ -273,7 +272,7 @@ plugins { validation { // NOTE(SW): entries here are generally have conditional defaults or are for internal use only defaultIgnoreParams = [ - "genomes", + 'genomes', 'hmf_data_paths', 'panel_data_paths', 'ref_data_genome_fasta', @@ -294,6 +293,10 @@ validation { 'ref_data_hmf_data_path', 'ref_data_panel_data_path', 'ref_data_virusbreakenddb_path', + 'fastp_umi_length', + 'fastp_umi_location', + 'fastp_umi_skip', + 'markdups_umi_duplex_delim', ] lenientMode = true diff --git a/nextflow_schema.json b/nextflow_schema.json index 380cacb1..79126c59 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -97,14 +97,39 @@ "max_fastq_records": { "type": "integer", "description": "When positive, will use fastp to split fastq files so that each resultant fastq file has no more than max_fastq_records records. When nonpositive, fastp is not used and the provided fastq files are passed as-is to the aligner.", + "default": 10000000, "fa_icon": "fas fa-cog" }, - "umi_length": { + "fastp_umi": { + "type": "boolean", + "description": "Enable fastp UMI processing.", + "default": false, + "fa_icon": "fas fa-cog" + }, + "fastp_umi_location": { + "type": "string", + "description": "fastp UMI location parameter (--umi_loc).", + "fa_icon": "fas fa-cog" + }, + "fastp_umi_length": { "type": "integer", - "description": "When positive, will use fastp to extract UMIs of this length from fastq files.", + "description": "fastp UMI length parameter (--umi_len)", + "default": 0, + "fa_icon": "fas fa-cog" + }, + "fastp_umi_skip": { + "type": "integer", + "description": "fastp UMI skip parameter (--umi_skip)", + "default": -1, + "fa_icon": "fas fa-cog" + }, + "markdups_umi": { + "type": "boolean", + "description": "Enable MarkDups UMI processing.", + "default": false, "fa_icon": "fas fa-cog" }, - "umi_duplex_delim": { + "markdups_umi_duplex_delim": { "type": "string", "description": "UMI duplex delimiter as used by MarkDups.", "fa_icon": "fas fa-cog" diff --git a/subworkflows/local/read_alignment_dna/main.nf b/subworkflows/local/read_alignment_dna/main.nf index e564b29a..ac656df8 100644 --- a/subworkflows/local/read_alignment_dna/main.nf +++ b/subworkflows/local/read_alignment_dna/main.nf @@ -11,15 +11,18 @@ include { FASTP } from '../../../modules/local/fastp/main' workflow READ_ALIGNMENT_DNA { take: // Sample data - ch_inputs // channel: [mandatory] [ meta ] + ch_inputs // channel: [mandatory] [ meta ] // Reference data - genome_fasta // channel: [mandatory] /path/to/genome_fasta - genome_bwamem2_index // channel: [mandatory] /path/to/genome_bwa-mem2_index_dir/ + genome_fasta // channel: [mandatory] /path/to/genome_fasta + genome_bwamem2_index // channel: [mandatory] /path/to/genome_bwa-mem2_index_dir/ // Params - max_fastq_records // numeric: [mandatory] max number of FASTQ records per split - umi_length // numeric: [optional] UMI length for extraction from fastq + max_fastq_records // numeric: [optional] max number of FASTQ records per split + umi_enable // boolean: [mandatory] enable UMI processing + umi_location // string: [optional] fastp UMI location argument (--umi_loc) + umi_length // numeric: [optional] fastp UMI length argument (--umi_len) + umi_skip // numeric: [optional] fastp UMI skip argument (--umi_skip) main: // Channel for version.yml files @@ -74,13 +77,15 @@ workflow READ_ALIGNMENT_DNA { // Split FASTQ into chunks if requested for distributed processing // channel: [ meta_fastq_ready, fastq_fwd, fastq_fwd ] ch_fastqs_ready = Channel.empty() - if (max_fastq_records > 0 || umi_length > 0) { + if (max_fastq_records > 0 || umi_enable) { // Run process FASTP( ch_fastq_inputs, max_fastq_records, + umi_location, umi_length, + umi_skip, ) ch_versions = ch_versions.mix(FASTP.out.versions) @@ -118,7 +123,7 @@ workflow READ_ALIGNMENT_DNA { } else { // Select appropriate source - ch_fastq_source = umi_length > 0 ? FASTP.out.fastq : ch_fastq_inputs + ch_fastq_source = umi_enable ? FASTP.out.fastq : ch_fastq_inputs ch_fastqs_ready = ch_fastq_source .map { meta_fastq, fastq_fwd, fastq_rev -> diff --git a/subworkflows/local/read_processing/main.nf b/subworkflows/local/read_processing/main.nf index 6ee44a77..82e95282 100644 --- a/subworkflows/local/read_processing/main.nf +++ b/subworkflows/local/read_processing/main.nf @@ -22,7 +22,7 @@ workflow READ_PROCESSING { unmap_regions // channel: [mandatory] /path/to/unmap_regions // Params - has_umis // boolean: [mandatory] UMI processing flag + umi_enable // boolean: [mandatory] enable UMI processing umi_duplex_delim // string: [optional] UMI duplex delimiter main: @@ -90,7 +90,7 @@ workflow READ_PROCESSING { genome_fai, genome_dict, unmap_regions, - has_umis, + umi_enable, umi_duplex_delim, ) diff --git a/workflows/targeted.nf b/workflows/targeted.nf index 97ab2b04..29edfc8b 100644 --- a/workflows/targeted.nf +++ b/workflows/targeted.nf @@ -118,7 +118,10 @@ workflow TARGETED { ref_data.genome_fasta, ref_data.genome_bwamem2_index, params.max_fastq_records, - params.umi_length + params.fastp_umi, + params.fastp_umi_location, + params.fastp_umi_length, + params.fastp_umi_skip, ) READ_ALIGNMENT_RNA( @@ -151,8 +154,6 @@ workflow TARGETED { ch_process_dna_normal_out = Channel.empty() if (run_config.stages.markdups) { - has_umis = run_config.panel.equalsIgnoreCase('tso500') || params.umi_duplex_delim != '' || params.umi_length > 0 - READ_PROCESSING( ch_inputs, ch_align_dna_tumor_out, @@ -162,8 +163,8 @@ workflow TARGETED { ref_data.genome_fai, ref_data.genome_dict, hmf_data.unmap_regions, - has_umis, - params.umi_duplex_delim, + params.markdups_umi, + params.markdups_umi_duplex_delim, ) ch_versions = ch_versions.mix(READ_PROCESSING.out.versions) diff --git a/workflows/wgts.nf b/workflows/wgts.nf index fad9ddc6..48c99fd6 100644 --- a/workflows/wgts.nf +++ b/workflows/wgts.nf @@ -123,7 +123,10 @@ workflow WGTS { ref_data.genome_fasta, ref_data.genome_bwamem2_index, params.max_fastq_records, - 0, // disabled for now + false, // umi_enable + '', // umi_location + 0, // umi_length + -1, // umi_skip ) READ_ALIGNMENT_RNA( @@ -165,8 +168,8 @@ workflow WGTS { ref_data.genome_fai, ref_data.genome_dict, hmf_data.unmap_regions, - false, // has_umis - '', // no duplex UMI delimiter + false, // umi_enable + '', // umi_duplex_delim ) ch_versions = ch_versions.mix(READ_PROCESSING.out.versions)