Skip to content

Commit

Permalink
Expand UMI handling
Browse files Browse the repository at this point in the history
  • Loading branch information
scwatts committed Oct 13, 2024
1 parent d1e5f11 commit 0d4426a
Show file tree
Hide file tree
Showing 12 changed files with 155 additions and 51 deletions.
4 changes: 4 additions & 0 deletions .nf-core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ lint:
- lib/Utils.groovy
- lib/WorkflowMain.groovy
- lib/WorkflowOncoanalyser.groovy
nextflow_config:
- config_defaults:
- params.fastp_umi_length
- params.fastp_umi_skip
bump_version: null
org_path: null
update: null
68 changes: 60 additions & 8 deletions lib/WorkflowMain.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class WorkflowMain {
}
}

if (!params.containsKey('ref_hmf_data_path')) {
if (!params.containsKey('ref_data_hmf_data_path')) {
if (params.genome_version.toString() == '37') {
params.ref_data_hmf_data_path = Constants.HMF_DATA_37_PATH
} else if (params.genome_version.toString() == '38') {
Expand Down Expand Up @@ -65,14 +65,30 @@ class WorkflowMain {
if (run_mode === Constants.RunMode.TARGETED) {

// Attempt to set default panel data path; make no assumption on valid 'panel' value

if (params.containsKey('panel')) {
if (params.panel == 'tso500' && params.genome_version.toString() == '37') {
params.ref_data_panel_data_path = Constants.TSO500_PANEL_37_PATH
} else if (params.panel == 'tso500' && params.genome_version.toString() == '38') {
params.ref_data_panel_data_path = Constants.TSO500_PANEL_38_PATH
}
}

// When fastp UMI is enabled, MarkDups UMI should be as well
if (params.fastp_umi && (!params.containsKey('markdups_umi') || !params.markdups_umi)) {
params.markdups_umi = true
}

// Set the MarkDups UMI duplex delimiter to '_' when the following conditions are met:
// - both fastp and MarkDups UMI processing enabled
// - fastp is using a duplex UMI location type (per_index or per_read)
// - no MarkDups duplex delimiter has been set
def fastp_and_markdups_umi = params.fastp_umi && params.markdups_umi
def fastp_duplex_location = params.containsKey('fastp_umi_location') && (params.fastp_umi_location == 'per_index' || params.fastp_umi_location == 'per_read')
def no_umi_duplex_delim = !params.containsKey('markdups_umi_duplex_delim') || !params.markdups_umi_duplex_delim
if (fastp_and_markdups_umi && fastp_duplex_location && no_umi_duplex_delim) {
params.markdups_umi_duplex_delim = '_'
}

}

def stages = Processes.getRunStages(
Expand All @@ -93,12 +109,18 @@ class WorkflowMain {
}

// Final point to set any default to avoid access to undefined parameters during nf-validation
if (!params.containsKey('panel')) { params.panel = null }
if (!params.containsKey('ref_data_genome_alt')) { params.ref_data_genome_alt = null }
if (!params.containsKey('ref_data_genome_gtf')) { params.ref_data_genome_gtf = null }
if (!params.containsKey('ref_data_hla_slice_bed')) { params.ref_data_hla_slice_bed = null }
if (!params.containsKey('ref_data_panel_data_path')) { params.ref_data_panel_data_path = null }
if (!params.containsKey('ref_data_virusbreakenddb_path')) { params.ref_data_virusbreakenddb_path = null }
if (!params.containsKey('panel')) params.panel = null
if (!params.containsKey('ref_data_genome_alt')) params.ref_data_genome_alt = null
if (!params.containsKey('ref_data_genome_gtf')) params.ref_data_genome_gtf = null
if (!params.containsKey('ref_data_hla_slice_bed')) params.ref_data_hla_slice_bed = null
if (!params.containsKey('ref_data_panel_data_path')) params.ref_data_panel_data_path = null
if (!params.containsKey('ref_data_virusbreakenddb_path')) params.ref_data_virusbreakenddb_path = null

// Additionally set selected parameters with false-ish truthy values to avoid passing null values as inputs
if (!params.containsKey('fastp_umi_location')) params.fastp_umi_location = ''
if (!params.containsKey('fastp_umi_length')) params.fastp_umi_length = 0
if (!params.containsKey('fastp_umi_skip')) params.fastp_umi_skip = -1
if (!params.containsKey('markdups_umi_duplex_delim')) params.markdups_umi_duplex_delim = ''

}

Expand Down Expand Up @@ -227,6 +249,36 @@ class WorkflowMain {

}

// UMI parameters

def fastp_umi_args_set_any = params.fastp_umi_location || params.fastp_umi_length || params.fastp_umi_skip >= 0
if (fastp_umi_args_set_any && !params.fastp_umi) {
log.error "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" +
" Detected use of fastp UMI parameters but fastp UMI processing has not been enabled.\n" +
" Please review your configuration and set the fastp_umi flag or otherwise adjust\n" +
" accordingly.\n" +
"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
Nextflow.exit(1)
}

def fastp_umi_args_set_all = params.fastp_umi_location && params.fastp_umi_length && params.fastp_umi_skip >= 0
if (params.fastp_umi && !fastp_umi_args_set_all) {
log.error "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" +
" Refusing to run fastp UMI processing without having any UMI params configured.\n" +
" Please review your configuration and appropriately set all fastp_umi_* parameters.\n" +
"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
Nextflow.exit(1)
}

if (params.markdups_umi_duplex_delim && params.markdups_umi === false) {
log.error "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" +
" Detected use of MarkDups UMI parameters but MarkDups UMI processing has not been\n" +
" enabled. Please review your configuration and set the markdups_umi flag or\n" +
" otherwise adjust accordingly.\n" +
"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
Nextflow.exit(1)
}

}

public static getRunConfig(params, inputs, log) {
Expand Down
13 changes: 10 additions & 3 deletions modules/local/fastp/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ process FASTP {
input:
tuple val(meta), path(reads_fwd), path(reads_rev)
val max_fastq_records
val umi_location
val umi_length
val umi_skip

output:
tuple val(meta), path('*_R1.fastp.fastq.gz'), path('*_R2.fastp.fastq.gz'), emit: fastq
Expand All @@ -22,8 +24,13 @@ process FASTP {
script:
def args = task.ext.args ?: ''

def umi_extraction = umi_length > 0 ? '--umi --umi_loc per_read --umi_len ' + umi_length : ''
def split_by_lines_arg = max_fastq_records ? "--split_by_lines ${4 * max_fastq_records}" : ''
def split_by_lines_arg = max_fastq_records > 0 ? "--split_by_lines ${4 * max_fastq_records}" : ''

def umi_args_list = []
if (umi_location) umi_args_list.add("--umi_loc ${umi_location}")
if (umi_length) umi_args_list.add("--umi_len ${umi_length}")
if (umi_skip >= 0) umi_args_list.add("--umi_skip ${umi_skip}")
def umi_args = umi_args_list ? '--umi ' + umi_args_list.join(' ') : ''

"""
fastp \\
Expand All @@ -34,7 +41,7 @@ process FASTP {
--disable_length_filtering \\
--disable_adapter_trimming \\
--disable_trim_poly_g \\
${umi_extraction} \\
${umi_args} \\
${split_by_lines_arg} \\
--out1 ${meta.sample_id}_${meta.library_id}_${meta.lane}_R1.fastp.fastq.gz \\
--out2 ${meta.sample_id}_${meta.library_id}_${meta.lane}_R2.fastp.fastq.gz
Expand Down
10 changes: 8 additions & 2 deletions modules/local/fastp/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,16 @@ input:
pattern: "*.{fastq.gz}"
- max_fastq_records:
type: integer
description: Maximum number of reads per file
description: Maximum number of reads per file (optional)
- umi_location:
type: string
description: UMI location type (optional)
- umi_length:
type: integer
description: UMI length for UMI extraction
description: UMI length (optional)
- umi_skip:
type: integer
description: UMI base skip (optional)
output:
- meta:
type: map
Expand Down
22 changes: 10 additions & 12 deletions modules/local/markdups/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ process MARKDUPS {
path genome_fai
path genome_dict
path unmap_regions
val has_umis
val umi_enable
val umi_duplex_delim

output:
Expand All @@ -30,15 +30,12 @@ process MARKDUPS {

def xmx_mod = task.ext.xmx_mod ?: 0.95

def umi_flags
if (has_umis) {
umi_flags = '-umi_enabled'
if (umi_duplex_delim) {
umi_flags = "${umi_flags} -umi_duplex -umi_duplex_delim ${umi_duplex_delim}"
}
} else {
umi_flags = '-form_consensus'
}
def form_consensus_arg = umi_enable ? '' : '-form_consensus'

def umi_args_list = []
if (umi_enable) umi_args_list.add('-umi_enabled')
if (umi_duplex_delim) umi_args_list.add("-umi_duplex -umi_duplex_delim ${umi_duplex_delim}")
def umi_args = umi_args_list ? umi_args_list.join(' ') : ''

"""
markdups \\
Expand All @@ -51,7 +48,8 @@ process MARKDUPS {
-sample ${meta.sample_id} \\
-input_bam ${bams.join(',')} \\
\\
${umi_flags} \\
${form_consensus_arg} \\
${umi_args} \\
\\
-unmap_regions ${unmap_regions} \\
-ref_genome ${genome_fasta} \\
Expand All @@ -76,7 +74,7 @@ process MARKDUPS {
touch ${meta.sample_id}.markdups.bam.bai
touch ${meta.sample_id}.duplicate_freq.tsv
if [[ -n "${has_umis}" ]]; then
if [[ -n "${umi_enable}" ]]; then
touch ${meta.sample_id}.umi_coord_freq.tsv
touch ${meta.sample_id}.umi_edit_distance.tsv
touch ${meta.sample_id}.umi_nucleotide_freq.tsv
Expand Down
4 changes: 2 additions & 2 deletions modules/local/markdups/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ input:
type: file
description: Unmapped regions file
pattern: "*.{tsv}"
- has_umis:
- umi_enable:
type: boolean
description: Flag indicating presence of UMIs in reads
description: Flag to enable UMI processing
- umi_duplex_delim:
type: string
description: UMI duplex delimiter
Expand Down
11 changes: 7 additions & 4 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ params {

// Read processing and alignment options
max_fastq_records = 10000000
umi_length = 0
umi_duplex_delim = '+'
fastp_umi = false
markdups_umi = false

// Process configuration
processes_manual = false
Expand All @@ -31,7 +31,6 @@ params {

// Reference genome information; iGenomes is effectively disabled but retained for linting
genome = null
force_genome = false
igenomes_base = 's3://ngi-igenomes/igenomes/'
igenomes_ignore = true
hmf_genomes_base = 'https://pub-cf6ba01919994c3cbd354659947f74d8.r2.dev/genomes'
Expand Down Expand Up @@ -273,7 +272,7 @@ plugins {
validation {
// NOTE(SW): entries here are generally have conditional defaults or are for internal use only
defaultIgnoreParams = [
"genomes",
'genomes',
'hmf_data_paths',
'panel_data_paths',
'ref_data_genome_fasta',
Expand All @@ -294,6 +293,10 @@ validation {
'ref_data_hmf_data_path',
'ref_data_panel_data_path',
'ref_data_virusbreakenddb_path',
'fastp_umi_length',
'fastp_umi_location',
'fastp_umi_skip',
'markdups_umi_duplex_delim',
]

lenientMode = true
Expand Down
31 changes: 28 additions & 3 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,39 @@
"max_fastq_records": {
"type": "integer",
"description": "When positive, will use fastp to split fastq files so that each resultant fastq file has no more than max_fastq_records records. When nonpositive, fastp is not used and the provided fastq files are passed as-is to the aligner.",
"default": 10000000,
"fa_icon": "fas fa-cog"
},
"umi_length": {
"fastp_umi": {
"type": "boolean",
"description": "Enable fastp UMI processing.",
"default": false,
"fa_icon": "fas fa-cog"
},
"fastp_umi_location": {
"type": "string",
"description": "fastp UMI location parameter (--umi_loc).",
"fa_icon": "fas fa-cog"
},
"fastp_umi_length": {
"type": "integer",
"description": "When positive, will use fastp to extract UMIs of this length from fastq files.",
"description": "fastp UMI length parameter (--umi_len)",
"default": 0,
"fa_icon": "fas fa-cog"
},
"fastp_umi_skip": {
"type": "integer",
"description": "fastp UMI skip parameter (--umi_skip)",
"default": -1,
"fa_icon": "fas fa-cog"
},
"markdups_umi": {
"type": "boolean",
"description": "Enable MarkDups UMI processing.",
"default": false,
"fa_icon": "fas fa-cog"
},
"umi_duplex_delim": {
"markdups_umi_duplex_delim": {
"type": "string",
"description": "UMI duplex delimiter as used by MarkDups.",
"fa_icon": "fas fa-cog"
Expand Down
19 changes: 12 additions & 7 deletions subworkflows/local/read_alignment_dna/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,18 @@ include { FASTP } from '../../../modules/local/fastp/main'
workflow READ_ALIGNMENT_DNA {
take:
// Sample data
ch_inputs // channel: [mandatory] [ meta ]
ch_inputs // channel: [mandatory] [ meta ]

// Reference data
genome_fasta // channel: [mandatory] /path/to/genome_fasta
genome_bwamem2_index // channel: [mandatory] /path/to/genome_bwa-mem2_index_dir/
genome_fasta // channel: [mandatory] /path/to/genome_fasta
genome_bwamem2_index // channel: [mandatory] /path/to/genome_bwa-mem2_index_dir/

// Params
max_fastq_records // numeric: [mandatory] max number of FASTQ records per split
umi_length // numeric: [optional] UMI length for extraction from fastq
max_fastq_records // numeric: [optional] max number of FASTQ records per split
umi_enable // boolean: [mandatory] enable UMI processing
umi_location // string: [optional] fastp UMI location argument (--umi_loc)
umi_length // numeric: [optional] fastp UMI length argument (--umi_len)
umi_skip // numeric: [optional] fastp UMI skip argument (--umi_skip)

main:
// Channel for version.yml files
Expand Down Expand Up @@ -74,13 +77,15 @@ workflow READ_ALIGNMENT_DNA {
// Split FASTQ into chunks if requested for distributed processing
// channel: [ meta_fastq_ready, fastq_fwd, fastq_fwd ]
ch_fastqs_ready = Channel.empty()
if (max_fastq_records > 0 || umi_length > 0) {
if (max_fastq_records > 0 || umi_enable) {

// Run process
FASTP(
ch_fastq_inputs,
max_fastq_records,
umi_location,
umi_length,
umi_skip,
)

ch_versions = ch_versions.mix(FASTP.out.versions)
Expand Down Expand Up @@ -118,7 +123,7 @@ workflow READ_ALIGNMENT_DNA {
} else {

// Select appropriate source
ch_fastq_source = umi_length > 0 ? FASTP.out.fastq : ch_fastq_inputs
ch_fastq_source = umi_enable ? FASTP.out.fastq : ch_fastq_inputs

ch_fastqs_ready = ch_fastq_source
.map { meta_fastq, fastq_fwd, fastq_rev ->
Expand Down
4 changes: 2 additions & 2 deletions subworkflows/local/read_processing/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ workflow READ_PROCESSING {
unmap_regions // channel: [mandatory] /path/to/unmap_regions

// Params
has_umis // boolean: [mandatory] UMI processing flag
umi_enable // boolean: [mandatory] enable UMI processing
umi_duplex_delim // string: [optional] UMI duplex delimiter

main:
Expand Down Expand Up @@ -90,7 +90,7 @@ workflow READ_PROCESSING {
genome_fai,
genome_dict,
unmap_regions,
has_umis,
umi_enable,
umi_duplex_delim,
)

Expand Down
Loading

0 comments on commit 0d4426a

Please sign in to comment.