From 0d4426a46fbdea099668d5b025e4f9d9241792e0 Mon Sep 17 00:00:00 2001
From: Stephen Watts <hello@stephen.ac>
Date: Mon, 14 Oct 2024 10:48:10 +1100
Subject: [PATCH] Expand UMI handling

---
 .nf-core.yml                                  |  4 ++
 lib/WorkflowMain.groovy                       | 68 ++++++++++++++++---
 modules/local/fastp/main.nf                   | 13 +++-
 modules/local/fastp/meta.yml                  | 10 ++-
 modules/local/markdups/main.nf                | 22 +++---
 modules/local/markdups/meta.yml               |  4 +-
 nextflow.config                               | 11 +--
 nextflow_schema.json                          | 31 ++++++++-
 subworkflows/local/read_alignment_dna/main.nf | 19 ++++--
 subworkflows/local/read_processing/main.nf    |  4 +-
 workflows/targeted.nf                         | 11 +--
 workflows/wgts.nf                             |  9 ++-
 12 files changed, 155 insertions(+), 51 deletions(-)

diff --git a/.nf-core.yml b/.nf-core.yml
index 7453b9e9..5aee5ec3 100644
--- a/.nf-core.yml
+++ b/.nf-core.yml
@@ -17,6 +17,10 @@ lint:
     - lib/Utils.groovy
     - lib/WorkflowMain.groovy
     - lib/WorkflowOncoanalyser.groovy
+  nextflow_config:
+    - config_defaults:
+      - params.fastp_umi_length
+      - params.fastp_umi_skip
 bump_version: null
 org_path: null
 update: null
diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy
index 5f144e67..4b32c452 100755
--- a/lib/WorkflowMain.groovy
+++ b/lib/WorkflowMain.groovy
@@ -37,7 +37,7 @@ class WorkflowMain {
             }
         }
 
-        if (!params.containsKey('ref_hmf_data_path')) {
+        if (!params.containsKey('ref_data_hmf_data_path')) {
             if (params.genome_version.toString() == '37') {
                 params.ref_data_hmf_data_path = Constants.HMF_DATA_37_PATH
             } else if (params.genome_version.toString() == '38') {
@@ -65,7 +65,6 @@ class WorkflowMain {
         if (run_mode === Constants.RunMode.TARGETED) {
 
             // Attempt to set default panel data path; make no assumption on valid 'panel' value
-
             if (params.containsKey('panel')) {
                 if (params.panel == 'tso500' && params.genome_version.toString() == '37') {
                     params.ref_data_panel_data_path = Constants.TSO500_PANEL_37_PATH
@@ -73,6 +72,23 @@ class WorkflowMain {
                     params.ref_data_panel_data_path = Constants.TSO500_PANEL_38_PATH
                 }
             }
+
+            // When fastp UMI is enabled, MarkDups UMI should be as well
+            if (params.fastp_umi && (!params.containsKey('markdups_umi') || !params.markdups_umi)) {
+                params.markdups_umi = true
+            }
+
+            // Set the MarkDups UMI duplex delimiter to '_' when the following conditions are met:
+            //   - both fastp and MarkDups UMI processing enabled
+            //   - fastp is using a duplex UMI location type (per_index or per_read)
+            //   - no MarkDups duplex delimiter has been set
+            def fastp_and_markdups_umi = params.fastp_umi && params.markdups_umi
+            def fastp_duplex_location = params.containsKey('fastp_umi_location') && (params.fastp_umi_location == 'per_index' || params.fastp_umi_location == 'per_read')
+            def no_umi_duplex_delim = !params.containsKey('markdups_umi_duplex_delim') || !params.markdups_umi_duplex_delim
+            if (fastp_and_markdups_umi && fastp_duplex_location && no_umi_duplex_delim) {
+                params.markdups_umi_duplex_delim = '_'
+            }
+
         }
 
         def stages = Processes.getRunStages(
@@ -93,12 +109,18 @@ class WorkflowMain {
         }
 
         // Final point to set any default to avoid access to undefined parameters during nf-validation
-        if (!params.containsKey('panel')) { params.panel = null }
-        if (!params.containsKey('ref_data_genome_alt')) { params.ref_data_genome_alt = null }
-        if (!params.containsKey('ref_data_genome_gtf')) { params.ref_data_genome_gtf = null }
-        if (!params.containsKey('ref_data_hla_slice_bed')) { params.ref_data_hla_slice_bed = null }
-        if (!params.containsKey('ref_data_panel_data_path')) { params.ref_data_panel_data_path = null }
-        if (!params.containsKey('ref_data_virusbreakenddb_path')) { params.ref_data_virusbreakenddb_path = null }
+        if (!params.containsKey('panel')) params.panel = null
+        if (!params.containsKey('ref_data_genome_alt')) params.ref_data_genome_alt = null
+        if (!params.containsKey('ref_data_genome_gtf')) params.ref_data_genome_gtf = null
+        if (!params.containsKey('ref_data_hla_slice_bed')) params.ref_data_hla_slice_bed = null
+        if (!params.containsKey('ref_data_panel_data_path')) params.ref_data_panel_data_path = null
+        if (!params.containsKey('ref_data_virusbreakenddb_path')) params.ref_data_virusbreakenddb_path = null
+
+        // Additionally set selected parameters with false-ish truthy values to avoid passing null values as inputs
+        if (!params.containsKey('fastp_umi_location')) params.fastp_umi_location = ''
+        if (!params.containsKey('fastp_umi_length')) params.fastp_umi_length = 0
+        if (!params.containsKey('fastp_umi_skip')) params.fastp_umi_skip = -1
+        if (!params.containsKey('markdups_umi_duplex_delim')) params.markdups_umi_duplex_delim = ''
 
     }
 
@@ -227,6 +249,36 @@ class WorkflowMain {
 
         }
 
+        // UMI parameters
+
+        def fastp_umi_args_set_any = params.fastp_umi_location || params.fastp_umi_length || params.fastp_umi_skip >= 0
+        if (fastp_umi_args_set_any && !params.fastp_umi) {
+            log.error "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" +
+                "  Detected use of fastp UMI parameters but fastp UMI processing has not been enabled.\n" +
+                "  Please review your configuration and set the fastp_umi flag or otherwise adjust\n" +
+                "  accordingly.\n" +
+                "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+            Nextflow.exit(1)
+        }
+
+        def fastp_umi_args_set_all = params.fastp_umi_location && params.fastp_umi_length && params.fastp_umi_skip >= 0
+        if (params.fastp_umi && !fastp_umi_args_set_all) {
+            log.error "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" +
+                "  Refusing to run fastp UMI processing without having any UMI params configured.\n" +
+                "  Please review your configuration and appropriately set all fastp_umi_* parameters.\n" +
+                "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+            Nextflow.exit(1)
+        }
+
+        if (params.markdups_umi_duplex_delim && params.markdups_umi === false) {
+            log.error "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" +
+                "  Detected use of MarkDups UMI parameters but MarkDups UMI processing has not been\n" +
+                "  enabled. Please review your configuration and set the markdups_umi flag or\n" +
+                "  otherwise adjust accordingly.\n" +
+                "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+            Nextflow.exit(1)
+        }
+
     }
 
     public static getRunConfig(params, inputs, log) {
diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf
index 95a14791..8c950944 100644
--- a/modules/local/fastp/main.nf
+++ b/modules/local/fastp/main.nf
@@ -10,7 +10,9 @@ process FASTP {
     input:
     tuple val(meta), path(reads_fwd), path(reads_rev)
     val max_fastq_records
+    val umi_location
     val umi_length
+    val umi_skip
 
     output:
     tuple val(meta), path('*_R1.fastp.fastq.gz'), path('*_R2.fastp.fastq.gz'), emit: fastq
@@ -22,8 +24,13 @@ process FASTP {
     script:
     def args = task.ext.args ?: ''
 
-    def umi_extraction = umi_length > 0 ? '--umi --umi_loc per_read --umi_len ' + umi_length : ''
-    def split_by_lines_arg = max_fastq_records ? "--split_by_lines ${4 * max_fastq_records}" : ''
+    def split_by_lines_arg = max_fastq_records > 0 ? "--split_by_lines ${4 * max_fastq_records}" : ''
+
+    def umi_args_list = []
+    if (umi_location) umi_args_list.add("--umi_loc ${umi_location}")
+    if (umi_length) umi_args_list.add("--umi_len ${umi_length}")
+    if (umi_skip >= 0) umi_args_list.add("--umi_skip ${umi_skip}")
+    def umi_args = umi_args_list ? '--umi ' + umi_args_list.join(' ') : ''
 
     """
     fastp \\
@@ -34,7 +41,7 @@ process FASTP {
         --disable_length_filtering \\
         --disable_adapter_trimming \\
         --disable_trim_poly_g \\
-        ${umi_extraction} \\
+        ${umi_args} \\
         ${split_by_lines_arg} \\
         --out1 ${meta.sample_id}_${meta.library_id}_${meta.lane}_R1.fastp.fastq.gz \\
         --out2 ${meta.sample_id}_${meta.library_id}_${meta.lane}_R2.fastp.fastq.gz
diff --git a/modules/local/fastp/meta.yml b/modules/local/fastp/meta.yml
index 5184ceb2..434d3cc4 100644
--- a/modules/local/fastp/meta.yml
+++ b/modules/local/fastp/meta.yml
@@ -27,10 +27,16 @@ input:
       pattern: "*.{fastq.gz}"
   - max_fastq_records:
       type: integer
-      description: Maximum number of reads per file
+      description: Maximum number of reads per file (optional)
+  - umi_location:
+      type: string
+      description: UMI location type (optional)
   - umi_length:
       type: integer
-      description: UMI length for UMI extraction
+      description: UMI length (optional)
+  - umi_skip:
+      type: integer
+      description: UMI base skip (optional)
 output:
   - meta:
       type: map
diff --git a/modules/local/markdups/main.nf b/modules/local/markdups/main.nf
index 12f82843..2845b3d4 100644
--- a/modules/local/markdups/main.nf
+++ b/modules/local/markdups/main.nf
@@ -14,7 +14,7 @@ process MARKDUPS {
     path genome_fai
     path genome_dict
     path unmap_regions
-    val has_umis
+    val umi_enable
     val umi_duplex_delim
 
     output:
@@ -30,15 +30,12 @@ process MARKDUPS {
 
     def xmx_mod = task.ext.xmx_mod ?: 0.95
 
-    def umi_flags
-    if (has_umis) {
-        umi_flags = '-umi_enabled'
-        if (umi_duplex_delim) {
-            umi_flags = "${umi_flags} -umi_duplex -umi_duplex_delim ${umi_duplex_delim}"
-        }
-    } else {
-        umi_flags = '-form_consensus'
-    }
+    def form_consensus_arg = umi_enable ? '' : '-form_consensus'
+
+    def umi_args_list = []
+    if (umi_enable) umi_args_list.add('-umi_enabled')
+    if (umi_duplex_delim) umi_args_list.add("-umi_duplex -umi_duplex_delim ${umi_duplex_delim}")
+    def umi_args = umi_args_list ? umi_args_list.join(' ') : ''
 
     """
     markdups \\
@@ -51,7 +48,8 @@ process MARKDUPS {
         -sample ${meta.sample_id} \\
         -input_bam ${bams.join(',')} \\
         \\
-        ${umi_flags} \\
+        ${form_consensus_arg} \\
+        ${umi_args} \\
         \\
         -unmap_regions ${unmap_regions} \\
         -ref_genome ${genome_fasta} \\
@@ -76,7 +74,7 @@ process MARKDUPS {
     touch ${meta.sample_id}.markdups.bam.bai
     touch ${meta.sample_id}.duplicate_freq.tsv
 
-    if [[ -n "${has_umis}" ]]; then
+    if [[ -n "${umi_enable}" ]]; then
         touch ${meta.sample_id}.umi_coord_freq.tsv
         touch ${meta.sample_id}.umi_edit_distance.tsv
         touch ${meta.sample_id}.umi_nucleotide_freq.tsv
diff --git a/modules/local/markdups/meta.yml b/modules/local/markdups/meta.yml
index be21f189..3d2fc144 100644
--- a/modules/local/markdups/meta.yml
+++ b/modules/local/markdups/meta.yml
@@ -40,9 +40,9 @@ input:
       type: file
       description: Unmapped regions file
       pattern: "*.{tsv}"
-  - has_umis:
+  - umi_enable:
       type: boolean
-      description: Flag indicating presence of UMIs in reads
+      description: Flag to enable UMI processing
   - umi_duplex_delim:
       type: string
       description: UMI duplex delimiter
diff --git a/nextflow.config b/nextflow.config
index e32eb2ca..fdb47d5c 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -21,8 +21,8 @@ params {
 
     // Read processing and alignment options
     max_fastq_records = 10000000
-    umi_length        = 0
-    umi_duplex_delim  = '+'
+    fastp_umi         = false
+    markdups_umi      = false
 
     // Process configuration
     processes_manual  = false
@@ -31,7 +31,6 @@ params {
 
     // Reference genome information; iGenomes is effectively disabled but retained for linting
     genome           = null
-    force_genome     = false
     igenomes_base    = 's3://ngi-igenomes/igenomes/'
     igenomes_ignore  = true
     hmf_genomes_base = 'https://pub-cf6ba01919994c3cbd354659947f74d8.r2.dev/genomes'
@@ -273,7 +272,7 @@ plugins {
 validation {
     // NOTE(SW): entries here are generally have conditional defaults or are for internal use only
     defaultIgnoreParams = [
-        "genomes",
+        'genomes',
         'hmf_data_paths',
         'panel_data_paths',
         'ref_data_genome_fasta',
@@ -294,6 +293,10 @@ validation {
         'ref_data_hmf_data_path',
         'ref_data_panel_data_path',
         'ref_data_virusbreakenddb_path',
+        'fastp_umi_length',
+        'fastp_umi_location',
+        'fastp_umi_skip',
+        'markdups_umi_duplex_delim',
     ]
 
     lenientMode = true
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 380cacb1..79126c59 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -97,14 +97,39 @@
                 "max_fastq_records": {
                     "type": "integer",
                     "description": "When positive, will use fastp to split fastq files so that each resultant fastq file has no more than max_fastq_records records. When nonpositive, fastp is not used and the provided fastq files are passed as-is to the aligner.",
+                    "default": 10000000,
                     "fa_icon": "fas fa-cog"
                 },
-                "umi_length": {
+                "fastp_umi": {
+                    "type": "boolean",
+                    "description": "Enable fastp UMI processing.",
+                    "default": false,
+                    "fa_icon": "fas fa-cog"
+                },
+                "fastp_umi_location": {
+                    "type": "string",
+                    "description": "fastp UMI location parameter (--umi_loc).",
+                    "fa_icon": "fas fa-cog"
+                },
+                "fastp_umi_length": {
                     "type": "integer",
-                    "description": "When positive, will use fastp to extract UMIs of this length from  fastq files.",
+                    "description": "fastp UMI length parameter (--umi_len)",
+                    "default": 0,
+                    "fa_icon": "fas fa-cog"
+                },
+                "fastp_umi_skip": {
+                    "type": "integer",
+                    "description": "fastp UMI skip parameter (--umi_skip)",
+                    "default": -1,
+                    "fa_icon": "fas fa-cog"
+                },
+                "markdups_umi": {
+                    "type": "boolean",
+                    "description": "Enable MarkDups UMI processing.",
+                    "default": false,
                     "fa_icon": "fas fa-cog"
                 },
-                "umi_duplex_delim": {
+                "markdups_umi_duplex_delim": {
                     "type": "string",
                     "description": "UMI duplex delimiter as used by MarkDups.",
                     "fa_icon": "fas fa-cog"
diff --git a/subworkflows/local/read_alignment_dna/main.nf b/subworkflows/local/read_alignment_dna/main.nf
index e564b29a..ac656df8 100644
--- a/subworkflows/local/read_alignment_dna/main.nf
+++ b/subworkflows/local/read_alignment_dna/main.nf
@@ -11,15 +11,18 @@ include { FASTP          } from '../../../modules/local/fastp/main'
 workflow READ_ALIGNMENT_DNA {
     take:
     // Sample data
-    ch_inputs              // channel: [mandatory] [ meta ]
+    ch_inputs            // channel: [mandatory] [ meta ]
 
     // Reference data
-    genome_fasta           // channel: [mandatory] /path/to/genome_fasta
-    genome_bwamem2_index   // channel: [mandatory] /path/to/genome_bwa-mem2_index_dir/
+    genome_fasta         // channel: [mandatory] /path/to/genome_fasta
+    genome_bwamem2_index // channel: [mandatory] /path/to/genome_bwa-mem2_index_dir/
 
     // Params
-    max_fastq_records      // numeric: [mandatory] max number of FASTQ records per split
-    umi_length             // numeric: [optional] UMI length for extraction from fastq
+    max_fastq_records    // numeric: [optional]  max number of FASTQ records per split
+    umi_enable           // boolean: [mandatory] enable UMI processing
+    umi_location         //  string: [optional]  fastp UMI location argument (--umi_loc)
+    umi_length           // numeric: [optional]  fastp UMI length argument (--umi_len)
+    umi_skip             // numeric: [optional]  fastp UMI skip argument (--umi_skip)
 
     main:
     // Channel for version.yml files
@@ -74,13 +77,15 @@ workflow READ_ALIGNMENT_DNA {
     // Split FASTQ into chunks if requested for distributed processing
     // channel: [ meta_fastq_ready, fastq_fwd, fastq_fwd ]
     ch_fastqs_ready = Channel.empty()
-    if (max_fastq_records > 0 || umi_length > 0) {
+    if (max_fastq_records > 0 || umi_enable) {
 
         // Run process
         FASTP(
             ch_fastq_inputs,
             max_fastq_records,
+            umi_location,
             umi_length,
+            umi_skip,
         )
 
         ch_versions = ch_versions.mix(FASTP.out.versions)
@@ -118,7 +123,7 @@ workflow READ_ALIGNMENT_DNA {
     } else {
 
         // Select appropriate source
-        ch_fastq_source = umi_length > 0 ? FASTP.out.fastq : ch_fastq_inputs
+        ch_fastq_source = umi_enable ? FASTP.out.fastq : ch_fastq_inputs
 
         ch_fastqs_ready = ch_fastq_source
             .map { meta_fastq, fastq_fwd, fastq_rev ->
diff --git a/subworkflows/local/read_processing/main.nf b/subworkflows/local/read_processing/main.nf
index 6ee44a77..82e95282 100644
--- a/subworkflows/local/read_processing/main.nf
+++ b/subworkflows/local/read_processing/main.nf
@@ -22,7 +22,7 @@ workflow READ_PROCESSING {
     unmap_regions    // channel: [mandatory] /path/to/unmap_regions
 
     // Params
-    has_umis         // boolean: [mandatory] UMI processing flag
+    umi_enable       // boolean: [mandatory] enable UMI processing
     umi_duplex_delim // string:  [optional] UMI duplex delimiter
 
     main:
@@ -90,7 +90,7 @@ workflow READ_PROCESSING {
         genome_fai,
         genome_dict,
         unmap_regions,
-        has_umis,
+        umi_enable,
         umi_duplex_delim,
     )
 
diff --git a/workflows/targeted.nf b/workflows/targeted.nf
index 97ab2b04..29edfc8b 100644
--- a/workflows/targeted.nf
+++ b/workflows/targeted.nf
@@ -118,7 +118,10 @@ workflow TARGETED {
             ref_data.genome_fasta,
             ref_data.genome_bwamem2_index,
             params.max_fastq_records,
-            params.umi_length
+            params.fastp_umi,
+            params.fastp_umi_location,
+            params.fastp_umi_length,
+            params.fastp_umi_skip,
         )
 
         READ_ALIGNMENT_RNA(
@@ -151,8 +154,6 @@ workflow TARGETED {
     ch_process_dna_normal_out = Channel.empty()
     if (run_config.stages.markdups) {
 
-        has_umis = run_config.panel.equalsIgnoreCase('tso500') || params.umi_duplex_delim != '' || params.umi_length > 0
-
         READ_PROCESSING(
             ch_inputs,
             ch_align_dna_tumor_out,
@@ -162,8 +163,8 @@ workflow TARGETED {
             ref_data.genome_fai,
             ref_data.genome_dict,
             hmf_data.unmap_regions,
-            has_umis,
-            params.umi_duplex_delim,
+            params.markdups_umi,
+            params.markdups_umi_duplex_delim,
         )
 
         ch_versions = ch_versions.mix(READ_PROCESSING.out.versions)
diff --git a/workflows/wgts.nf b/workflows/wgts.nf
index fad9ddc6..48c99fd6 100644
--- a/workflows/wgts.nf
+++ b/workflows/wgts.nf
@@ -123,7 +123,10 @@ workflow WGTS {
             ref_data.genome_fasta,
             ref_data.genome_bwamem2_index,
             params.max_fastq_records,
-            0,  // disabled for now
+            false,  // umi_enable
+            '',  // umi_location
+            0,  // umi_length
+            -1,  // umi_skip
         )
 
         READ_ALIGNMENT_RNA(
@@ -165,8 +168,8 @@ workflow WGTS {
             ref_data.genome_fai,
             ref_data.genome_dict,
             hmf_data.unmap_regions,
-            false,  // has_umis
-            '',  // no duplex UMI delimiter
+            false,  // umi_enable
+            '',  // umi_duplex_delim
         )
 
         ch_versions = ch_versions.mix(READ_PROCESSING.out.versions)