From 48a75fc42c2c1293bf464d9d06684cb40f1cedec Mon Sep 17 00:00:00 2001
From: Ted Brookings <ted@fulcrumgenomics.com>
Date: Thu, 7 Sep 2023 14:39:47 -0400
Subject: [PATCH] Add options filterUmisWithN and allowUmisWithDifferentLengths
 * filterUmisWithN defaults to true (current behavior)   if false treat Ns
 like other bases * allowUmisWithDifferentLengths defaults to false (current
 behavior)   if true, treat UMIs with different lengths as mismatches

fix name
---
 .../umi/CallMolecularConsensusReads.scala     |  4 +-
 .../fulcrumgenomics/umi/GroupReadsByUmi.scala | 53 ++++++++++++-------
 2 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/src/main/scala/com/fulcrumgenomics/umi/CallMolecularConsensusReads.scala b/src/main/scala/com/fulcrumgenomics/umi/CallMolecularConsensusReads.scala
index 5e3e1099d..2ef24105c 100644
--- a/src/main/scala/com/fulcrumgenomics/umi/CallMolecularConsensusReads.scala
+++ b/src/main/scala/com/fulcrumgenomics/umi/CallMolecularConsensusReads.scala
@@ -72,9 +72,9 @@ import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
     |calls each end of a pair independently, and does not jointly call bases that overlap within a pair.  Insertion or
     |deletion errors in the reads are not considered in the consensus model.
     |
-    |The consensus reads produced are unaligned, due to the difficulty and error-prone nature of inferring the conesensus
+    |The consensus reads produced are unaligned, due to the difficulty and error-prone nature of inferring the consensus
     |alignment.  Consensus reads should therefore be aligned after, which should not be too expensive as likely there
-    |are far fewer consensus reads than input raw raws.  Please see how best to use this tool within the best-practice
+    |are far fewer consensus reads than input raw reads.  Please see how best to use this tool within the best-practice
     |pipeline: https://github.com/fulcrumgenomics/fgbio/blob/main/docs/best-practice-consensus-pipeline.md
     |
     |Particular attention should be paid to setting the `--min-reads` parameter as this can have a dramatic effect on
diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
index 1559b9f7c..6719b1611 100644
--- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
+++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -43,7 +43,6 @@ import htsjdk.samtools._
 import htsjdk.samtools.util.SequenceUtil
 
 import java.util.concurrent.atomic.AtomicLong
-import scala.collection.immutable.IndexedSeq
 import scala.collection.mutable.ListBuffer
 import scala.collection.{BufferedIterator, Iterator, mutable}
 
@@ -210,7 +209,7 @@ object GroupReadsByUmi {
     * Class that implements the directed adjacency graph method from umi_tools.
     * See: https://github.com/CGATOxford/UMI-tools
     */
-  private[umi] class AdjacencyUmiAssigner(val maxMismatches: Int) extends UmiAssigner {
+  private[umi] class AdjacencyUmiAssigner(val maxMismatches: Int, val allowUmisWithDifferentLengths: Boolean) extends UmiAssigner {
     /** Represents a node in the adjacency graph; equality is just by UMI sequence. */
     class Node(val umi: Umi, val count: Long, val children: mutable.Buffer[Node] = mutable.Buffer()) {
       /** Gets the full set of descendants from this node. */
@@ -235,16 +234,22 @@ object GroupReadsByUmi {
 
     /** Returns whether or not a pair of UMIs match closely enough to be considered adjacent in the graph. */
     protected def matches(lhs: Umi, rhs: Umi): Boolean = {
-      require(lhs.length == rhs.length, s"UMIs of different length detected: $lhs vs. $rhs")
-      var idx = 0
-      var mismatches = 0
-      val len = lhs.length
-      while (idx < len && mismatches <= this.maxMismatches) {
-        if (lhs(idx) != rhs(idx)) mismatches += 1
-        idx += 1
-      }
+      if (allowUmisWithDifferentLengths) {
+        lhs.length == rhs.length
+      } else {
+        require(lhs.length == rhs.length, s"UMIs of different length detected: $lhs vs. $rhs")
+        true
+      } &&  {
+        var idx = 0
+        var mismatches = 0
+        val len = lhs.length
+        while (idx < len && mismatches <= this.maxMismatches) {
+          if (lhs(idx) != rhs(idx)) mismatches += 1
+          idx += 1
+        }
 
-      mismatches <= maxMismatches
+        mismatches <= maxMismatches
+      }
     }
 
     /** Assigns IDs to each UMI based on the root to which is it mapped. */
@@ -271,7 +276,6 @@ object GroupReadsByUmi {
         val nextRoot = remaining.remove(0)
         roots += nextRoot
         val working = mutable.Buffer[Node](nextRoot)
-
         while (working.nonEmpty) {
           val root = working.remove(0)
           val (hits, misses) = remaining.partition(other => root.count >= 2 * other.count - 1 && matches(root.umi, other.umi))
@@ -292,7 +296,7 @@ object GroupReadsByUmi {
     *
     * @param maxMismatches the maximum number of mismatches between UMIs
     */
-  class PairedUmiAssigner(maxMismatches: Int) extends AdjacencyUmiAssigner(maxMismatches) {
+  class PairedUmiAssigner(maxMismatches: Int, allowUmisWithDifferentLengths: Boolean) extends AdjacencyUmiAssigner(maxMismatches, allowUmisWithDifferentLengths) {
     /** String that is prefixed onto the UMI from the read with that maps to a lower coordinate in the genome.. */
     private[umi] val lowerReadUmiPrefix: String = ("a" * (maxMismatches+1)) + ":"
 
@@ -402,27 +406,32 @@ case class TagFamilySizeMetric(family_size: Int,
 
 /** The strategies implemented by [[GroupReadsByUmi]] to identify reads from the same source molecule.*/
 sealed trait Strategy extends EnumEntry {
-  def newStrategy(edits: Int): UmiAssigner
+  def newStrategy(edits: Int, allowUmisWithDifferentLengths: Boolean): UmiAssigner
 }
 object Strategy extends FgBioEnum[Strategy] {
   def values: IndexedSeq[Strategy] = findValues
   /** Strategy to only reads with identical UMI sequences are grouped together. */
   case object Identity extends Strategy {
-    def newStrategy(edits: Int = 0): UmiAssigner = {
+    def newStrategy(edits: Int = 0, allowUmisWithDifferentLengths: Boolean): UmiAssigner = {
       require(edits == 0, "Edits should be zero when using the identity UMI assigner.")
       new IdentityUmiAssigner
     }
   }
+
   /** Strategy to cluster reads into groups based on mismatches between reads in clusters. */
-  case object Edit extends Strategy { def newStrategy(edits: Int): UmiAssigner = new SimpleErrorUmiAssigner(edits) }
+  case object Edit extends Strategy { def newStrategy(edits: Int, allowUmisWithDifferentLengths: Boolean): UmiAssigner = new SimpleErrorUmiAssigner(edits) }
   /** Strategy based on the directed adjacency method described in [umi_tools](http://dx.doi.org/10.1101/051755)
     * that allows for errors between UMIs but only when there is a count gradient.
     */
-  case object Adjacency extends Strategy { def newStrategy(edits: Int): UmiAssigner = new AdjacencyUmiAssigner(edits) }
+  case object Adjacency extends Strategy {
+    def newStrategy(edits: Int, allowUmisWithDifferentLengths: Boolean): UmiAssigner = new AdjacencyUmiAssigner(edits, allowUmisWithDifferentLengths)
+  }
   /** Strategy similar to the [[Adjacency]] strategy similar to adjacency but for methods that produce template with a
     * pair of UMIs such that a read with A-B is related to but not identical to a read with B-A.
     */
-  case object Paired extends Strategy { def newStrategy(edits: Int): UmiAssigner = new PairedUmiAssigner(edits)}
+  case object Paired extends Strategy {
+    def newStrategy(edits: Int, allowUmisWithDifferentLengths: Boolean): UmiAssigner = new PairedUmiAssigner(edits, allowUmisWithDifferentLengths)
+  }
 }
 
 @clp(group=ClpGroups.Umi, description =
@@ -510,6 +519,8 @@ class GroupReadsByUmi
                          |otherwise discard reads with UMIs shorter than this length and allow for differing UMI lengths.
                          |""")
     val minUmiLength: Option[Int] = None,
+ @arg(flag='N', doc="Filter UMIs with N bases.") val filterUmisWithN: Boolean = true,
+ @arg(flag='a', doc="Allow UMIs with different lengths") val allowUmisWithDifferentLengths: Boolean = false,
  @arg(flag='x', doc= """
                          |DEPRECATED: this option will be removed in future versions and inter-contig reads will be
                          |automatically processed.""")
@@ -519,7 +530,7 @@ class GroupReadsByUmi
 
   require(this.minUmiLength.forall(_ => this.strategy != Strategy.Paired), "Paired strategy cannot be used with --min-umi-length")
 
-  private val assigner = strategy.newStrategy(this.edits)
+  private val assigner = strategy.newStrategy(this.edits, this.allowUmisWithDifferentLengths)
 
   // Give values to unset parameters that are different in duplicate marking mode
   private val _minMapQ = this.minMapQ.getOrElse(if (this.markDuplicates) 0 else 1)
@@ -578,7 +589,9 @@ class GroupReadsByUmi
       .filter(r => (r.mapped || (r.paired && r.mateMapped))                         || { filteredPoorAlignment += 1; false })
       .filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false })
       .filter(r => mapqOk(r, this._minMapQ)                                         || { filteredPoorAlignment += 1; false })
-      .filter(r => !r.get[String](rawTag).exists(_.contains('N'))                   || { filteredNsInUmi += 1; false })
+      .filter(
+        r => !(filterUmisWithN && r.get[String](rawTag).exists(_.contains('N'))  ) || { filteredNsInUmi += 1; false }
+      )
       .filter { r =>
         this.minUmiLength.forall { l =>
           r.get[String](this.rawTag).forall { umi =>