Allow the PairedAssigner to use empty UMIs

fulcrumgenomics · Dec 30, 2023 · b1caef3 · b1caef3
1 parent 9c475fd
commit b1caef3
Show file tree

Hide file tree

Showing 6 changed files with 110 additions and 14 deletions.
diff --git a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala
@@ -31,6 +31,7 @@ import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
 import com.fulcrumgenomics.commons.collection.BetterBufferedIterator
 import com.fulcrumgenomics.commons.util.{LazyLogging, NumericCounter, SimpleCounter}
 import com.fulcrumgenomics.sopt.{arg, clp}
+import com.fulcrumgenomics.umi.Umis.UmiSeparatorPattern
 import com.fulcrumgenomics.util.Metric.{Count, Proportion}
 import com.fulcrumgenomics.util._
 import htsjdk.samtools.util.{Interval, IntervalList, Murmur3, OverlapDetector}
@@ -389,8 +390,8 @@ class CollectDuplexSeqMetrics
     val umi1s = IndexedSeq.newBuilder[String] // ab UMI 1 (and ba UMI 2) sequences
     val umi2s = IndexedSeq.newBuilder[String] // ab UMI 2 (and ba UMI 1) sequences
 
-    ab.iterator.map(r => r[String](this.umiTag).split('-')).foreach { case Array(u1, u2) => umi1s += u1; umi2s += u2 }
-    ba.iterator.map(r => r[String](this.umiTag).split('-')).foreach { case Array(u1, u2) => umi1s += u2; umi2s += u1 }
+    ab.iterator.map(r => UmiSeparatorPattern.split(r[String](this.umiTag), -1)).foreach { case Array(u1, u2) => umi1s += u1; umi2s += u2 }
+    ba.iterator.map(r => UmiSeparatorPattern.split(r[String](this.umiTag), -1)).foreach { case Array(u1, u2) => umi1s += u2; umi2s += u1 }
 
     val Seq(abConsensusUmi, baConsensusUmi) = Seq(umi1s, umi2s).map(_.result()).map{ umis =>
       val consensus = this.consensusBuilder.callConsensus(umis)
@@ -530,7 +531,7 @@ class CollectDuplexSeqMetrics
     val uniqueTotal      = metrics.map(_.unique_observations).sum.toDouble
 
     metrics.foreach { m =>
-      val Array(umi1, umi2)                   = m.umi.split('-')
+      val Array(umi1, umi2)                   = UmiSeparatorPattern.split(m.umi, -1)
       m.fraction_raw_observations             = m.raw_observations / rawTotal
       m.fraction_unique_observations          = m.unique_observations / uniqueTotal
       m.fraction_unique_observations_expected = singleUmiMetrics(umi1).fraction_unique_observations * singleUmiMetrics(umi2).fraction_unique_observations

diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -34,6 +34,7 @@ import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
 import com.fulcrumgenomics.commons.util.{LazyLogging, NumericCounter, SimpleCounter}
 import com.fulcrumgenomics.sopt.{arg, clp}
 import com.fulcrumgenomics.umi.GroupReadsByUmi._
+import com.fulcrumgenomics.umi.Umis.UmiSeparatorPattern
 import com.fulcrumgenomics.util.Metric.{Count, Proportion}
 import com.fulcrumgenomics.util.Sequences.countMismatches
 import com.fulcrumgenomics.util._
@@ -42,8 +43,8 @@ import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy
 import htsjdk.samtools._
 import htsjdk.samtools.util.SequenceUtil
 
+import java.util.concurrent.Executors
 import java.util.concurrent.atomic.AtomicLong
-import java.util.concurrent.{Executors, ForkJoinPool}
 import scala.collection.immutable.IndexedSeq
 import scala.collection.mutable.ListBuffer
 import scala.collection.parallel.ExecutionContextTaskSupport
@@ -529,14 +530,14 @@ object Strategy extends FgBioEnum[Strategy] {
     |                  reads per UMI, but breaks down at very high coverage of UMIs.
     |3. **adjacency**: a version of the directed adjacency method described in [umi_tools](http://dx.doi.org/10.1101/051755)
     |                  that allows for errors between UMIs but only when there is a count gradient.
-    |4. **paired**:    similar to adjacency but for methods that produce template with a pair of UMIs
-    |                  such that a read with A-B is related to but not identical to a read with B-A.
-    |                  Expects the pair of UMIs to be stored in a single tag, separated by a hyphen
-    |                  (e.g. `ACGT-CCGG`).  The molecular IDs produced have more structure than for single
-    |                  UMI strategies, and are of the form `{base}/{AB|BA}`. E.g. two UMI pairs would be
-    |                  mapped as follows AAAA-GGGG -> 1/AB, GGGG-AAAA -> 1/BA.
+    |4. **paired**:    similar to adjacency but for methods that produce template such that a read with A-B is related
+    |                  to but not identical to a read with B-A. Expects the UMI sequences to be stored in a single SAM
+    |                  tag separated by a hyphen (e.g. `ACGT-CCGG`) and allows for one of the two UMIs to be absent
+    |                  (e.g. `ACGT-` or `-ACGT`). The molecular IDs produced have more structure than for single
+    |                  UMI strategies and are of the form `{base}/{A|B}`. E.g. two UMI pairs would be mapped as
+    |                  follows AAAA-GGGG -> 1/A, GGGG-AAAA -> 1/B.
     |
-    |`edit`, `adjacency` and `paired` make use of the `--edits` parameter to control the matching of
+    |Strategies `edit`, `adjacency`, and `paired` make use of the `--edits` parameter to control the matching of
     |non-identical UMIs.
     |
     |By default, all UMIs must be the same length. If `--min-umi-length=len` is specified then reads that have a UMI
@@ -831,8 +832,8 @@ class GroupReadsByUmi
         val pos1 = if (r1.positiveStrand) r1.unclippedStart else r1.unclippedEnd
         val pos2 = if (r2.positiveStrand) r2.unclippedStart else r2.unclippedEnd
         val r1Lower = r1.refIndex < r2.refIndex || (r1.refIndex == r2.refIndex && (pos1 < pos2 || (pos1 == pos2 && r1.positiveStrand)))
-        val umis = umi.split('-')
-        require(umis.length == 2, s"Paired strategy used but umi did not contain 2 segments: $umi")
+        val umis = UmiSeparatorPattern.split(umi, -1) // Split and ensure we return empty strings for missing UMIs.
+        require(umis.length == 2, s"Paired strategy used but umi did not contain 2 segments delimited by a '-': $umi")
 
         if (r1Lower) paired.lowerReadUmiPrefix  + ":" + umis(0) + "-" + paired.higherReadUmiPrefix + ":" + umis(1)
         else         paired.higherReadUmiPrefix + ":" + umis(0) + "-" + paired.lowerReadUmiPrefix  + ":" + umis(1)

diff --git a/src/main/scala/com/fulcrumgenomics/umi/SimpleConsensusCaller.scala b/src/main/scala/com/fulcrumgenomics/umi/SimpleConsensusCaller.scala
@@ -60,7 +60,10 @@ private[umi] class SimpleConsensusCaller(val errorRatePreLabeling: Byte = 90.toB
     require(sequences.nonEmpty, "Can't call consensus on an empty set of sequences!")
 
     if (sequences.length == 1) sequences.head else {
-      require(sequences.forall(_.length == sequences.head.length), "Sequences must all have the same length")
+      require(
+        sequences.forall(_.length == sequences.head.length),
+        s"Sequences must all have the same length. Found ${sequences.mkString(", ")}"
+      )
       val buffer = new StringBuilder
       val firstRead  = sequences.head
       val readLength = firstRead.length

diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala
@@ -27,8 +27,13 @@ package com.fulcrumgenomics.umi
 
 import com.fulcrumgenomics.bam.api.SamRecord
 
+import java.util.regex.Pattern
+
 object Umis {
 
+  /** The separator pattern for concatenated UMIs. For example "ACGT-GTAA". */
+  val UmiSeparatorPattern: Pattern = Pattern.compile("-")
+
   /** Copies the UMI sequence from the read name.
     *
     * The read name is split by the given name delimiter, and the last field is assumed to be the UMI sequence.  The UMI

diff --git a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala
@@ -343,4 +343,41 @@ class CollectDuplexSeqMetricsTest extends UnitSpec {
     metrics.find(_.umi == "AAA-TTT") shouldBe defined
     metrics.find(_.umi == "AAA-TTT").foreach(m => m.unique_observations shouldBe 1)
   }
+
+  it should "count UMIs even if one of the UMIs on a 'molecule end' is an empty sequence" in collector(duplex = true).foreach { c =>
+    val builder = new SamBuilder(readLength = 10)
+    builder.addPair(start1 = 100, start2 = 200, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "-CCC", MI -> "1/A"))
+    builder.addPair(start1 = 200, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "CCC-", MI -> "1/B"))
+    builder.addPair(start1 = 100, start2 = 200, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "AAA-", MI -> "2/A"))
+    builder.addPair(start1 = 200, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "-AAA", MI -> "2/B"))
+    builder.addPair(start1 = 300, start2 = 400, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "-GGG", MI -> "3/A"))
+    builder.addPair(start1 = 900, start2 = 800, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "TTT-", MI -> "4/A"))
+    builder.addPair(start1 = 300, start2 = 400, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "-CGC", MI -> "5/A"))
+    builder.addPair(start1 = 900, start2 = 800, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "TAT-", MI -> "6/A"))
+
+    builder.toSeq.groupBy(r => r[String](MI).takeWhile(_ != '/')).values
+      .map(rs => rs.groupBy(r => r[String](MI)).values.toSeq).toSeq
+      .foreach(group => c.updateUmiMetrics(group))
+
+    val metrics = c.duplexUmiMetrics(c.umiMetrics)
+    metrics should have size 6
+
+    metrics.find(_.umi == "-CCC") shouldBe defined
+    metrics.find(_.umi == "-CCC").foreach(m => m.unique_observations shouldBe 1)
+
+    metrics.find(_.umi == "AAA-") shouldBe defined
+    metrics.find(_.umi == "AAA-").foreach(m => m.unique_observations shouldBe 1)
+
+    metrics.find(_.umi == "-GGG") shouldBe defined
+    metrics.find(_.umi == "-GGG").foreach(m => m.unique_observations shouldBe 1)
+
+    metrics.find(_.umi == "TTT-") shouldBe defined
+    metrics.find(_.umi == "TTT-").foreach(m => m.unique_observations shouldBe 1)
+
+    metrics.find(_.umi == "CGC-") shouldBe defined
+    metrics.find(_.umi == "CGC-").foreach(m => m.unique_observations shouldBe 1)
+
+    metrics.find(_.umi == "-TAT") shouldBe defined
+    metrics.find(_.umi == "-TAT").foreach(m => m.unique_observations shouldBe 1)
+  }
 }
diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
@@ -200,6 +200,24 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     templates.map(t => tool invokePrivate umiForRead(t)) should contain theSameElementsInOrderAs expected
   }
 
+  it should "correctly assign a/b for paired UMI prefixes when the UMI for one end of the source molecule is absent" in {
+    val tool = new GroupReadsByUmi(rawTag = "RX", assignTag = "MI", strategy = Strategy.Paired, edits = 0, allowInterContig = true)
+    val builder = new SamBuilder(readLength = 100)
+    val templates = Seq(
+      // This should be a::AAA-b:: since contig1 is lower
+      builder.addPair(contig = 1, contig2 = Some(2), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "AAA-")),
+      // This should be b::AAA-a:: since contig2 is lower
+      builder.addPair(contig = 2, contig2 = Some(1), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "AAA-")),
+      // This should be a::-b::TTT since contig1 is lower
+      builder.addPair(contig = 1, contig2 = Some(2), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "-TTT")),
+      // This should be b::-a::TTT since contig2 is lower
+      builder.addPair(contig = 2, contig2 = Some(1), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "-TTT")),
+    ).map { pair => Template(r1 = pair.headOption, r2 = pair.lastOption) }
+    val expected = n("a::AAA-b::", 1) ++ n("b::AAA-a::", 1) ++  n("a::-b::TTT", 1) ++ n("b::-a::TTT", 1)
+    val umiForRead = PrivateMethod[String](Symbol("umiForRead"))
+    templates.map(t => tool invokePrivate umiForRead(t)) should contain theSameElementsInOrderAs expected
+  }
+
   "GroupReads.ReadInfo" should "extract the same ReadEnds from a Template as from an R1 with mate cigar" in {
     val builder = new SamBuilder(readLength=100, sort=None)
     val Seq(r1, r2) = builder.addPair(contig=2, contig2=Some(1), start1=300, start2=400, cigar1="10S90M", cigar2="90M10S")
@@ -420,6 +438,37 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     an[Exception] should be thrownBy tool.execute()
   }
 
+  Seq(
+    Seq("right", "ACT-", "-ACT"),
+    Seq("left", "-ACT", "ACT-"),
+  ).foreach { case Seq(orientation, leftUmi, rightUmi) =>
+    it should s"succeed when run in paired mode but the $orientation end of the source molecule does not have a UMI" in {
+      val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
+      builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
+      builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
+      builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
+      builder.addPair(name = "a04", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
+
+      builder.addPair(name = "b01", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
+      builder.addPair(name = "b02", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
+      builder.addPair(name = "b03", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
+      builder.addPair(name = "b04", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
+
+      val in  = builder.toTempFile()
+      val out = Files.createTempFile("umi_grouped.", ".sam")
+      new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", strategy = Strategy.Paired, edits = 1).execute()
+
+      val recs = readBamRecs(out)
+      val aIds = recs.filter(_.name.startsWith("a")).map(r => r[String]("MI")).distinct
+      val bIds = recs.filter(_.name.startsWith("b")).map(r => r[String]("MI")).distinct
+
+      aIds should have size 1
+      bIds should have size 1
+      aIds.head shouldBe "0/A"
+      bIds.head shouldBe "0/B"
+    }
+  }
+
   it should "fail when the raw tag is not present" in {
     val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate))
     builder.addPair(name="a01", start1=100, start2=300, strand1=Plus, strand2=Minus)