Skip to content

Commit

Permalink
Allow the PairedAssigner to use empty UMIs
Browse files Browse the repository at this point in the history
  • Loading branch information
clintval committed Dec 30, 2023
1 parent 9c475fd commit b1caef3
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.collection.BetterBufferedIterator
import com.fulcrumgenomics.commons.util.{LazyLogging, NumericCounter, SimpleCounter}
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.umi.Umis.UmiSeparatorPattern
import com.fulcrumgenomics.util.Metric.{Count, Proportion}
import com.fulcrumgenomics.util._
import htsjdk.samtools.util.{Interval, IntervalList, Murmur3, OverlapDetector}
Expand Down Expand Up @@ -389,8 +390,8 @@ class CollectDuplexSeqMetrics
val umi1s = IndexedSeq.newBuilder[String] // ab UMI 1 (and ba UMI 2) sequences
val umi2s = IndexedSeq.newBuilder[String] // ab UMI 2 (and ba UMI 1) sequences

ab.iterator.map(r => r[String](this.umiTag).split('-')).foreach { case Array(u1, u2) => umi1s += u1; umi2s += u2 }
ba.iterator.map(r => r[String](this.umiTag).split('-')).foreach { case Array(u1, u2) => umi1s += u2; umi2s += u1 }
ab.iterator.map(r => UmiSeparatorPattern.split(r[String](this.umiTag), -1)).foreach { case Array(u1, u2) => umi1s += u1; umi2s += u2 }
ba.iterator.map(r => UmiSeparatorPattern.split(r[String](this.umiTag), -1)).foreach { case Array(u1, u2) => umi1s += u2; umi2s += u1 }

val Seq(abConsensusUmi, baConsensusUmi) = Seq(umi1s, umi2s).map(_.result()).map{ umis =>
val consensus = this.consensusBuilder.callConsensus(umis)
Expand Down Expand Up @@ -530,7 +531,7 @@ class CollectDuplexSeqMetrics
val uniqueTotal = metrics.map(_.unique_observations).sum.toDouble

metrics.foreach { m =>
val Array(umi1, umi2) = m.umi.split('-')
val Array(umi1, umi2) = UmiSeparatorPattern.split(m.umi, -1)
m.fraction_raw_observations = m.raw_observations / rawTotal
m.fraction_unique_observations = m.unique_observations / uniqueTotal
m.fraction_unique_observations_expected = singleUmiMetrics(umi1).fraction_unique_observations * singleUmiMetrics(umi2).fraction_unique_observations
Expand Down
21 changes: 11 additions & 10 deletions src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.util.{LazyLogging, NumericCounter, SimpleCounter}
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.umi.GroupReadsByUmi._
import com.fulcrumgenomics.umi.Umis.UmiSeparatorPattern
import com.fulcrumgenomics.util.Metric.{Count, Proportion}
import com.fulcrumgenomics.util.Sequences.countMismatches
import com.fulcrumgenomics.util._
Expand All @@ -42,8 +43,8 @@ import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy
import htsjdk.samtools._
import htsjdk.samtools.util.SequenceUtil

import java.util.concurrent.Executors
import java.util.concurrent.atomic.AtomicLong
import java.util.concurrent.{Executors, ForkJoinPool}
import scala.collection.immutable.IndexedSeq
import scala.collection.mutable.ListBuffer
import scala.collection.parallel.ExecutionContextTaskSupport
Expand Down Expand Up @@ -529,14 +530,14 @@ object Strategy extends FgBioEnum[Strategy] {
| reads per UMI, but breaks down at very high coverage of UMIs.
|3. **adjacency**: a version of the directed adjacency method described in [umi_tools](http://dx.doi.org/10.1101/051755)
| that allows for errors between UMIs but only when there is a count gradient.
|4. **paired**: similar to adjacency but for methods that produce template with a pair of UMIs
| such that a read with A-B is related to but not identical to a read with B-A.
| Expects the pair of UMIs to be stored in a single tag, separated by a hyphen
| (e.g. `ACGT-CCGG`). The molecular IDs produced have more structure than for single
| UMI strategies, and are of the form `{base}/{AB|BA}`. E.g. two UMI pairs would be
| mapped as follows AAAA-GGGG -> 1/AB, GGGG-AAAA -> 1/BA.
|4. **paired**: similar to adjacency but for methods that produce template such that a read with A-B is related
| to but not identical to a read with B-A. Expects the UMI sequences to be stored in a single SAM
| tag separated by a hyphen (e.g. `ACGT-CCGG`) and allows for one of the two UMIs to be absent
| (e.g. `ACGT-` or `-ACGT`). The molecular IDs produced have more structure than for single
| UMI strategies and are of the form `{base}/{A|B}`. E.g. two UMI pairs would be mapped as
| follows AAAA-GGGG -> 1/A, GGGG-AAAA -> 1/B.
|
|`edit`, `adjacency` and `paired` make use of the `--edits` parameter to control the matching of
|Strategies `edit`, `adjacency`, and `paired` make use of the `--edits` parameter to control the matching of
|non-identical UMIs.
|
|By default, all UMIs must be the same length. If `--min-umi-length=len` is specified then reads that have a UMI
Expand Down Expand Up @@ -831,8 +832,8 @@ class GroupReadsByUmi
val pos1 = if (r1.positiveStrand) r1.unclippedStart else r1.unclippedEnd
val pos2 = if (r2.positiveStrand) r2.unclippedStart else r2.unclippedEnd
val r1Lower = r1.refIndex < r2.refIndex || (r1.refIndex == r2.refIndex && (pos1 < pos2 || (pos1 == pos2 && r1.positiveStrand)))
val umis = umi.split('-')
require(umis.length == 2, s"Paired strategy used but umi did not contain 2 segments: $umi")
val umis = UmiSeparatorPattern.split(umi, -1) // Split and ensure we return empty strings for missing UMIs.
require(umis.length == 2, s"Paired strategy used but umi did not contain 2 segments delimited by a '-': $umi")

if (r1Lower) paired.lowerReadUmiPrefix + ":" + umis(0) + "-" + paired.higherReadUmiPrefix + ":" + umis(1)
else paired.higherReadUmiPrefix + ":" + umis(0) + "-" + paired.lowerReadUmiPrefix + ":" + umis(1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,10 @@ private[umi] class SimpleConsensusCaller(val errorRatePreLabeling: Byte = 90.toB
require(sequences.nonEmpty, "Can't call consensus on an empty set of sequences!")

if (sequences.length == 1) sequences.head else {
require(sequences.forall(_.length == sequences.head.length), "Sequences must all have the same length")
require(
sequences.forall(_.length == sequences.head.length),
s"Sequences must all have the same length. Found ${sequences.mkString(", ")}"
)
val buffer = new StringBuilder
val firstRead = sequences.head
val readLength = firstRead.length
Expand Down
5 changes: 5 additions & 0 deletions src/main/scala/com/fulcrumgenomics/umi/Umis.scala
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,13 @@ package com.fulcrumgenomics.umi

import com.fulcrumgenomics.bam.api.SamRecord

import java.util.regex.Pattern

object Umis {

/** The separator pattern for concatenated UMIs. For example "ACGT-GTAA". */
val UmiSeparatorPattern: Pattern = Pattern.compile("-")

/** Copies the UMI sequence from the read name.
*
* The read name is split by the given name delimiter, and the last field is assumed to be the UMI sequence. The UMI
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,4 +343,41 @@ class CollectDuplexSeqMetricsTest extends UnitSpec {
metrics.find(_.umi == "AAA-TTT") shouldBe defined
metrics.find(_.umi == "AAA-TTT").foreach(m => m.unique_observations shouldBe 1)
}

it should "count UMIs even if one of the UMIs on a 'molecule end' is an empty sequence" in collector(duplex = true).foreach { c =>
val builder = new SamBuilder(readLength = 10)
builder.addPair(start1 = 100, start2 = 200, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "-CCC", MI -> "1/A"))
builder.addPair(start1 = 200, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "CCC-", MI -> "1/B"))
builder.addPair(start1 = 100, start2 = 200, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "AAA-", MI -> "2/A"))
builder.addPair(start1 = 200, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "-AAA", MI -> "2/B"))
builder.addPair(start1 = 300, start2 = 400, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "-GGG", MI -> "3/A"))
builder.addPair(start1 = 900, start2 = 800, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "TTT-", MI -> "4/A"))
builder.addPair(start1 = 300, start2 = 400, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "-CGC", MI -> "5/A"))
builder.addPair(start1 = 900, start2 = 800, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "TAT-", MI -> "6/A"))

builder.toSeq.groupBy(r => r[String](MI).takeWhile(_ != '/')).values
.map(rs => rs.groupBy(r => r[String](MI)).values.toSeq).toSeq
.foreach(group => c.updateUmiMetrics(group))

val metrics = c.duplexUmiMetrics(c.umiMetrics)
metrics should have size 6

metrics.find(_.umi == "-CCC") shouldBe defined
metrics.find(_.umi == "-CCC").foreach(m => m.unique_observations shouldBe 1)

metrics.find(_.umi == "AAA-") shouldBe defined
metrics.find(_.umi == "AAA-").foreach(m => m.unique_observations shouldBe 1)

metrics.find(_.umi == "-GGG") shouldBe defined
metrics.find(_.umi == "-GGG").foreach(m => m.unique_observations shouldBe 1)

metrics.find(_.umi == "TTT-") shouldBe defined
metrics.find(_.umi == "TTT-").foreach(m => m.unique_observations shouldBe 1)

metrics.find(_.umi == "CGC-") shouldBe defined
metrics.find(_.umi == "CGC-").foreach(m => m.unique_observations shouldBe 1)

metrics.find(_.umi == "-TAT") shouldBe defined
metrics.find(_.umi == "-TAT").foreach(m => m.unique_observations shouldBe 1)
}
}
49 changes: 49 additions & 0 deletions src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,24 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
templates.map(t => tool invokePrivate umiForRead(t)) should contain theSameElementsInOrderAs expected
}

it should "correctly assign a/b for paired UMI prefixes when the UMI for one end of the source molecule is absent" in {
val tool = new GroupReadsByUmi(rawTag = "RX", assignTag = "MI", strategy = Strategy.Paired, edits = 0, allowInterContig = true)
val builder = new SamBuilder(readLength = 100)
val templates = Seq(
// This should be a::AAA-b:: since contig1 is lower
builder.addPair(contig = 1, contig2 = Some(2), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "AAA-")),
// This should be b::AAA-a:: since contig2 is lower
builder.addPair(contig = 2, contig2 = Some(1), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "AAA-")),
// This should be a::-b::TTT since contig1 is lower
builder.addPair(contig = 1, contig2 = Some(2), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "-TTT")),
// This should be b::-a::TTT since contig2 is lower
builder.addPair(contig = 2, contig2 = Some(1), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "-TTT")),
).map { pair => Template(r1 = pair.headOption, r2 = pair.lastOption) }
val expected = n("a::AAA-b::", 1) ++ n("b::AAA-a::", 1) ++ n("a::-b::TTT", 1) ++ n("b::-a::TTT", 1)
val umiForRead = PrivateMethod[String](Symbol("umiForRead"))
templates.map(t => tool invokePrivate umiForRead(t)) should contain theSameElementsInOrderAs expected
}

"GroupReads.ReadInfo" should "extract the same ReadEnds from a Template as from an R1 with mate cigar" in {
val builder = new SamBuilder(readLength=100, sort=None)
val Seq(r1, r2) = builder.addPair(contig=2, contig2=Some(1), start1=300, start2=400, cigar1="10S90M", cigar2="90M10S")
Expand Down Expand Up @@ -420,6 +438,37 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
an[Exception] should be thrownBy tool.execute()
}

Seq(
Seq("right", "ACT-", "-ACT"),
Seq("left", "-ACT", "ACT-"),
).foreach { case Seq(orientation, leftUmi, rightUmi) =>
it should s"succeed when run in paired mode but the $orientation end of the source molecule does not have a UMI" in {
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))
builder.addPair(name = "a04", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi))

builder.addPair(name = "b01", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
builder.addPair(name = "b02", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
builder.addPair(name = "b03", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))
builder.addPair(name = "b04", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi))

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".sam")
new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", strategy = Strategy.Paired, edits = 1).execute()

val recs = readBamRecs(out)
val aIds = recs.filter(_.name.startsWith("a")).map(r => r[String]("MI")).distinct
val bIds = recs.filter(_.name.startsWith("b")).map(r => r[String]("MI")).distinct

aIds should have size 1
bIds should have size 1
aIds.head shouldBe "0/A"
bIds.head shouldBe "0/B"
}
}

it should "fail when the raw tag is not present" in {
val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate))
builder.addPair(name="a01", start1=100, start2=300, strand1=Plus, strand2=Minus)
Expand Down

0 comments on commit b1caef3

Please sign in to comment.