diff --git a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala index ae0034e93..361265640 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala @@ -31,6 +31,7 @@ import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} import com.fulcrumgenomics.commons.collection.BetterBufferedIterator import com.fulcrumgenomics.commons.util.{LazyLogging, NumericCounter, SimpleCounter} import com.fulcrumgenomics.sopt.{arg, clp} +import com.fulcrumgenomics.umi.Umis.UmiSeparatorPattern import com.fulcrumgenomics.util.Metric.{Count, Proportion} import com.fulcrumgenomics.util._ import htsjdk.samtools.util.{Interval, IntervalList, Murmur3, OverlapDetector} @@ -389,8 +390,8 @@ class CollectDuplexSeqMetrics val umi1s = IndexedSeq.newBuilder[String] // ab UMI 1 (and ba UMI 2) sequences val umi2s = IndexedSeq.newBuilder[String] // ab UMI 2 (and ba UMI 1) sequences - ab.iterator.map(r => r[String](this.umiTag).split('-')).foreach { case Array(u1, u2) => umi1s += u1; umi2s += u2 } - ba.iterator.map(r => r[String](this.umiTag).split('-')).foreach { case Array(u1, u2) => umi1s += u2; umi2s += u1 } + ab.iterator.map(r => UmiSeparatorPattern.split(r[String](this.umiTag), -1)).foreach { case Array(u1, u2) => umi1s += u1; umi2s += u2 } + ba.iterator.map(r => UmiSeparatorPattern.split(r[String](this.umiTag), -1)).foreach { case Array(u1, u2) => umi1s += u2; umi2s += u1 } val Seq(abConsensusUmi, baConsensusUmi) = Seq(umi1s, umi2s).map(_.result()).map{ umis => val consensus = this.consensusBuilder.callConsensus(umis) @@ -530,7 +531,7 @@ class CollectDuplexSeqMetrics val uniqueTotal = metrics.map(_.unique_observations).sum.toDouble metrics.foreach { m => - val Array(umi1, umi2) = m.umi.split('-') + val Array(umi1, umi2) = UmiSeparatorPattern.split(m.umi, -1) m.fraction_raw_observations = m.raw_observations / rawTotal m.fraction_unique_observations = m.unique_observations / uniqueTotal m.fraction_unique_observations_expected = singleUmiMetrics(umi1).fraction_unique_observations * singleUmiMetrics(umi2).fraction_unique_observations diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index 2a0520ef1..7d4223046 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -34,6 +34,7 @@ import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} import com.fulcrumgenomics.commons.util.{LazyLogging, NumericCounter, SimpleCounter} import com.fulcrumgenomics.sopt.{arg, clp} import com.fulcrumgenomics.umi.GroupReadsByUmi._ +import com.fulcrumgenomics.umi.Umis.UmiSeparatorPattern import com.fulcrumgenomics.util.Metric.{Count, Proportion} import com.fulcrumgenomics.util.Sequences.countMismatches import com.fulcrumgenomics.util._ @@ -42,8 +43,8 @@ import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy import htsjdk.samtools._ import htsjdk.samtools.util.SequenceUtil +import java.util.concurrent.Executors import java.util.concurrent.atomic.AtomicLong -import java.util.concurrent.{Executors, ForkJoinPool} import scala.collection.immutable.IndexedSeq import scala.collection.mutable.ListBuffer import scala.collection.parallel.ExecutionContextTaskSupport @@ -529,14 +530,14 @@ object Strategy extends FgBioEnum[Strategy] { | reads per UMI, but breaks down at very high coverage of UMIs. |3. **adjacency**: a version of the directed adjacency method described in [umi_tools](http://dx.doi.org/10.1101/051755) | that allows for errors between UMIs but only when there is a count gradient. - |4. **paired**: similar to adjacency but for methods that produce template with a pair of UMIs - | such that a read with A-B is related to but not identical to a read with B-A. - | Expects the pair of UMIs to be stored in a single tag, separated by a hyphen - | (e.g. `ACGT-CCGG`). The molecular IDs produced have more structure than for single - | UMI strategies, and are of the form `{base}/{AB|BA}`. E.g. two UMI pairs would be - | mapped as follows AAAA-GGGG -> 1/AB, GGGG-AAAA -> 1/BA. + |4. **paired**: similar to adjacency but for methods that produce template such that a read with A-B is related + | to but not identical to a read with B-A. Expects the UMI sequences to be stored in a single SAM + | tag separated by a hyphen (e.g. `ACGT-CCGG`) and allows for one of the two UMIs to be absent + | (e.g. `ACGT-` or `-ACGT`). The molecular IDs produced have more structure than for single + | UMI strategies and are of the form `{base}/{A|B}`. E.g. two UMI pairs would be mapped as + | follows AAAA-GGGG -> 1/A, GGGG-AAAA -> 1/B. | - |`edit`, `adjacency` and `paired` make use of the `--edits` parameter to control the matching of + |Strategies `edit`, `adjacency`, and `paired` make use of the `--edits` parameter to control the matching of |non-identical UMIs. | |By default, all UMIs must be the same length. If `--min-umi-length=len` is specified then reads that have a UMI @@ -831,8 +832,8 @@ class GroupReadsByUmi val pos1 = if (r1.positiveStrand) r1.unclippedStart else r1.unclippedEnd val pos2 = if (r2.positiveStrand) r2.unclippedStart else r2.unclippedEnd val r1Lower = r1.refIndex < r2.refIndex || (r1.refIndex == r2.refIndex && (pos1 < pos2 || (pos1 == pos2 && r1.positiveStrand))) - val umis = umi.split('-') - require(umis.length == 2, s"Paired strategy used but umi did not contain 2 segments: $umi") + val umis = UmiSeparatorPattern.split(umi, -1) // Split and ensure we return empty strings for missing UMIs. + require(umis.length == 2, s"Paired strategy used but umi did not contain 2 segments delimited by a '-': $umi") if (r1Lower) paired.lowerReadUmiPrefix + ":" + umis(0) + "-" + paired.higherReadUmiPrefix + ":" + umis(1) else paired.higherReadUmiPrefix + ":" + umis(0) + "-" + paired.lowerReadUmiPrefix + ":" + umis(1) diff --git a/src/main/scala/com/fulcrumgenomics/umi/SimpleConsensusCaller.scala b/src/main/scala/com/fulcrumgenomics/umi/SimpleConsensusCaller.scala index 3e2d4935d..aba449d6f 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/SimpleConsensusCaller.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/SimpleConsensusCaller.scala @@ -60,7 +60,10 @@ private[umi] class SimpleConsensusCaller(val errorRatePreLabeling: Byte = 90.toB require(sequences.nonEmpty, "Can't call consensus on an empty set of sequences!") if (sequences.length == 1) sequences.head else { - require(sequences.forall(_.length == sequences.head.length), "Sequences must all have the same length") + require( + sequences.forall(_.length == sequences.head.length), + s"Sequences must all have the same length. Found ${sequences.mkString(", ")}" + ) val buffer = new StringBuilder val firstRead = sequences.head val readLength = firstRead.length diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index 2fbd4cfe9..c55cdd8b1 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -27,8 +27,13 @@ package com.fulcrumgenomics.umi import com.fulcrumgenomics.bam.api.SamRecord +import java.util.regex.Pattern + object Umis { + /** The separator pattern for concatenated UMIs. For example "ACGT-GTAA". */ + val UmiSeparatorPattern: Pattern = Pattern.compile("-") + /** Copies the UMI sequence from the read name. * * The read name is split by the given name delimiter, and the last field is assumed to be the UMI sequence. The UMI diff --git a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala index a019e595d..d9d0ed16f 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala @@ -343,4 +343,41 @@ class CollectDuplexSeqMetricsTest extends UnitSpec { metrics.find(_.umi == "AAA-TTT") shouldBe defined metrics.find(_.umi == "AAA-TTT").foreach(m => m.unique_observations shouldBe 1) } + + it should "count UMIs even if one of the UMIs on a 'molecule end' is an empty sequence" in collector(duplex = true).foreach { c => + val builder = new SamBuilder(readLength = 10) + builder.addPair(start1 = 100, start2 = 200, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "-CCC", MI -> "1/A")) + builder.addPair(start1 = 200, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "CCC-", MI -> "1/B")) + builder.addPair(start1 = 100, start2 = 200, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "AAA-", MI -> "2/A")) + builder.addPair(start1 = 200, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "-AAA", MI -> "2/B")) + builder.addPair(start1 = 300, start2 = 400, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "-GGG", MI -> "3/A")) + builder.addPair(start1 = 900, start2 = 800, strand1 = Plus, strand2 = Minus, attrs = Map(RX -> "TTT-", MI -> "4/A")) + builder.addPair(start1 = 300, start2 = 400, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "-CGC", MI -> "5/A")) + builder.addPair(start1 = 900, start2 = 800, strand1 = Minus, strand2 = Plus, attrs = Map(RX -> "TAT-", MI -> "6/A")) + + builder.toSeq.groupBy(r => r[String](MI).takeWhile(_ != '/')).values + .map(rs => rs.groupBy(r => r[String](MI)).values.toSeq).toSeq + .foreach(group => c.updateUmiMetrics(group)) + + val metrics = c.duplexUmiMetrics(c.umiMetrics) + metrics should have size 6 + + metrics.find(_.umi == "-CCC") shouldBe defined + metrics.find(_.umi == "-CCC").foreach(m => m.unique_observations shouldBe 1) + + metrics.find(_.umi == "AAA-") shouldBe defined + metrics.find(_.umi == "AAA-").foreach(m => m.unique_observations shouldBe 1) + + metrics.find(_.umi == "-GGG") shouldBe defined + metrics.find(_.umi == "-GGG").foreach(m => m.unique_observations shouldBe 1) + + metrics.find(_.umi == "TTT-") shouldBe defined + metrics.find(_.umi == "TTT-").foreach(m => m.unique_observations shouldBe 1) + + metrics.find(_.umi == "CGC-") shouldBe defined + metrics.find(_.umi == "CGC-").foreach(m => m.unique_observations shouldBe 1) + + metrics.find(_.umi == "-TAT") shouldBe defined + metrics.find(_.umi == "-TAT").foreach(m => m.unique_observations shouldBe 1) + } } diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index 77a5dc8f6..dffb0d227 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -200,6 +200,24 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT templates.map(t => tool invokePrivate umiForRead(t)) should contain theSameElementsInOrderAs expected } + it should "correctly assign a/b for paired UMI prefixes when the UMI for one end of the source molecule is absent" in { + val tool = new GroupReadsByUmi(rawTag = "RX", assignTag = "MI", strategy = Strategy.Paired, edits = 0, allowInterContig = true) + val builder = new SamBuilder(readLength = 100) + val templates = Seq( + // This should be a::AAA-b:: since contig1 is lower + builder.addPair(contig = 1, contig2 = Some(2), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "AAA-")), + // This should be b::AAA-a:: since contig2 is lower + builder.addPair(contig = 2, contig2 = Some(1), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "AAA-")), + // This should be a::-b::TTT since contig1 is lower + builder.addPair(contig = 1, contig2 = Some(2), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "-TTT")), + // This should be b::-a::TTT since contig2 is lower + builder.addPair(contig = 2, contig2 = Some(1), start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "-TTT")), + ).map { pair => Template(r1 = pair.headOption, r2 = pair.lastOption) } + val expected = n("a::AAA-b::", 1) ++ n("b::AAA-a::", 1) ++ n("a::-b::TTT", 1) ++ n("b::-a::TTT", 1) + val umiForRead = PrivateMethod[String](Symbol("umiForRead")) + templates.map(t => tool invokePrivate umiForRead(t)) should contain theSameElementsInOrderAs expected + } + "GroupReads.ReadInfo" should "extract the same ReadEnds from a Template as from an R1 with mate cigar" in { val builder = new SamBuilder(readLength=100, sort=None) val Seq(r1, r2) = builder.addPair(contig=2, contig2=Some(1), start1=300, start2=400, cigar1="10S90M", cigar2="90M10S") @@ -420,6 +438,37 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT an[Exception] should be thrownBy tool.execute() } + Seq( + Seq("right", "ACT-", "-ACT"), + Seq("left", "-ACT", "ACT-"), + ).foreach { case Seq(orientation, leftUmi, rightUmi) => + it should s"succeed when run in paired mode but the $orientation end of the source molecule does not have a UMI" in { + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) + builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi)) + builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi)) + builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi)) + builder.addPair(name = "a04", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> leftUmi)) + + builder.addPair(name = "b01", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi)) + builder.addPair(name = "b02", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi)) + builder.addPair(name = "b03", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi)) + builder.addPair(name = "b04", start1 = 300, start2 = 100, strand1 = Minus, strand2 = Plus, attrs = Map("RX" -> rightUmi)) + + val in = builder.toTempFile() + val out = Files.createTempFile("umi_grouped.", ".sam") + new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", strategy = Strategy.Paired, edits = 1).execute() + + val recs = readBamRecs(out) + val aIds = recs.filter(_.name.startsWith("a")).map(r => r[String]("MI")).distinct + val bIds = recs.filter(_.name.startsWith("b")).map(r => r[String]("MI")).distinct + + aIds should have size 1 + bIds should have size 1 + aIds.head shouldBe "0/A" + bIds.head shouldBe "0/B" + } + } + it should "fail when the raw tag is not present" in { val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate)) builder.addPair(name="a01", start1=100, start2=300, strand1=Plus, strand2=Minus)