Skip to content

Commit

Permalink
An some explicit assumption checks in DuplexConsensusCaller.
Browse files Browse the repository at this point in the history
1. That AB-R1s and BA-R2s should be on the same strand, and the
same for AB-R2s and BA-R1s.
2. That UmiConsensusCaller.filterToMostCommonAlignment should only
receive rads frmo the same strand.
  • Loading branch information
nh13 committed May 8, 2017
1 parent 86caa01 commit c8ae7fa
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 15 deletions.
23 changes: 23 additions & 0 deletions src/main/scala/com/fulcrumgenomics/umi/DuplexConsensusCaller.scala
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,29 @@ class DuplexConsensusCaller(override val readNamePrefix: String,
val (_, abR1s, abR2s) = subGroupRecords(ab)
val (_, baR1s, baR2s) = subGroupRecords(ba)

// Get all the alignments to one end of the source molecule
val singleStrand1 = abR1s ++ baR2s
val singleStrand2 = abR2s ++ baR1s

// The orientation of AB and BA reads should be:
// AB R1: + AB R2: -
// BA R1: - BA R2: +
// or vice versa (AB-R1:-, AB-R2:+, AB-R1:-, AB-R2: +
// Therefore, AB-R1s and BA-R2s should be on the same strand, and the same for AB-R2s and BA-R1s
// Check for this explicitly here.
if (singleStrand1.nonEmpty) {
val ss1Flag = singleStrand1.head.getReadNegativeStrandFlag
val ss1MI = singleStrand1.head.getStringAttribute(ConsensusTags.MolecularId)
require(singleStrand1.forall(_.getReadNegativeStrandFlag == ss1Flag),
s"Not all AB-R1s and BA-R2s were on the same strand for molecule with id: $ss1MI")
}
if (singleStrand2.nonEmpty) {
val ss2Flag = singleStrand2.head.getReadNegativeStrandFlag
val ss2MI = singleStrand2.head.getStringAttribute(ConsensusTags.MolecularId)
require(singleStrand2.forall(_.getReadNegativeStrandFlag == ss2Flag),
s"Not all AB-R2s and BA-R1s were on the same strand for molecule with id: $ss2MI")
}

// Filter by common indel pattern with AB and BA together
val filteredXs = filterToMostCommonAlignment(abR1s ++ baR2s)
val filteredYs = filterToMostCommonAlignment(abR2s ++ baR1s)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,10 @@ trait UmiConsensusCaller[C <: SimpleRead] {
* NOTE: filtered out reads are sent to the [[rejectRecords()]] method and do not need further handling
*/
protected[umi] def filterToMostCommonAlignment(recs: Seq[SAMRecord]): Seq[SAMRecord] = {
if (recs.nonEmpty) {
require(recs.forall(r => r.getReadNegativeStrandFlag == recs.head.getReadNegativeStrandFlag),
"Not all records were on the same strand.")
}
val groups = recs.groupBy { r =>
val builder = new mutable.StringBuilder
val elems = r.getCigar.getCigarElements.iterator().bufferBetter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ package com.fulcrumgenomics.umi
import java.nio.file.Paths

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.testing.SamRecordSetBuilder.{Minus, Plus}
import com.fulcrumgenomics.testing.{SamRecordSetBuilder, UnitSpec}
import htsjdk.samtools.SamReaderFactory

Expand Down Expand Up @@ -58,14 +59,24 @@ class CallDuplexConsensusReadsTest extends UnitSpec {
checkClpAnnotations[CallDuplexConsensusReads]
}

it should "run fail if AB-R1s are not on the same strand ads BA-R2s" in {
val builder = new SamRecordSetBuilder(readLength=10)
builder.addPair(name="ab1", start1=100, start2=200, attrs=Map(MI -> "1/A")).foreach { _.setReadString("AAAAAAAAAA") }
builder.addPair(name="ba1", start1=200, start2=100, attrs=Map(MI -> "1/B")).foreach { _.setReadString("AAAAAAAAAA") }

val in = builder.toTempFile()
val out = makeTempFile("duplex.", ".bam")
an[Exception] should be thrownBy new CallDuplexConsensusReads(input=in, output=out, readGroupId="ZZ").execute()
}

it should "run successfully and create consensus reads" in {
val builder = new SamRecordSetBuilder(readLength=10)
builder.addPair(name="ab1", start1=100, start2=100, attrs=Map(MI -> "1/A")).foreach { _.setReadString("AAAAAAAAAA") }
builder.addPair(name="ab2", start1=100, start2=100, attrs=Map(MI -> "1/A")).foreach { _.setReadString("AAAAAAAAAA") }
builder.addPair(name="ab3", start1=100, start2=100, attrs=Map(MI -> "1/A")).foreach { _.setReadString("AAAAAAAAAA") }
builder.addPair(name="ba1", start1=100, start2=100, attrs=Map(MI -> "1/B")).foreach { _.setReadString("AAAAAAAAAA") }
builder.addPair(name="ba2", start1=100, start2=100, attrs=Map(MI -> "1/B")).foreach { _.setReadString("AAAAAAAAAA") }
builder.addPair(name="ba3", start1=100, start2=100, attrs=Map(MI -> "1/B")).foreach { _.setReadString("AAAAAAAAAA") }
builder.addPair(name="ba1", start1=100, start2=100, strand1=Minus, strand2=Plus, attrs=Map(MI -> "1/B")).foreach { _.setReadString("AAAAAAAAAA") }
builder.addPair(name="ba2", start1=100, start2=100, strand1=Minus, strand2=Plus, attrs=Map(MI -> "1/B")).foreach { _.setReadString("AAAAAAAAAA") }
builder.addPair(name="ba3", start1=100, start2=100, strand1=Minus, strand2=Plus, attrs=Map(MI -> "1/B")).foreach { _.setReadString("AAAAAAAAAA") }

val in = builder.toTempFile()
val out = makeTempFile("duplex.", ".bam")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
package com.fulcrumgenomics.umi

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.testing.SamRecordSetBuilder.Minus
import com.fulcrumgenomics.testing.SamRecordSetBuilder.{Minus, Plus}
import com.fulcrumgenomics.testing.{SamRecordSetBuilder, UnitSpec}
import com.fulcrumgenomics.umi.UmiConsensusCaller.SourceRead
import com.fulcrumgenomics.umi.VanillaUmiConsensusCallerOptions._
Expand Down Expand Up @@ -398,6 +398,20 @@ class VanillaUmiConsensusCallerTest extends UnitSpec with OptionValues {
consensus.getFloatAttribute(ConsensusTags.PerRead.RawReadErrorRate) shouldBe 0.toFloat
}

it should "calculate the # of errors relative to the most likely consensus call, even when the final call is an N" in {
// NB: missing last base on the first read, which causes an N no-call, but errors should still be 1/3
val call = cc(cco(minReads=4)).consensusCall(Seq(
src("GATTACAN", Array(20,20,20,20,20,20,20,20)),
src("GATTACAG", Array(20,20,20,20,20,20,20,20)),
src("GATTACAG", Array(20,20,20,20,20,20,20,20)),
src("GATTACAT", Array(20,20,20,20,20,20,20,20))
)).value

call.baseString shouldBe "GATTACAN"
call.depths should contain theSameElementsInOrderAs Seq(4,4,4,4,4,4,4,3)
call.errors should contain theSameElementsInOrderAs Seq(0,0,0,0,0,0,0,1)
}

"VanillaUmiConsensusCaller.filterToMostCommonAlignment" should "return all reads when all cigars are 50M" in {
val builder = new SamRecordSetBuilder(readLength=50)
(1 to 10).foreach { i => builder.addFrag(start=100, cigar="50M") }
Expand Down Expand Up @@ -441,18 +455,13 @@ class VanillaUmiConsensusCallerTest extends UnitSpec with OptionValues {
recs.map(_.getCigarString).distinct.sorted shouldBe Seq("25M1D25M", "5S20M1D25M", "5S20M1D20M5H", "25M1D20M5S").sorted
}

it should "calculate the # of errors relative to the most likely consensus call, even when the final call is an N" in {
// NB: missing last base on the first read, which causes an N no-call, but errors should still be 1/3
val call = cc(cco(minReads=4)).consensusCall(Seq(
src("GATTACAN", Array(20,20,20,20,20,20,20,20)),
src("GATTACAG", Array(20,20,20,20,20,20,20,20)),
src("GATTACAG", Array(20,20,20,20,20,20,20,20)),
src("GATTACAT", Array(20,20,20,20,20,20,20,20))
)).value

call.baseString shouldBe "GATTACAN"
call.depths should contain theSameElementsInOrderAs Seq(4,4,4,4,4,4,4,3)
call.errors should contain theSameElementsInOrderAs Seq(0,0,0,0,0,0,0,1)
it should "throw an exception if reads where given from opposite strands" in {
val builder = new SamRecordSetBuilder(readLength=50)
// These should all be returned
(1 to 5).foreach { i => builder.addFrag(start=100, cigar="25M1D25M", strand = if (i % 2 == 0) Plus else Minus) }

an[Exception] should be thrownBy cc().filterToMostCommonAlignment(builder.toSeq)
}

"VanillaConsensusCaller.toSourceRead" should "mask bases that are below the quality threshold" in {
Expand Down

0 comments on commit c8ae7fa

Please sign in to comment.