From 06065c68d601513c56e45d065c1a0109e8596df2 Mon Sep 17 00:00:00 2001 From: Ted Brookings Date: Fri, 29 Mar 2024 16:32:51 -0400 Subject: [PATCH 01/10] Add DownsampleVcf tool --- .../fulcrumgenomics/vcf/DownsampleVcf.scala | 281 ++++++++++ .../vcf/DownsampleVcfTest.scala | 487 ++++++++++++++++++ 2 files changed, 768 insertions(+) create mode 100644 src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala create mode 100644 src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala diff --git a/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala new file mode 100644 index 000000000..04fe4a087 --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala @@ -0,0 +1,281 @@ +package com.fulcrumgenomics.vcf + +import com.fulcrumgenomics.FgBioDef._ +import com.fulcrumgenomics.commons.io.Io +import com.fulcrumgenomics.commons.util.LazyLogging +import com.fulcrumgenomics.fasta.SequenceDictionary +import com.fulcrumgenomics.sopt.{arg, clp} +import com.fulcrumgenomics.util.{Metric, ProgressLogger} +import com.fulcrumgenomics.vcf.api.Allele.NoCallAllele +import com.fulcrumgenomics.vcf.api.{Allele, Genotype, Variant, VcfCount, VcfFieldType, VcfFormatHeader, VcfHeader, VcfSource, VcfWriter} +import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} +import com.fulcrumgenomics.vcf.DownsampleVcf.{downsampleAndRegenotype, winnowVariants} + +import scala.math.log10 +import scala.util.Random + +object DownsampleVcf extends LazyLogging { + /** Removes variants that are within a specified distance from a previous variant + * The end position of the current variant is compared with the start position of the following variant + * + * @param variants an iterator of the variants to process + * @param windowSize the interval (exclusive) in which to check for additional variants. + * windowSize considers the distance between the end position of a variant + * with the start position of the following variant + * @param dict a sequencing dictionary to get contig ordering + * @return a new iterator of variants with just the variant entries we want to keep + */ + def winnowVariants(variants: Iterator[Variant], windowSize: Int, dict: SequenceDictionary): Iterator[Variant] = { + require(windowSize >= 0, f"the windowSize ($windowSize) is negative") + new Iterator[Variant] { + private val iter = variants.bufferBetter + + def hasNext: Boolean = iter.hasNext + + def isInOrder(current: Variant, next: Variant, currentIndex: Int, nextIndex: Int): Boolean = { + (currentIndex < nextIndex) || (currentIndex == nextIndex && current.end <= next.pos) + } + + def next(): Variant = { + val current = iter.next() + val currentIndex = dict(current.chrom).index + iter.dropWhile { next: Variant => + val nextIndex = dict(next.chrom).index + require( + isInOrder(current, next, currentIndex, nextIndex), + f"variants out of order; ${current.chrom}:${current.pos} > ${next.chrom}:${next.pos}") + + currentIndex == nextIndex && next.pos - current.end < windowSize + } + current + } + } + } + + /** Downsamples variants using Allele Depths + * + * @param oldAds an indexed seq of the original allele depths + * @param proportion the proportion to use for downsampling, + * calculated using total base count from the index and a target base count + * @return a new IndexedSeq of allele depths of the same length as `oldAds` + */ + def downsampleADs(oldAds: IndexedSeq[Int], proportion: Double, random: Random): IndexedSeq[Int] = { + require(proportion <= 1, f"proportion must be less than 1: proportion = ${proportion}") + oldAds.map(s => Range(0, s).iterator.map(_ => random.nextDouble()).count(_ < proportion)) + } + + /** + * Does the downsampling on a Variant + * @param variant the variant with the genotype to downsample + * @param proportions a map of downsampling target proportions for each sample + * @param random random number generator for downsampling + * @param epsilon the error rate for genotyping + * @return a new variant with updated genotypes + */ + // Returns a new variant that has downsampled ADs, recomputed PLs and updated genotypes + def downsampleAndRegenotype(variant: Variant, proportions: Map[String, Double], random: Random, epsilon: Double = 0.01): Variant = { + try { + variant.copy(genotypes = variant.genotypes.map { case (sample, gt) => + val proportion = proportions(sample) + sample -> downsampleAndRegenotype(gt = gt, proportion = proportion, random = random, epsilon = epsilon) + }) + } catch { + case e: MatchError => throw new Exception( + "processing " + variant.id + " at " + variant.chrom + ":" + variant.pos + "-" + variant.end, e + ) + } + } + + /** + * Does the downsampling on a Genotype + * @param gt the genotype to downsample + * @param proportion the proportion to use for downsampling allele depths + * @param random random number generator for downsampling + * @param epsilon the error rate for genotyping + * @return a new Genotype with updated allele depths, PLs and genotype + */ + def downsampleAndRegenotype(gt: Genotype, proportion: Double, random: Random, epsilon: Double): Genotype = { + val oldAds = gt[IndexedSeq[Int]]("AD") + val newAds = downsampleADs(oldAds, proportion, random) + val Seq(aa, ab, bb) = computePls(newAds) + val Seq(alleleA, alleleB) = gt.alleles.toSeq + + val calls = { + if (aa == 0 && ab == 0 && bb == 0) IndexedSeq(NoCallAllele, NoCallAllele) + else if (aa < ab && aa < bb) IndexedSeq(alleleA, alleleA) + else if (bb < ab && bb < aa) IndexedSeq(alleleB, alleleB) + else IndexedSeq(alleleA, alleleB) + } + gt.copy(attrs = Map("PL" -> IndexedSeq(aa, ab, bb), "AD" -> newAds, "DP" -> newAds.sum), calls = calls) + } + + /** + * Compute the genotype likelihoods given the allele depths. + * @param ads The allele depths to generate likelihoods from + * @return a list of three likelihoods + */ + def computePls(ads: IndexedSeq[Int]): IndexedSeq[Int] = { + val likelihoods = Likelihoods(ads(0), ads(1)) + IndexedSeq(likelihoods.aa.round.toInt, likelihoods.ab.round.toInt, likelihoods.bb.round.toInt) + } + + + object Likelihoods { + /** Computes the likelihoods for each possible genotype. + * + * @param alleleDepthA the reference allele depth + * @param alleleDepthB the alternate allele depth + * @param epsilon the error rate for genotyping + * @return a new `Likelihood` that has the likelihoods of AA, AB, and BB + */ + def apply(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double = 0.01): Likelihoods = { + val aGivenAA = log10(1 - epsilon) + val aGivenBB = log10(epsilon) + val aGivenAB = log10((1 - epsilon) / 2) + + val rawGlAA = ((alleleDepthA * aGivenAA) + (alleleDepthB * aGivenBB)) * -10 + val rawGlBB = ((alleleDepthA * aGivenBB) + (alleleDepthB * aGivenAA)) * -10 + val rawGlAB = ((alleleDepthA + alleleDepthB) * aGivenAB) * -10 + + val minGL = math.min(math.min(rawGlAA, rawGlAB), rawGlBB) + + Likelihoods( + aa = rawGlAA - minGL, + ab = rawGlAB - minGL, + bb = rawGlBB - minGL + ) + } + } + + /** Stores the log10(likelihoods) for all possible bi-allelic genotypes. + * + * @param aa likelihood of AA + * @param ab likelihood of AB + * @param bb likelihood of BB + */ + case class Likelihoods(aa: Double, ab: Double, bb: Double) { + def pls = IndexedSeq(aa.round.toInt, ab.round.toInt, bb.round.toInt) + } +} + + @clp(group = ClpGroups.VcfOrBcf, description = + """ + |DownsampleVcf takes a vcf file and metadata with sequencing info and + |1. winnows the vcf to remove variants within a specified distance to each other, + |2. downsamples the variants using the provided allele depths and target base count by + | re-computing/downsampling the allele depths for the new target base count + | and re-computing the genotypes based on the new allele depths + |and writes a new downsampled vcf file. + |For single-sample VCFs, the metadata file can be omitted, and instead you can specify originalBases. + """) + class DownsampleVcf + (@arg(flag = 'i', doc = "The vcf to downsample.") val input: PathToVcf, + @arg(flag = 'm', doc = "Index file with bases per sample.") val metadata: Option[FilePath] = None, + @arg(flag = 'b', doc = "Original number of bases (for single-sample VCF)") val originalBases: Option[Double] = None, + @arg(flag = 'n', doc = "Target number of bases to downsample to.") val downsampleToBases: Double, + @arg(flag = 'o', doc = "Output file name.") val output: PathToVcf, + @arg(flag = 'w', doc = "Winnowing window size.") val windowSize: Int = 150, + @arg(flag = 'e', doc = "Error rate for genotyping.") val epsilon: Double = 0.01, + @arg(flag = 'c', doc = "True to write out no-calls.") val writeNoCall: Boolean = false) + extends FgBioTool { + Io.assertReadable(input) + Io.assertReadable(metadata) + Io.assertCanWriteFile(output) + require(downsampleToBases > 0, "target base count must be greater than zero") + require(windowSize >= 0, "window size must be greater than or equal to zero") + require(0 <= epsilon && epsilon <= 1, "epsilon/error rate must be between 0 and 1") + originalBases match { + case Some(x) => + require(x > 0, "originalBases must be greater than zero") + require(metadata.isEmpty, "Must pass either originalBases (for single-sample VCF) or metadata, not both") + case None => + require(metadata.isDefined, "Must pass either originalBases (for single-sample VCF) or metadata, not both") + } + + override def execute(): Unit = { + val vcf = VcfSource(input) + val progress = ProgressLogger(logger, noun = "variants") + val proportions = ( + originalBases match { + case Some(x) => + require(vcf.header.samples.length == 1, "--original-bases requires a single-sample VCF") + LazyList(vcf.header.samples.head -> math.min(downsampleToBases / x, 1.0)) + case _ => + Sample.read(metadata.getOrElse(throw new RuntimeException)) + .filter(s => vcf.header.samples.contains(s.SAMPLE_NAME)) + .map(sample => sample.SAMPLE_NAME -> math.min(downsampleToBases / sample.BASE_COUNT.toDouble, 1.0)) + } + ).toMap + proportions.foreach { case (s, p) => logger.info(f"Downsampling $s with proportion ${p}%.4f") } + + val winnowed = if (windowSize > 0) winnowVariants(vcf.iterator, windowSize = windowSize, dict = vcf.header.dict) else vcf.iterator + val outputVcf = VcfWriter(path = output, header = buildOutputHeader(vcf.header)) + + val random = new Random(42) + winnowed.foreach { v => + val ds = downsampleAndRegenotype(v, proportions = proportions, random = random, epsilon = epsilon) + if (writeNoCall) { + outputVcf += ds + progress.record(ds) + } + else if (!ds.gts.forall(g => g.isNoCall)) { + outputVcf += ds + progress.record(ds) + } + } + + progress.logLast() + vcf.safelyClose() + outputVcf.close() + } + + def buildOutputHeader(in: VcfHeader): VcfHeader = { + val fmts = Seq.newBuilder[VcfFormatHeader] + fmts ++= in.formats + + if (!in.format.contains("AD")) { + fmts += VcfFormatHeader(id="AD", count=VcfCount.OnePerAllele, kind=VcfFieldType.Integer, description="Per allele depths.") + } + + if (!in.format.contains("DP")) { + fmts += VcfFormatHeader(id="DP", count=VcfCount.Fixed(1), kind=VcfFieldType.Integer, description="Total depth across alleles.") + } + + if (!in.format.contains("PL")) { + fmts += VcfFormatHeader(id="PL", count=VcfCount.OnePerGenotype, kind=VcfFieldType.Integer, description="Per genotype phred scaled likelihoods.") + } + + in.copy(formats = fmts.result()) + } + } + +object Sample { + /** Load a set of samples from the 1KG metadata file. */ + def read(path: FilePath): Seq[Sample] = { + val lines = Io.readLines(path).dropWhile(_.startsWith("##")).map(line => line.dropWhile(_ == '#')) + Metric.read[Sample](lines=lines) + } +} + +case class Sample(ENA_FILE_PATH: String = ".", + MD5SUM: String = ".", + RUN_ID: String = ".", + STUDY_ID: String = ".", + STUDY_NAME: String = ".", + CENTER_NAME: String = ".", + SUBMISSION_ID: String = ".", + SUBMISSION_DATE: String = ".", + SAMPLE_ID: String = ".", + SAMPLE_NAME: String, + POPULATION: String = ".", + EXPERIMENT_ID: String = ".", + INSTRUMENT_PLATFORM: String = ".", + INSTRUMENT_MODEL: String = ".", + LIBRARY_NAME: String = ".", + RUN_NAME: String = ".", + INSERT_SIZE: String = ".", + LIBRARY_LAYOUT: String = ".", + PAIRED_FASTQ: String = ".", + READ_COUNT: String = ".", + BASE_COUNT: Long, + ANALYSIS_GROUP: String = ".") extends Metric diff --git a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala new file mode 100644 index 000000000..c3e8b7aba --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala @@ -0,0 +1,487 @@ +package com.fulcrumgenomics.vcf + +import com.fulcrumgenomics.testing.VcfBuilder +import com.fulcrumgenomics.testing.VcfBuilder.Gt +import com.fulcrumgenomics.util.Metric +import com.fulcrumgenomics.vcf.api.Allele.SimpleAllele +import com.fulcrumgenomics.vcf.api.{Allele, AlleleSet, Genotype, Variant} +import com.fulcrumgenomics.testing.UnitSpec +import com.fulcrumgenomics.vcf.DownsampleVcf.{Likelihoods, computePls, downsampleAndRegenotype} + +import scala.util.Random + +class DownsampleVcfTest extends UnitSpec { + + private val dict = VcfBuilder(Seq.empty).header.dict + + private def winnow(variants: Iterator[Variant], window: Int = 10): Iterator[Variant] = { + DownsampleVcf.winnowVariants(variants, window, dict) + } + + "DownsampleVcf.winnowVariants" should "return an empty iterator when given an empty iterator" in { + winnow(Iterator.empty, window = 0) shouldBe Symbol("empty") + } + + it should "return a single variant when given a single variant" in { + val builder = VcfBuilder(Seq.empty) + builder.add(pos=3, alleles=Seq("A","G")) + winnow(builder.iterator, window=0).toSeq should contain theSameElementsInOrderAs builder.iterator.toSeq + } + + it should "throw an exception if the window is negative" in { + val builder = VcfBuilder(Seq.empty) + val ex = intercept[Exception] { winnow(builder.iterator, window= -2).toSeq } + ex.getMessage should include ("negative") + } + + it should "keep both variants if distance is one more than the boundary" in { + val builder = VcfBuilder(Seq.empty) + builder.add(pos=3, alleles=Seq("A","G")) + builder.add(pos=14, alleles=Seq("G","C")) + winnow(builder.iterator, window=10).toSeq should contain theSameElementsInOrderAs builder.iterator.toSeq + } + + it should "keep both variants if distance is the boundary" in { + val builder = VcfBuilder(Seq.empty) + builder.add(pos=3, alleles=Seq("A","G")) + builder.add(pos=13, alleles=Seq("G","C")) + winnow(builder.iterator, window=10).toSeq should contain theSameElementsInOrderAs builder.iterator.toSeq + } + + it should "keep the first variant if distance is within specified window" in { + val builder = VcfBuilder(Seq.empty) + builder.add(pos=3, alleles=Seq("A","G")) + builder.add(pos=9, alleles=Seq("G","C")) + winnow(builder.iterator, window=10).toSeq should contain theSameElementsInOrderAs Iterator(builder.iterator.next()).toSeq + } + + it should "keep both variants if genomic loci is in range, but on different chromosomes" in { + val builder = VcfBuilder(Seq.empty) + builder.add(chrom="chr1", pos=3, alleles=Seq("A","G")) + builder.add(chrom="chr2", pos=9, alleles=Seq("G","C")) + winnow(builder.iterator, window=10).toSeq should contain theSameElementsInOrderAs builder.iterator.toSeq + } + + it should "keep the first variant if three variants are within the specified window" in { + val builder = VcfBuilder(Seq.empty) + builder.add(pos=3, alleles=Seq("A","G")) + builder.add(pos=9, alleles=Seq("G","C")) + builder.add(pos=12, alleles=Seq("G","T")) + winnow(builder.iterator, window=10).toSeq should contain theSameElementsInOrderAs Iterator(builder.iterator.next()).toSeq + } + + it should "remove the middle variant if variants a and b are within window, b and c are within window, and a and c are far enough apart" in { + val builder = VcfBuilder(Seq.empty) + builder.add(pos=3, alleles=Seq("A","G")) + builder.add(pos=11, alleles=Seq("G","C")) + builder.add(pos=14, alleles=Seq("G","T")) + val answer = builder.iterator.filterNot(_.pos==11) + winnow(builder.iterator, window=10).toSeq should contain theSameElementsInOrderAs answer.toSeq + } + + it should "throw an exception if the chromosomes are out of order" in { + val builder = VcfBuilder(Seq.empty) + builder.add(chrom="chr2", pos=3, alleles=Seq("A","G")) + builder.add(chrom="chr1", pos=9, alleles=Seq("G","C")) + val ex = intercept[Exception] { winnow(builder.iterator.toSeq.reverseIterator, window=10).toSeq } + ex.getMessage should include ("variants out of order") + } + + it should "throw an exception if the positions within a chromosome are out of order" in { + val builder = VcfBuilder(Seq.empty) + builder.add(pos=9, alleles=Seq("A","G")) + builder.add(pos=3, alleles=Seq("G","C")) + val ex = intercept[Exception] { winnow(builder.iterator.toSeq.reverseIterator, window=10).toSeq } + ex.getMessage should include ("variants out of order") + } + + it should "keep the first variant if there are duplicate variants" in { + // VcfBuilder doesn't support duplicate variants + val builder = VcfBuilder(Seq.empty) + builder.add(pos=9, alleles=Seq("A","G")) + val builder2 = VcfBuilder(Seq.empty) + builder2.add(pos=9, alleles=Seq("A","G")) + val builder3 = Iterator(builder.iterator.next()) ++ Iterator(builder2.iterator.next()) + winnow(builder3.iterator, window=10).toSeq should contain theSameElementsInOrderAs Iterator(builder.iterator.next()).toSeq + } + + it should "keep the first variant if there are variants at the same contig/position" in { + val builder = VcfBuilder(Seq.empty) + builder.add(pos=9, alleles=Seq("A","G")) + val builder2 = VcfBuilder(Seq.empty) + builder2.add(pos=9, alleles=Seq("A","T")) + val builder3 = Iterator(builder.iterator.next()) ++ Iterator(builder2.iterator.next()) + winnow(builder3.iterator, window=10).toSeq should contain theSameElementsInOrderAs Iterator(builder.iterator.next()).toSeq + } + + it should "keep both variants if there is an insertion and the following SNP is outside the window" in { + val builder = VcfBuilder(Seq.empty) + builder.add(pos=3, alleles=Seq("A","AGGG")) + builder.add(pos=14, alleles=Seq("C","T")) + winnow(builder.iterator, window=10).toSeq should contain theSameElementsInOrderAs builder.toSeq + } + + it should "keep the first variant if there is an insertion and the following SNP is within the window" in { + val builder = VcfBuilder(Seq.empty) + builder.add(pos=3, alleles=Seq("A","AGGG")) + builder.add(pos=12, alleles=Seq("C","T")) + winnow(builder.iterator, window=10).toSeq should contain theSameElementsInOrderAs Iterator(builder.iterator.next()).toSeq + } + + it should "keep both variants if there is a deletion and the following SNP is outside the window" in { + val builder = VcfBuilder(Seq.empty) + builder.add(pos=3, alleles=Seq("AGGG","A")) + builder.add(pos=16, alleles=Seq("C","G")) + winnow(builder.iterator, window=10).toSeq should contain theSameElementsInOrderAs builder.toSeq + } + + it should "keep the first variant if there is a deletion and the following SNP is within the window" in { + val builder = VcfBuilder(Seq.empty) + builder.add(pos=3, alleles=Seq("AGGG","A")) + builder.add(pos=15, alleles=Seq("C","G")) + winnow(builder.iterator, window=10).toSeq should contain theSameElementsInOrderAs Iterator(builder.iterator.next()).toSeq + } + + /* + Testing DownsampleVcf.downsampleADs + */ + + val random = new Random(60) + + "DownsampleVcf.downsampleADs" should "return the same # of ADs as was given" in { + val ad = IndexedSeq(1, 2) + val expected = DownsampleVcf.downsampleADs(ad, proportion = 0.1, random) + expected.length shouldBe ad.length + } + + it should "downsample to roughly 1% of the previous allele number" in { + val ad = IndexedSeq(10000, 25000) + val expected = DownsampleVcf.downsampleADs(ad, proportion = 0.1, random) + expected should contain theSameElementsInOrderAs Seq(1003, 2523) + } + + it should "return 0 if the input allele depths are 0" in { + val ad = IndexedSeq(0, 0) + val expected = DownsampleVcf.downsampleADs(ad, proportion = 0.1, random) + expected should contain theSameElementsInOrderAs Seq(0,0) + } + + it should "return a small number or 0 if the input ADs are small" in { + val ad = IndexedSeq(1, 10) + val expected = DownsampleVcf.downsampleADs(ad, proportion = 0.1, random) + expected should contain theSameElementsInOrderAs Seq(0,1) + } + + it should "return the same values if the proportion is 1" in { + val ad = IndexedSeq(100, 100) + val expected = DownsampleVcf.downsampleADs(ad, proportion = 1, random) + expected should contain theSameElementsInOrderAs Seq(100, 100) + } + + it should "throw an exception if the proportion is greater than 1" in { + val ad = IndexedSeq(100, 200) + val ex = intercept[Exception] { DownsampleVcf.downsampleADs(ad, proportion = 1.1, random) } + ex.getMessage should include ("proportion must be less than 1") + } + + "DownsampleVcf.computePls" should "return new PLs that are not always 0,0,0" in { + val ads = IndexedSeq[Int](0, 100) + val expected = IndexedSeq(1996, 301, 0) + val newlikelihoods = computePls(ads) + newlikelihoods should contain theSameElementsInOrderAs expected + } + + /* + Testing DownsampleVcf.Likelihoods + */ + + "DownsampleVcf.Likelihoods" should "return ref if all allele depths are zero" in { + val likelihood = Likelihoods(alleleDepthA=0, alleleDepthB=0) + val expected = IndexedSeq[Int](0, 0, 0) + likelihood.pls.length shouldBe expected.length + likelihood.pls should contain theSameElementsInOrderAs expected + } + + it should "return a likelihood of 0 for AA if there are only ref alleles observed" in { + val likelihood = Likelihoods(alleleDepthA = 10, alleleDepthB = 0) + val expected = IndexedSeq[Int](0, 30, 200) + likelihood.pls should contain theSameElementsInOrderAs expected + } + + it should "return a likelihood of 0 for BB if there are only alt alleles observed" in { + val likelihood = Likelihoods(alleleDepthA = 0, alleleDepthB = 10) + val expected = IndexedSeq[Int](200, 30, 0) + likelihood.pls should contain theSameElementsInOrderAs expected + } + + it should "return a likelihood of 0 for AB if there are an equal number of ref and alt alleles" in { + val likelihood = Likelihoods(alleleDepthA = 5, alleleDepthB = 5) + val expected = IndexedSeq[Int](70, 0, 70) + likelihood.pls should contain theSameElementsInOrderAs expected + } + + it should "return a likelihood of 0 for AA if the AD A >> AD B" in { + val likelihood = Likelihoods(alleleDepthA = 15, alleleDepthB = 2) + likelihood.pls(0) == 0 + } + + it should "return a likelihood of 0 for AB if AD.A and AD.B are similar but not equal" in { + val likelihood = Likelihoods(alleleDepthA = 15, alleleDepthB = 17) + likelihood.pls(1) == 0 + } + + it should "return a likelihood of 0 for BB if AD.B >> AD.A but neither are 0" in { + val likelihood = Likelihoods(alleleDepthA = 3, alleleDepthB = 30) + likelihood.pls(2) == 0 + } + + it should "return correct values when there are very few reads" in { + Likelihoods(0, 0).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0) + Likelihoods(1, 0).pls should contain theSameElementsInOrderAs IndexedSeq(0, 3, 20) + Likelihoods(1, 1).pls should contain theSameElementsInOrderAs IndexedSeq(14, 0, 14) + Likelihoods(0, 2).pls should contain theSameElementsInOrderAs IndexedSeq(40, 6, 0) + Likelihoods(1, 2).pls should contain theSameElementsInOrderAs IndexedSeq(31, 0, 11) + } + + + /* + testing DownsampleVcf.downsampleAndRegenotype on downsampleAndRegenotypes + */ + private def makeGt(ref: String, alt: String, ads: IndexedSeq[Int], sample: String ="test"): Genotype = { + Genotype(alleles=AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt))), + sample=sample, + calls=IndexedSeq[Allele](Allele(ref), Allele(alt)), + attrs=Map("AD" -> ads, "PL" -> Likelihoods(alleleDepthA = ads(0), alleleDepthB = ads(1)))) + } + + + "DownsampleVcf.downsampleAndRegneotype(Genotype)" should "return no call if all allele depths are zero" in { + val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(0,0)) + val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.01, random = new Random(42), epsilon = 0.01) + val expected = IndexedSeq(Allele("."), Allele(".")) + newGeno.calls should contain theSameElementsInOrderAs expected + } + + it should "return two ref alleles if the ref AD is much larger than the alt AD" in { + val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(100,0)) + val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01) + val expected = IndexedSeq(Allele("A"), Allele("A")) + newGeno.calls should contain theSameElementsInOrderAs expected + } + + it should "return two alt alleles if the alt AD is greater than the ref AD" in { + val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(0,100)) + val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01) + val expected = IndexedSeq(Allele("T"), Allele("T")) + newGeno.calls should contain theSameElementsInOrderAs expected + } + + it should "return two alt alleles if ref and alt AD > 0 but ref << alt" in { + val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(30,200)) + val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01) + val expected = IndexedSeq(Allele("T"), Allele("T")) + newGeno.calls should contain theSameElementsInOrderAs expected + } + + it should "return het if ref and alt ADs are similar but ref < alt" in { + val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(190,200)) + val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01) + val expected = IndexedSeq(Allele("A"), Allele("T")) + newGeno.calls should contain theSameElementsInOrderAs expected + } + + + it should "return a ref and alt allele if the ref and alt ADs are the same" in { + val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(100,100)) + val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01) + val expected = IndexedSeq(Allele("A"), Allele("T")) + newGeno.calls should contain theSameElementsInOrderAs expected + } + + /* + testing DownsampleVcf.downsampleAndRegenotype on Variant + */ + + private def makeVariant(ref: String, alt: String, sample: String = "test", ads: IndexedSeq[Int]): Variant = { + Variant(chrom="1", + pos=10, + alleles=AlleleSet(ref=Allele(ref), alts=Allele(alt)), + genotypes=Map(sample -> makeGt(ref=ref, alt=alt, ads=ads, sample = sample)) + ) + } + + "DownsampleVcf.downsampleAndRegenotype(Variant)" should "return no call alleles if depths are 0" in { + val variant = makeVariant(ref="A", alt="T", ads=IndexedSeq(0,0)) + val newVariant = DownsampleVcf.downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42)) + val expected = IndexedSeq(Allele("."), Allele(".")) + newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected + } + + it should "return ref alleles if ref AD > 0 and alt is 0" in { + val variant = makeVariant(ref="A", alt="T", ads=IndexedSeq(100,0)) + val newVariant = DownsampleVcf.downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42)) + val expected = IndexedSeq(Allele("A"), Allele("A")) + newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected + } + + it should "return ref if ref and alt > 0 but ref > alt" in { + val variant = makeVariant(ref="A", alt="T", ads=IndexedSeq(200,20)) + val newVariant = DownsampleVcf.downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42)) + val expected = IndexedSeq(Allele("A"), Allele("A")) + newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected + } + + it should "return alts if alt AD > 0 and ref AD = 0" in { + val variant = makeVariant(ref="A", alt="T", ads=IndexedSeq(0,100)) + val newVariant = DownsampleVcf.downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42)) + val expected = IndexedSeq(Allele("T"), Allele("T")) + newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected + } + + it should "return het if ref and alt ADs are the same" in { + val variant = makeVariant(ref="A", alt="T", ads=IndexedSeq(500,500)) + val newVariant = DownsampleVcf.downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42)) + val expected = IndexedSeq(Allele("A"), Allele("T")) + newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected + } + + private val sample = "test1" + private val builder = VcfBuilder(samples=Seq(sample)) + builder.add(chrom="chr1", pos=100, id="1", alleles=Seq("A", "C"), info=Map(), + gts=Seq(Gt(sample=sample, gt="0/1", ad=Seq(1000,1000), pl=Seq(13936,0,13936)))) // should stay + builder.add(chrom="chr1", pos=200, id="2", alleles=Seq("A", "C"), info=Map(), + gts=Seq(Gt(sample=sample, gt="0/0", ad=Seq(1000,0), pl=Seq(0,3010,19956)))) // should be removed + builder.add(chrom="chr1", pos=400, id="3", alleles=Seq("A", "C"), info=Map(), + gts=Seq(Gt(sample=sample, gt="1/1", ad=Seq(0,1000), pl=Seq(19956,3010,0)))) // should stay + builder.add(chrom="chr1", pos=600, id="4", alleles=Seq("A", "C"), info=Map(), + gts=Seq(Gt(sample=sample, gt="1/1", ad=Seq(0,3), pl=Seq(60,9,0)))) // should be removed + builder.add(chrom="chr1", pos=800, id="5", alleles=Seq("A", "C"), info=Map(), + gts=Seq(Gt(sample=sample, gt="1/1", ad=Seq(20,1000), pl=Seq(19557,2671,0)))) // should stay + builder.add(chrom="chr1", pos=1000, id="6", alleles=Seq("A", "C"), info=Map(), + gts=Seq(Gt(sample=sample, gt="1/1", ad=Seq(1000,10), pl=Seq(0,2841,19757)))) // should stay + builder.add(chrom="chr1", pos=1200, id="7", alleles=Seq("A", "C"), info=Map(), + gts=Seq(Gt(sample=sample, gt="1/1", ad=Seq(1,2), pl=Seq(31,0,11)))) // should be removed + builder.add(chrom="chr1", pos=1360, id="8", alleles=Seq("A", "C"), info=Map(), + gts=Seq(Gt(sample=sample, gt="1/1", ad=Seq(800,900), pl=Seq(12843,0,10848)))) // should stay + val inVcf = builder.toTempFile() + // Make the metadata file + val metadata = makeTempFile("metadata", ".txt") + Metric.write(metadata, Seq(Sample(SAMPLE_NAME = sample, BASE_COUNT = 100))) + + "DownsampleVcf" should "write a new vcf with downsampled genotypes when provided a vcf" in { + List(true, false).foreach( + use_metdata => { + // Construct the input VCF + val outVcf = makeTempFile("out", ".vcf.gz") + if(use_metdata) { + new DownsampleVcf(input = inVcf, output = outVcf, metadata = Some(metadata), downsampleToBases = 1).execute() + } else { + new DownsampleVcf(input = inVcf, output = outVcf, originalBases = Some(100), downsampleToBases = 1).execute() + } + + val vs = readVcfRecs(outVcf) + vs should have length 5 + + val ad0 = vs(0).genotypes("test1")[IndexedSeq[Int]]("AD") + ad0(0) < 110 shouldBe true; + ad0(1) < 110 shouldBe true + val pl0 = vs(0).genotypes("test1")[IndexedSeq[Int]]("PL") + pl0(1) shouldBe 0 + + val ad1 = vs(1).genotypes("test1")[IndexedSeq[Int]]("AD") + ad1(0) shouldBe 0; + ad1(1) < 110 shouldBe true + val pl1 = vs(1).genotypes("test1")[IndexedSeq[Int]]("PL") + pl1(2) shouldBe 0 + + val ad2 = vs(2).genotypes("test1")[IndexedSeq[Int]]("AD") + ad2(0) < 30 shouldBe true; + ad2(1) < 110 shouldBe true + val pl2 = vs(2).genotypes("test1")[IndexedSeq[Int]]("PL") + pl2(2) shouldBe 0 + + val ad3 = vs(3).genotypes("test1")[IndexedSeq[Int]]("AD") + ad3(0) < 110 shouldBe true; + ad3(1) shouldBe 0; + val pl3 = vs(3).genotypes("test1")[IndexedSeq[Int]]("PL") + pl3(0) shouldBe 0 + + val ad4 = vs(4).genotypes("test1")[IndexedSeq[Int]]("AD") + ad4(0) < 100 shouldBe true; + ad4(1) < 100 shouldBe true; + ad4(0) > 1 shouldBe true; + ad4(1) > 2 shouldBe true; + val pl4 = vs(4).genotypes("test1")[IndexedSeq[Int]]("PL") + pl4(1) shouldBe 0 + } + ) + + } + + "DownsampleVcf" should "write a new vcf with downsampled genotypes when provided a vcf, keeping nocalls" in { + // Construct the input VCF + List(true, false).foreach( + use_metdata => { + // Construct the input VCF + val outVcf = makeTempFile("out", ".vcf.gz") + if (use_metdata) { + new DownsampleVcf(input = inVcf, output = outVcf, metadata = Some(metadata), downsampleToBases = 1, writeNoCall = true).execute() + } else { + new DownsampleVcf(input = inVcf, output = outVcf, originalBases = Some(100), downsampleToBases = 1, writeNoCall = true).execute() + } + + val vs = readVcfRecs(outVcf) + vs should have length 7 + + val ad0 = vs(0).genotypes("test1")[IndexedSeq[Int]]("AD") + ad0(0) < 110 shouldBe true; + ad0(1) < 110 shouldBe true + val pl0 = vs(0).genotypes("test1")[IndexedSeq[Int]]("PL") + pl0(1) shouldBe 0 + + val ad1 = vs(1).genotypes("test1")[IndexedSeq[Int]]("AD") + ad1(0) shouldBe 0; + ad1(1) < 110 shouldBe true + val pl1 = vs(1).genotypes("test1")[IndexedSeq[Int]]("PL") + pl1(2) shouldBe 0 + + val ad2 = vs(2).genotypes("test1")[IndexedSeq[Int]]("AD") + ad2(0) shouldBe 0; + ad2(1) shouldBe 0 + val pl2 = vs(2).genotypes("test1")[IndexedSeq[Int]]("PL") + pl2(0) shouldBe 0; + pl2(1) shouldBe 0; + pl2(2) shouldBe 0 + + val ad3 = vs(3).genotypes("test1")[IndexedSeq[Int]]("AD") + ad3(0) < 30 shouldBe true; + ad3(1) < 110 shouldBe true + val pl3 = vs(3).genotypes("test1")[IndexedSeq[Int]]("PL") + pl3(2) shouldBe 0 + + val ad4 = vs(4).genotypes("test1")[IndexedSeq[Int]]("AD") + ad4(0) < 110 shouldBe true; + ad4(1) shouldBe 0; + val pl4 = vs(4).genotypes("test1")[IndexedSeq[Int]]("PL") + pl4(0) shouldBe 0 + + val ad5 = vs(5).genotypes("test1")[IndexedSeq[Int]]("AD") + ad5(0) shouldBe 0; + ad5(1) shouldBe 0 + val pl5 = vs(5).genotypes("test1")[IndexedSeq[Int]]("PL") + pl5(0) shouldBe 0; + pl5(1) shouldBe 0; + pl5(2) shouldBe 0 + + val ad6 = vs(6).genotypes("test1")[IndexedSeq[Int]]("AD") + ad6(0) < 100 shouldBe true; + ad6(1) < 100 shouldBe true; + ad6(0) > 1 shouldBe true; + ad6(1) > 2 shouldBe true; + val pl6 = vs(6).genotypes("test1")[IndexedSeq[Int]]("PL") + pl6(1) shouldBe 0 + } + ) + } +} + From 9d1aa063b3c140a6f433415fc3f484e3aa0f559d Mon Sep 17 00:00:00 2001 From: jdidion Date: Tue, 2 Apr 2024 16:55:36 -0700 Subject: [PATCH 02/10] make changes suggested in PR --- .../fulcrumgenomics/vcf/DownsampleVcf.scala | 260 ++++++++++-------- 1 file changed, 151 insertions(+), 109 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala index 04fe4a087..b2c40eb0c 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala @@ -15,9 +15,8 @@ import scala.math.log10 import scala.util.Random object DownsampleVcf extends LazyLogging { - /** Removes variants that are within a specified distance from a previous variant - * The end position of the current variant is compared with the start position of the following variant - * + /** Removes variants that are within a specified distance from a previous variant. + * The end position of the current variant is compared with the start position of the following variant. * @param variants an iterator of the variants to process * @param windowSize the interval (exclusive) in which to check for additional variants. * windowSize considers the distance between the end position of a variant @@ -32,7 +31,7 @@ object DownsampleVcf extends LazyLogging { def hasNext: Boolean = iter.hasNext - def isInOrder(current: Variant, next: Variant, currentIndex: Int, nextIndex: Int): Boolean = { + private def isInOrder(current: Variant, next: Variant, currentIndex: Int, nextIndex: Int): Boolean = { (currentIndex < nextIndex) || (currentIndex == nextIndex && current.end <= next.pos) } @@ -52,32 +51,33 @@ object DownsampleVcf extends LazyLogging { } } - /** Downsamples variants using Allele Depths - * + /** Downsamples variants by randomly sampling the total allele depths at the given proportion. * @param oldAds an indexed seq of the original allele depths * @param proportion the proportion to use for downsampling, * calculated using total base count from the index and a target base count * @return a new IndexedSeq of allele depths of the same length as `oldAds` */ - def downsampleADs(oldAds: IndexedSeq[Int], proportion: Double, random: Random): IndexedSeq[Int] = { + def downsampleADs(oldAds: IterableOnce[Int], proportion: Double, random: Random): IndexedSeq[Int] = { require(proportion <= 1, f"proportion must be less than 1: proportion = ${proportion}") - oldAds.map(s => Range(0, s).iterator.map(_ => random.nextDouble()).count(_ < proportion)) + oldAds.iterator.toIndexedSeq.map(s => Range(0, s).iterator.map(_ => random.nextDouble()).count(_ < proportion)) } /** - * Does the downsampling on a Variant - * @param variant the variant with the genotype to downsample - * @param proportions a map of downsampling target proportions for each sample + * Re-genotypes a variant for each sample after downsampling the allele counts based on the given + * per-sample proportions. + * @param variant the variant to downsample and re-genotype + * @param proportions proportion to downsample the allele counts for each sample prior to re-genotyping * @param random random number generator for downsampling - * @param epsilon the error rate for genotyping - * @return a new variant with updated genotypes + * @param epsilon the sequencing error rate for genotyping + * @return a new variant with updated genotypes, downsampled ADs, and recomputed PLs */ - // Returns a new variant that has downsampled ADs, recomputed PLs and updated genotypes - def downsampleAndRegenotype(variant: Variant, proportions: Map[String, Double], random: Random, epsilon: Double = 0.01): Variant = { + def downsampleAndRegenotype(variant: Variant, + proportions: Map[String, Double], + random: Random, epsilon: Double=0.01): Variant = { try { - variant.copy(genotypes = variant.genotypes.map { case (sample, gt) => + variant.copy(genotypes=variant.genotypes.map { case (sample, gt) => val proportion = proportions(sample) - sample -> downsampleAndRegenotype(gt = gt, proportion = proportion, random = random, epsilon = epsilon) + sample -> downsampleAndRegenotype(gt=gt, proportion=proportion, random=random, epsilon=epsilon) }) } catch { case e: MatchError => throw new Exception( @@ -87,15 +87,15 @@ object DownsampleVcf extends LazyLogging { } /** - * Does the downsampling on a Genotype + * Re-genotypes a sample after downsampling the allele counts based on the given proportion. * @param gt the genotype to downsample - * @param proportion the proportion to use for downsampling allele depths + * @param proportion proportion to downsample the allele count prior to re-genotyping * @param random random number generator for downsampling - * @param epsilon the error rate for genotyping - * @return a new Genotype with updated allele depths, PLs and genotype + * @param epsilon the sequencing error rate for genotyping + * @return a new Genotype with updated allele depths, PLs, and genotype */ def downsampleAndRegenotype(gt: Genotype, proportion: Double, random: Random, epsilon: Double): Genotype = { - val oldAds = gt[IndexedSeq[Int]]("AD") + val oldAds = gt.getOrElse[IndexedSeq[Int]]("AD", throw new Exception(s"AD tag not found for sample ${gt.sample}")) val newAds = downsampleADs(oldAds, proportion, random) val Seq(aa, ab, bb) = computePls(newAds) val Seq(alleleA, alleleB) = gt.alleles.toSeq @@ -106,20 +106,21 @@ object DownsampleVcf extends LazyLogging { else if (bb < ab && bb < aa) IndexedSeq(alleleB, alleleB) else IndexedSeq(alleleA, alleleB) } - gt.copy(attrs = Map("PL" -> IndexedSeq(aa, ab, bb), "AD" -> newAds, "DP" -> newAds.sum), calls = calls) + gt.copy(attrs=Map("PL" -> IndexedSeq(aa, ab, bb), "AD" -> newAds, "DP" -> newAds.sum), calls=calls) } /** - * Compute the genotype likelihoods given the allele depths. - * @param ads The allele depths to generate likelihoods from - * @return a list of three likelihoods + * Compute the genotype likelihoods given the allele depths, assuming a diploid genotype (i.e. + * two allele depths). + * @param ads The input depths for the two alleles A and B. + * @return a list of three likelihoods for the alleles AA, AB, and BB. */ def computePls(ads: IndexedSeq[Int]): IndexedSeq[Int] = { + require(ads.length == 2, "there must be exactly two allele depths") val likelihoods = Likelihoods(ads(0), ads(1)) IndexedSeq(likelihoods.aa.round.toInt, likelihoods.ab.round.toInt, likelihoods.bb.round.toInt) } - object Likelihoods { /** Computes the likelihoods for each possible genotype. * @@ -128,7 +129,7 @@ object DownsampleVcf extends LazyLogging { * @param epsilon the error rate for genotyping * @return a new `Likelihood` that has the likelihoods of AA, AB, and BB */ - def apply(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double = 0.01): Likelihoods = { + def apply(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double=0.01): Likelihoods = { val aGivenAA = log10(1 - epsilon) val aGivenBB = log10(epsilon) val aGivenAB = log10((1 - epsilon) / 2) @@ -154,102 +155,143 @@ object DownsampleVcf extends LazyLogging { * @param bb likelihood of BB */ case class Likelihoods(aa: Double, ab: Double, bb: Double) { + /** + * Returns the likelihoods as a list of phred-scaled integers (i.e, the value of the PL tag). + * @return a list of phred-scaled likelihooodS for AA, AB, BB. + */ def pls = IndexedSeq(aa.round.toInt, ab.round.toInt, bb.round.toInt) } } - @clp(group = ClpGroups.VcfOrBcf, description = - """ - |DownsampleVcf takes a vcf file and metadata with sequencing info and - |1. winnows the vcf to remove variants within a specified distance to each other, - |2. downsamples the variants using the provided allele depths and target base count by - | re-computing/downsampling the allele depths for the new target base count - | and re-computing the genotypes based on the new allele depths - |and writes a new downsampled vcf file. - |For single-sample VCFs, the metadata file can be omitted, and instead you can specify originalBases. - """) - class DownsampleVcf - (@arg(flag = 'i', doc = "The vcf to downsample.") val input: PathToVcf, - @arg(flag = 'm', doc = "Index file with bases per sample.") val metadata: Option[FilePath] = None, - @arg(flag = 'b', doc = "Original number of bases (for single-sample VCF)") val originalBases: Option[Double] = None, - @arg(flag = 'n', doc = "Target number of bases to downsample to.") val downsampleToBases: Double, - @arg(flag = 'o', doc = "Output file name.") val output: PathToVcf, - @arg(flag = 'w', doc = "Winnowing window size.") val windowSize: Int = 150, - @arg(flag = 'e', doc = "Error rate for genotyping.") val epsilon: Double = 0.01, - @arg(flag = 'c', doc = "True to write out no-calls.") val writeNoCall: Boolean = false) - extends FgBioTool { - Io.assertReadable(input) - Io.assertReadable(metadata) - Io.assertCanWriteFile(output) - require(downsampleToBases > 0, "target base count must be greater than zero") - require(windowSize >= 0, "window size must be greater than or equal to zero") - require(0 <= epsilon && epsilon <= 1, "epsilon/error rate must be between 0 and 1") - originalBases match { - case Some(x) => - require(x > 0, "originalBases must be greater than zero") - require(metadata.isEmpty, "Must pass either originalBases (for single-sample VCF) or metadata, not both") - case None => - require(metadata.isDefined, "Must pass either originalBases (for single-sample VCF) or metadata, not both") +@clp(group=ClpGroups.VcfOrBcf, description = + """ + |Re-genotypes a VCF after downsampling the allele counts. + | + |The input VCF must have at least one sample. + | + |If the input VCF contains a single sample, the downsampling target may be specified as a + |proportion of the original read depth using `--proportion=(0..1)`, or as the combination of + |the original and target _number of sequenced bases_ (`--originalBases` and + |`--downsampleToBases`). For multi-sample VCFs, the downsampling target must be specified using + |`--downsampleToBases`, and a metadata file with the total number of sequenced bases per sample + |is required as well. The metadata file must follow the + |[[https://www.internationalgenome.org/category/meta-data/] 1000 Genomes index format], but the + |only required columns are `SAMPLE_NAME` and `BASE_COUNT`. A propportion for each sample is + |calculated by dividing the _target number of sequenced bases_ by the _original number of + |sequenced bases_. + | + |The tool first (optionally) winnows the VCF file to remove variants within a distance to each + |other specified by `--window-size` (the default value of `0` disables winnowing). Next, each + |sample at each variant is examined independently. The allele depths per-genotype are randoml + |downsampled given the proportion. The downsampled allele depths are then used to re-compute + |allele likelhoods and produce a new genotype. + | + |The tool outputs a downsampled VCF file with the winnowed variants removed, and with the + |genotype calls and `DP`, `AD`, and `PL` tags updated for each sample at each retained variant. +""") +class DownsampleVcf +(@arg(flag='i', doc="The vcf to downsample.") val input: PathToVcf, + @arg(flag='p', doc="Proportion of bases to retain (for single-sample VCF).") val proportion: Option[Double] = None, + @arg(flag='b', doc="Original number of bases (for single-sample VCF).") val originalBases: Option[Double] = None, + @arg(flag='m', doc="Index file with bases per sample.") val metadata: Option[FilePath] = None, + @arg(flag='n', doc="Target number of bases to downsample to.") val downsampleToBases: Option[Double], + @arg(flag='o', doc="Output file name.") val output: PathToVcf, + @arg(flag='w', doc="Winnowing window size.") val windowSize: Int = 0, + @arg(flag='e', doc="Sequencing Error rate for genotyping.") val epsilon: Double = 0.01, + @arg(flag='c', doc="True to write out no-calls.") val writeNoCall: Boolean = false, + @arg(flag='s', doc="Random seed value.") val seed: Int = 42, + ) extends FgBioTool { + Io.assertReadable(input) + Io.assertCanWriteFile(output) + require(windowSize >= 0, "window size must be greater than or equal to zero") + require(0 <= epsilon && epsilon <= 1, "epsilon/error rate must be between 0 and 1") + (proportion, originalBases, metadata, downsampleToBases) match { + case (Some(x), None, None, None) => + require(x > 0, "proportion must be greater than 0") + require(x < 1, "proportion must be less than 1") + case (None, Some(original), None, Some(target)) => + require(original > 0, "originalBases must be greater than zero") + require(target > 0, "target base count must be greater than zero") + case (None, None, Some(metadata), Some(target)) => + Io.assertReadable(metadata) + require(target > 0, "target base count must be greater than zero") + case (None, _, _, None) => + throw new IllegalArgumentException( + "exactly one of proportion or downsampleToBases must be specified" + ) + case _ => + throw new IllegalArgumentException( + "exactly one of proportion, originalBases, or metadata must be specified" + ) + } + + override def execute(): Unit = { + val vcf = VcfSource(input) + val proportions = ( + (proportion, originalBases, metadata, downsampleToBases) match { + case (Some(x), None, None, None) => + require(vcf.header.samples.length == 1, "--original-bases requires a single-sample VCF") + LazyList(vcf.header.samples.head -> x) + case (None, Some(original), None, Some(target)) => + require(vcf.header.samples.length == 1, "--original-bases requires a single-sample VCF") + LazyList(vcf.header.samples.head -> math.min(target / original, 1.0)) + case (None, None, Some(metadata), Some(target)) => + Sample.read(metadata) + .filter(s => vcf.header.samples.contains(s.SAMPLE_NAME)) + .map(sample => sample.SAMPLE_NAME -> math.min(target / sample.BASE_COUNT.toDouble, 1.0)) + case _ => + throw new RuntimeException("unexpected parameter combination") + } + ).toMap + proportions.foreach { case (s, p) => logger.info(f"Downsampling $s with proportion ${p}%.4f") } + + val inputProgress = ProgressLogger(logger, noun="variants read") + val inputVariants = ProgressLogger.ProgressLoggingIterator(vcf.iterator).progress(inputProgress) + val winnowed = if (windowSize > 0) { + val winnowed = winnowVariants(inputVariants, windowSize=windowSize, dict=vcf.header.dict) + val winnowedProgress = ProgressLogger(logger, noun="variants retained") + ProgressLogger.ProgressLoggingIterator(winnowed).progress(winnowedProgress) + } else { + inputVariants } + val outputVcf = VcfWriter(path=output, header=buildOutputHeader(vcf.header)) - override def execute(): Unit = { - val vcf = VcfSource(input) - val progress = ProgressLogger(logger, noun = "variants") - val proportions = ( - originalBases match { - case Some(x) => - require(vcf.header.samples.length == 1, "--original-bases requires a single-sample VCF") - LazyList(vcf.header.samples.head -> math.min(downsampleToBases / x, 1.0)) - case _ => - Sample.read(metadata.getOrElse(throw new RuntimeException)) - .filter(s => vcf.header.samples.contains(s.SAMPLE_NAME)) - .map(sample => sample.SAMPLE_NAME -> math.min(downsampleToBases / sample.BASE_COUNT.toDouble, 1.0)) - } - ).toMap - proportions.foreach { case (s, p) => logger.info(f"Downsampling $s with proportion ${p}%.4f") } - - val winnowed = if (windowSize > 0) winnowVariants(vcf.iterator, windowSize = windowSize, dict = vcf.header.dict) else vcf.iterator - val outputVcf = VcfWriter(path = output, header = buildOutputHeader(vcf.header)) - - val random = new Random(42) - winnowed.foreach { v => - val ds = downsampleAndRegenotype(v, proportions = proportions, random = random, epsilon = epsilon) - if (writeNoCall) { - outputVcf += ds - progress.record(ds) - } - else if (!ds.gts.forall(g => g.isNoCall)) { - outputVcf += ds - progress.record(ds) - } + val progress = ProgressLogger(logger, noun="variants written") + val random = new Random(seed) + winnowed.foreach { v => + val ds = downsampleAndRegenotype(v, proportions=proportions, random=random, epsilon=epsilon) + if (writeNoCall || !ds.gts.forall(g => g.isNoCall)) { + outputVcf += ds + progress.record(ds) } - - progress.logLast() - vcf.safelyClose() - outputVcf.close() } + + progress.logLast() + vcf.safelyClose() + outputVcf.close() + } - def buildOutputHeader(in: VcfHeader): VcfHeader = { - val fmts = Seq.newBuilder[VcfFormatHeader] - fmts ++= in.formats + private def buildOutputHeader(in: VcfHeader): VcfHeader = { + val fmts = Seq.newBuilder[VcfFormatHeader] + fmts ++= in.formats - if (!in.format.contains("AD")) { - fmts += VcfFormatHeader(id="AD", count=VcfCount.OnePerAllele, kind=VcfFieldType.Integer, description="Per allele depths.") - } - - if (!in.format.contains("DP")) { - fmts += VcfFormatHeader(id="DP", count=VcfCount.Fixed(1), kind=VcfFieldType.Integer, description="Total depth across alleles.") - } + if (!in.format.contains("AD")) { + fmts += VcfFormatHeader(id="AD", count=VcfCount.OnePerAllele, kind=VcfFieldType.Integer, description="Per allele depths.") + } - if (!in.format.contains("PL")) { - fmts += VcfFormatHeader(id="PL", count=VcfCount.OnePerGenotype, kind=VcfFieldType.Integer, description="Per genotype phred scaled likelihoods.") - } + if (!in.format.contains("DP")) { + fmts += VcfFormatHeader(id="DP", count=VcfCount.Fixed(1), kind=VcfFieldType.Integer, description="Total depth across alleles.") + } - in.copy(formats = fmts.result()) + if (!in.format.contains("PL")) { + fmts += VcfFormatHeader(id="PL", count=VcfCount.OnePerGenotype, kind=VcfFieldType.Integer, description="Per genotype phred scaled likelihoods.") } + + in.copy(formats=fmts.result()) } +} -object Sample { +private object Sample { /** Load a set of samples from the 1KG metadata file. */ def read(path: FilePath): Seq[Sample] = { val lines = Io.readLines(path).dropWhile(_.startsWith("##")).map(line => line.dropWhile(_ == '#')) From 9ccb1e60e0068e31934b4b887775e559303e57a4 Mon Sep 17 00:00:00 2001 From: jdidion Date: Tue, 2 Apr 2024 17:17:39 -0700 Subject: [PATCH 03/10] fix tests --- README.md | 27 +++++++++++-------- .../vcf/DownsampleVcfTest.scala | 26 +++++++++++++++--- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index d95b5fd62..cf6865dac 100644 --- a/README.md +++ b/README.md @@ -14,16 +14,21 @@ A set of tools to analyze genomic data with a focus on Next Generation Sequencin Detailed user documentation is available on the [project website](http://fulcrumgenomics.github.io/fgbio/) including [tool usage](http://fulcrumgenomics.github.io/fgbio/tools/latest) and [documentation of metrics produced](http://fulcrumgenomics.github.io/fgbio/metrics/latest). Detailed developer documentation can be found [here](http://javadoc.io/doc/com.fulcrumgenomics/fgbio_2.13). - * [Goals](#goals) - * [Overview](#overview) - * [List of tools](#list-of-tools) - * [Building](#building) - * [Command line](#command-line) - * [Include fgbio in your project](#include-fgbio-in-your-project) - * [Contributing](#contributing) - * [Authors](#authors) - * [License](#license) - * [Sponsorship](#sponsorship) +- [fgbio](#fgbio) +- [Goals](#goals) + - [Overview](#overview) + - [List of tools](#list-of-tools) + - [Building](#building) + - [Cloning the Repository](#cloning-the-repository) + - [Running the build](#running-the-build) + - [Command line](#command-line) + - [Include fgbio in your project](#include-fgbio-in-your-project) + - [Contributing](#contributing) + - [Authors](#authors) + - [License](#license) + - [Sponsorship](#sponsorship) + - [Become a sponsor](#become-a-sponsor) + - [Sponsors](#sponsors) @@ -76,7 +81,7 @@ Below we highlight a few tools that you may find useful. [Git LFS](https://git-lfs.github.com/) is used to store large files used in testing fgbio. In order to compile and run tests it is necessary to [install git lfs](https://git-lfs.github.com/). To retrieve the large files either: 1. Clone the repository _after_ installing git lfs, or -2. In a previously cloned repository run `git lfs pull` once +2. In a previously cloned repository run the follwing once: `git lfs install && git lfs pull` After initial setup regular git commands (e.g. `pull`, `fetch`, `push`) will also operate on large files and no special handling is needed. diff --git a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala index c3e8b7aba..1af287e18 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala @@ -374,9 +374,17 @@ class DownsampleVcfTest extends UnitSpec { // Construct the input VCF val outVcf = makeTempFile("out", ".vcf.gz") if(use_metdata) { - new DownsampleVcf(input = inVcf, output = outVcf, metadata = Some(metadata), downsampleToBases = 1).execute() + new DownsampleVcf(input=inVcf, + output=outVcf, + metadata=Some(metadata), + downsampleToBases=Some(1), + windowSize=150).execute() } else { - new DownsampleVcf(input = inVcf, output = outVcf, originalBases = Some(100), downsampleToBases = 1).execute() + new DownsampleVcf(input=inVcf, + output=outVcf, + originalBases=Some(100), + downsampleToBases=Some(1), + windowSize=150).execute() } val vs = readVcfRecs(outVcf) @@ -425,9 +433,19 @@ class DownsampleVcfTest extends UnitSpec { // Construct the input VCF val outVcf = makeTempFile("out", ".vcf.gz") if (use_metdata) { - new DownsampleVcf(input = inVcf, output = outVcf, metadata = Some(metadata), downsampleToBases = 1, writeNoCall = true).execute() + new DownsampleVcf(input=inVcf, + output=outVcf, + metadata=Some(metadata), + downsampleToBases=Some(1), + writeNoCall=true, + windowSize=150).execute() } else { - new DownsampleVcf(input = inVcf, output = outVcf, originalBases = Some(100), downsampleToBases = 1, writeNoCall = true).execute() + new DownsampleVcf(input=inVcf, + output=outVcf, + originalBases=Some(100), + downsampleToBases=Some(1), + writeNoCall=true, + windowSize=150).execute() } val vs = readVcfRecs(outVcf) From f6052afffd74190f803148b5266085f4aab54242 Mon Sep 17 00:00:00 2001 From: jdidion Date: Wed, 3 Apr 2024 15:49:02 -0700 Subject: [PATCH 04/10] test proportion argument --- .../fulcrumgenomics/vcf/DownsampleVcf.scala | 2 +- .../vcf/DownsampleVcfTest.scala | 73 +++++++++++-------- 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala index b2c40eb0c..3f4ce49b2 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala @@ -194,7 +194,7 @@ class DownsampleVcf @arg(flag='p', doc="Proportion of bases to retain (for single-sample VCF).") val proportion: Option[Double] = None, @arg(flag='b', doc="Original number of bases (for single-sample VCF).") val originalBases: Option[Double] = None, @arg(flag='m', doc="Index file with bases per sample.") val metadata: Option[FilePath] = None, - @arg(flag='n', doc="Target number of bases to downsample to.") val downsampleToBases: Option[Double], + @arg(flag='n', doc="Target number of bases to downsample to.") val downsampleToBases: Option[Double] = None, @arg(flag='o', doc="Output file name.") val output: PathToVcf, @arg(flag='w', doc="Winnowing window size.") val windowSize: Int = 0, @arg(flag='e', doc="Sequencing Error rate for genotyping.") val epsilon: Double = 0.01, diff --git a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala index 1af287e18..089c0dc89 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala @@ -369,22 +369,28 @@ class DownsampleVcfTest extends UnitSpec { Metric.write(metadata, Seq(Sample(SAMPLE_NAME = sample, BASE_COUNT = 100))) "DownsampleVcf" should "write a new vcf with downsampled genotypes when provided a vcf" in { - List(true, false).foreach( - use_metdata => { + List("proportion", "number", "metadata").foreach( + kind => { // Construct the input VCF val outVcf = makeTempFile("out", ".vcf.gz") - if(use_metdata) { - new DownsampleVcf(input=inVcf, - output=outVcf, - metadata=Some(metadata), - downsampleToBases=Some(1), - windowSize=150).execute() - } else { - new DownsampleVcf(input=inVcf, - output=outVcf, - originalBases=Some(100), - downsampleToBases=Some(1), - windowSize=150).execute() + kind match { + case "proportion" => + new DownsampleVcf(input=inVcf, + output=outVcf, + proportion=Some(0.01), + windowSize=150).execute() + case "number" => + new DownsampleVcf(input=inVcf, + output=outVcf, + originalBases=Some(100), + downsampleToBases=Some(1), + windowSize=150).execute() + case "metadata" => + new DownsampleVcf(input=inVcf, + output=outVcf, + metadata=Some(metadata), + downsampleToBases=Some(1), + windowSize=150).execute() } val vs = readVcfRecs(outVcf) @@ -428,24 +434,31 @@ class DownsampleVcfTest extends UnitSpec { "DownsampleVcf" should "write a new vcf with downsampled genotypes when provided a vcf, keeping nocalls" in { // Construct the input VCF - List(true, false).foreach( - use_metdata => { + List("proportion", "number", "metadata").foreach( + kind => { // Construct the input VCF val outVcf = makeTempFile("out", ".vcf.gz") - if (use_metdata) { - new DownsampleVcf(input=inVcf, - output=outVcf, - metadata=Some(metadata), - downsampleToBases=Some(1), - writeNoCall=true, - windowSize=150).execute() - } else { - new DownsampleVcf(input=inVcf, - output=outVcf, - originalBases=Some(100), - downsampleToBases=Some(1), - writeNoCall=true, - windowSize=150).execute() + kind match { + case "proportion" => + new DownsampleVcf(input=inVcf, + output=outVcf, + proportion=Some(0.01), + writeNoCall=true, + windowSize=150).execute() + case "number" => + new DownsampleVcf(input=inVcf, + output=outVcf, + originalBases=Some(100), + downsampleToBases=Some(1), + writeNoCall=true, + windowSize=150).execute() + case "metadata" => + new DownsampleVcf(input=inVcf, + output=outVcf, + metadata=Some(metadata), + downsampleToBases=Some(1), + writeNoCall=true, + windowSize=150).execute() } val vs = readVcfRecs(outVcf) From 6eb17620f94cb573acc9bee1346c99e4fb97b686 Mon Sep 17 00:00:00 2001 From: jdidion Date: Wed, 3 Apr 2024 16:27:06 -0700 Subject: [PATCH 05/10] test non-winnowing --- .../vcf/DownsampleVcfTest.scala | 105 ++++++++++-------- 1 file changed, 60 insertions(+), 45 deletions(-) diff --git a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala index 089c0dc89..7b4b8ec24 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala @@ -369,66 +369,81 @@ class DownsampleVcfTest extends UnitSpec { Metric.write(metadata, Seq(Sample(SAMPLE_NAME = sample, BASE_COUNT = 100))) "DownsampleVcf" should "write a new vcf with downsampled genotypes when provided a vcf" in { - List("proportion", "number", "metadata").foreach( - kind => { + List("proportion", "number", "metadata").foreach(kind => { + List(true, false).foreach(winnow => { // Construct the input VCF val outVcf = makeTempFile("out", ".vcf.gz") + val windowSize = if (winnow) { 150 } else { 0 } kind match { case "proportion" => new DownsampleVcf(input=inVcf, output=outVcf, proportion=Some(0.01), - windowSize=150).execute() + windowSize=windowSize).execute() case "number" => new DownsampleVcf(input=inVcf, output=outVcf, originalBases=Some(100), downsampleToBases=Some(1), - windowSize=150).execute() + windowSize=windowSize).execute() case "metadata" => new DownsampleVcf(input=inVcf, output=outVcf, metadata=Some(metadata), downsampleToBases=Some(1), - windowSize=150).execute() + windowSize=windowSize).execute() } val vs = readVcfRecs(outVcf) - vs should have length 5 + val expectedLength = if (winnow) { 5 } else { 6 } + vs should have length expectedLength val ad0 = vs(0).genotypes("test1")[IndexedSeq[Int]]("AD") - ad0(0) < 110 shouldBe true; + ad0(0) < 110 shouldBe true ad0(1) < 110 shouldBe true val pl0 = vs(0).genotypes("test1")[IndexedSeq[Int]]("PL") pl0(1) shouldBe 0 + + val offset = if (winnow) { + 0 + } else { + val ad1 = vs(1).genotypes("test1")[IndexedSeq[Int]]("AD") + ad1(0) shouldBe 8 + ad1(1) < 110 shouldBe true + val pl1 = vs(1).genotypes("test1")[IndexedSeq[Int]]("PL") + pl1(2) shouldBe 160 + 1 + } - val ad1 = vs(1).genotypes("test1")[IndexedSeq[Int]]("AD") - ad1(0) shouldBe 0; - ad1(1) < 110 shouldBe true - val pl1 = vs(1).genotypes("test1")[IndexedSeq[Int]]("PL") - pl1(2) shouldBe 0 - - val ad2 = vs(2).genotypes("test1")[IndexedSeq[Int]]("AD") - ad2(0) < 30 shouldBe true; + val ad2 = vs(1 + offset).genotypes("test1")[IndexedSeq[Int]]("AD") + ad2(0) shouldBe 0 ad2(1) < 110 shouldBe true - val pl2 = vs(2).genotypes("test1")[IndexedSeq[Int]]("PL") + val pl2 = vs(1 + offset).genotypes("test1")[IndexedSeq[Int]]("PL") pl2(2) shouldBe 0 - val ad3 = vs(3).genotypes("test1")[IndexedSeq[Int]]("AD") - ad3(0) < 110 shouldBe true; - ad3(1) shouldBe 0; - val pl3 = vs(3).genotypes("test1")[IndexedSeq[Int]]("PL") - pl3(0) shouldBe 0 + val ad3 = vs(2 + offset).genotypes("test1")[IndexedSeq[Int]]("AD") + ad3(0) < 30 shouldBe true + ad3(1) < 110 shouldBe true + val pl3 = vs(2 + offset).genotypes("test1")[IndexedSeq[Int]]("PL") + pl3(2) shouldBe 0 - val ad4 = vs(4).genotypes("test1")[IndexedSeq[Int]]("AD") - ad4(0) < 100 shouldBe true; - ad4(1) < 100 shouldBe true; - ad4(0) > 1 shouldBe true; - ad4(1) > 2 shouldBe true; - val pl4 = vs(4).genotypes("test1")[IndexedSeq[Int]]("PL") - pl4(1) shouldBe 0 - } - ) + val ad4 = vs(3 + offset).genotypes("test1")[IndexedSeq[Int]]("AD") + ad4(0) < 110 shouldBe true + // changes due to random number generator + val expectedAD41 = if (winnow) { 0 } else { 1 } + ad4(1) shouldBe expectedAD41 + val pl4 = vs(3 + offset).genotypes("test1")[IndexedSeq[Int]]("PL") + pl4(0) shouldBe 0 + + val ad5 = vs(4 + offset).genotypes("test1")[IndexedSeq[Int]]("AD") + ad5(0) < 100 shouldBe true + ad5(1) < 100 shouldBe true + ad5(0) > 1 shouldBe true + ad5(1) > 2 shouldBe true + val pl5 = vs(4 + offset).genotypes("test1")[IndexedSeq[Int]]("PL") + pl5(1) shouldBe 0 + }) + }) } @@ -465,50 +480,50 @@ class DownsampleVcfTest extends UnitSpec { vs should have length 7 val ad0 = vs(0).genotypes("test1")[IndexedSeq[Int]]("AD") - ad0(0) < 110 shouldBe true; + ad0(0) < 110 shouldBe true ad0(1) < 110 shouldBe true val pl0 = vs(0).genotypes("test1")[IndexedSeq[Int]]("PL") pl0(1) shouldBe 0 val ad1 = vs(1).genotypes("test1")[IndexedSeq[Int]]("AD") - ad1(0) shouldBe 0; + ad1(0) shouldBe 0 ad1(1) < 110 shouldBe true val pl1 = vs(1).genotypes("test1")[IndexedSeq[Int]]("PL") pl1(2) shouldBe 0 val ad2 = vs(2).genotypes("test1")[IndexedSeq[Int]]("AD") - ad2(0) shouldBe 0; + ad2(0) shouldBe 0 ad2(1) shouldBe 0 val pl2 = vs(2).genotypes("test1")[IndexedSeq[Int]]("PL") - pl2(0) shouldBe 0; - pl2(1) shouldBe 0; + pl2(0) shouldBe 0 + pl2(1) shouldBe 0 pl2(2) shouldBe 0 val ad3 = vs(3).genotypes("test1")[IndexedSeq[Int]]("AD") - ad3(0) < 30 shouldBe true; + ad3(0) < 30 shouldBe true ad3(1) < 110 shouldBe true val pl3 = vs(3).genotypes("test1")[IndexedSeq[Int]]("PL") pl3(2) shouldBe 0 val ad4 = vs(4).genotypes("test1")[IndexedSeq[Int]]("AD") - ad4(0) < 110 shouldBe true; - ad4(1) shouldBe 0; + ad4(0) < 110 shouldBe true + ad4(1) shouldBe 0 val pl4 = vs(4).genotypes("test1")[IndexedSeq[Int]]("PL") pl4(0) shouldBe 0 val ad5 = vs(5).genotypes("test1")[IndexedSeq[Int]]("AD") - ad5(0) shouldBe 0; + ad5(0) shouldBe 0 ad5(1) shouldBe 0 val pl5 = vs(5).genotypes("test1")[IndexedSeq[Int]]("PL") - pl5(0) shouldBe 0; - pl5(1) shouldBe 0; + pl5(0) shouldBe 0 + pl5(1) shouldBe 0 pl5(2) shouldBe 0 val ad6 = vs(6).genotypes("test1")[IndexedSeq[Int]]("AD") - ad6(0) < 100 shouldBe true; - ad6(1) < 100 shouldBe true; - ad6(0) > 1 shouldBe true; - ad6(1) > 2 shouldBe true; + ad6(0) < 100 shouldBe true + ad6(1) < 100 shouldBe true + ad6(0) > 1 shouldBe true + ad6(1) > 2 shouldBe true val pl6 = vs(6).genotypes("test1")[IndexedSeq[Int]]("PL") pl6(1) shouldBe 0 } From 9a2d510c635dba89c1b55a5453eb01cc577fd6e7 Mon Sep 17 00:00:00 2001 From: jdidion Date: Wed, 3 Apr 2024 16:42:57 -0700 Subject: [PATCH 06/10] add more tests --- .../vcf/DownsampleVcfTest.scala | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala index 7b4b8ec24..cfa0230fc 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala @@ -529,5 +529,33 @@ class DownsampleVcfTest extends UnitSpec { } ) } + + "DownsampleVcf" should "fail with invalid parameter combinations" in { + assertThrows[IllegalArgumentException] { + new DownsampleVcf(input=inVcf, + output=inVcf, + windowSize=150).execute() + } + assertThrows[IllegalArgumentException] { + new DownsampleVcf(input=inVcf, + output=inVcf, + proportion=Some(0.1), + downsampleToBases=Some(100), + windowSize=150).execute() + } + assertThrows[IllegalArgumentException] { + new DownsampleVcf(input=inVcf, + output=inVcf, + proportion=Some(0.1), + originalBases=Some(100), + windowSize=150).execute() + } + assertThrows[IllegalArgumentException] { + new DownsampleVcf(input=inVcf, + output=inVcf, + originalBases=Some(100), + windowSize=150).execute() + } + } } From 8f7a4ebdabacef280c38fcb95e92f9ac20fb360a Mon Sep 17 00:00:00 2001 From: jdidion Date: Wed, 3 Apr 2024 15:39:34 -0700 Subject: [PATCH 07/10] add tests --- .../fulcrumgenomics/vcf/DownsampleVcf.scala | 111 ++++++++++++------ .../vcf/DownsampleVcfTest.scala | 90 +++++++++++--- 2 files changed, 145 insertions(+), 56 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala index 3f4ce49b2..310bfe4c6 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala @@ -97,69 +97,102 @@ object DownsampleVcf extends LazyLogging { def downsampleAndRegenotype(gt: Genotype, proportion: Double, random: Random, epsilon: Double): Genotype = { val oldAds = gt.getOrElse[IndexedSeq[Int]]("AD", throw new Exception(s"AD tag not found for sample ${gt.sample}")) val newAds = downsampleADs(oldAds, proportion, random) - val Seq(aa, ab, bb) = computePls(newAds) - val Seq(alleleA, alleleB) = gt.alleles.toSeq - - val calls = { - if (aa == 0 && ab == 0 && bb == 0) IndexedSeq(NoCallAllele, NoCallAllele) - else if (aa < ab && aa < bb) IndexedSeq(alleleA, alleleA) - else if (bb < ab && bb < aa) IndexedSeq(alleleB, alleleB) - else IndexedSeq(alleleA, alleleB) - } - gt.copy(attrs=Map("PL" -> IndexedSeq(aa, ab, bb), "AD" -> newAds, "DP" -> newAds.sum), calls=calls) - } - - /** - * Compute the genotype likelihoods given the allele depths, assuming a diploid genotype (i.e. - * two allele depths). - * @param ads The input depths for the two alleles A and B. - * @return a list of three likelihoods for the alleles AA, AB, and BB. - */ - def computePls(ads: IndexedSeq[Int]): IndexedSeq[Int] = { - require(ads.length == 2, "there must be exactly two allele depths") - val likelihoods = Likelihoods(ads(0), ads(1)) - IndexedSeq(likelihoods.aa.round.toInt, likelihoods.ab.round.toInt, likelihoods.bb.round.toInt) + val likelihoods = Likelihoods(newAds) + val pls = likelihoods.pls + val calls = likelihoods.mostLikelyCall(gt.alleles.toSeq) + gt.copy(attrs=Map("PL" -> pls, "AD" -> newAds, "DP" -> newAds.sum), calls=calls) } object Likelihoods { - /** Computes the likelihoods for each possible genotype. - * + /** Computes the likelihoods for each possible biallelic genotype. * @param alleleDepthA the reference allele depth * @param alleleDepthB the alternate allele depth * @param epsilon the error rate for genotyping * @return a new `Likelihood` that has the likelihoods of AA, AB, and BB */ - def apply(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double=0.01): Likelihoods = { + def biallelic(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double = 0.01): Likelihoods = { val aGivenAA = log10(1 - epsilon) val aGivenBB = log10(epsilon) val aGivenAB = log10((1 - epsilon) / 2) - val rawGlAA = ((alleleDepthA * aGivenAA) + (alleleDepthB * aGivenBB)) * -10 - val rawGlBB = ((alleleDepthA * aGivenBB) + (alleleDepthB * aGivenAA)) * -10 - val rawGlAB = ((alleleDepthA + alleleDepthB) * aGivenAB) * -10 + val rawGlAA = ((alleleDepthA * aGivenAA) + (alleleDepthB * aGivenBB)) + val rawGlBB = ((alleleDepthA * aGivenBB) + (alleleDepthB * aGivenAA)) + val rawGlAB = ((alleleDepthA + alleleDepthB) * aGivenAB) - val minGL = math.min(math.min(rawGlAA, rawGlAB), rawGlBB) + Likelihoods(2, IndexedSeq(rawGlAA, rawGlAB, rawGlBB)) + } + /** Computes the likelihoods for each possible multiallelic genotype. + * @param alleleDepths the sequence of allele depths in the order specified in the VCF + * @param epsilon the error rate for genotyping + * @return a new `Likelihood` that has the likelihoods of all possible genotypes in the order + * specified in VFC spec for the GL/PL tags. + */ + def multiallelic(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = { + val numAlleles = alleleDepths.length + // probabilities associated with each possible genotype for a pair of alleles + val probs: Array[Double] = Array( + math.log10(epsilon), + math.log10((1 - epsilon) / 2), + math.log10(1 - epsilon) + ) + // raw genotype log-likelihoods Likelihoods( - aa = rawGlAA - minGL, - ab = rawGlAB - minGL, - bb = rawGlBB - minGL + numAlleles, + (0 until numAlleles).flatMap(b => + (0 to b).map(a => + (0 until numAlleles).map(allele => + probs(Array(a, b).count(_ == allele)) * alleleDepths(allele) + ).sum + ) + ) ) } + + def apply(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = { + require(alleleDepths.length >= 2, "at least two alleles are required to calculate genotype likelihoods") + if (alleleDepths.length > 2) multiallelic(alleleDepths, epsilon) + else biallelic(alleleDepths(0), alleleDepths(1), epsilon) + } } - /** Stores the log10(likelihoods) for all possible bi-allelic genotypes. - * - * @param aa likelihood of AA - * @param ab likelihood of AB - * @param bb likelihood of BB + /** Stores the log10(likelihoods) for all possible genotypes. + * @param numAlleles the number of alleles the variant has + * @param genotypeLikelihoods sequence of GLs in the order specified in the VCF spec */ - case class Likelihoods(aa: Double, ab: Double, bb: Double) { + case class Likelihoods(numAlleles: Int, genotypeLikelihoods: IndexedSeq[Double]) { /** * Returns the likelihoods as a list of phred-scaled integers (i.e, the value of the PL tag). * @return a list of phred-scaled likelihooodS for AA, AB, BB. */ - def pls = IndexedSeq(aa.round.toInt, ab.round.toInt, bb.round.toInt) + def pls: IndexedSeq[Int] = { + // subtract the min value so the smallest GL is 0, then multiply by -10 and convert to + // Int to make it PHRED-scale + val rawPL = genotypeLikelihoods.map(gl => gl * -10) + val minPL = rawPL.min + rawPL.map(pl => (pl - minPL).round.toInt) + } + + def mostLikelyGenotype: Option[(Int, Int)] = { + val minIndexes = pls.zipWithIndex.filter(pair => pair._1 == 0) + minIndexes.length match { + case 0 => throw new RuntimeException("expected the most likely PL to have a value of 0.0") + case 1 => { + val genotypes = + for (b <- 0 until numAlleles; a <- 0 to b) + yield (a, b) + Some(genotypes(minIndexes.head._2)) + } + case _ => None // if multiple genotypes are most likely, don't make a call + } + } + + def mostLikelyCall(alleles: Seq[Allele]): IndexedSeq[Allele] = { + mostLikelyGenotype match { + case None => IndexedSeq(NoCallAllele, NoCallAllele) + case Some((a, b)) => IndexedSeq(alleles(a), alleles(b)) + } + } } } diff --git a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala index cfa0230fc..79979ed66 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala @@ -6,7 +6,7 @@ import com.fulcrumgenomics.util.Metric import com.fulcrumgenomics.vcf.api.Allele.SimpleAllele import com.fulcrumgenomics.vcf.api.{Allele, AlleleSet, Genotype, Variant} import com.fulcrumgenomics.testing.UnitSpec -import com.fulcrumgenomics.vcf.DownsampleVcf.{Likelihoods, computePls, downsampleAndRegenotype} +import com.fulcrumgenomics.vcf.DownsampleVcf.{Likelihoods, downsampleAndRegenotype} import scala.util.Random @@ -187,7 +187,7 @@ class DownsampleVcfTest extends UnitSpec { "DownsampleVcf.computePls" should "return new PLs that are not always 0,0,0" in { val ads = IndexedSeq[Int](0, 100) val expected = IndexedSeq(1996, 301, 0) - val newlikelihoods = computePls(ads) + val newlikelihoods = Likelihoods(ads).pls newlikelihoods should contain theSameElementsInOrderAs expected } @@ -196,51 +196,57 @@ class DownsampleVcfTest extends UnitSpec { */ "DownsampleVcf.Likelihoods" should "return ref if all allele depths are zero" in { - val likelihood = Likelihoods(alleleDepthA=0, alleleDepthB=0) + val likelihood = Likelihoods(IndexedSeq(0, 0)) val expected = IndexedSeq[Int](0, 0, 0) likelihood.pls.length shouldBe expected.length likelihood.pls should contain theSameElementsInOrderAs expected } it should "return a likelihood of 0 for AA if there are only ref alleles observed" in { - val likelihood = Likelihoods(alleleDepthA = 10, alleleDepthB = 0) + val likelihood = Likelihoods(IndexedSeq(10, 0)) val expected = IndexedSeq[Int](0, 30, 200) likelihood.pls should contain theSameElementsInOrderAs expected } it should "return a likelihood of 0 for BB if there are only alt alleles observed" in { - val likelihood = Likelihoods(alleleDepthA = 0, alleleDepthB = 10) + val likelihood = Likelihoods(IndexedSeq(0, 10)) val expected = IndexedSeq[Int](200, 30, 0) likelihood.pls should contain theSameElementsInOrderAs expected } it should "return a likelihood of 0 for AB if there are an equal number of ref and alt alleles" in { - val likelihood = Likelihoods(alleleDepthA = 5, alleleDepthB = 5) + val likelihood = Likelihoods(IndexedSeq(5, 5)) val expected = IndexedSeq[Int](70, 0, 70) likelihood.pls should contain theSameElementsInOrderAs expected } it should "return a likelihood of 0 for AA if the AD A >> AD B" in { - val likelihood = Likelihoods(alleleDepthA = 15, alleleDepthB = 2) + val likelihood = Likelihoods(IndexedSeq(15, 2)) likelihood.pls(0) == 0 } it should "return a likelihood of 0 for AB if AD.A and AD.B are similar but not equal" in { - val likelihood = Likelihoods(alleleDepthA = 15, alleleDepthB = 17) + val likelihood = Likelihoods(IndexedSeq(15, 17)) likelihood.pls(1) == 0 } it should "return a likelihood of 0 for BB if AD.B >> AD.A but neither are 0" in { - val likelihood = Likelihoods(alleleDepthA = 3, alleleDepthB = 30) + val likelihood = Likelihoods(IndexedSeq(3, 30)) likelihood.pls(2) == 0 } it should "return correct values when there are very few reads" in { - Likelihoods(0, 0).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0) - Likelihoods(1, 0).pls should contain theSameElementsInOrderAs IndexedSeq(0, 3, 20) - Likelihoods(1, 1).pls should contain theSameElementsInOrderAs IndexedSeq(14, 0, 14) - Likelihoods(0, 2).pls should contain theSameElementsInOrderAs IndexedSeq(40, 6, 0) - Likelihoods(1, 2).pls should contain theSameElementsInOrderAs IndexedSeq(31, 0, 11) + Likelihoods(IndexedSeq(0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0) + Likelihoods(IndexedSeq(1, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 3, 20) + Likelihoods(IndexedSeq(1, 1)).pls should contain theSameElementsInOrderAs IndexedSeq(14, 0, 14) + Likelihoods(IndexedSeq(0, 2)).pls should contain theSameElementsInOrderAs IndexedSeq(40, 6, 0) + Likelihoods(IndexedSeq(1, 2)).pls should contain theSameElementsInOrderAs IndexedSeq(31, 0, 11) + } + + it should "return correct values for multi-allelic variants" in { + Likelihoods(IndexedSeq(0, 0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0, 0, 0, 0) + Likelihoods(IndexedSeq(10, 0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 30, 200, 30, 200, 200) + Likelihoods(IndexedSeq(10, 10, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(139, 0, 139, 169, 169, 339) } @@ -251,10 +257,10 @@ class DownsampleVcfTest extends UnitSpec { Genotype(alleles=AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt))), sample=sample, calls=IndexedSeq[Allele](Allele(ref), Allele(alt)), - attrs=Map("AD" -> ads, "PL" -> Likelihoods(alleleDepthA = ads(0), alleleDepthB = ads(1)))) + attrs=Map("AD" -> ads, "PL" -> Likelihoods(ads)) + ) } - "DownsampleVcf.downsampleAndRegneotype(Genotype)" should "return no call if all allele depths are zero" in { val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(0,0)) val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.01, random = new Random(42), epsilon = 0.01) @@ -298,6 +304,30 @@ class DownsampleVcfTest extends UnitSpec { newGeno.calls should contain theSameElementsInOrderAs expected } + /* + testing DownsampleVcf.downsampleAndRegenotype on downsampleAndRegenotypes + */ + private def makeTriallelicGt(ref: String, alt1: String, alt2: String, ads: IndexedSeq[Int], sample: String ="test"): Genotype = { + val likelihoods = Likelihoods(ads) + val alleles = AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt1), Allele(alt2))) + val calls = likelihoods.mostLikelyCall(alleles.toSeq) + Genotype(alleles, sample=sample, calls=calls, attrs=Map("AD" -> ads, "PL" -> likelihoods.pls)) + } + + it should "return ref,alt1 for a tri-allelic genotype if those alleles have the highest depth" in { + val geno = makeTriallelicGt(ref="A", alt1="T", alt2="G", ads=IndexedSeq(100, 100, 0)) + val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01) + val expected = IndexedSeq(Allele("A"), Allele("T")) + newGeno.calls should contain theSameElementsInOrderAs expected + } + + it should "return alt1,alt2 for a tri-allelic genotype if those alleles have the highest depth" in { + val geno = makeTriallelicGt(ref="A", alt1="T", alt2="G", ads=IndexedSeq(0, 100, 100)) + val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01) + val expected = IndexedSeq(Allele("T"), Allele("G")) + newGeno.calls should contain theSameElementsInOrderAs expected + } + /* testing DownsampleVcf.downsampleAndRegenotype on Variant */ @@ -306,7 +336,7 @@ class DownsampleVcfTest extends UnitSpec { Variant(chrom="1", pos=10, alleles=AlleleSet(ref=Allele(ref), alts=Allele(alt)), - genotypes=Map(sample -> makeGt(ref=ref, alt=alt, ads=ads, sample = sample)) + genotypes=Map(sample -> makeGt(ref=ref, alt=alt, ads=ads, sample=sample)) ) } @@ -345,6 +375,32 @@ class DownsampleVcfTest extends UnitSpec { newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected } + /* + testing DownsampleVcf.downsampleAndRegenotype on downsampleAndRegenotypes + */ + private def makeTriallelicVariant(ref: String, alt1: String, alt2: String, ads: IndexedSeq[Int], sample: String ="test"): Variant = { + val likelihoods = Likelihoods(ads) + val alleles = AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt1), Allele(alt2))) + Variant(chrom="1", + pos=10, + alleles=alleles, + genotypes=Map(sample -> makeTriallelicGt(ref=ref, alt1=alt1, alt2=alt2, ads=ads, sample=sample))) + } + + it should "return ref,alt1 for a tri-allelic variant if those alleles have the highest depth" in { + val variant = makeTriallelicVariant(ref="A", alt1="T", alt2="G", ads=IndexedSeq(100, 100, 0)) + val newVariant = downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42), epsilon = 0.01) + val expected = IndexedSeq(Allele("A"), Allele("T")) + newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected + } + + it should "return alt1,alt2 for a tri-allelic variant if those alleles have the highest depth" in { + val variant = makeTriallelicVariant(ref="A", alt1="T", alt2="G", ads=IndexedSeq(0, 100, 100)) + val newVariant = downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42), epsilon = 0.01) + val expected = IndexedSeq(Allele("T"), Allele("G")) + newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected + } + private val sample = "test1" private val builder = VcfBuilder(samples=Seq(sample)) builder.add(chrom="chr1", pos=100, id="1", alleles=Seq("A", "C"), info=Map(), From 81e39d109ca6a1166317fbee390e20e98b1aafac Mon Sep 17 00:00:00 2001 From: jdidion Date: Thu, 4 Apr 2024 10:13:32 -0700 Subject: [PATCH 08/10] add more tests --- .../fulcrumgenomics/vcf/DownsampleVcf.scala | 29 ++++++++++++------- .../vcf/DownsampleVcfTest.scala | 17 +++++++++++ 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala index 310bfe4c6..b02c0b24e 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala @@ -13,6 +13,7 @@ import com.fulcrumgenomics.vcf.DownsampleVcf.{downsampleAndRegenotype, winnowVar import scala.math.log10 import scala.util.Random +import scala.tools.nsc.doc.html.HtmlTags object DownsampleVcf extends LazyLogging { /** Removes variants that are within a specified distance from a previous variant. @@ -103,6 +104,16 @@ object DownsampleVcf extends LazyLogging { gt.copy(attrs=Map("PL" -> pls, "AD" -> newAds, "DP" -> newAds.sum), calls=calls) } + /**Converts a sequence of log-likelihoods to phred-scale by 1) multiplying each by -10, 2) + * subtracting from each the min value so the smallest value is 0, and 3) rounding to the + * nearest integer. + */ + def logToPhredLikelihoods(logLikelihoods: IndexedSeq[Double]): IndexedSeq[Int] = { + val rawPL = logLikelihoods.map(gl => gl * -10) + val minPL = rawPL.min + rawPL.map(pl => (pl - minPL).round.toInt) + } + object Likelihoods { /** Computes the likelihoods for each possible biallelic genotype. * @param alleleDepthA the reference allele depth @@ -122,13 +133,14 @@ object DownsampleVcf extends LazyLogging { Likelihoods(2, IndexedSeq(rawGlAA, rawGlAB, rawGlBB)) } - /** Computes the likelihoods for each possible multiallelic genotype. + /** Computes the likelihoods for each possible genotype given a sequence of read depths for any + * number of alleles. * @param alleleDepths the sequence of allele depths in the order specified in the VCF * @param epsilon the error rate for genotyping - * @return a new `Likelihood` that has the likelihoods of all possible genotypes in the order - * specified in VFC spec for the GL/PL tags. + * @return a new `Likelihood` that has the log likelihoods of all possible genotypes in the + * order specified in VFC spec for the GL/PL tags. */ - def multiallelic(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = { + def generalized(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = { val numAlleles = alleleDepths.length // probabilities associated with each possible genotype for a pair of alleles val probs: Array[Double] = Array( @@ -151,8 +163,7 @@ object DownsampleVcf extends LazyLogging { def apply(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = { require(alleleDepths.length >= 2, "at least two alleles are required to calculate genotype likelihoods") - if (alleleDepths.length > 2) multiallelic(alleleDepths, epsilon) - else biallelic(alleleDepths(0), alleleDepths(1), epsilon) + generalized(alleleDepths, epsilon) } } @@ -166,11 +177,7 @@ object DownsampleVcf extends LazyLogging { * @return a list of phred-scaled likelihooodS for AA, AB, BB. */ def pls: IndexedSeq[Int] = { - // subtract the min value so the smallest GL is 0, then multiply by -10 and convert to - // Int to make it PHRED-scale - val rawPL = genotypeLikelihoods.map(gl => gl * -10) - val minPL = rawPL.min - rawPL.map(pl => (pl - minPL).round.toInt) + logToPhredLikelihoods(genotypeLikelihoods) } def mostLikelyGenotype: Option[(Int, Int)] = { diff --git a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala index 79979ed66..1ea36acf1 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala @@ -202,6 +202,23 @@ class DownsampleVcfTest extends UnitSpec { likelihood.pls should contain theSameElementsInOrderAs expected } + it should "return correct results for basic cases" in { + val e = 0.01 + val cases: IndexedSeq[(IndexedSeq[Int], IndexedSeq[Double])] = IndexedSeq( + (IndexedSeq(1, 0), IndexedSeq(1 - e, 0.5, e)), + (IndexedSeq(0, 1), IndexedSeq(e, 0.5, 1 - e)), + (IndexedSeq(1, 1), IndexedSeq((1 - e) * e, 0.25, (1 - e) * e)), + (IndexedSeq(2, 0), IndexedSeq(math.pow((1 - e), 2), 0.25, math.pow(e, 2))), + (IndexedSeq(0, 0, 1), IndexedSeq(e, e, e, 0.5, 0.5, 1 - e)), + ) + cases.foreach { case (input, output) => + val likelihood = Likelihoods(input, e) + val logOutput = output.map(p => math.log10(p)) + likelihood.pls.length shouldBe logOutput.length + likelihood.pls should contain theSameElementsInOrderAs DownsampleVcf.logToPhredLikelihoods(logOutput) + } + } + it should "return a likelihood of 0 for AA if there are only ref alleles observed" in { val likelihood = Likelihoods(IndexedSeq(10, 0)) val expected = IndexedSeq[Int](0, 30, 200) From f57f1e8b04e1b40ab001e56cdda336af1918b0f5 Mon Sep 17 00:00:00 2001 From: jdidion Date: Thu, 4 Apr 2024 14:19:45 -0700 Subject: [PATCH 09/10] add more tests, fix some tests --- .../fulcrumgenomics/vcf/DownsampleVcf.scala | 26 +++++++++---------- .../vcf/DownsampleVcfTest.scala | 23 +++++++++++++--- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala index b02c0b24e..c47e6b5c6 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala @@ -104,17 +104,17 @@ object DownsampleVcf extends LazyLogging { gt.copy(attrs=Map("PL" -> pls, "AD" -> newAds, "DP" -> newAds.sum), calls=calls) } - /**Converts a sequence of log-likelihoods to phred-scale by 1) multiplying each by -10, 2) - * subtracting from each the min value so the smallest value is 0, and 3) rounding to the - * nearest integer. - */ - def logToPhredLikelihoods(logLikelihoods: IndexedSeq[Double]): IndexedSeq[Int] = { - val rawPL = logLikelihoods.map(gl => gl * -10) - val minPL = rawPL.min - rawPL.map(pl => (pl - minPL).round.toInt) - } - object Likelihoods { + /**Converts a sequence of log-likelihoods to phred-scale by 1) multiplying each by -10, 2) + * subtracting from each the min value so the smallest value is 0, and 3) rounding to the + * nearest integer. + */ + def logToPhredLikelihoods(logLikelihoods: IndexedSeq[Double]): IndexedSeq[Int] = { + val rawPL = logLikelihoods.map(gl => gl * -10) + val minPL = rawPL.min + rawPL.map(pl => (pl - minPL).round.toInt) + } + /** Computes the likelihoods for each possible biallelic genotype. * @param alleleDepthA the reference allele depth * @param alleleDepthB the alternate allele depth @@ -143,7 +143,7 @@ object DownsampleVcf extends LazyLogging { def generalized(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = { val numAlleles = alleleDepths.length // probabilities associated with each possible genotype for a pair of alleles - val probs: Array[Double] = Array( + val logProbs: Array[Double] = Array( math.log10(epsilon), math.log10((1 - epsilon) / 2), math.log10(1 - epsilon) @@ -154,7 +154,7 @@ object DownsampleVcf extends LazyLogging { (0 until numAlleles).flatMap(b => (0 to b).map(a => (0 until numAlleles).map(allele => - probs(Array(a, b).count(_ == allele)) * alleleDepths(allele) + logProbs(Array(a, b).count(_ == allele)) * alleleDepths(allele) ).sum ) ) @@ -177,7 +177,7 @@ object DownsampleVcf extends LazyLogging { * @return a list of phred-scaled likelihooodS for AA, AB, BB. */ def pls: IndexedSeq[Int] = { - logToPhredLikelihoods(genotypeLikelihoods) + Likelihoods.logToPhredLikelihoods(genotypeLikelihoods) } def mostLikelyGenotype: Option[(Int, Int)] = { diff --git a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala index 1ea36acf1..283843dc0 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala @@ -215,7 +215,22 @@ class DownsampleVcfTest extends UnitSpec { val likelihood = Likelihoods(input, e) val logOutput = output.map(p => math.log10(p)) likelihood.pls.length shouldBe logOutput.length - likelihood.pls should contain theSameElementsInOrderAs DownsampleVcf.logToPhredLikelihoods(logOutput) + likelihood.pls should contain theSameElementsInOrderAs DownsampleVcf.Likelihoods.logToPhredLikelihoods(logOutput) + } + } + + it should "return the same results for biallelic and generalized algorithm" in { + val e = 0.01 + val cases: IndexedSeq[(IndexedSeq[Int], IndexedSeq[Double])] = IndexedSeq( + (IndexedSeq(1, 0), IndexedSeq(1 - e, 0.5, e)), + (IndexedSeq(0, 1), IndexedSeq(e, 0.5, 1 - e)), + (IndexedSeq(1, 1), IndexedSeq((1 - e) * e, 0.25, (1 - e) * e)), + (IndexedSeq(2, 0), IndexedSeq(math.pow((1 - e), 2), 0.25, math.pow(e, 2))), + ) + cases.foreach { case (input, output) => + val biallelic = DownsampleVcf.Likelihoods.biallelic(input(0), input(1), e) + val generalized = DownsampleVcf.Likelihoods.generalized(input, e) + biallelic.pls should contain theSameElementsInOrderAs generalized.pls } } @@ -239,17 +254,17 @@ class DownsampleVcfTest extends UnitSpec { it should "return a likelihood of 0 for AA if the AD A >> AD B" in { val likelihood = Likelihoods(IndexedSeq(15, 2)) - likelihood.pls(0) == 0 + assert(likelihood.pls(0) == 0) } it should "return a likelihood of 0 for AB if AD.A and AD.B are similar but not equal" in { val likelihood = Likelihoods(IndexedSeq(15, 17)) - likelihood.pls(1) == 0 + assert(likelihood.pls(1) == 0) } it should "return a likelihood of 0 for BB if AD.B >> AD.A but neither are 0" in { val likelihood = Likelihoods(IndexedSeq(3, 30)) - likelihood.pls(2) == 0 + assert(likelihood.pls(2) == 0) } it should "return correct values when there are very few reads" in { From 02329721d007bd1001a3fcc8e9b34dc5e04c0ff1 Mon Sep 17 00:00:00 2001 From: jdidion Date: Thu, 4 Apr 2024 14:24:40 -0700 Subject: [PATCH 10/10] cleanup --- .../fulcrumgenomics/vcf/DownsampleVcf.scala | 26 +++++++++---------- .../vcf/DownsampleVcfTest.scala | 4 +-- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala index c47e6b5c6..ddf380db5 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala @@ -121,7 +121,7 @@ object DownsampleVcf extends LazyLogging { * @param epsilon the error rate for genotyping * @return a new `Likelihood` that has the likelihoods of AA, AB, and BB */ - def biallelic(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double = 0.01): Likelihoods = { + def biallelic(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double = 0.01): IndexedSeq[Double] = { val aGivenAA = log10(1 - epsilon) val aGivenBB = log10(epsilon) val aGivenAB = log10((1 - epsilon) / 2) @@ -130,7 +130,7 @@ object DownsampleVcf extends LazyLogging { val rawGlBB = ((alleleDepthA * aGivenBB) + (alleleDepthB * aGivenAA)) val rawGlAB = ((alleleDepthA + alleleDepthB) * aGivenAB) - Likelihoods(2, IndexedSeq(rawGlAA, rawGlAB, rawGlBB)) + IndexedSeq(rawGlAA, rawGlAB, rawGlBB) } /** Computes the likelihoods for each possible genotype given a sequence of read depths for any @@ -140,7 +140,7 @@ object DownsampleVcf extends LazyLogging { * @return a new `Likelihood` that has the log likelihoods of all possible genotypes in the * order specified in VFC spec for the GL/PL tags. */ - def generalized(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = { + def generalized(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): IndexedSeq[Double] = { val numAlleles = alleleDepths.length // probabilities associated with each possible genotype for a pair of alleles val logProbs: Array[Double] = Array( @@ -148,22 +148,20 @@ object DownsampleVcf extends LazyLogging { math.log10((1 - epsilon) / 2), math.log10(1 - epsilon) ) - // raw genotype log-likelihoods - Likelihoods( - numAlleles, - (0 until numAlleles).flatMap(b => - (0 to b).map(a => - (0 until numAlleles).map(allele => - logProbs(Array(a, b).count(_ == allele)) * alleleDepths(allele) - ).sum - ) + // compute genotype log-likelihoods + (0 until numAlleles).flatMap(b => + (0 to b).map(a => + (0 until numAlleles).map(allele => + logProbs(Array(a, b).count(_ == allele)) * alleleDepths(allele) + ).sum ) ) } def apply(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = { - require(alleleDepths.length >= 2, "at least two alleles are required to calculate genotype likelihoods") - generalized(alleleDepths, epsilon) + val numAlleles = alleleDepths.length + require(numAlleles >= 2, "at least two alleles are required to calculate genotype likelihoods") + Likelihoods(numAlleles, generalized(alleleDepths, epsilon)) } } diff --git a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala index 283843dc0..d4a884b57 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala @@ -228,8 +228,8 @@ class DownsampleVcfTest extends UnitSpec { (IndexedSeq(2, 0), IndexedSeq(math.pow((1 - e), 2), 0.25, math.pow(e, 2))), ) cases.foreach { case (input, output) => - val biallelic = DownsampleVcf.Likelihoods.biallelic(input(0), input(1), e) - val generalized = DownsampleVcf.Likelihoods.generalized(input, e) + val biallelic = Likelihoods(2, DownsampleVcf.Likelihoods.biallelic(input(0), input(1), e)) + val generalized = Likelihoods(2, DownsampleVcf.Likelihoods.generalized(input, e)) biallelic.pls should contain theSameElementsInOrderAs generalized.pls } }