diff --git a/src/main/scala/com/fulcrumgenomics/vcf/HapCutToVcf.scala b/src/main/scala/com/fulcrumgenomics/vcf/HapCutToVcf.scala index a9f5b4625..fc1108d20 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/HapCutToVcf.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/HapCutToVcf.scala @@ -25,21 +25,18 @@ package com.fulcrumgenomics.vcf -import java.io.{Closeable, InputStream} -import java.util -import java.util.NoSuchElementException - -import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} -import com.fulcrumgenomics.util.Io import com.fulcrumgenomics.FgBioDef._ +import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} import com.fulcrumgenomics.commons.io.PathUtil import com.fulcrumgenomics.commons.util.LazyLogging import com.fulcrumgenomics.sopt._ -import htsjdk.variant.variantcontext._ -import htsjdk.variant.variantcontext.writer.{Options, VariantContextWriter, VariantContextWriterBuilder} +import com.fulcrumgenomics.util.Io +import com.fulcrumgenomics.vcf.HapCut1VcfHeaderLines.{LikelihoodChangeFormatHeaderLine, MaxLikelihoodChangeFormatHeaderLine, MecReductionFormatHeaderLine, PhaseSetFormatHeaderLine, ReadCountFormatHeaderLine} +import com.fulcrumgenomics.vcf.api._ import htsjdk.variant.vcf._ -import scala.jdk.CollectionConverters._ +import java.io.{Closeable, InputStream} +import java.util.NoSuchElementException import scala.io.Source @clp( @@ -102,8 +99,8 @@ class HapCutToVcf @arg( doc="Fix IUPAC codes in the original VCF to be VCF 4.3 spec-compliant (ex 'R' -> 'A'). Does not support BCF inputs.") val fixAmbiguousReferenceAlleles: Boolean = false ) extends FgBioTool with LazyLogging { - import HapCutType._ import HapCutToVcf.fixIupacBases + import HapCutType._ Io.assertReadable(vcf) Io.assertReadable(input) @@ -132,11 +129,11 @@ class HapCutToVcf newVcf } - val vcfReader = new VCFFileReader(inputVcf.toFile, false) + val vcfReader = VcfSource(inputVcf) val iterator = new HapCutAndVcfMergingIterator(input, vcfReader, gatkPhasingFormat) val vcfWriter = makeWriter(output, vcfReader, iterator.hapCutType) - iterator.foreach(vcfWriter.add) + vcfWriter.write(iterator) vcfReader.safelyClose() iterator.safelyClose() @@ -144,45 +141,24 @@ class HapCutToVcf } /** Creates a VCF writer, adding extra header lines if the output is the phased VCF. */ - private def makeWriter(path: PathToVcf, vcfReader: VCFFileReader, hapCutType: HapCutType): VariantContextWriter = { - val inputHeader = vcfReader.getFileHeader - val createIndex = PathUtil.extensionOf(path) match { - case Some(".vcf") => false - case Some(".vcf.gz") | Some(".bcf") => true - case _ => throw new IllegalArgumentException(s"Could not determine file type from $path") - } - val builder = new VariantContextWriterBuilder() - .setOutputFile(path.toFile) - .setReferenceDictionary(inputHeader.getSequenceDictionary) - .setOption(Options.WRITE_FULL_FORMAT_FIELD) - .setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER) - val writer: VariantContextWriter = if (createIndex) { - builder.setOption(Options.INDEX_ON_THE_FLY).build - } - else { - builder.unsetOption(Options.INDEX_ON_THE_FLY).build - } - - // get the header lines in the input header that we wish to skip/replace with our own definitions - val headerLinesToSkip = HeaderLines.formatHeaderKeys(hapCutType).flatMap(key => Option(inputHeader.getFormatHeaderLine(key))) - val headerLines: util.Set[VCFHeaderLine] = new util.HashSet[VCFHeaderLine]( - inputHeader.getMetaDataInSortedOrder.filterNot(headerLinesToSkip.contains).toJavaSet - ) + private def makeWriter(path: PathToVcf, vcfReader: VcfSource, hapCutType: HapCutType): VcfWriter = { + val inputHeader = vcfReader.header - // add standard header lines - VCFStandardHeaderLines.addStandardFormatLines(headerLines, false, Genotype.PRIMARY_KEYS) + // header lines to skip + val headerLinesToSkip = inputHeader.formats.filter(key => HeaderLines.formatHeaderKeys(hapCutType).contains(key.id)) - // add the new format header lines - headerLines.addAll(HeaderLines.formatHeaderLines(hapCutType).asJava) + // standard header lines to add + val standardLines = HeaderLines.standardHeaderLines() // add the new filter header line if we are to set a filter (not phased) on the unphased variants - if (gatkPhasingFormat) headerLines.add(HapCut1VcfHeaderLines.NotPhasedFilterHeaderLine) + val newFilters = if (gatkPhasingFormat) { + inputHeader.filters :+ HapCut1VcfHeaderLines.NotPhasedFilterHeaderLine + } else inputHeader.filters - // create it - val outHeader = new VCFHeader(headerLines, inputHeader.getSampleNamesInOrder) // create the header + val newFormats = inputHeader.formats.filterNot(headerLinesToSkip.contains(_)) ++ standardLines + val outHeader = inputHeader.copy(formats=newFormats, filters=newFilters) - // write it and return - writer.writeHeader(outHeader) + val writer = VcfWriter(path, header=outHeader) writer } } @@ -197,13 +173,17 @@ object HeaderLines { } } - def formatHeaderLines(hapCutType: HapCutType): Seq[VCFHeaderLine] = { + def formatHeaderLines(hapCutType: HapCutType): Seq[VcfFormatHeader] = { hapCutType match { case HapCut1 => HapCut1VcfHeaderLines.formatHeaderLines case HapCut2 => HapCut2VcfHeaderLines.formatHeaderLines case _ => Seq.empty // empty file } } + + def standardHeaderLines(): Seq[VcfFormatHeader] = { + StandardHeaderLines.formatHeaderLines + } } object HapCutToVcf { @@ -230,29 +210,43 @@ object HapCutToVcf { trait HeaderLines { val PhaseSetFormatTag = "PS" val PhaseSetFormatDescription = "Phase set for the genotype (position of the first variant)" - val PhaseSetFormatHeaderLine = new VCFFormatHeaderLine(PhaseSetFormatTag, 1, VCFHeaderLineType.Integer, PhaseSetFormatDescription) + val PhaseSetFormatHeaderLine = VcfFormatHeader(PhaseSetFormatTag, VcfCount.Fixed(1), VcfFieldType.Integer, PhaseSetFormatDescription) val NotPhasedFilterName = "NotPhased" val NotPhasedFilterDescription = "The variant was not phased by HapCut." - val NotPhasedFilterHeaderLine = new VCFFilterHeaderLine(NotPhasedFilterName, NotPhasedFilterDescription) + val NotPhasedFilterHeaderLine = VcfFilterHeader(NotPhasedFilterName, NotPhasedFilterDescription) +} + +object StandardHeaderLines extends HeaderLines { + val GenotypeFilterHeaderLine = VcfFormatHeader("FT", VcfCount.Unknown, VcfFieldType.String, "Genotype-level filter") + val GenotypeKeyHeaderLine = VcfFormatHeader("GT", VcfCount.Fixed(1), VcfFieldType.String, "Genotype") + val GenotypeQualityKeyHeaderLine = VcfFormatHeader("GQ", VcfCount.Fixed(1), VcfFieldType.Integer, "Genotype Quality") + val DepthKeyHeaderLine = VcfFormatHeader("DP", VcfCount.Fixed(1), VcfFieldType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered") + val AlleleDepthHeaderLine = VcfFormatHeader("AD", VcfCount.OnePerAllele, VcfFieldType.Integer, "Allelic depths for the ref and alt alleles in the order listed") + val GenotypePLKeyHeaderLine = VcfFormatHeader("PL", VcfCount.OnePerGenotype, VcfFieldType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification") + + /** The VCF header FORMAT lines that will be added for HapCut1 specific-genotype information. */ + val formatHeaderLines: Seq[VcfFormatHeader] = { + Seq(GenotypeFilterHeaderLine, GenotypeKeyHeaderLine, GenotypeQualityKeyHeaderLine, DepthKeyHeaderLine, AlleleDepthHeaderLine, GenotypePLKeyHeaderLine) + } } object HapCut1VcfHeaderLines extends HeaderLines { val ReadCountFormatTag = "RC" val ReadCountFormatDescription = "Counts of calls supporting allele0 and allele1 respectively" - val ReadCountFormatHeaderLine = new VCFFormatHeaderLine(ReadCountFormatTag, VCFHeaderLineCount.R, VCFHeaderLineType.Integer, ReadCountFormatDescription) + val ReadCountFormatHeaderLine = VcfFormatHeader(ReadCountFormatTag, VcfCount.OnePerAltAllele, VcfFieldType.Integer, ReadCountFormatDescription) val LikelihoodChangeFormatTag = "LC" val LikelihoodChangeFormatDescription = "Change in likelihood if this SNP is made homozygous or removed" - val LikelihoodChangeFormatHeaderLine = new VCFFormatHeaderLine(LikelihoodChangeFormatTag, 3, VCFHeaderLineType.Float, LikelihoodChangeFormatDescription) + val LikelihoodChangeFormatHeaderLine = VcfFormatHeader(LikelihoodChangeFormatTag, VcfCount.Fixed(3), VcfFieldType.Float, LikelihoodChangeFormatDescription) val MaxLikelihoodChangeFormatTag = "MLC" val MaxLikelihoodChangeFormatDescription = "Maximum change in likelihood if this SNP is made homozygous or removed" - val MaxLikelihoodChangeFormatHeaderLine = new VCFFormatHeaderLine(MaxLikelihoodChangeFormatTag, 1, VCFHeaderLineType.Float, MaxLikelihoodChangeFormatDescription) + val MaxLikelihoodChangeFormatHeaderLine = VcfFormatHeader(MaxLikelihoodChangeFormatTag, VcfCount.Fixed(1), VcfFieldType.Float, MaxLikelihoodChangeFormatDescription) val MecReductionFormatTag = "RMEC" val MecReductionFormatDescription = "Reduction in MEC score if we remove this variant altogether" - val MecReductionFormatHeaderLine = new VCFFormatHeaderLine(MecReductionFormatTag, 1, VCFHeaderLineType.Float, MecReductionFormatDescription) + val MecReductionFormatHeaderLine = VcfFormatHeader(MecReductionFormatTag, VcfCount.Fixed(1), VcfFieldType.Float, MecReductionFormatDescription) /** The VCF header FORMAT keys that will be added for HapCut1 specific-genotype information. */ val formatHeaderKeys: Seq[String] = { @@ -260,7 +254,7 @@ object HapCut1VcfHeaderLines extends HeaderLines { } /** The VCF header FORMAT lines that will be added for HapCut1 specific-genotype information. */ - val formatHeaderLines: Seq[VCFHeaderLine] = { + val formatHeaderLines: Seq[VcfFormatHeader] = { Seq(ReadCountFormatHeaderLine, LikelihoodChangeFormatHeaderLine, MaxLikelihoodChangeFormatHeaderLine, MecReductionFormatHeaderLine, PhaseSetFormatHeaderLine) } } @@ -268,15 +262,15 @@ object HapCut1VcfHeaderLines extends HeaderLines { object HapCut2VcfHeaderLines extends HeaderLines { val PrunedFormatTag = "PR" val PrunedFormatDescription = "1 if HapCut2 pruned this variant when using --discrete_pruning, 0 otherwise" - val PrunedFormatHeaderLine = new VCFFormatHeaderLine(PrunedFormatTag, 1, VCFHeaderLineType.Integer, PrunedFormatDescription) + val PrunedFormatHeaderLine = VcfFormatHeader(PrunedFormatTag, VcfCount.Fixed(1), VcfFieldType.Integer, PrunedFormatDescription) val SwitchErrorFormatTag = "SE" val SwitchErrorFormatDescription = "The confidence (phred-scaled) that there is not a switch error occurring immediately before the SNV" - val SwitchErrorFormatHeaderLine = new VCFFormatHeaderLine(SwitchErrorFormatTag, 1, VCFHeaderLineType.Float, SwitchErrorFormatDescription) + val SwitchErrorFormatHeaderLine = VcfFormatHeader(SwitchErrorFormatTag, VcfCount.Fixed(1), VcfFieldType.Float, SwitchErrorFormatDescription) val NoErrorFormatTag = "NE" val NoErrorFormatDescription = "The confidence (phred-scaled) that the SNV is not a mismatch (single SNV) error." - val NoErrorFormatHeaderLine = new VCFFormatHeaderLine(NoErrorFormatTag, 1, VCFHeaderLineType.Float, NoErrorFormatDescription) + val NoErrorFormatHeaderLine = VcfFormatHeader(NoErrorFormatTag, VcfCount.Fixed(1), VcfFieldType.Float, NoErrorFormatDescription) /** The VCF header FORMAT keys that will be added for HapCut2 specific-genotype information. */ val formatHeaderKeys: Seq[String] = { @@ -284,7 +278,7 @@ object HapCut2VcfHeaderLines extends HeaderLines { } /** The VCF header FORMAT lines that will be added for HapCut2 specific-genotype information. */ - val formatHeaderLines: Seq[VCFHeaderLine] = { + val formatHeaderLines: Seq[VcfFormatHeader] = { Seq(PrunedFormatHeaderLine, SwitchErrorFormatHeaderLine, NoErrorFormatHeaderLine, PhaseSetFormatHeaderLine) } } @@ -297,14 +291,14 @@ object HapCut2VcfHeaderLines extends HeaderLines { * @param gatkPhasingFormat true to output in GATK's ReadBackedPhasing format, false if to use the recommendations in the VCF spec. */ private class HapCutAndVcfMergingIterator(hapCutPath: FilePath, - vcfReader: VCFFileReader, + vcfReader: VcfSource, gatkPhasingFormat: Boolean) - extends Iterator[VariantContext] with Closeable { + extends Iterator[Variant] with Closeable { import HapCutType.HapCutType private val sourceIterator = vcfReader.iterator.zipWithIndex.buffered private val hapCutReader = HapCutReader(path=hapCutPath) - private val sampleName = vcfReader.getFileHeader.getSampleNamesInOrder.iterator().next() + private val sampleName = vcfReader.header.samples.head def hasNext: Boolean = { if (sourceIterator.isEmpty && hapCutReader.hasNext) throw new IllegalStateException("HapCut has more phased variants but no more variants in the input") @@ -312,21 +306,24 @@ private class HapCutAndVcfMergingIterator(hapCutPath: FilePath, } /** Returns either the phased variant with added HapCut-specific genotype information (Left), or the original variant (Right). */ - def next(): VariantContext = { + def next(): Variant = { if (!hasNext) throw new NoSuchElementException("Calling next() when hasNext() is false") - val (sourceContext, sourceOffset) = sourceIterator.next() - if (!hapCutReader.hasNext) formatSourceContext(sourceContext) + val (sourceVariant, sourceOffset) = sourceIterator.next() + if (!hapCutReader.hasNext) formatSourceVariant(sourceVariant) else { val HapCutOffsetAndCall(offset, callOption) = hapCutReader.next() if (offset != sourceOffset+1) throw new IllegalStateException("BUG: calls are out of order") callOption match { case None => - formatSourceContext(sourceContext) + formatSourceVariant(sourceVariant) case Some(hapCutCall) => - require(hapCutCall.pos == sourceContext.getStart, s"${hapCutCall.pos} ${sourceContext.getStart}") + require(hapCutCall.pos == sourceVariant.getStart, s"${hapCutCall.pos} ${sourceVariant.getStart}") require(offset == hapCutCall.offset) - val hapCutContext = hapCutCall.toVariantContext(sampleName) - replaceGenotypes(source=sourceContext, genotype=hapCutContext.getGenotype(0), isPhased = !gatkPhasingFormat || !hapCutCall.firstInBlock) + val hapCutVariant = hapCutCall.toVariant(sampleName) + replaceGenotypes( + source=sourceVariant, + genotype=hapCutVariant.genotypes.valuesIterator.next(), + isPhased = !gatkPhasingFormat || !hapCutCall.firstInBlock) } } } @@ -336,49 +333,46 @@ private class HapCutAndVcfMergingIterator(hapCutPath: FilePath, def close(): Unit = { this.hapCutReader.close() } - /** Returns a new variant context with the phase unset (if it was set) and appropriately formatted (filtered as + /** Returns a new variant with the phase unset (if it was set) and appropriately formatted (filtered as * not phased if `gatkPhasingFormat` is true). */ - private def formatSourceContext(sourceContext: VariantContext): VariantContext = { - val hasPhasedGenotype = sourceContext.getGenotypes.exists(_.isPhased) - val hasPhasingSetId = sourceContext.getGenotypes.exists(_.hasExtendedAttribute(HapCut1VcfHeaderLines.PhaseSetFormatTag)) + private def formatSourceVariant(sourceVariant: Variant): Variant = { + val hasPhasedGenotype = sourceVariant.genotypes.exists { case (_, v) => v.phased } + val hasPhasingSetId = sourceVariant.genotypes.exists { case (_, v) => v.attrs.contains(HapCut1VcfHeaderLines.PhaseSetFormatTag) } - if (!hasPhasedGenotype && !hasPhasingSetId && !gatkPhasingFormat) sourceContext + if (!hasPhasedGenotype && !hasPhasingSetId && !gatkPhasingFormat) sourceVariant else { - val builder = new VariantContextBuilder(sourceContext) - if (hasPhasedGenotype || hasPhasingSetId) { - // unset the phase and remove the phasing set ID if the input has phase set - builder.genotypes(sourceContext.getGenotypes().map { g => - val builder = new GenotypeBuilder(g).phased(false) - val attrs = g.getExtendedAttributes.asScala.filterNot { case (tag, value) => tag == HapCut1VcfHeaderLines.PhaseSetFormatTag } - builder.noAttributes() - builder.attributes(attrs.asJava) - builder.make() - }.toJavaList) - } - if (gatkPhasingFormat) { - // set the variant as filtered due to not being phased - builder.filter(HapCut1VcfHeaderLines.NotPhasedFilterName) - } - builder.make() + val newGenotypes = if (hasPhasedGenotype || hasPhasingSetId) { + sourceVariant.genotypes.map { case (sample, genotype) => + sample -> genotype.copy(phased = false, attrs = genotype.attrs.filterNot { case (tag, _) => tag == HapCut1VcfHeaderLines.PhaseSetFormatTag }) + } + } else sourceVariant.genotypes + + val newFilters = if (gatkPhasingFormat) { + sourceVariant.filters ++ Set(HapCut1VcfHeaderLines.NotPhasedFilterName) + } else sourceVariant.filters + + sourceVariant.copy(genotypes = newGenotypes, filters = newFilters) } } /** Replaces the original genotype with HapCut's genotype and adds any HapCut-specific genotype information. */ - private def replaceGenotypes(source: VariantContext, genotype: Genotype, isPhased: Boolean): VariantContext = { - val builder = new VariantContextBuilder(source) - val sourceAlleles = source.getAlleles.toSeq - val genotypeAlleles = genotype.getAlleles.toList.map { + private def replaceGenotypes(source: Variant, genotype: Genotype, isPhased: Boolean): Variant = { + val sourceAlleles = source.alleles.iterator + + val genotypeAlleles = genotype.alleles.toSeq.map { allele => sourceAlleles.find(a => a.toString == allele.toString) match { case None => throw new IllegalStateException(s"Could not find allele '$allele' in source alleles: " + sourceAlleles.map{_.toString}.mkString(", ")) case Some(a) => a } } - val sourceGenotype = source.getGenotype(0) - val genotypeBuilder = new GenotypeBuilder(sourceGenotype).alleles(genotypeAlleles.asJava).phased(isPhased) - genotypeBuilder.attributes(genotype.getExtendedAttributes) - builder.genotypes(genotypeBuilder.make()).make() + val sourceGenotype = source.genotypes(source.genotypes.keys.head) + val newGt = sourceGenotype.copy( + alleles = AlleleSet(genotypeAlleles.head, genotypeAlleles.tail), + phased = isPhased) + + source.copy(genotypes = Map(newGt.sample -> newGt)) } } @@ -455,7 +449,7 @@ private object GenotypeInfo { /** Genotype-level information specific to HapCut1 or HapCut2. */ private sealed trait GenotypeInfo { - def addTo(builder: GenotypeBuilder): GenotypeBuilder + def addTo(builder: Genotype): Genotype } /** Genotype-level information produced by HapCut1 */ @@ -465,12 +459,13 @@ private case class HapCut1GenotypeInfo private(readCounts: List[Int], rMEC: Float) extends GenotypeInfo { import HapCut1VcfHeaderLines._ /** Adds the HapCut-specific genotype information to the given genotype builder. */ - def addTo(builder: GenotypeBuilder): GenotypeBuilder = { - builder - .attribute(ReadCountFormatTag, this.readCounts.asJava) - .attribute(LikelihoodChangeFormatTag, this.likelihoods.asJava) - .attribute(MaxLikelihoodChangeFormatTag, this.delta) - .attribute(MecReductionFormatTag, this.rMEC) + def addTo(gt: Genotype): Genotype = { + gt.copy(attrs=gt.attrs ++ Map( + ReadCountFormatDescription -> this.readCounts, + LikelihoodChangeFormatTag -> this.likelihoods, + MaxLikelihoodChangeFormatTag -> this.delta, + MecReductionFormatTag -> this.rMEC + )) } } @@ -500,11 +495,12 @@ private object HapCut1GenotypeInfo { /** Genotype-level information produced by HapCut2 */ private[vcf] case class HapCut2GenotypeInfo private[vcf](pruned: Option[Boolean], log10SwitchError: Option[Double], log10NoError: Option[Double]) extends GenotypeInfo { import HapCut2VcfHeaderLines._ - def addTo(builder: GenotypeBuilder): GenotypeBuilder = { - builder - .attribute(PrunedFormatTag, this.pruned.map(p => if (p) 1 else 0).orNull) - .attribute(SwitchErrorFormatTag, this.log10SwitchError.orNull) - .attribute(NoErrorFormatTag, this.log10NoError.orNull) + def addTo(gt: Genotype): Genotype = { + gt.copy(attrs = gt.attrs ++ Map( + PrunedFormatTag -> this.pruned.map(p => if (p) 1 else 0).orNull, + SwitchErrorFormatTag -> this.log10SwitchError.orNull, + NoErrorFormatTag -> this.log10NoError.orNull + )) } } @@ -558,47 +554,36 @@ private case class HapCutCall private(block: BlockInfo, /** true if this variant is the first phased variant in a phased block, false otherwise. */ def firstInBlock: Boolean = phaseSet == pos - /** Converts the HapCut variant representation to a VariantContext. */ - def toVariantContext(sampleName: String): VariantContext = { + /** Converts the HapCut variant representation to a Variant */ + def toVariant(sampleName: String): Variant = { // Parse the alleles - val refAllele = Allele.create(this.ref, true) - val altAlleles = this.alts.split(",").map { alt => - Allele.create(alt, false) - }.toSeq - val alleles = refAllele +: altAlleles - val allelesCollection = alleles.asJavaCollection + val refAllele = Allele(this.ref) + val altAlleles = this.alts.split(",").map { Allele(_) }.toSeq + val alleles = AlleleSet(refAllele, altAlleles) // Get the genotype alleles - val genotypeAlleles: List[Allele] = if (hap1Allele < 0) { + val genotypeAlleles: Seq[Allele] = if (hap1Allele < 0) { require(hap2Allele < 0) - // Assumes the genotype is before the first ":" token - this.genotype.split(":").head.split("[/|]").map(_.toInt).map(i => alleles(i)).toList + this.genotype.split(":").head.split("[/|]").map(_.toInt).map(alleles(_)).toSeq } else { - List(alleles(hap1Allele), alleles(hap2Allele)) + Seq(alleles(hap1Allele), alleles(hap2Allele)) } - val genotype = this.info.addTo(new GenotypeBuilder(sampleName, genotypeAlleles.asJava)) - .phased(true) - .attribute(PhaseSetFormatTag, phaseSet) - .make() - - // build the context, and make sure to recompute the end - new VariantContextBuilder( - s"${this.offset}", - this.contig, - this.pos.toLong, - this.pos.toLong, - allelesCollection - ) - .computeEndFromAlleles(alleles.toList.asJava, this.pos) - .genotypes(genotype).make() + val genotype = this.info.addTo(Genotype( + sample=sampleName, + alleles=alleles, + calls=genotypeAlleles.toIndexedSeq, + phased=true, + attrs=Map(PhaseSetFormatTag -> phaseSet))) + + Variant(chrom=this.contig, pos=this.pos, alleles=alleles, genotypes=Map(sampleName -> genotype)) } } private object HapCutCall { - import HapCutType._ import HapCutToVcf.fixIupacBases + import HapCutType._ /** Parse a variant line. * @@ -668,6 +653,7 @@ private[vcf] class HapCutReader(iterator: Iterator[String], private[this] val source: Option[{ def close(): Unit }] = None) extends Iterator[HapCutOffsetAndCall] with Closeable { import HapCutType._ + import scala.collection.mutable.ListBuffer private val lineIterator = iterator.buffered diff --git a/src/test/resources/com/fulcrumgenomics/vcf/testdata/NA12878.GIABPedigreev0.2.17.41100000.41300000.vcf b/src/test/resources/com/fulcrumgenomics/vcf/testdata/NA12878.GIABPedigreev0.2.17.41100000.41300000.vcf index 0b8cc0e76..1727577d0 100644 --- a/src/test/resources/com/fulcrumgenomics/vcf/testdata/NA12878.GIABPedigreev0.2.17.41100000.41300000.vcf +++ b/src/test/resources/com/fulcrumgenomics/vcf/testdata/NA12878.GIABPedigreev0.2.17.41100000.41300000.vcf @@ -122,6 +122,8 @@ ##INFO= ##bcftools_viewVersion=1.3-14-ge0890a1+htslib-1.3-13-g406c7d0 ##bcftools_viewCommand=view -r 17:41100000-41300000 /seq/nist/NA12878_HG001/GIABPedigreev0.2/NIST_RTG_PlatGen_merged_highconfidence_v0.2_Allannotate.vcf.gz +##bcftools_annotateVersion=1.12+htslib-1.12 +##bcftools_annotateCommand=annotate --force --remove INFO/PLNCIIonWG NA12878.GIABPedigreev0.2.17.41100000.41300000.vcf; Date=Tue Apr 26 09:52:36 2022 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 17 41101001 . C A 24383.5 PASS DP=872;DPR=1.012;PGC=2,2,5,2;PGDR=0.838,1.097,1.137,0.906;PHQ=0;source=NISTPASS,RTGPHQ;PHC;XRX GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:AD:GL:AVR 1/1:43:0.865:0.164:0:1458.5:207:0.2:7.45:1.26:0:0:0,42:-145.85,-24.3,0:0.5037 17 41101246 . C T 26153 PASS DP=880;DPR=1.021;PGC=2,2,5,2;PGDR=0.979,0.978,1,1.055;PHQ=0;source=NISTPASS,RTGPHQ;PHC GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:RS:AD:GL:AVR 1/1:57:1.146:1.046:0:1947.1:273:0.15:1.4:0.95:0:0:A,1,0.251,T,56,0.795:0,56:-194.71,-28.15,0:0.645 @@ -163,7 +165,7 @@ 17 41122786 . GAGAC G 35898.1 PASS DP=794;DPR=0.921;PGC=2,2,5,2;PGDR=0.805,0.96,1.008,0.821;PHQ=0;source=NISTPASS,RTGPHQ;PHC;XRX GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:AD:GL:AVR 1/1:33:0.664:0.22:0:1660.6:205:1.05:0.07:3.04:0:0:2,31:-166.06,-20.87,0:0.497 17 41123177 . A C 24769.4 PASS DP=892;DPR=1.035;PGC=2,2,5,2;PGDR=1.2,0.949,1.042,1.025;PHQ=0;source=NISTPASS,RTGPHQ;PHC;XRX GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:AD:GL:AVR 1/1:60:1.207:0.123:0:2014.9:284:0:0.58:2.11:0.03:0:0,60:-201.49,-30.04,0:0.5084 17 41124186 . C T 10866.9 PASS DP=347;DPR=0.403;PGC=2,2,5,2;PGDR=0.355,0.355,0.373,0.41;PHQ=0;source=NISTPASS,RTGPHQ;PHC GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:RS:AD:GL:AVR 1/1:25:0.503:0.262:0:896.8:129:0.35:1.45:14.68:0:0.07:A,1,0.126,T,24,0.136:0,24:-89.68,-12.87,0:0.3497 -17 41124315 . CCTCTCT C 11152 PASS DPSum=346;HRun=0;HapNoVar=0;NoPLTot=0;PL454WG=181,13,0;PLCG=1036,117,0;PLHSWEx=88,3,0;PLHSWG=1206,94,0;PLILL250=978,68,0;PLILLCLIA=1131,82,0;PLILLWG=346,38,0;PLIllPCRFree=767,58,0;PLNCIIonWG=140,6,0;PLPlatGen=4417,340,0;PLXIll=821,65,0;PLXPSolWGLS=41,3,0;PLminsum=887;PLminsumOverDP=2.56;RPA=5,2;RU=CT;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=8;allalts=.;datasetcalls=12;geno=3;genoMapGood=8;platformbias=none;platformnames=ill,cg;platforms=2;source=NISTPASS,RTGPHI,PlatGen;varType=INDEL GT:DP:GQ:PL 1/1:346:887:11152,887,0 +17 41124315 . CCTCTCT C 11152 PASS DPSum=346;HRun=0;HapNoVar=0;NoPLTot=0;PL454WG=181,13,0;PLCG=1036,117,0;PLHSWEx=88,3,0;PLHSWG=1206,94,0;PLILL250=978,68,0;PLILLCLIA=1131,82,0;PLILLWG=346,38,0;PLIllPCRFree=767,58,0;PLPlatGen=4417,340,0;PLXIll=821,65,0;PLXPSolWGLS=41,3,0;PLminsum=887;PLminsumOverDP=2.56;RPA=5,2;RU=CT;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=8;allalts=.;datasetcalls=12;geno=3;genoMapGood=8;platformbias=none;platformnames=ill,cg;platforms=2;source=NISTPASS,RTGPHI,PlatGen;varType=INDEL GT:DP:GQ:PL 1/1:346:887:11152,887,0 17 41125054 . G A 1807.3 PASS DP=702;DPR=0.814;PGC=2,2,5,2;PGDR=0.653,0.857,0.913,0.663;PHQ=44;source=NISTUncertain,RTGPHQ;PHC GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:RS:AD:GL:AVR:PS 0/1:39:0.784:8.559:0.205:449.8:335:0.5:7.72:9.41:0:0:A,21,3.076,G,18,5.484:18,21:-44.98,0,-33.52:0.1139:8009102 17 41126305 . A G 30757.6 PASS DP=945;DPR=1.096;PGC=2,2,5,2;PGDR=1.027,0.947,1.06,1.258;PHQ=0;source=NISTPASS,RTGPHQ;PHC GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:RS:AD:GL:AVR 1/1:49:0.985:0.099:0:1861.1:257:0:3.59:3.59:0:0.02:G,49,0.099:0,49:-186.11,-26.15,0:0.6822 17 41127448 . A G 20419.4 PASS DP=819;DPR=0.95;PGC=2,2,5,2;PGDR=1.028,0.958,0.944,0.957;PHQ=44;source=NISTPASS,RTGPHQ;PHC GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:RS:AD:GL:AVR:PS 1/0:45:0.905:0.068:0:607.7:608:2.36:0.33:2.36:0.22:0.06:A,26,0.041,G,19,0.027:26,19:-60.77,0,-83.1:0.5624:8009102 @@ -207,7 +209,7 @@ 17 41146980 . AG A 29631.6 PASS DP=967;DPR=1.122;PGC=2,2,5,2;PGDR=1.322,1.16,1.148,0.995;PHQ=44;source=NISTPASS,RTGPHQ;PHC;XRX GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:AD:GL:AVR:PS 1/0:40:0.804:0.119:0:554.9:555:9.41:3.01:11.92:0:0:26,13:-55.49,0,-116.81:0.4836:8009102 17 41147132 . CTG C 40803.7 PASS DP=929;DPR=1.078;PGC=2,2,5,2;PGDR=1.007,1.132,1.215,1.073;PHQ=0;source=NISTPASS,RTGPHQ;PHC;XRX GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:AD:GL:AVR 1/1:40:0.804:0.135:0:1944.8:226:0.22:2.73:0.21:0.01:0:0,39:-194.48,-22.99,0:0.5016 17 41148229 . G A 5178.3 PASS DP=936;DPR=1.086;PGC=2,2,5,2;PGDR=1.131,1.241,1.091,1.025;PHQ=44;source=NISTPASS,RTGPHQ;PHC GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:RS:AD:GL:AVR:PS 0/1:45:0.905:0.122:0:788.3:618:1.21:2.17:0.05:0.19:0:A,25,0.058,G,20,0.064:20,25:-78.83,0,-61.84:0.5508:8009102 -17 41148816 . G GAAA 13638 PASS DPSum=376;HRun=1;HapNoVar=0;NoPLTot=1;PLCG=filteredSSE95Align99.5,52,8,0,52,52,52;PLHSWG=1887,148,0,1887,1887,1887;PLILL250=2172,152,0,2172,2172,2172;PLILLCLIA=937,45,0,937,937,937;PLILLWG=filteredSSE99Align99.5,66,10,0,66,66,66;PLIllPCRFree=1656,114,0,1656,1656,1656;PLIonEx=0,3,30,30,30,30;PLNCIIonWG=filteredSSE99.5Align99.5,0,3,30,30,30,30;PLPlatGen=6374,424,0,6374,6374,6374;PLXIll=642,38,0,642,642,642;PLminsum=894;PLminsumOverDP=2.38;RPA=13,16;RU=A;TrancheABQDmin2=90;TrancheAlignmin2=0;TrancheMapmin2=95;TrancheSSEmin2=95;YesPLtot=6;allalts=.,.;datasetcalls=10;geno=3;genoMapGood=4;platformbias=none;platformnames=ill;platforms=1;source=NISTPASS,RTGPHI,PlatGen;varType=INDEL GT:DP:GQ:PL 1/1:376:894:13668,924,30 +17 41148816 . G GAAA 13638 PASS DPSum=376;HRun=1;HapNoVar=0;NoPLTot=1;PLCG=filteredSSE95Align99.5,52,8,0,52,52,52;PLHSWG=1887,148,0,1887,1887,1887;PLILL250=2172,152,0,2172,2172,2172;PLILLCLIA=937,45,0,937,937,937;PLILLWG=filteredSSE99Align99.5,66,10,0,66,66,66;PLIllPCRFree=1656,114,0,1656,1656,1656;PLIonEx=0,3,30,30,30,30;PLPlatGen=6374,424,0,6374,6374,6374;PLXIll=642,38,0,642,642,642;PLminsum=894;PLminsumOverDP=2.38;RPA=13,16;RU=A;TrancheABQDmin2=90;TrancheAlignmin2=0;TrancheMapmin2=95;TrancheSSEmin2=95;YesPLtot=6;allalts=.,.;datasetcalls=10;geno=3;genoMapGood=4;platformbias=none;platformnames=ill;platforms=1;source=NISTPASS,RTGPHI,PlatGen;varType=INDEL GT:DP:GQ:PL 1/1:376:894:13668,924,30 17 41148961 . C CAAAA 45296 PASS DP=777;DPR=0.901;PGC=2,2,5,2;PGDR=0.828,0.934,0.94,0.889;PHQ=0;source=NISTPASS,RTGPHQ;PHC;XRX GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:AD:GL:AVR 1/1:39:0.784:0.066:0:2525.8:126:32.07:0.08:0:0.26:0:7,27:-252.58,-21.33,0:0.4537 17 41149070 . G A 4755.9 PASS DP=942;DPR=1.093;PGC=2,2,5,2;PGDR=1.26,0.934,1.121,1.092;PHQ=44;source=NISTPASS,RTGPHQ;PHC GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:RS:AD:GL:AVR:PS 0/1:51:1.026:0.109:0:728.5:728:0.38:9.73:12.31:0.41:0:A,24,0.057,G,27,0.052:27,24:-72.85,0,-83.62:0.5499:8009102 17 41149709 . C T 33903 PASS DP=1023;DPR=1.187;PGC=2,2,5,2;PGDR=1.318,1.203,1.148,1.258;PHQ=0;source=NISTPASS,RTGPHQ;PHC GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:RS:AD:GL:AVR 1/1:58:1.166:0.086:0:2201.2:284:0:5.39:5.21:0:0:T,58,0.086:0,58:-220.12,-28.86,0:0.6853 @@ -227,7 +229,7 @@ 17 41154261 . G A 5066.9 PASS DP=894;DPR=1.037;PGC=2,2,5,2;PGDR=0.905,1.04,1.078,1.219;PHQ=44;source=NISTPASS,RTGPHQ;PHC GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:RS:AD:GL:AVR:PS 0/1:42:0.845:0.257:0:516.7:517:4.29:4.89:8.53:0.21:0:A,16,0.024,C,1,0.126,G,25,0.107:25,16:-51.67,0,-77.47:0.5483:8009102 17 41154600 . C T 22495 PASS DP=916;DPR=1.063;PGC=2,2,5,2;PGDR=1.244,1.214,1.094,0.985;PHQ=44;source=NISTPASS,RTGPHQ;PHC GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:RS:AD:GL:AVR:PS 1/0:42:0.845:0.475:0:599.6:600:0.83:21.24:0.21:0.09:0.02:C,23,0.386,T,19,0.089:23,19:-59.96,0,-68.28:0.567:8009102 17 41154817 . A C 22748.7 PASS DP=924;DPR=1.072;PGC=2,2,5,2;PGDR=1.078,1.174,1.069,1.131;PHQ=44;source=NISTPASS,RTGPHQ;PHC GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:RS:AD:GL:AVR:PS 1/0:47:0.945:0.075:0:757.7:725:0.05:7.65:2.9:0:0.02:A,23,0.037,C,24,0.038:23,24:-75.77,0,-72.47:0.571:8009102 -17 41156153 . A T 4696 PASS DPSum=544;HRun=17;HapNoVar=0;NoPLTot=0;PL454WG=114,0,99;PLCG=417,0,848;PLHSWG=693,0,644;PLILL250=222,0,535;PLILLCLIA=432,0,987;PLILLWEx=25,0,30;PLILLWG=125,0,524;PLIllPCRFree=461,0,363;PLNCIIonWG=150,0,24;PLPlatGen=1772,0,2593;PLXIll=285,0,220;PLminsum=4696;PLminsumOverDP=8.63;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=11;allalts=.;datasetcalls=11;geno=2;genoMapGood=11;platformbias=none;platformnames=ill,454,cg,ion;platforms=4;source=NISTPASS,RTGPHI,PlatGen;varType=SNP GT:DP:GQ:PL:ED 1/0:544:4696:4696,0,6867:5 +17 41156153 . A T 4696 PASS DPSum=544;HRun=17;HapNoVar=0;NoPLTot=0;PL454WG=114,0,99;PLCG=417,0,848;PLHSWG=693,0,644;PLILL250=222,0,535;PLILLCLIA=432,0,987;PLILLWEx=25,0,30;PLILLWG=125,0,524;PLIllPCRFree=461,0,363;PLPlatGen=1772,0,2593;PLXIll=285,0,220;PLminsum=4696;PLminsumOverDP=8.63;TrancheABQDmin2=0;TrancheAlignmin2=0;TrancheMapmin2=0;TrancheSSEmin2=0;YesPLtot=11;allalts=.;datasetcalls=11;geno=2;genoMapGood=11;platformbias=none;platformnames=ill,454,cg,ion;platforms=4;source=NISTPASS,RTGPHI,PlatGen;varType=SNP GT:DP:GQ:PL:ED 1/0:544:4696:4696,0,6867:5 17 41156510 . C T 18111.1 PASS DP=805;DPR=0.934;PGC=2,2,5,2;PGDR=0.9,1.089,1.021,0.839;PHQ=44;source=NISTPASS,RTGPHQ;PHC GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:RS:AD:GL:AVR:PS 1/0:50:1.006:0.25:0:768.1:722:0.17:3.26:1.56:0.28:0.02:C,24,0.171,T,26,0.079:24,26:-76.81,0,-72.17:0.564:8009102 17 41157105 . T G 22402.4 PASS DP=997;DPR=1.157;PGC=2,2,5,2;PGDR=1.298,1.131,1.15,1.073;PHQ=44;source=NISTPASS,RTGPHQ;PHC GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:RS:AD:GL:AVR:PS 1/0:68:1.368:1.343:0:1018.5:785:2.63:1.87:2.48:0.15:0.01:C,1,0.251,G,38,0.699,T,29,0.392:29,38:-101.85,0,-78.49:0.575:8009102 17 41157387 . C T 3540.9 PASS DP=738;DPR=0.856;PGC=2,2,5,2;PGDR=0.861,0.823,0.821,0.859;PHQ=44;source=NISTPASS,RTGPHQ;PHC GT:DP:DPR:RE:AR:RQ:GQ:ABP:SBP:RPB:PPB:PUR:RS:AD:GL:AVR:PS 0/1:43:0.865:0.084:0:445.6:446:11.36:0.67:8.53:0.56:0.07:C,29,0.063,T,14,0.021:29,14:-44.56,0,-92.65:0.5387:8009102 diff --git a/src/test/scala/com/fulcrumgenomics/vcf/HapCutToVcfTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/HapCutToVcfTest.scala index 80775395a..9bcee42d9 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/HapCutToVcfTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/HapCutToVcfTest.scala @@ -123,18 +123,18 @@ class HapCutToVcfTest extends UnitSpec with ParallelTestExecution { call.phaseSet shouldBe 41106449 call.hap1Allele shouldBe 1 call.hap2Allele shouldBe 0 - val ctx = call.toVariantContext("Sample") - ctx.getGenotype(0).isPhased shouldBe true - ctx.getGenotype(0).getAlleles.map(_.getBaseString).toList should contain theSameElementsInOrderAs Seq("CT", "C") + val ctx = call.toVariant("Sample") + ctx.genotypes.valuesIterator.next().phased shouldBe true + ctx.genotypes.valuesIterator.next().calls.map(_.value).toList should contain theSameElementsInOrderAs Seq("CT", "C") } { val call = calls(4) call.phaseSet shouldBe 41106449 call.hap1Allele shouldBe 0 call.hap2Allele shouldBe 1 - val ctx = call.toVariantContext("Sample") - ctx.getGenotype(0).isPhased shouldBe true - ctx.getGenotype(0).getAlleles.map(_.getBaseString).toList should contain theSameElementsInOrderAs Seq("T", "G") + val ctx = call.toVariant("Sample") + ctx.genotypes.valuesIterator.next().phased shouldBe true + ctx.genotypes.valuesIterator.next().calls.map(_.value).toList should contain theSameElementsInOrderAs Seq("T", "G") } }