From 8b9af50cf55a32fb1adadcf5adaa047ff5ac2ad9 Mon Sep 17 00:00:00 2001 From: Clint Valentine Date: Sun, 28 Feb 2021 11:36:40 -0500 Subject: [PATCH 1/3] Add source classes for BED and generic Interval types --- .gitignore | 2 + .../fasta/SequenceDictionary.scala | 19 +- .../com/fulcrumgenomics/util/BedSource.scala | 116 ++++++++ .../util/IntervalListSource.scala | 40 +-- .../fulcrumgenomics/util/IntervalSource.scala | 102 +++++++ .../scala/com/fulcrumgenomics/util/Io.scala | 12 +- .../fasta/SequenceDictionaryTest.scala | 37 ++- .../fulcrumgenomics/util/BedSourceTest.scala | 266 ++++++++++++++++++ .../util/IntervalListSourceTest.scala | 18 +- .../util/IntervalSourceTest.scala | 265 +++++++++++++++++ 10 files changed, 838 insertions(+), 39 deletions(-) create mode 100644 src/main/scala/com/fulcrumgenomics/util/BedSource.scala create mode 100644 src/main/scala/com/fulcrumgenomics/util/IntervalSource.scala create mode 100644 src/test/scala/com/fulcrumgenomics/util/BedSourceTest.scala create mode 100644 src/test/scala/com/fulcrumgenomics/util/IntervalSourceTest.scala diff --git a/.gitignore b/.gitignore index c269e1609..9f79a9718 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ JeanLuc.iml target project/project .DS_Store +out/ +*.iml \ No newline at end of file diff --git a/src/main/scala/com/fulcrumgenomics/fasta/SequenceDictionary.scala b/src/main/scala/com/fulcrumgenomics/fasta/SequenceDictionary.scala index eb7a75a7b..c83e53c53 100644 --- a/src/main/scala/com/fulcrumgenomics/fasta/SequenceDictionary.scala +++ b/src/main/scala/com/fulcrumgenomics/fasta/SequenceDictionary.scala @@ -26,13 +26,12 @@ package com.fulcrumgenomics.fasta import java.io.StringWriter - import com.fulcrumgenomics.FgBioDef import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.fasta.SequenceMetadata.Keys import com.fulcrumgenomics.util.Io import enumeratum.EnumEntry -import htsjdk.samtools.util.BufferedLineReader +import htsjdk.samtools.util.{BufferedLineReader, Locatable} import htsjdk.samtools.{SAMSequenceDictionary, SAMSequenceDictionaryCodec, SAMSequenceRecord, SAMTextHeaderCodec} import htsjdk.variant.utils.SAMSequenceDictionaryExtractor @@ -247,6 +246,22 @@ case class SequenceDictionary(infos: IndexedSeq[SequenceMetadata]) extends Itera this.length == that.length && this.zip(that).forall { case (thisInfo, thatInfo) => thisInfo.sameAs(thatInfo) } } + /** Validate the locatable against the sequence dictionary. + * + * @throws NoSuchElementException when the locatable's contig cannot be found in the sequence dictionary. + * @throws IllegalArgumentException when the locatable's start is less than 1. + * @throws IllegalArgumentException when the locatable's end is beyond the reference contig length. + * @throws IllegalArgumentException when the locatable's start is greater than the end. + */ + def validate(locatable: Locatable): Unit = { + val info = infos + .find(_.name == locatable.getContig) + .getOrElse(throw new NoSuchElementException(s"Contig does not exist within dictionary for locatable: $locatable.")) + require(1 <= locatable.getStart, s"Start is less than 1 for locatable: $locatable.") + require(locatable.getEnd <= info.length, s"End is beyond the reference contig length for locatable: $locatable.") + require(locatable.getStart <= locatable.getEnd, f"Start is greater than end for locatable: $locatable.") + } + /** Writes the sequence dictionary to the given path */ def write(path: FilePath): Unit = { val writer = Io.toWriter(path) diff --git a/src/main/scala/com/fulcrumgenomics/util/BedSource.scala b/src/main/scala/com/fulcrumgenomics/util/BedSource.scala new file mode 100644 index 000000000..83d4bebe7 --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/util/BedSource.scala @@ -0,0 +1,116 @@ +/* + * The MIT License + * + * Copyright (c) 2021 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +package com.fulcrumgenomics.util + +import com.fulcrumgenomics.commons.CommonsDef._ +import com.fulcrumgenomics.commons.collection.BetterBufferedIterator +import com.fulcrumgenomics.fasta.SequenceDictionary +import htsjdk.tribble.bed.BEDCodec.StartOffset +import htsjdk.tribble.bed.{BEDCodec, BEDFeature} + +import java.io.{Closeable, File, InputStream} +import scala.io.Source +import scala.util.{Failure, Success, Try} + +/** A class for sourcing BED features from a stream of ASCII string data. */ +class BedSource private( + private val lines: Iterator[String], + private val sd: Option[SequenceDictionary] = None, + private val source: Option[{ def close(): Unit}] = None +) extends Iterator[BEDFeature] with Closeable { + + /** The underlying codec used to parse the lines of BED data. */ + private val codec = new BEDCodec(StartOffset.ONE) + + /** The current line count. */ + private var lineNumber = 1L + + /** The underlying buffered iterator of BED data. */ + private val iter: BetterBufferedIterator[String] = lines match { + case iter: BetterBufferedIterator[String] => iter + case iter => iter.bufferBetter + } + + /** The header of this BED file. In most cases, the BED header is empty. */ + val header: Seq[String] = { + val lines = iter.takeWhile(line => BedSource.HeaderPrefixes.exists(line.startsWith)).toIndexedSeq + lineNumber += lines.length + lines + } + + /** The [[SequenceDictionary]] associated with the source. */ + val dict: Option[SequenceDictionary] = sd + + /** True if calling `next()` will yield another BED feature, false otherwise. */ + override def hasNext: Boolean = iter.hasNext + + /** Returns the next BED feature if available, or throws an exception if the feature is invalid or none is available. */ + override def next(): BEDFeature = yieldAndThen(parse(iter.next()))(lineNumber += 1) + + /** Parse a line of text and build a BED feature. */ + private def parse(line: String): BEDFeature = { + val parsed = codec.decode(line) + val feature = Option(parsed).getOrElse(throw new IllegalStateException(s"No BED feature could be built from line number: $lineNumber")) + Try(dict.foreach(_.validate(feature))) match { + case Success(()) => feature + case Failure(e: NoSuchElementException) => throw new NoSuchElementException(e.getMessage + s" Failed on line number: $lineNumber") + case Failure(e: IllegalArgumentException) => throw new IllegalArgumentException(e.getMessage + s" Failed on line number: $lineNumber") + case Failure(e: Throwable) => throw new IllegalStateException(e.getMessage + s" Failed on line number: $lineNumber") + } + } + + /** Closes the optional underlying handle. */ + override def close(): Unit = this.source.foreach(_.close()) +} + +/** Companion object for [[BedSource]]. */ +object BedSource { + + /** Common BED header line prefixes. */ + val HeaderPrefixes: Seq[String] = Seq("#", "browser", "track") + + /** Creates a new BED source from a sequence of lines. */ + def apply(lines: Iterable[String], dict: Option[SequenceDictionary]): BedSource = new BedSource(lines.iterator, dict) + + /** Creates a new BED source from an iterator of lines. */ + def apply(lines: Iterator[String], dict: Option[SequenceDictionary]): BedSource = new BedSource(lines, dict) + + /** Creates a new BED source from an input stream. */ + def apply(stream: InputStream, dict: Option[SequenceDictionary]): BedSource = { + new BedSource(Source.fromInputStream(stream).getLines(), dict) + } + + /** Creates a new BED source from a source. */ + def apply(source: Source, dict: Option[SequenceDictionary]): BedSource = { + new BedSource(source.getLines(), dict, source = Some(source)) + } + + /** Creates a new BED source from a File. */ + def apply(file: File, dict: Option[SequenceDictionary]): BedSource = apply(path=file.toPath, dict) + + /** Creates a new BED source from a Path. */ + def apply(path: PathToIntervals, dict: Option[SequenceDictionary]): BedSource = apply(Io.readLines(path), dict) +} diff --git a/src/main/scala/com/fulcrumgenomics/util/IntervalListSource.scala b/src/main/scala/com/fulcrumgenomics/util/IntervalListSource.scala index 326d965a1..ea328cde2 100644 --- a/src/main/scala/com/fulcrumgenomics/util/IntervalListSource.scala +++ b/src/main/scala/com/fulcrumgenomics/util/IntervalListSource.scala @@ -27,18 +27,24 @@ package com.fulcrumgenomics.util import java.io.{Closeable, File, InputStream} - import com.fulcrumgenomics.FgBioDef.{PathToIntervals, yieldAndThen} import com.fulcrumgenomics.commons.CommonsDef.BetterBufferedIteratorScalaWrapper +import com.fulcrumgenomics.commons.collection.BetterBufferedIterator import com.fulcrumgenomics.commons.util.StringUtil import com.fulcrumgenomics.fasta.SequenceDictionary +import com.fulcrumgenomics.util.IntervalListSource.HeaderPrefix import htsjdk.samtools.util.{BufferedLineReader, Interval, IntervalList} import htsjdk.samtools.{SAMFileHeader, SAMTextHeaderCodec} import scala.io.Source +import scala.util.{Failure, Success, Try} +/** Companion object for [[IntervalListSource]]. */ object IntervalListSource { + /** The Interval List header line prefix. */ + val HeaderPrefix: String = "@" + /** Creates a new interval list source from a sequence of lines. */ def apply(lines: Iterable[String]): IntervalListSource = new IntervalListSource(lines.iterator) @@ -66,7 +72,10 @@ class IntervalListSource private(lines: Iterator[String], private[this] val source: Option[{ def close(): Unit }] = None) extends Iterator[Interval] with Closeable { - private val iter = lines.bufferBetter + private val iter: BetterBufferedIterator[String] = lines match { + case iter: BetterBufferedIterator[String] => iter + case iter => iter.bufferBetter + } private var lineNumber = 1L @@ -79,7 +88,7 @@ class IntervalListSource private(lines: Iterator[String], // Read the header val header: SAMFileHeader = { val codec = new SAMTextHeaderCodec - val headerLines = iter.takeWhile(_.startsWith("@")).toIndexedSeq + val headerLines = iter.takeWhile(_.startsWith(HeaderPrefix)).toIndexedSeq require(headerLines.nonEmpty, "No header found") lineNumber += headerLines.length val lineReader = BufferedLineReader.fromString(headerLines.mkString("\n")) @@ -100,21 +109,9 @@ class IntervalListSource private(lines: Iterator[String], override def close(): Unit = this.source.foreach(_.close()) private def parse(line: String): Interval = { - val fieldCount = StringUtil.split(line, '\t', parseArray) + val fieldCount = StringUtil.split(line, '\t', parseArray) require(fieldCount == 5, s"Expected 5 fields on line $lineNumber") - val Array(refName: String, startString: String, endString: String, strand: String, name: String) = parseArray - - val start = startString.toInt - val end = endString.toInt - - Option(dict(refName)) match { - case None => - throw new IllegalArgumentException(f"Reference contig '$refName' not found in the sequence dictionary on line number $lineNumber.") - case Some(seq) => - require(1 <= start, s"Start is less than 1 on line number $lineNumber") - require(end <= seq.length, s"End is beyond the reference contig length on line number $lineNumber") - require(start <= end, f"Start is greater than end on line number $lineNumber") - } + val Array(refName: String, start: String, end: String, strand: String, name: String) = parseArray val negative = strand match { case "-" => true @@ -122,7 +119,14 @@ class IntervalListSource private(lines: Iterator[String], case _ => throw new IllegalArgumentException(s"Unrecognized strand '$strand' on line number $lineNumber") } - new Interval(refName, start, end, negative, name) + val interval = new Interval(refName, start.toInt, end.toInt, negative, name) + + Try(dict.validate(interval)) match { + case Success(()) => interval + case Failure(e: NoSuchElementException) => throw new NoSuchElementException(e.getMessage + s" Failed on line number: $lineNumber") + case Failure(e: IllegalArgumentException) => throw new IllegalArgumentException(e.getMessage + s" Failed on line number: $lineNumber") + case Failure(e: Throwable) => throw new IllegalStateException(e.getMessage + s" Failed on line number: $lineNumber") + } } /** Reads in the intervals into an [[htsjdk.samtools.util.IntervalList]] */ diff --git a/src/main/scala/com/fulcrumgenomics/util/IntervalSource.scala b/src/main/scala/com/fulcrumgenomics/util/IntervalSource.scala new file mode 100644 index 000000000..2089be203 --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/util/IntervalSource.scala @@ -0,0 +1,102 @@ +/* + * The MIT License + * + * Copyright (c) 2021 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +package com.fulcrumgenomics.util + +import com.fulcrumgenomics.commons.CommonsDef._ +import com.fulcrumgenomics.commons.collection.BetterBufferedIterator +import com.fulcrumgenomics.fasta.SequenceDictionary +import com.fulcrumgenomics.util.IntervalListSource.{HeaderPrefix => IntervalListHeaderPrefix} +import htsjdk.samtools.SAMFileHeader +import htsjdk.samtools.util.Interval + +import java.io.{Closeable, File, InputStream} +import scala.io.Source + +/** A class for sourcing intervals from a stream of data that could either be in BED or Interval List format. */ +class IntervalSource private( + private val lines: Iterator[String], + private val sd: Option[SequenceDictionary], + private val source: Option[{ def close(): Unit }] = None +) extends Iterator[Interval] with Closeable { + + /** The underlying buffered iterator of interval data. */ + private val iter: BetterBufferedIterator[String] = lines match { + case iter: BetterBufferedIterator[String] => iter + case iter => iter.bufferBetter + } + + private val (underlying: Iterator[Interval], _dict, _header) = if ( + iter.headOption.exists(_.startsWith(IntervalListHeaderPrefix)) + ) { + val wrapped = IntervalListSource(iter) + require(sd.forall(wrapped.dict.sameAs), "Provided sequence dictionary does not match the input's dict header!") + (wrapped, Some(wrapped.dict), Some(wrapped.header)) + } else { + val wrapped = BedSource(iter, sd) + (wrapped.map(feature => new Interval(feature)), sd, None) + } + + /** The [[SAMFileHeader]] associated with the source, if it exists. */ + val header: Option[SAMFileHeader] = _header + + /** The [[SequenceDictionary]] associated with the source, if it exists. */ + val dict: Option[SequenceDictionary] = _dict + + /** True if calling `next()` will yield another interval, false otherwise. */ + override def hasNext: Boolean = underlying.hasNext + + /** Returns the next interval if available, or throws an exception if none is available. */ + override def next(): Interval = underlying.next() + + /** Closes the underlying reader. */ + override def close(): Unit = this.source.foreach(_.close()) +} + +/** Companion object for [[IntervalSource]]. */ +object IntervalSource { + + /** Creates a new interval source from a sequence of lines. */ + def apply(lines: Iterable[String], dict: Option[SequenceDictionary]): IntervalSource = new IntervalSource(lines.iterator, dict) + + /** Creates a new interval source from an iterator of lines. */ + def apply(lines: Iterator[String], dict: Option[SequenceDictionary]): IntervalSource = new IntervalSource(lines, dict) + + /** Creates a new interval source from an input stream. */ + def apply(stream: InputStream, dict: Option[SequenceDictionary]): IntervalSource = { + new IntervalSource(Source.fromInputStream(stream).getLines(), dict) + } + + /** Creates a new interval source from a source. */ + def apply(source: Source, dict: Option[SequenceDictionary]): IntervalSource = { + new IntervalSource(source.getLines(), dict, source = Some(source)) + } + + /** Creates a new interval source from a File. */ + def apply(file: File, dict: Option[SequenceDictionary]): IntervalSource = apply(path=file.toPath, dict) + + /** Creates a new interval source from a Path. */ + def apply(path: PathToIntervals, dict: Option[SequenceDictionary]): IntervalSource = apply(Io.readLines(path), dict) +} diff --git a/src/main/scala/com/fulcrumgenomics/util/Io.scala b/src/main/scala/com/fulcrumgenomics/util/Io.scala index ce54df16d..2857d849f 100644 --- a/src/main/scala/com/fulcrumgenomics/util/Io.scala +++ b/src/main/scala/com/fulcrumgenomics/util/Io.scala @@ -21,16 +21,16 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ -package com.fulcrumgenomics.util -import java.io.{InputStream, OutputStream} -import java.nio.file.{Files, Path, Paths} -import java.util.zip.{GZIPInputStream, GZIPOutputStream} +package com.fulcrumgenomics.util -import com.fulcrumgenomics.commons.CommonsDef.DirPath +import com.fulcrumgenomics.commons.CommonsDef._ import com.fulcrumgenomics.commons.io.{IoUtil, PathUtil} import htsjdk.samtools.util.BlockCompressedOutputStream +import java.io.OutputStream +import java.nio.file.{Files, Path, Paths} + /** * Provides common IO utility methods. Can be instantiated to create a custom factory, or * the companion object can be used as a singleton version. @@ -51,7 +51,7 @@ class Io(var compressionLevel: Int = 5, override def makeTempDir(name: String): DirPath = Files.createTempDirectory(tmpDir, name) /** Overridden to ensure that tmp files are created within the correct tmpDir. */ - override def makeTempFile(prefix: String, suffix: String, dir: Option[DirPath] = Some(tmpDir)): DirPath = super.makeTempFile(prefix, suffix, dir) + override def makeTempFile(prefix: String, suffix: String, dir: Option[DirPath] = Some(tmpDir)): FilePath = super.makeTempFile(prefix, suffix, dir) } /** Singleton object that can be used when the default buffer size and compression are desired. */ diff --git a/src/test/scala/com/fulcrumgenomics/fasta/SequenceDictionaryTest.scala b/src/test/scala/com/fulcrumgenomics/fasta/SequenceDictionaryTest.scala index 7d5b94d89..5b3ca9b88 100644 --- a/src/test/scala/com/fulcrumgenomics/fasta/SequenceDictionaryTest.scala +++ b/src/test/scala/com/fulcrumgenomics/fasta/SequenceDictionaryTest.scala @@ -25,13 +25,14 @@ package com.fulcrumgenomics.fasta +import com.fulcrumgenomics.commons.CommonsDef._ import com.fulcrumgenomics.fasta.Converters.{FromSAMSequenceDictionary, FromSAMSequenceRecord, ToSAMSequenceDictionary, ToSAMSequenceRecord} -import com.fulcrumgenomics.testing.UnitSpec -import org.scalatest.OptionValues -import com.fulcrumgenomics.fasta.SequenceMetadata.{AlternateLocus, Keys} +import com.fulcrumgenomics.fasta.SequenceMetadata.AlternateLocus import com.fulcrumgenomics.fasta.Topology.{Circular, Linear} +import com.fulcrumgenomics.testing.UnitSpec +import htsjdk.samtools.util.Interval import htsjdk.samtools.{SAMSequenceDictionary, SAMSequenceRecord} -import com.fulcrumgenomics.FgBioDef._ +import org.scalatest.OptionValues class SequenceDictionaryTest extends UnitSpec with OptionValues { @@ -172,6 +173,34 @@ class SequenceDictionaryTest extends UnitSpec with OptionValues { SequenceMetadata(name="chr1", length=0, md5=Some("1")) sameAs SequenceMetadata(name="chr1", length=0, md5=Some("1")) shouldBe true } + "SequenceDictionary.validate" should "raise a NoSuchElementException if a locatable has a contig not in the optional sequence dictionary" in { + val dict = SequenceDictionary(SequenceMetadata("chr1", length = 1000)) + val interval = new Interval("chr2", 200, 200) + val caught = intercept[NoSuchElementException] { dict.validate(interval) } + caught.getMessage should include ("Contig does not exist within dictionary for locatable") + } + + it should "raise a IllegalArgumentException if a locatable has a start value less than 1 (1-based)" in { + val dict = SequenceDictionary(SequenceMetadata("chr1", length = 1000)) + val interval = new Interval("chr1", 0, 200) + val caught = intercept[IllegalArgumentException] { dict.validate(interval) } + caught.getMessage should include ("Start is less than 1 for locatable") + } + + it should "raise a IllegalArgumentException if a locatable has an end value greater than the contig length" in { + val dict = SequenceDictionary(SequenceMetadata("chr1", length = 1000)) + val interval = new Interval("chr1", 200, 1000 + 1) + val caught = intercept[IllegalArgumentException] { dict.validate(interval) } + caught.getMessage should include ("End is beyond the reference contig length for locatable") + } + + it should "raise a IllegalArgumentException if a locatable record has a start value greater than an end value" in { + val dict = SequenceDictionary(SequenceMetadata("chr1", length = 1000)) + val interval = new Interval("chr1", 200 + 1, 200) + val caught = intercept[IllegalArgumentException] { dict.validate(interval) } + caught.getMessage should include ("Start is greater than end for locatable") + } + "SequenceDictionary" should "fail to build two sequence metadatas share a name (including aliases)" in { val one = SequenceMetadata(name="chr1", length=0) val two = SequenceMetadata(name="chr2", length=0, aliases=Seq("chr1")) diff --git a/src/test/scala/com/fulcrumgenomics/util/BedSourceTest.scala b/src/test/scala/com/fulcrumgenomics/util/BedSourceTest.scala new file mode 100644 index 000000000..9ba4ba211 --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/util/BedSourceTest.scala @@ -0,0 +1,266 @@ +/* + * The MIT License + * + * Copyright (c) 2021 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +package com.fulcrumgenomics.util + +import com.fulcrumgenomics.fasta.{SequenceDictionary, SequenceMetadata} +import com.fulcrumgenomics.testing.UnitSpec +import htsjdk.tribble.bed.{BEDFeature, FullBEDFeature, SimpleBEDFeature} +import org.scalactic.{Equality, Explicitly} +import org.scalatest.OptionValues._ + +import java.io.ByteArrayInputStream +import scala.io.Source + +/** Unit tests for [[BedSource]]. */ +class BedSourceTest extends UnitSpec with Explicitly { + + /** A sequence dictionary for unit testing. */ + private val Dict: SequenceDictionary = SequenceDictionary(SequenceMetadata("chr1", length = 10000)) + + /** Convenience method for building a Full BED Feature from a contig, start, end, and name. */ + private def bed(contig: String, start: Int, end: Int, name: String): FullBEDFeature = { + val feature = new FullBEDFeature(contig, start, end) + feature.setName(name) + feature + } + + /** Equality helper for BED features that only compares contig, start, end, and name. */ + private val equalityByLocatableAndName: Equality[BEDFeature] = (a: BEDFeature, b: Any) => b match { + case expected: BEDFeature => a.contigsMatch(expected) && + a.getStart == expected.getStart && + a.getEnd == expected.getEnd && + a.getName == expected.getName + case _ => false + } + + /** Contents of a BED file without a header for testing. */ + private val BedWithoutHeader: String = + """chr19 49302000 49302300 -1.0 + |chr19 49302300 49302600 -0.75 + |chr19 49302600 49302900 -0.50 + |chr19 49302900 49303200 -0.25 + |chr19 49303200 49303500 0.0 + |chr19 49303500 49303800 0.25 + |chr19 49303800 49304100 0.50 + |chr19 49304100 49304400 0.75 + |chr20 10 100 1.00 + """.stripMargin.trim + + /** Deserialized version of [[BedWithoutHeader]] for testing. */ + private val BedWithoutHeaderExpected: Seq[FullBEDFeature] = { + Seq( + bed("chr19", 49302001, 49302300, "-1.0"), + bed("chr19", 49302301, 49302600, "-0.75"), + bed("chr19", 49302601, 49302900, "-0.50"), + bed("chr19", 49302901, 49303200, "-0.25"), + bed("chr19", 49303201, 49303500, "0.0"), + bed("chr19", 49303501, 49303800, "0.25"), + bed("chr19", 49303801, 49304100, "0.50"), + bed("chr19", 49304101, 49304400, "0.75"), + bed("chr20", 11, 100, "1.00"), + ) + } + + /** Contents of a BED file (actually a bedGraph) with a complex header for testing. */ + private val BedWithHeader: String = + """browser position chr19:49302001-49304701 + |browser hide all + |browser pack refGene encodeRegions + |browser full altGraph + |# 300 base wide bar graph, autoScale is on by default == graphing + |# limits will dynamically change to always show full range of data + |# in viewing window, priority = 20 positions this as the second graph + |# Note, zero-relative, half-open coordinate system in use for bedGraph format + |track type=bedGraph name="BedGraph Format" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 + |chr19 49302000 49302300 -1.0 + |chr19 49302300 49302600 -0.75 + |chr19 49302600 49302900 -0.50 + |chr19 49302900 49303200 -0.25 + |chr19 49303200 49303500 0.0 + |chr19 49303500 49303800 0.25 + |chr19 49303800 49304100 0.50 + |chr19 49304100 49304400 0.75 + |chr19 49304400 49304700 1.00 + """.stripMargin.trim + + /** Deserialized version of [[BedWithHeader]] for testing. */ + private val BedWithHeaderExpected: Seq[FullBEDFeature] = { + Seq( + bed("chr19", 49302001, 49302300, "-1.0"), + bed("chr19", 49302301, 49302600, "-0.75"), + bed("chr19", 49302601, 49302900, "-0.50"), + bed("chr19", 49302901, 49303200, "-0.25"), + bed("chr19", 49303201, 49303500, "0.0"), + bed("chr19", 49303501, 49303800, "0.25"), + bed("chr19", 49303801, 49304100, "0.50"), + bed("chr19", 49304101, 49304400, "0.75"), + bed("chr19", 49304401, 49304700, "1.00"), + ) + } + + "BedSource" should "read a single simple BED data record" in { + val actual = BedSource("chr1 100\n".linesIterator, dict = None) + val expected = Seq(new SimpleBEDFeature(101, 101, "chr1")) + actual.dict shouldBe None + actual.header shouldBe empty + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableAndName) + } + + it should "read no BED features from an empty iterator" in { + val actual = BedSource(Iterator.empty, dict = None) + actual.dict shouldBe None + actual.header shouldBe empty + actual.toList shouldBe empty + } + + it should "return BED features from a list of line records" in { + val actual = BedSource(BedWithoutHeader.linesIterator, dict = None) + actual.dict shouldBe None + actual.header shouldBe empty + (actual.toList should contain theSameElementsInOrderAs BedWithoutHeaderExpected) (decided by equalityByLocatableAndName) + } + + it should "return BED features from a list of line records and keep a copy of the header" in { + val actual = BedSource(BedWithHeader.linesIterator, dict = None) + actual.dict shouldBe None + actual.header shouldBe Seq( + "browser position chr19:49302001-49304701", + "browser hide all", + "browser pack refGene encodeRegions", + "browser full altGraph", + "#\t300 base wide bar graph, autoScale is on by default == graphing", + "#\tlimits will dynamically change to always show full range of data", + "#\tin viewing window, priority = 20 positions this as the second graph", + "#\tNote, zero-relative, half-open coordinate system in use for bedGraph format", + "track type=bedGraph name=\"BedGraph Format\" description=\"BedGraph format\" visibility=full color=200,100,0 altColor=0,100,200 priority=20", + ) + (actual.toList should contain theSameElementsInOrderAs BedWithHeaderExpected) (decided by equalityByLocatableAndName) + } + + it should "raise an exception when the underlying BED codec cannot read the data line" in { + an[IllegalStateException] shouldBe thrownBy { BedSource("\n".linesIterator, dict = None).toList } + an[IllegalStateException] shouldBe thrownBy { BedSource(" \n".linesIterator, dict = None).toList } + an[IllegalStateException] shouldBe thrownBy { BedSource("chr1\n".linesIterator, dict = None).toList } + } + + it should "have a sequence dictionary if one is passed to the constructor" in { + val source = BedSource(Iterator.empty, dict = Some(Dict)) + source.dict.value shouldBe Dict + source.header shouldBe empty + source.toList shouldBe empty + } + + it should "raise a NoSuchElementException if a BED record has a contig not in the optional sequence dictionary" in { + val source = BedSource("chr2 100\n".linesIterator, dict = Some(Dict)) + source.dict.value shouldBe Dict + source.header shouldBe empty + val caught = intercept[NoSuchElementException] { source.toList } + caught.getMessage should include ("Contig does not exist within dictionary for locatable") + caught.getMessage should include ("Failed on line number: 1") + } + + it should "raise a IllegalArgumentException if a BED record has a start value less than 1 (1-based)" in { + val source = BedSource("chr1 -1 50\n".linesIterator, dict = Some(Dict)) + source.dict.value shouldBe Dict + source.header shouldBe empty + val caught = intercept[IllegalArgumentException] { source.toList } + caught.getMessage should include ("Start is less than 1 for locatable") + caught.getMessage should include ("Failed on line number: 1") + } + + it should "raise a IllegalArgumentException if a BED record has an end value greater than the contig length" in { + val source = BedSource("chr1 0 10001\n".linesIterator, dict = Some(Dict)) + source.dict.value shouldBe Dict + source.header shouldBe empty + val caught = intercept[IllegalArgumentException] { source.toList } + caught.getMessage should include ("End is beyond the reference contig length for locatable") + caught.getMessage should include ("Failed on line number: 1") + } + + it should "raise a IllegalArgumentException if a BED record has a start value greater than an end value" in { + val source = BedSource("chr1 100 50\n".linesIterator, dict = Some(Dict)) + source.dict.value shouldBe Dict + source.header shouldBe empty + val caught = intercept[IllegalArgumentException] { source.toList } + caught.getMessage should include ("Start is greater than end for locatable") + caught.getMessage should include ("Failed on line number: 1") + } + + it should "know which line of input triggered a validation exception" in { + val source = BedSource("chr1 100\nchr2 100".linesIterator, dict = Some(Dict)) + source.dict.value shouldBe Dict + source.header shouldBe empty + val caught = intercept[NoSuchElementException] { source.toList } + caught.getMessage should include ("Contig does not exist within dictionary for locatable") + caught.getMessage should include ("Failed on line number: 2") + } + + "BedSource.apply" should "allow sourcing BED features from an iterable of string data" in { + val actual = BedSource(Seq("chr1 100"), dict = Some(Dict)) + val expected = Seq(new SimpleBEDFeature(101, 101, "chr1")) + actual.dict.value shouldBe Dict + actual.header shouldBe empty + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableAndName) + } + + it should "allow sourcing BED features from an input stream of string data" in { + val stream = new ByteArrayInputStream("chr1 100\n".getBytes(java.nio.charset.StandardCharsets.UTF_8.name)) + val actual = BedSource(stream, dict = Some(Dict)) + val expected = Seq(new SimpleBEDFeature(101, 101, "chr1")) + actual.dict.value shouldBe Dict + actual.header shouldBe empty + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableAndName) + } + + it should "allow sourcing BED features from a source of string data" in { + val source = Source.fromString("chr1 100\n") + val actual = BedSource(source, dict = Some(Dict)) + val expected = Seq(new SimpleBEDFeature(101, 101, "chr1")) + actual.dict.value shouldBe Dict + actual.header shouldBe empty + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableAndName) + } + + it should "allow sourcing BED features from a file of string data" in { + val path = Io.makeTempFile(getClass.getSimpleName, ".bed") + Io.writeLines(path, Seq("chr1 100")) + val actual = BedSource(path.toFile, dict = Some(Dict)) + val expected = Seq(new SimpleBEDFeature(101, 101, "chr1")) + actual.dict.value shouldBe Dict + actual.header shouldBe empty + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableAndName) + } + + it should "allow sourcing BED features from a path of string data" in { + val path = Io.makeTempFile(getClass.getSimpleName, ".bed") + Io.writeLines(path, Seq("chr1 100")) + val actual = BedSource(path, dict = Some(Dict)) + val expected = Seq(new SimpleBEDFeature(101, 101, "chr1")) + actual.dict.value shouldBe Dict + actual.header shouldBe empty + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableAndName) + } +} diff --git a/src/test/scala/com/fulcrumgenomics/util/IntervalListSourceTest.scala b/src/test/scala/com/fulcrumgenomics/util/IntervalListSourceTest.scala index 20e7a7343..7471260d2 100644 --- a/src/test/scala/com/fulcrumgenomics/util/IntervalListSourceTest.scala +++ b/src/test/scala/com/fulcrumgenomics/util/IntervalListSourceTest.scala @@ -59,42 +59,42 @@ class IntervalListSourceTest extends UnitSpec { } it should "fail if no header is present" in { - val exception = intercept[Exception] { IntervalListSource(Seq("chr1\t1\t1\t+\tname")) } + val exception = intercept[IllegalArgumentException] { IntervalListSource(Seq("chr1\t1\t1\t+\tname")) } exception.getMessage should include("No header found") } it should "fail if no sequence dictionary is present" in { - val exception = intercept[Exception] { IntervalListSource(Seq("@HD\tVN:1.0", "chr1\t1\t1\t+\tname")) } + val exception = intercept[IllegalArgumentException] { IntervalListSource(Seq("@HD\tVN:1.0", "chr1\t1\t1\t+\tname")) } exception.getMessage should include("No reference sequences found") } it should "fail if a line does not have exactly five fields" in { - val exception = intercept[Exception] { IntervalListSource(Seq("@HD\tVN:1.0", "@SQ\tSN:chr1\tLN:10000", "chr1")).toSeq } + val exception = intercept[IllegalArgumentException] { IntervalListSource(Seq("@HD\tVN:1.0", "@SQ\tSN:chr1\tLN:10000", "chr1")).toSeq } exception.getMessage should include("Expected 5 fields on line 3") } it should "fail if an interval's contig is not found the in the sequence dictionary" in { - val exception = intercept[Exception] { IntervalListSource(Seq("@HD\tVN:1.0", "@SQ\tSN:chr1\tLN:10000", "chr2\t1\t1\t+\tname")).toSeq } - exception.getMessage should include("key not found: chr2") + val exception = intercept[NoSuchElementException] { IntervalListSource(Seq("@HD\tVN:1.0", "@SQ\tSN:chr1\tLN:10000", "chr2\t1\t1\t+\tname")).toSeq } + exception.getMessage should include("Contig does not exist within dictionary") } it should "fail if an interval's start is less than one" in { - val exception = intercept[Exception] { IntervalListSource(Seq("@HD\tVN:1.0", "@SQ\tSN:chr1\tLN:10000", "chr1\t0\t1\t+\tname")).toSeq } + val exception = intercept[IllegalArgumentException] { IntervalListSource(Seq("@HD\tVN:1.0", "@SQ\tSN:chr1\tLN:10000", "chr1\t0\t1\t+\tname")).toSeq } exception.getMessage should include("Start is less than 1") } it should "fail if an interval's end is beyond the contig's length in the sequence dictionary" in { - val exception = intercept[Exception] { IntervalListSource(Seq("@HD\tVN:1.0", "@SQ\tSN:chr1\tLN:10000", "chr1\t1\t10001\t+\tname")).toSeq } + val exception = intercept[IllegalArgumentException] { IntervalListSource(Seq("@HD\tVN:1.0", "@SQ\tSN:chr1\tLN:10000", "chr1\t1\t10001\t+\tname")).toSeq } exception.getMessage should include("End is beyond") } it should "fail if an interval's start is greater than its end" in { - val exception = intercept[Exception] { IntervalListSource(Seq("@HD\tVN:1.0", "@SQ\tSN:chr1\tLN:10000", "chr1\t2\t1\t+\tname")).toSeq } + val exception = intercept[IllegalArgumentException] { IntervalListSource(Seq("@HD\tVN:1.0", "@SQ\tSN:chr1\tLN:10000", "chr1\t2\t1\t+\tname")).toSeq } exception.getMessage should include("Start is greater than end") } it should "fail if the strand cannot be recognized" in { - val exception = intercept[Exception] { IntervalListSource(Seq("@HD\tVN:1.0", "@SQ\tSN:chr1\tLN:10000", "chr1\t1\t1\tN\tname")).toSeq } + val exception = intercept[IllegalArgumentException] { IntervalListSource(Seq("@HD\tVN:1.0", "@SQ\tSN:chr1\tLN:10000", "chr1\t1\t1\tN\tname")).toSeq } exception.getMessage should include("Unrecognized strand") } } diff --git a/src/test/scala/com/fulcrumgenomics/util/IntervalSourceTest.scala b/src/test/scala/com/fulcrumgenomics/util/IntervalSourceTest.scala new file mode 100644 index 000000000..4229956c0 --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/util/IntervalSourceTest.scala @@ -0,0 +1,265 @@ +/* + * The MIT License + * + * Copyright (c) 2021 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +package com.fulcrumgenomics.util + +import com.fulcrumgenomics.fasta.Converters.ToSAMSequenceDictionary +import com.fulcrumgenomics.fasta.{SequenceDictionary, SequenceMetadata} +import com.fulcrumgenomics.testing.UnitSpec +import htsjdk.samtools.SAMFileHeader +import htsjdk.samtools.util.Interval +import org.scalatest.OptionValues._ + +import java.io.ByteArrayInputStream +import scala.io.Source + +/** Unit tests for [[IntervalSource]]. */ +class IntervalSourceTest extends UnitSpec { + + /** A sequence dictionary for unit testing. */ + private val Dict: SequenceDictionary = SequenceDictionary( + SequenceMetadata(name = "chr1", length = 50000000), + SequenceMetadata(name = "chr2", length = 50000) + ) + + /** Contents of a BED file without a header for testing. */ + private val BedWithoutHeader: String = + """chr19 49302000 49302300 -1.0 + |chr19 49302300 49302600 -0.75 + |chr19 49302600 49302900 -0.50 + |chr19 49302900 49303200 -0.25 + |chr19 49303200 49303500 0.0 + |chr19 49303500 49303800 0.25 + |chr19 49303800 49304100 0.50 + |chr19 49304100 49304400 0.75 + |chr20 10 100 1.00 + """.stripMargin.trim + + /** Deserialized version of [[BedWithoutHeader]] for testing. */ + private val BedWithoutHeaderExpected: Seq[Interval] = { + Seq( + new Interval("chr19", 49302001, 49302300, false, "-1.0"), + new Interval("chr19", 49302301, 49302600, false, "-0.75"), + new Interval("chr19", 49302601, 49302900, false, "-0.50"), + new Interval("chr19", 49302901, 49303200, false, "-0.25"), + new Interval("chr19", 49303201, 49303500, false, "0.0"), + new Interval("chr19", 49303501, 49303800, false, "0.25"), + new Interval("chr19", 49303801, 49304100, false, "0.50"), + new Interval("chr19", 49304101, 49304400, false, "0.75"), + new Interval("chr20", 11, 100, false, "1.00"), + ) + } + + /** Contents of a BED file (actually a bedGraph) with a complex header for testing. */ + private val BedWithHeader: String = + """browser position chr19:49302001-49304701 + |browser hide all + |browser pack refGene encodeRegions + |browser full altGraph + |# 300 base wide bar graph, autoScale is on by default == graphing + |# limits will dynamically change to always show full range of data + |# in viewing window, priority = 20 positions this as the second graph + |# Note, zero-relative, half-open coordinate system in use for bedGraph format + |track type=bedGraph name="BedGraph Format" description="BedGraph format" visibility=full color=200,100,0 altColor=0,100,200 priority=20 + |chr19 49302000 49302300 -1.0 + |chr19 49302300 49302600 -0.75 + |chr19 49302600 49302900 -0.50 + |chr19 49302900 49303200 -0.25 + |chr19 49303200 49303500 0.0 + |chr19 49303500 49303800 0.25 + |chr19 49303800 49304100 0.50 + |chr19 49304100 49304400 0.75 + |chr19 49304400 49304700 1.00 + """.stripMargin.trim + + /** Deserialized version of [[BedWithHeader]] for testing. */ + private val BedWithHeaderExpected: Seq[Interval] = { + Seq( + new Interval("chr19", 49302001, 49302300, false, "-1.0"), + new Interval("chr19", 49302301, 49302600, false, "-0.75"), + new Interval("chr19", 49302601, 49302900, false, "-0.50"), + new Interval("chr19", 49302901, 49303200, false, "-0.25"), + new Interval("chr19", 49303201, 49303500, false, "0.0"), + new Interval("chr19", 49303501, 49303800, false, "0.25"), + new Interval("chr19", 49303801, 49304100, false, "0.50"), + new Interval("chr19", 49304101, 49304400, false, "0.75"), + new Interval("chr19", 49304401, 49304700, false, "1.00"), + ) + } + + /** Contents of an Interval List header for testing. */ + private val IntervalListData: String = + "@HD\tVN:1.0\n" + "@SQ\tSN:chr1\tLN:50000000\n" + "@SQ\tSN:chr2\tLN:50000\n" + + "chr1\t49302000\t49302300\t+\tname1\n" + + "chr1\t49302300\t49302600\t+\tname2\n" + + "chr1\t49302600\t49302900\t+\tname3\n" + + "chr1\t49302900\t49303200\t+\tname4\n" + + "chr1\t49303200\t49303500\t+\tname5\n" + + "chr1\t49303500\t49303800\t+\tname6\n" + + "chr1\t49303800\t49304100\t+\tname7\n" + + "chr1\t49304100\t49304400\t+\tname8\n" + + "chr2\t10\t100\t-\tname9\n" + + /** Deserialized version of [[IntervalListData]] for testing. */ + private val IntervalListExpected: Seq[Interval] = Seq( + new Interval("chr1", 49302000, 49302300, false, "name1"), + new Interval("chr1", 49302300, 49302600, false, "name2"), + new Interval("chr1", 49302600, 49302900, false, "name3"), + new Interval("chr1", 49302900, 49303200, false, "name4"), + new Interval("chr1", 49303200, 49303500, false, "name5"), + new Interval("chr1", 49303500, 49303800, false, "name6"), + new Interval("chr1", 49303800, 49304100, false, "name7"), + new Interval("chr1", 49304100, 49304400, false, "name8"), + new Interval("chr2", 10, 100, true, "name9"), + ) + + "IntervalSource" should "read a single simple BED data record into an interval" in { + val actual = IntervalSource("chr1 100\n".linesIterator, dict = None) + val expected = Seq(new Interval("chr1", 101, 101)) + actual.dict shouldBe None + actual.header shouldBe None + actual.toList should contain theSameElementsInOrderAs expected + } + + it should "read no intervals at all from an empty iterator" in { + val actual = IntervalSource(Iterator.empty, dict = None) + actual.dict shouldBe None + actual.header shouldBe None + actual.toList shouldBe empty + } + + it should "return BED intervals from a list of line records" in { + val actual = IntervalSource(BedWithoutHeader.linesIterator, dict = None) + actual.dict shouldBe None + actual.header shouldBe None + actual.toList should contain theSameElementsInOrderAs BedWithoutHeaderExpected + } + + it should "return BED intervals, skipping a header, from a list of line records" in { + val actual = IntervalSource(BedWithHeader.linesIterator, dict = None) + actual.dict shouldBe None + actual.header shouldBe None + actual.toList should contain theSameElementsInOrderAs BedWithHeaderExpected + } + + it should "read a single simple Interval List data record into an interval" in { + val data = "@HD\tVN:1.0\n" + "@SQ\tSN:chr1\tLN:50000000\n" + "chr1\t100\t100\t+\t.\n" + val dict = SequenceDictionary(SequenceMetadata("chr1", length = 50000000)) + val header = new SAMFileHeader() + header.setAttribute("VN", "1.0") + header.setSequenceDictionary(dict.asSam) + + val source = IntervalSource(data.linesIterator, dict = None) + val expected = Seq(new Interval("chr1", 100, 100, false, null)) + + source.dict.value shouldBe dict + source.header.value shouldBe header + source.toList should contain theSameElementsInOrderAs expected + } + + it should "read multiple intervals from an interval list" in { + val source = IntervalSource(IntervalListData.linesIterator, dict = None) + val header = new SAMFileHeader() + header.setAttribute("VN", "1.0") + header.setSequenceDictionary(Dict.asSam) + + source.dict.value shouldBe Dict + source.header.value shouldBe header + source.toList should contain theSameElementsInOrderAs IntervalListExpected + } + + it should "assert the provided sequence dictionary matches the found sequence dictionary for interval list input" in { + val invalid = SequenceDictionary(SequenceMetadata("chrX", length =2)) + val caught = intercept[IllegalArgumentException] { IntervalSource(IntervalListData.linesIterator, dict = Some(invalid)) } + caught.getMessage should include ("Provided sequence dictionary does not match the input's dict header!") + } + + it should "pass through the sequence dictionary when supplied for BED input" in { + val dict = SequenceDictionary( + SequenceMetadata(name = "chr19", length = 50000000), + SequenceMetadata(name = "chr20", length = 50000) + ) + val actual = IntervalSource(BedWithHeader.linesIterator, dict = Some(dict)) + actual.dict.value shouldBe dict + actual.header shouldBe None + actual.toList should contain theSameElementsInOrderAs BedWithHeaderExpected + } + + it should "assert the records match the provided sequence dictionary for BED input" in { + val dict = SequenceDictionary( + SequenceMetadata(name = "chr19", length = 50000000), + SequenceMetadata(name = "chr20", length = 50000) + ) + val caught = intercept[NoSuchElementException] { IntervalSource("chr1 100".linesIterator, dict = Some(dict)).toList } + caught.getMessage should include ("Contig does not exist within dictionary for locatable") + caught.getMessage should include ("Failed on line number: 1") + } + + "IntervalSource.apply" should "allow sourcing intervals from an iterable of string data" in { + val actual = IntervalSource(Seq("chr1 100"), dict = Some(Dict)) + val expected = Seq(new Interval("chr1", 101, 101)) + actual.dict.value shouldBe Dict + actual.header shouldBe None + actual.toList should contain theSameElementsInOrderAs expected + } + + it should "allow sourcing interval from an input stream of string data" in { + val stream = new ByteArrayInputStream("chr1 100\n".getBytes(java.nio.charset.StandardCharsets.UTF_8.name)) + val actual = IntervalSource(stream, dict = Some(Dict)) + val expected = Seq(new Interval("chr1", 101, 101)) + actual.dict.value shouldBe Dict + actual.header shouldBe None + actual.toList should contain theSameElementsInOrderAs expected + } + + it should "allow sourcing intervals from a source of string data" in { + val source = Source.fromString("chr1 100\n") + val actual = IntervalSource(source, dict = Some(Dict)) + val expected = Seq(new Interval("chr1", 101, 101)) + actual.dict.value shouldBe Dict + actual.header shouldBe None + actual.toList should contain theSameElementsInOrderAs expected + } + + it should "allow sourcing intervals from a file of string data" in { + val path = Io.makeTempFile(getClass.getSimpleName, ".bed") + Io.writeLines(path, Seq("chr1 100")) + val actual = IntervalSource(path.toFile, dict = Some(Dict)) + val expected = Seq(new Interval("chr1", 101, 101)) + actual.dict.value shouldBe Dict + actual.header shouldBe None + actual.toList should contain theSameElementsInOrderAs expected + } + + it should "allow sourcing intervals from a path of string data" in { + val path = Io.makeTempFile(getClass.getSimpleName, ".bed") + Io.writeLines(path, Seq("chr1 100")) + val actual = IntervalSource(path, dict = Some(Dict)) + val expected = Seq(new Interval("chr1", 101, 101)) + actual.dict.value shouldBe Dict + actual.header shouldBe None + actual.toList should contain theSameElementsInOrderAs expected + } +} From e5b530853e3774b099c4f936d7d67a543561aae5 Mon Sep 17 00:00:00 2001 From: Clint Valentine Date: Sun, 28 Feb 2021 12:12:07 -0500 Subject: [PATCH 2/3] Close the underlying sources in unit tests --- src/test/scala/com/fulcrumgenomics/util/BedSourceTest.scala | 1 + src/test/scala/com/fulcrumgenomics/util/IntervalSourceTest.scala | 1 + 2 files changed, 2 insertions(+) diff --git a/src/test/scala/com/fulcrumgenomics/util/BedSourceTest.scala b/src/test/scala/com/fulcrumgenomics/util/BedSourceTest.scala index 9ba4ba211..6199423b2 100644 --- a/src/test/scala/com/fulcrumgenomics/util/BedSourceTest.scala +++ b/src/test/scala/com/fulcrumgenomics/util/BedSourceTest.scala @@ -242,6 +242,7 @@ class BedSourceTest extends UnitSpec with Explicitly { actual.dict.value shouldBe Dict actual.header shouldBe empty (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableAndName) + actual.close() } it should "allow sourcing BED features from a file of string data" in { diff --git a/src/test/scala/com/fulcrumgenomics/util/IntervalSourceTest.scala b/src/test/scala/com/fulcrumgenomics/util/IntervalSourceTest.scala index 4229956c0..01bd241fc 100644 --- a/src/test/scala/com/fulcrumgenomics/util/IntervalSourceTest.scala +++ b/src/test/scala/com/fulcrumgenomics/util/IntervalSourceTest.scala @@ -241,6 +241,7 @@ class IntervalSourceTest extends UnitSpec { actual.dict.value shouldBe Dict actual.header shouldBe None actual.toList should contain theSameElementsInOrderAs expected + actual.close() } it should "allow sourcing intervals from a file of string data" in { From 7c635f79d27197f30fd1c72d71955d8eb89ca41b Mon Sep 17 00:00:00 2001 From: Clint Valentine Date: Sun, 28 Feb 2021 14:09:45 -0500 Subject: [PATCH 3/3] Support filling all fields of Interval from BEDFeature --- .../com/fulcrumgenomics/util/BedSource.scala | 2 +- .../fulcrumgenomics/util/IntervalSource.scala | 17 ++++- .../fulcrumgenomics/util/BedSourceTest.scala | 28 ++++---- .../util/IntervalSourceTest.scala | 72 +++++++++++++++---- 4 files changed, 87 insertions(+), 32 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/util/BedSource.scala b/src/main/scala/com/fulcrumgenomics/util/BedSource.scala index 83d4bebe7..30e473152 100644 --- a/src/main/scala/com/fulcrumgenomics/util/BedSource.scala +++ b/src/main/scala/com/fulcrumgenomics/util/BedSource.scala @@ -109,7 +109,7 @@ object BedSource { } /** Creates a new BED source from a File. */ - def apply(file: File, dict: Option[SequenceDictionary]): BedSource = apply(path=file.toPath, dict) + def apply(file: File, dict: Option[SequenceDictionary]): BedSource = apply(path = file.toPath, dict) /** Creates a new BED source from a Path. */ def apply(path: PathToIntervals, dict: Option[SequenceDictionary]): BedSource = apply(Io.readLines(path), dict) diff --git a/src/main/scala/com/fulcrumgenomics/util/IntervalSource.scala b/src/main/scala/com/fulcrumgenomics/util/IntervalSource.scala index 2089be203..363526c68 100644 --- a/src/main/scala/com/fulcrumgenomics/util/IntervalSource.scala +++ b/src/main/scala/com/fulcrumgenomics/util/IntervalSource.scala @@ -31,6 +31,7 @@ import com.fulcrumgenomics.fasta.SequenceDictionary import com.fulcrumgenomics.util.IntervalListSource.{HeaderPrefix => IntervalListHeaderPrefix} import htsjdk.samtools.SAMFileHeader import htsjdk.samtools.util.Interval +import htsjdk.tribble.annotation.Strand import java.io.{Closeable, File, InputStream} import scala.io.Source @@ -55,8 +56,18 @@ class IntervalSource private( require(sd.forall(wrapped.dict.sameAs), "Provided sequence dictionary does not match the input's dict header!") (wrapped, Some(wrapped.dict), Some(wrapped.header)) } else { - val wrapped = BedSource(iter, sd) - (wrapped.map(feature => new Interval(feature)), sd, None) + val wrapped = BedSource(iter, sd).map { bed => + new Interval( + bed.getContig, + bed.getStart, + bed.getEnd, + // BEDFeature.getStrand() can be null so wrap in an option and search for the negative enum. + Option(bed.getStrand).contains(Strand.NEGATIVE), + // The default name for BEDFeature is the empty string (""), but defaults to null for Interval. + if (bed.getName == "") null else bed.getName + ) + } + (wrapped, sd, None) } /** The [[SAMFileHeader]] associated with the source, if it exists. */ @@ -95,7 +106,7 @@ object IntervalSource { } /** Creates a new interval source from a File. */ - def apply(file: File, dict: Option[SequenceDictionary]): IntervalSource = apply(path=file.toPath, dict) + def apply(file: File, dict: Option[SequenceDictionary]): IntervalSource = apply(path = file.toPath, dict) /** Creates a new interval source from a Path. */ def apply(path: PathToIntervals, dict: Option[SequenceDictionary]): IntervalSource = apply(Io.readLines(path), dict) diff --git a/src/test/scala/com/fulcrumgenomics/util/BedSourceTest.scala b/src/test/scala/com/fulcrumgenomics/util/BedSourceTest.scala index 6199423b2..c74adea9d 100644 --- a/src/test/scala/com/fulcrumgenomics/util/BedSourceTest.scala +++ b/src/test/scala/com/fulcrumgenomics/util/BedSourceTest.scala @@ -37,6 +37,15 @@ import scala.io.Source /** Unit tests for [[BedSource]]. */ class BedSourceTest extends UnitSpec with Explicitly { + /** Equality helper for BED features that only compares contig, start, end, and name. */ + private val equalityByLocatableAndName: Equality[BEDFeature] = (a: BEDFeature, b: Any) => b match { + case expected: BEDFeature => a.contigsMatch(expected) && + a.getStart == expected.getStart && + a.getEnd == expected.getEnd && + Option(a.getName) == Option(expected.getName) + case _ => false + } + /** A sequence dictionary for unit testing. */ private val Dict: SequenceDictionary = SequenceDictionary(SequenceMetadata("chr1", length = 10000)) @@ -47,15 +56,6 @@ class BedSourceTest extends UnitSpec with Explicitly { feature } - /** Equality helper for BED features that only compares contig, start, end, and name. */ - private val equalityByLocatableAndName: Equality[BEDFeature] = (a: BEDFeature, b: Any) => b match { - case expected: BEDFeature => a.contigsMatch(expected) && - a.getStart == expected.getStart && - a.getEnd == expected.getEnd && - a.getName == expected.getName - case _ => false - } - /** Contents of a BED file without a header for testing. */ private val BedWithoutHeader: String = """chr19 49302000 49302300 -1.0 @@ -130,7 +130,7 @@ class BedSourceTest extends UnitSpec with Explicitly { } it should "read no BED features from an empty iterator" in { - val actual = BedSource(Iterator.empty, dict = None) + val actual = BedSource(Iterator.empty, dict = None) actual.dict shouldBe None actual.header shouldBe empty actual.toList shouldBe empty @@ -209,13 +209,13 @@ class BedSourceTest extends UnitSpec with Explicitly { caught.getMessage should include ("Failed on line number: 1") } - it should "know which line of input triggered a validation exception" in { - val source = BedSource("chr1 100\nchr2 100".linesIterator, dict = Some(Dict)) + it should "know which line of input triggered a validation exception including header lines" in { + val source = BedSource("# comment\nchr1 100\nchr2 100\n".linesIterator, dict = Some(Dict)) source.dict.value shouldBe Dict - source.header shouldBe empty + source.header shouldBe Seq("# comment") val caught = intercept[NoSuchElementException] { source.toList } caught.getMessage should include ("Contig does not exist within dictionary for locatable") - caught.getMessage should include ("Failed on line number: 2") + caught.getMessage should include ("Failed on line number: 3") } "BedSource.apply" should "allow sourcing BED features from an iterable of string data" in { diff --git a/src/test/scala/com/fulcrumgenomics/util/IntervalSourceTest.scala b/src/test/scala/com/fulcrumgenomics/util/IntervalSourceTest.scala index 01bd241fc..a4a8e1307 100644 --- a/src/test/scala/com/fulcrumgenomics/util/IntervalSourceTest.scala +++ b/src/test/scala/com/fulcrumgenomics/util/IntervalSourceTest.scala @@ -30,13 +30,25 @@ import com.fulcrumgenomics.fasta.{SequenceDictionary, SequenceMetadata} import com.fulcrumgenomics.testing.UnitSpec import htsjdk.samtools.SAMFileHeader import htsjdk.samtools.util.Interval +import htsjdk.tribble.annotation.Strand +import org.scalactic.{Equality, Explicitly} import org.scalatest.OptionValues._ import java.io.ByteArrayInputStream import scala.io.Source /** Unit tests for [[IntervalSource]]. */ -class IntervalSourceTest extends UnitSpec { +class IntervalSourceTest extends UnitSpec with Explicitly { + + /** Equality helper for intervals that fully compares them by contig, start, end, name, and strand. */ + private val equalityByLocatableNameAndStrand: Equality[Interval] = (a: Interval, b: Any) => b match { + case expected: Interval => a.contigsMatch(expected) && + a.getStart == expected.getStart && + a.getEnd == expected.getEnd && + a.getStrand == expected.getStrand && + Option(a.getName) == Option(expected.getName) + case _ => false + } /** A sequence dictionary for unit testing. */ private val Dict: SequenceDictionary = SequenceDictionary( @@ -140,11 +152,11 @@ class IntervalSourceTest extends UnitSpec { val expected = Seq(new Interval("chr1", 101, 101)) actual.dict shouldBe None actual.header shouldBe None - actual.toList should contain theSameElementsInOrderAs expected + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableNameAndStrand) } it should "read no intervals at all from an empty iterator" in { - val actual = IntervalSource(Iterator.empty, dict = None) + val actual = IntervalSource(Iterator.empty, dict = None) actual.dict shouldBe None actual.header shouldBe None actual.toList shouldBe empty @@ -154,14 +166,14 @@ class IntervalSourceTest extends UnitSpec { val actual = IntervalSource(BedWithoutHeader.linesIterator, dict = None) actual.dict shouldBe None actual.header shouldBe None - actual.toList should contain theSameElementsInOrderAs BedWithoutHeaderExpected + (actual.toList should contain theSameElementsInOrderAs BedWithoutHeaderExpected) (decided by equalityByLocatableNameAndStrand) } it should "return BED intervals, skipping a header, from a list of line records" in { val actual = IntervalSource(BedWithHeader.linesIterator, dict = None) actual.dict shouldBe None actual.header shouldBe None - actual.toList should contain theSameElementsInOrderAs BedWithHeaderExpected + (actual.toList should contain theSameElementsInOrderAs BedWithHeaderExpected) (decided by equalityByLocatableNameAndStrand) } it should "read a single simple Interval List data record into an interval" in { @@ -172,11 +184,11 @@ class IntervalSourceTest extends UnitSpec { header.setSequenceDictionary(dict.asSam) val source = IntervalSource(data.linesIterator, dict = None) - val expected = Seq(new Interval("chr1", 100, 100, false, null)) + val expected = Seq(new Interval("chr1", 100, 100, false, ".")) source.dict.value shouldBe dict source.header.value shouldBe header - source.toList should contain theSameElementsInOrderAs expected + (source.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableNameAndStrand) } it should "read multiple intervals from an interval list" in { @@ -187,7 +199,39 @@ class IntervalSourceTest extends UnitSpec { source.dict.value shouldBe Dict source.header.value shouldBe header - source.toList should contain theSameElementsInOrderAs IntervalListExpected + (source.toList should contain theSameElementsInOrderAs IntervalListExpected) (decided by equalityByLocatableNameAndStrand) + } + + it should "read an interval that has a name intentionally set from a BED source" in { + val actual = IntervalSource("chr1 100 101 interval-name\n".linesIterator, dict = None) + val expected = Seq(new Interval("chr1", 101, 101, false, "interval-name")) + actual.dict shouldBe None + actual.header shouldBe None + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableNameAndStrand) + } + + it should "read an interval that has the strand set to positive from a BED source" in { + val actual = IntervalSource("chr1 100 101 interval-name 500 +\n".linesIterator, dict = None) + val expected = Seq(new Interval("chr1", 101, 101, false, "interval-name")) + actual.dict shouldBe None + actual.header shouldBe None + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableNameAndStrand) + } + + it should "read an interval that has the strand set to the unknown value from a BED source" in { + val actual = IntervalSource("chr1 100 101 interval-name 500 .\n".linesIterator, dict = None) + val expected = Seq(new Interval("chr1", 101, 101, false, "interval-name")) + actual.dict shouldBe None + actual.header shouldBe None + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableNameAndStrand) + } + + it should "read an interval that has the strand set to negative from a BED source" in { + val actual = IntervalSource("chr1 100 101 interval-name 500 -\n".linesIterator, dict = None) + val expected = Seq(new Interval("chr1", 101, 101, true, "interval-name")) + actual.dict shouldBe None + actual.header shouldBe None + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableNameAndStrand) } it should "assert the provided sequence dictionary matches the found sequence dictionary for interval list input" in { @@ -204,7 +248,7 @@ class IntervalSourceTest extends UnitSpec { val actual = IntervalSource(BedWithHeader.linesIterator, dict = Some(dict)) actual.dict.value shouldBe dict actual.header shouldBe None - actual.toList should contain theSameElementsInOrderAs BedWithHeaderExpected + (actual.toList should contain theSameElementsInOrderAs BedWithHeaderExpected) (decided by equalityByLocatableNameAndStrand) } it should "assert the records match the provided sequence dictionary for BED input" in { @@ -222,7 +266,7 @@ class IntervalSourceTest extends UnitSpec { val expected = Seq(new Interval("chr1", 101, 101)) actual.dict.value shouldBe Dict actual.header shouldBe None - actual.toList should contain theSameElementsInOrderAs expected + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableNameAndStrand) } it should "allow sourcing interval from an input stream of string data" in { @@ -231,7 +275,7 @@ class IntervalSourceTest extends UnitSpec { val expected = Seq(new Interval("chr1", 101, 101)) actual.dict.value shouldBe Dict actual.header shouldBe None - actual.toList should contain theSameElementsInOrderAs expected + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableNameAndStrand) } it should "allow sourcing intervals from a source of string data" in { @@ -240,7 +284,7 @@ class IntervalSourceTest extends UnitSpec { val expected = Seq(new Interval("chr1", 101, 101)) actual.dict.value shouldBe Dict actual.header shouldBe None - actual.toList should contain theSameElementsInOrderAs expected + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableNameAndStrand) actual.close() } @@ -251,7 +295,7 @@ class IntervalSourceTest extends UnitSpec { val expected = Seq(new Interval("chr1", 101, 101)) actual.dict.value shouldBe Dict actual.header shouldBe None - actual.toList should contain theSameElementsInOrderAs expected + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableNameAndStrand) } it should "allow sourcing intervals from a path of string data" in { @@ -261,6 +305,6 @@ class IntervalSourceTest extends UnitSpec { val expected = Seq(new Interval("chr1", 101, 101)) actual.dict.value shouldBe Dict actual.header shouldBe None - actual.toList should contain theSameElementsInOrderAs expected + (actual.toList should contain theSameElementsInOrderAs expected) (decided by equalityByLocatableNameAndStrand) } }