From a31621119c50e3e1470168f88fd7c22f7361d06c Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Mon, 20 May 2024 09:53:03 -0700 Subject: [PATCH] feat: FastqToBam can extract UMI(s) from the comment in the read name --- .../fulcrumgenomics/fastq/FastqToBam.scala | 19 +++++++-- .../scala/com/fulcrumgenomics/umi/Umis.scala | 40 +++++++++++++++++- .../fastq/FastqToBamTest.scala | 31 ++++++++++++++ .../com/fulcrumgenomics/umi/UmisTest.scala | 41 +++++++++++++++++-- 4 files changed, 122 insertions(+), 9 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/fastq/FastqToBam.scala b/src/main/scala/com/fulcrumgenomics/fastq/FastqToBam.scala index 01e3f6998..982d7db9f 100644 --- a/src/main/scala/com/fulcrumgenomics/fastq/FastqToBam.scala +++ b/src/main/scala/com/fulcrumgenomics/fastq/FastqToBam.scala @@ -71,12 +71,15 @@ import htsjdk.samtools.{ReservedTagConstants, SAMFileHeader, SAMReadGroupRecord} |For more information on read structures see the |[Read Structure Wiki Page](https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures) | - |UMIs may be extracted from the read sequences, the read names, or both. If `--extract-umis-from-read-names` is + |UMIs may be extracted from the read sequences, the read names (or comment), or both. If `--extract-umis-from-read-names` is |specified, any UMIs present in the read names are extracted; read names are expected to be `:`-separated with |any UMIs present in the 8th field. If this option is specified, the `--umi-qual-tag` option may not be used as |qualities are not available for UMIs in the read name. If UMI segments are present in the read structures those |will also be extracted. If UMIs are present in both, the final UMIs are constructed by first taking the UMIs - |from the read names, then adding a hyphen, then the UMIs extracted from the reads. + |from the read names, then adding a hyphen, then the UMIs extracted from the reads. If `--extract-umis-from-read-comment` is + |specified, any UMIs present in the read name comments are extracted; the read name comment is the text _after_ + |the first white space in the read name (like a FASTA). If the comment is `:`-separated, then the UMI will be + |extracted from the last field, otherwise the full comment will be used. | |The same number of input files and read structures must be provided, with one exception: if supplying exactly |1 or 2 fastq files, both of which are solely template reads, no read structures need be provided. @@ -93,7 +96,10 @@ class FastqToBam @arg(flag='u', doc="Tag in which to store molecular barcodes/UMIs.") val umiTag: String = ConsensusTags.UmiBases, @arg(flag='q', doc="Tag in which to store molecular barcode/UMI qualities.") val umiQualTag: Option[String] = None, @arg(flag='Q', doc="Store the sample barcode qualities in the QT Tag.") val storeSampleBarcodeQualities: Boolean = false, - @arg(flag='n', doc="Extract UMI(s) from read names and prepend to UMIs from reads.") val extractUmisFromReadNames: Boolean = false, + @arg(flag='n', doc="Extract UMI(s) from read names and prepend to UMIs from reads.", mutex=Array("extractUmisFromReadComment")) + val extractUmisFromReadNames: Boolean = false, + @arg(flag='c', doc="Extract UMI(s) from read name comment and prepend to UMIs from reads.", mutex=Array("extractUmisFromReadNames")) + val extractUmisFromReadComment: Boolean = false, @arg( doc="Read group ID to use in the file header.") val readGroupId: String = "A", @arg( doc="The name of the sequenced sample.") val sample: String, @arg( doc="The name/ID of the sequenced library.") val library: String, @@ -117,6 +123,7 @@ class FastqToBam validate(input.length == actualReadStructures.length, "input and read-structure must be supplied the same number of times.") validate(1 to 2 contains actualReadStructures.flatMap(_.templateSegments).size, "read structures must contain 1-2 template reads total.") validate(!extractUmisFromReadNames || umiQualTag.isEmpty, "Cannot extract UMI qualities when also extracting UMI from read names.") + validate(!extractUmisFromReadComment || umiQualTag.isEmpty, "Cannot extract UMI qualities when also extracting UMI from read description.") override def execute(): Unit = { val encoding = qualityEncoding @@ -166,7 +173,11 @@ class FastqToBam val templates = subs.iterator.filter(_.kind == Template).toList // If requested, pull out the UMI(s) from the read name - val umiFromReadName = if (extractUmisFromReadNames) Umis.extractUmisFromReadName(fqs.head.name, strict=true) else None + val umiFromReadName = { + if (extractUmisFromReadNames) Umis.extractUmisFromReadName(fqs.head.name, strict=true) + else if (extractUmisFromReadComment) fqs.head.comment.flatMap(comment => Umis.extractUmisFromReadComment(comment, strict=true)) + else None + } templates.zipWithIndex.map { case (read, index) => // If the template read had no bases, we'll substitute in a single N @ Q2 below to keep htsjdk happy diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index 2fbd4cfe9..93a7b750e 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -55,7 +55,7 @@ object Umis { /** * Extracts the UMI from an Illumina fastq style read name. Illumina documents their FASTQ read names as: - * @::::::: ::: + * `@::::::: :::`` * * See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm * The UMI field is optional, so read names may or may not contain it. Illumina also specifies that the UMI @@ -63,7 +63,7 @@ object Umis { * translated to hyphens before returning. * * If `strict` is true the name _must_ contain either 7 or 8 colon-separated segments, - with the UMI being the last in the case of 8 and `None` in the case of 7. + * with the UMI being the last in the case of 8 and `None` in the case of 7. * * If `strict` is false the last segment is returned so long as it appears to be a valid UMI. */ @@ -88,6 +88,42 @@ object Umis { else umi } + /** + * Extracts the UMI from an Illumina fastq style read name. Illumina documents their FASTQ read names as: + * `@::::::: :::`` + * + * The index in the description may be replaced with a UMI field. + * + * See https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm + * The UMI field is optional, so read names may or may not contain it. Illumina also specifies that the UMI + * field may contain multiple UMIs, in which case they will delimit them with `+` characters. Pluses will be + * translated to hyphens before returning. + * + * If `strict` is true the comment _must_ contain either 4 colon-separated segments, + * with the UMI being the last in the case of 4. + * + * If `strict` is false the last segment is returned so long as it appears to be a valid UMI. + */ + def extractUmisFromReadComment(comment: String, delimiter: Char = ':', strict: Boolean): Option[String] = { + // If strict, check that the read name actually has eight parts, which is expected + val rawUmi = if (strict) { + val colons = comment.count(_ == delimiter) + if (colons == 3) Some(comment.substring(comment.lastIndexOf(delimiter) + 1, comment.length)) + else throw new IllegalArgumentException(s"Trying to extract UMI from read comment with ${colons + 1} parts (4 expected): ${comment}") + } else { + val idx = comment.lastIndexOf(delimiter) + if (idx == -1) Some(comment) + else Some(comment.substring(idx + 1, comment.length)) + } + + val umi = rawUmi.map(raw => (if (raw.indexOf('+') > 0) raw.replace('+', '-') else raw).toUpperCase) + val valid = umi.forall(u => u.forall(isValidUmiCharacter)) + + if (strict && !valid) throw new IllegalArgumentException(s"Invalid UMI '${umi.get}' extracted from name '${comment}") + else if (!valid) None + else umi + } + @inline private def isValidUmiCharacter(ch: Char): Boolean = { ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T' || ch == 'N' || ch == '-' } diff --git a/src/test/scala/com/fulcrumgenomics/fastq/FastqToBamTest.scala b/src/test/scala/com/fulcrumgenomics/fastq/FastqToBamTest.scala index 7941e681c..1c39096a3 100644 --- a/src/test/scala/com/fulcrumgenomics/fastq/FastqToBamTest.scala +++ b/src/test/scala/com/fulcrumgenomics/fastq/FastqToBamTest.scala @@ -383,6 +383,37 @@ class FastqToBamTest extends UnitSpec { recs(1).apply[String]("RX") shouldBe "TTGA-TAAT-TA-AA" } + + it should "extract UMIs from the comment in read names when requested" in { + val r1 = fq( + FastqRecord("q1:2:3:4:5:6:7:8", "AAAAAAAAAA", "==========", comment=Some("0:1:2:ACGT")), + FastqRecord("q2:2:3:4:5:6:7:8", "TAAAAAAAAA", "==========", comment=Some("0:1:2:TTGA")), + FastqRecord("q3:2:3:4:5:6:7", "TAAAAAAAAA", "=========="), + ) + val bam = makeTempFile("fastqToBamTest.", ".bam") + new FastqToBam(input=Seq(r1), output=bam, sample="s", library="l", extractUmisFromReadComment=true).execute() + val recs = readBamRecs(bam) + recs should have size 3 + recs(0).apply[String]("RX") shouldBe "ACGT" + recs(1).apply[String]("RX") shouldBe "TTGA" + recs(2).get[String]("RX") shouldBe None + } + + it should "extract UMIs from the comment in read names and read sequences" in { + val r1 = fq( + FastqRecord("q1:2:3:4:5:6:7:8", "GGNCCGAAAAAAA", "=============", comment=Some("0:1:2:ACGT+CGTA")), + FastqRecord("q2:2:3:4:5:6:7:8", "TANAACAAAAAAA", "=============", comment=Some("0:1:2:TTGA+TAAT")), + ) + val rs = ReadStructure("2M1S2M+T") + val bam = makeTempFile("fastqToBamTest.", ".bam") + new FastqToBam(input=Seq(r1), readStructures=Seq(rs), output=bam, sample="s", library="l", extractUmisFromReadComment=true).execute() + val recs = readBamRecs(bam) + recs should have size 2 + recs(0).apply[String]("RX") shouldBe "ACGT-CGTA-GG-CC" + recs(1).apply[String]("RX") shouldBe "TTGA-TAAT-TA-AA" + } + + it should "fail when read names don't match up" in { val r1 = fq(FastqRecord("q1", "AAAAAAAAAA", "==========")) val r2 = fq(FastqRecord("x1", "CCCCCCCCCC", "##########")) diff --git a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala index 96298b246..70adb5249 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala @@ -43,7 +43,7 @@ class UmisTest extends UnitSpec with OptionValues { Umis.extractUmisFromReadName("1:2:3:4:5:6:7:ACGTACGT", strict=true).value shouldBe "ACGTACGT" } - it should "return None if the read name has only 7 parts in strict mode" in { + it should "return None if the read has only 7 parts in strict mode" in { Umis.extractUmisFromReadName("1:2:3:4:5:6:7", strict=true) shouldBe None } @@ -78,6 +78,41 @@ class UmisTest extends UnitSpec with OptionValues { Umis.extractUmisFromReadName("1:2:3:4:5:6:7:8", strict=false) shouldBe None } + "Umis.extractUmisFromReadComment" should "extract a UMI from a well-formed read name" in { + Umis.extractUmisFromReadComment("0:1:2:ACGTACGT", strict=true).value shouldBe "ACGTACGT" + } + + it should "throw an exception in strict mode if the read has too many or too few segments" in { + an[Exception] shouldBe thrownBy { Umis.extractUmisFromReadComment("0:1:ACGTACGT", strict=true) } + an[Exception] shouldBe thrownBy { Umis.extractUmisFromReadComment("0:1:2:3:ACGTACGT", strict=true) } + } + + it should "translate pluses to hyphens when multiple UMIs are present" in { + Umis.extractUmisFromReadComment("0:1:2:ACGTACGT+TTGCGGCT", strict=true).value shouldBe "ACGTACGT-TTGCGGCT" + } + + it should "extract a UMI from the last segment in non-strict mode, if it looks like a UMI" in { + Umis.extractUmisFromReadComment("ACGT", strict=false).value shouldBe "ACGT" + Umis.extractUmisFromReadComment("1:ACGT", strict=false).value shouldBe "ACGT" + Umis.extractUmisFromReadComment("1:2:ACGT", strict=false).value shouldBe "ACGT" + Umis.extractUmisFromReadComment("1:2:3:ACGT", strict=false).value shouldBe "ACGT" + Umis.extractUmisFromReadComment("1:2:3:4:ACGT", strict=false).value shouldBe "ACGT" + Umis.extractUmisFromReadComment("1:2:3:4:5:ACGT", strict=false).value shouldBe "ACGT" + Umis.extractUmisFromReadComment("1:2:3:4:5:6:ACGT", strict=false).value shouldBe "ACGT" + Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:ACGT", strict=false).value shouldBe "ACGT" + Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:8:ACGT", strict=false).value shouldBe "ACGT" + Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:8:9:ACGT", strict=false).value shouldBe "ACGT" + } + + it should "return None in non-strict mode if the last segment doesn't look like a UMI" in { + Umis.extractUmisFromReadComment("1:2", strict=false) shouldBe None + Umis.extractUmisFromReadComment("1:2:3", strict=false) shouldBe None + Umis.extractUmisFromReadComment("1:2:3:4", strict=false) shouldBe None + Umis.extractUmisFromReadComment("1:2:3:4:5", strict=false) shouldBe None + Umis.extractUmisFromReadComment("1:2:3:4:5:6", strict=false) shouldBe None + Umis.extractUmisFromReadComment("1:2:3:4:5:6:7", strict=false) shouldBe None + Umis.extractUmisFromReadComment("1:2:3:4:5:6:7:8", strict=false) shouldBe None + } "Umis.copyUmiFromReadName" should "copy the UMI from the read name" in { copyUmiFromReadName(rec=rec("UMI:A")).nameAndUmi shouldBe ("UMI:A", "A") @@ -91,7 +126,7 @@ class UmisTest extends UnitSpec with OptionValues { copyUmiFromReadName(rec=rec("UMI:C:ACC+GGT"), removeUmi=true).nameAndUmi shouldBe ("UMI:C", "ACC-GGT") } - it should "split on a different name delimiter if specified" in { + it should "split on a different delimiter if specified" in { copyUmiFromReadName(rec=rec("UMI-A"), delimiter='-').nameAndUmi shouldBe ("UMI-A", "A") copyUmiFromReadName(rec=rec("UMI-C-A"), delimiter='-').nameAndUmi shouldBe ("UMI-C-A", "A") copyUmiFromReadName(rec=rec("UMI-C-ACC+GGT"), delimiter='-').nameAndUmi shouldBe ("UMI-C-ACC+GGT", "ACC-GGT") @@ -101,7 +136,7 @@ class UmisTest extends UnitSpec with OptionValues { copyUmiFromReadName(rec=rec("UMI:C:ACC+GGT")).nameAndUmi shouldBe ("UMI:C:ACC+GGT", "ACC-GGT") } - it should "fail if the read name has only one field" in { + it should "fail if the read has only one field" in { an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("NAME")) an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("1-2")) }