From 9744cbcdd949f45949b0412c135383ef38ac5a0e Mon Sep 17 00:00:00 2001
From: jrobinso <933148+jrobinso@users.noreply.github.com>
Date: Sat, 7 Sep 2024 11:44:03 -0700
Subject: [PATCH] Refactor bb reader (bigwig / bigbed) to account for files
 with unusual layouts.  Specifically chromTreeOffset > fullDataOffset, but
 also others.

---
 js/bigwig/bwReader.js  | 103 ++++++++++++++++++++++++++++++-----------
 js/bigwig/chromTree.js |   3 +-
 js/binary.js           |  14 ++++++
 test/testBigwig.js     |  16 ++++++-
 4 files changed, 106 insertions(+), 30 deletions(-)

diff --git a/js/bigwig/bwReader.js b/js/bigwig/bwReader.js
index 63e4b0f1f..87c6af6c0 100644
--- a/js/bigwig/bwReader.js
+++ b/js/bigwig/bwReader.js
@@ -281,6 +281,16 @@ class BWReader {
         }
     }
 
+    /**
+     * The BB header consists of
+     *  (1) the common header
+     *  (2) the zoom headers
+     *  (3) autosql
+     *  (4) total summary block (version 2 and later)
+     *
+     *  In addition, we read the chromomsome B+ tree
+     * @returns {Promise<*>}
+     */
     async loadHeader() {
 
         if (this.header) {
@@ -298,7 +308,7 @@ class BWReader {
             // Assume low-to-high unless proven otherwise
             this.littleEndian = true
 
-            let binaryParser = new BinaryParser(new DataView(data), this.littleEndian)
+            const binaryParser = new BinaryParser(new DataView(data), this.littleEndian)
             let magic = binaryParser.getUInt()
             if (magic === BIGWIG_MAGIC_LTH) {
                 this.type = "bigwig"
@@ -335,30 +345,33 @@ class BWReader {
                 extensionOffset: binaryParser.getLong()
             }
 
+            // Read the next chunk containing zoom headers, autosql, and total summary if present.  TotalSummary size = 40 bytes
             const startOffset = BBFILE_HEADER_SIZE
+            const size = header.totalSummaryOffset > 0 ?
+                header.totalSummaryOffset - startOffset + 40 :
+                Math.min(header.fullDataOffset, header.chromTreeOffset) - startOffset
             let range = {
                 start: startOffset,
-                size: (header.fullDataOffset - startOffset + 4)
+                size: size
             }
             data = await this.loader.loadArrayBuffer(this.path, buildOptions(this.config, {range: range}))
-
-            const nZooms = header.nZoomLevels
-            binaryParser = new BinaryParser(new DataView(data), this.littleEndian)
+            const extHeaderParser = new BinaryParser(new DataView(data), this.littleEndian)
 
             // Load zoom headers, store in order of decreasing reduction level (increasing resolution)
+            const nZooms = header.nZoomLevels
             this.zoomLevelHeaders = []
             this.firstZoomDataOffset = Number.MAX_SAFE_INTEGER
             for (let i = 1; i <= nZooms; i++) {
                 const zoomNumber = nZooms - i
-                const zlh = new ZoomLevelHeader(zoomNumber, binaryParser)
+                const zlh = new ZoomLevelHeader(zoomNumber, extHeaderParser)
                 this.firstZoomDataOffset = Math.min(zlh.dataOffset, this.firstZoomDataOffset)
                 this.zoomLevelHeaders[zoomNumber] = zlh
             }
 
             // Autosql
             if (header.autoSqlOffset > 0) {
-                binaryParser.position = header.autoSqlOffset - startOffset
-                const autoSqlString = binaryParser.getString()
+                extHeaderParser.position = header.autoSqlOffset - startOffset
+                const autoSqlString = extHeaderParser.getString()
                 if (autoSqlString) {
                     this.autoSql = parseAutoSQL(autoSqlString)
                 }
@@ -366,35 +379,73 @@ class BWReader {
 
             // Total summary
             if (header.totalSummaryOffset > 0) {
-                binaryParser.position = header.totalSummaryOffset - startOffset
-                this.totalSummary = new BWTotalSummary(binaryParser)
-            }
-
-            // Chrom data index
-            if (header.chromTreeOffset > 0) {
-                binaryParser.position = header.chromTreeOffset - startOffset
-                this.chromTree = await ChromTree.parseTree(binaryParser, startOffset, this.genome)
-                this.chrNames = new Set(this.chromTree.idToName)
-            } else {
-                // TODO -- this is an error, not expected
-                throw "BigWig chromosome tree offset <= 0"
+                extHeaderParser.position = header.totalSummaryOffset - startOffset
+                this.totalSummary = new BWTotalSummary(extHeaderParser)
             }
 
-            //Finally total data count
-            binaryParser.position = header.fullDataOffset - startOffset
-            header.dataCount = binaryParser.getInt()
+            // Chrom data index.  The start is known, size is not, but we can estimate it
+            const bufferSize = Math.min(200000, Math.max(10000, header.fullDataOffset - header.chromTreeOffset))
+            this.chromTree = await this.#readChromTree(header.chromTreeOffset, bufferSize)
+            this.chrNames = new Set(this.chromTree.idToName)
 
-            this.featureDensity = header.dataCount / this.chromTree.sumLengths
+            // Estimate feature density from dataCount (bigbed only)
+            if("bigbed" === this.type) {
+                const dataCount = await this.#readDataCount(header.fullDataOffset)
+                this.featureDensity = dataCount / this.chromTree.sumLengths
+            }
 
             this.header = header
 
-
             //extension
             if (header.extensionOffset > 0) {
                 await this.loadExtendedHeader(header.extensionOffset)
             }
-          return this.header
+            return this.header
+        }
+    }
+
+    async #readDataCount(offset) {
+        const data = await this.loader.loadArrayBuffer(this.path, buildOptions(this.config, {
+            range: {
+                start: offset,
+                size: 4
+            }
+        }))
+        const binaryParser = new BinaryParser(new DataView(data), this.littleEndian)
+        return binaryParser.getInt()
+    }
+
+    /**
+     * Used when the chromTreeOffset is > fullDataOffset, that is when the chrom tree is not in the initial chunk
+     * read for parsing the header.  We know the start position, but not the total size of the chrom tree
+     *
+     * @returns {Promise<void>}
+     */
+    async #readChromTree(chromTreeOffset, bufferSize) {
+
+        let size = bufferSize
+        const load = async () => {
+            const data = await this.loader.loadArrayBuffer(this.path, buildOptions(this.config, {
+                range: {
+                    start: chromTreeOffset,
+                    size: size
+                }
+            }))
+            const binaryParser = new BinaryParser(new DataView(data), this.littleEndian)
+            return ChromTree.parseTree(binaryParser, chromTreeOffset, this.genome)
+        }
+
+        let error
+        while (size < 1000000) {
+            try {
+                const chromTree = await load()
+                return chromTree
+            } catch (e) {
+                error = e
+                size *= 2
+            }
         }
+        throw (error)
     }
 
     async loadExtendedHeader(offset) {
diff --git a/js/bigwig/chromTree.js b/js/bigwig/chromTree.js
index 2fa631b78..cbb938303 100644
--- a/js/bigwig/chromTree.js
+++ b/js/bigwig/chromTree.js
@@ -32,7 +32,6 @@ export default class ChromTree {
             const idToName = []
             let sumLengths = 0
             const readTreeNode = (offset) => {
-
                 if (offset >= 0) binaryParser.position = offset
                 const type = binaryParser.getByte()
                 const reserved = binaryParser.getByte()
@@ -69,7 +68,7 @@ export default class ChromTree {
             }
 
             // Recursively walk tree to populate dictionary
-            readTreeNode(binaryParser, -1)
+            readTreeNode( -1)
 
             return new ChromTree(header, nameToId, idToName, sumLengths)
         }
diff --git a/js/binary.js b/js/binary.js
index 9184b87a5..b540c87e0 100644
--- a/js/binary.js
+++ b/js/binary.js
@@ -34,6 +34,20 @@ class BinaryParser {
         this.length = dataView.byteLength
     }
 
+    /**
+     * Print the first "n" bytes to the console.  Used for debugging.
+     * @param n
+     */
+    dumpBytes (n = 100) {
+        const pos = this.position
+        const bytes = []
+        for(let i=0; i<= n; i++) {
+            bytes.push(this.getByte())
+        }
+        console.log(bytes.join(" "))
+        this.setPosition(pos)
+    }
+
     setPosition(position) {
         this.position = position
     }
diff --git a/test/testBigwig.js b/test/testBigwig.js
index 73276db4b..6fe12d715 100644
--- a/test/testBigwig.js
+++ b/test/testBigwig.js
@@ -11,7 +11,6 @@ suite("testBigWig", function () {
 
         this.timeout(10000)
 
-        //chr21:19,146,376-19,193,466
         const url = "https://s3.amazonaws.com/igv.org.test/data/uncompressed.bw",
             chr = "chr21",
             start = 0,
@@ -21,14 +20,27 @@ suite("testBigWig", function () {
         const bwReader = new BWReader({url: url})
         const features = await bwReader.readFeatures(chr, start, chr, end, bpPerPixel)
         assert.equal(features.length, 8)   // Verified in iPad app
+    })
+
+
+    /**
+     * Test a BW file with an unusual layout (chromTree after full data).
+     */
+    test("chromTree", async function () {
 
+        this.timeout(10000)
+
+        const url = "https://data.broadinstitute.org/igvdata/test/data/bb/chromTreeTest.bigwig"
+        const bwReader = new BWReader({url: url})
+        const header = await bwReader.loadHeader()
+        assert.ok(header)
+        assert.equal(bwReader.chrNames.size, 6)
     })
 
     test("bigwig", async function () {
 
         this.timeout(10000)
 
-        //chr21:19,146,376-19,193,466
         const url = "test/data/bb/fixedStep.bw"
         const chr = "chr1"
         const bwReader = new BWReader({url: url})