From 9744cbcdd949f45949b0412c135383ef38ac5a0e Mon Sep 17 00:00:00 2001 From: jrobinso <933148+jrobinso@users.noreply.github.com> Date: Sat, 7 Sep 2024 11:44:03 -0700 Subject: [PATCH] Refactor bb reader (bigwig / bigbed) to account for files with unusual layouts. Specifically chromTreeOffset > fullDataOffset, but also others. --- js/bigwig/bwReader.js | 103 ++++++++++++++++++++++++++++++----------- js/bigwig/chromTree.js | 3 +- js/binary.js | 14 ++++++ test/testBigwig.js | 16 ++++++- 4 files changed, 106 insertions(+), 30 deletions(-) diff --git a/js/bigwig/bwReader.js b/js/bigwig/bwReader.js index 63e4b0f1f..87c6af6c0 100644 --- a/js/bigwig/bwReader.js +++ b/js/bigwig/bwReader.js @@ -281,6 +281,16 @@ class BWReader { } } + /** + * The BB header consists of + * (1) the common header + * (2) the zoom headers + * (3) autosql + * (4) total summary block (version 2 and later) + * + * In addition, we read the chromomsome B+ tree + * @returns {Promise<*>} + */ async loadHeader() { if (this.header) { @@ -298,7 +308,7 @@ class BWReader { // Assume low-to-high unless proven otherwise this.littleEndian = true - let binaryParser = new BinaryParser(new DataView(data), this.littleEndian) + const binaryParser = new BinaryParser(new DataView(data), this.littleEndian) let magic = binaryParser.getUInt() if (magic === BIGWIG_MAGIC_LTH) { this.type = "bigwig" @@ -335,30 +345,33 @@ class BWReader { extensionOffset: binaryParser.getLong() } + // Read the next chunk containing zoom headers, autosql, and total summary if present. TotalSummary size = 40 bytes const startOffset = BBFILE_HEADER_SIZE + const size = header.totalSummaryOffset > 0 ? + header.totalSummaryOffset - startOffset + 40 : + Math.min(header.fullDataOffset, header.chromTreeOffset) - startOffset let range = { start: startOffset, - size: (header.fullDataOffset - startOffset + 4) + size: size } data = await this.loader.loadArrayBuffer(this.path, buildOptions(this.config, {range: range})) - - const nZooms = header.nZoomLevels - binaryParser = new BinaryParser(new DataView(data), this.littleEndian) + const extHeaderParser = new BinaryParser(new DataView(data), this.littleEndian) // Load zoom headers, store in order of decreasing reduction level (increasing resolution) + const nZooms = header.nZoomLevels this.zoomLevelHeaders = [] this.firstZoomDataOffset = Number.MAX_SAFE_INTEGER for (let i = 1; i <= nZooms; i++) { const zoomNumber = nZooms - i - const zlh = new ZoomLevelHeader(zoomNumber, binaryParser) + const zlh = new ZoomLevelHeader(zoomNumber, extHeaderParser) this.firstZoomDataOffset = Math.min(zlh.dataOffset, this.firstZoomDataOffset) this.zoomLevelHeaders[zoomNumber] = zlh } // Autosql if (header.autoSqlOffset > 0) { - binaryParser.position = header.autoSqlOffset - startOffset - const autoSqlString = binaryParser.getString() + extHeaderParser.position = header.autoSqlOffset - startOffset + const autoSqlString = extHeaderParser.getString() if (autoSqlString) { this.autoSql = parseAutoSQL(autoSqlString) } @@ -366,35 +379,73 @@ class BWReader { // Total summary if (header.totalSummaryOffset > 0) { - binaryParser.position = header.totalSummaryOffset - startOffset - this.totalSummary = new BWTotalSummary(binaryParser) - } - - // Chrom data index - if (header.chromTreeOffset > 0) { - binaryParser.position = header.chromTreeOffset - startOffset - this.chromTree = await ChromTree.parseTree(binaryParser, startOffset, this.genome) - this.chrNames = new Set(this.chromTree.idToName) - } else { - // TODO -- this is an error, not expected - throw "BigWig chromosome tree offset <= 0" + extHeaderParser.position = header.totalSummaryOffset - startOffset + this.totalSummary = new BWTotalSummary(extHeaderParser) } - //Finally total data count - binaryParser.position = header.fullDataOffset - startOffset - header.dataCount = binaryParser.getInt() + // Chrom data index. The start is known, size is not, but we can estimate it + const bufferSize = Math.min(200000, Math.max(10000, header.fullDataOffset - header.chromTreeOffset)) + this.chromTree = await this.#readChromTree(header.chromTreeOffset, bufferSize) + this.chrNames = new Set(this.chromTree.idToName) - this.featureDensity = header.dataCount / this.chromTree.sumLengths + // Estimate feature density from dataCount (bigbed only) + if("bigbed" === this.type) { + const dataCount = await this.#readDataCount(header.fullDataOffset) + this.featureDensity = dataCount / this.chromTree.sumLengths + } this.header = header - //extension if (header.extensionOffset > 0) { await this.loadExtendedHeader(header.extensionOffset) } - return this.header + return this.header + } + } + + async #readDataCount(offset) { + const data = await this.loader.loadArrayBuffer(this.path, buildOptions(this.config, { + range: { + start: offset, + size: 4 + } + })) + const binaryParser = new BinaryParser(new DataView(data), this.littleEndian) + return binaryParser.getInt() + } + + /** + * Used when the chromTreeOffset is > fullDataOffset, that is when the chrom tree is not in the initial chunk + * read for parsing the header. We know the start position, but not the total size of the chrom tree + * + * @returns {Promise} + */ + async #readChromTree(chromTreeOffset, bufferSize) { + + let size = bufferSize + const load = async () => { + const data = await this.loader.loadArrayBuffer(this.path, buildOptions(this.config, { + range: { + start: chromTreeOffset, + size: size + } + })) + const binaryParser = new BinaryParser(new DataView(data), this.littleEndian) + return ChromTree.parseTree(binaryParser, chromTreeOffset, this.genome) + } + + let error + while (size < 1000000) { + try { + const chromTree = await load() + return chromTree + } catch (e) { + error = e + size *= 2 + } } + throw (error) } async loadExtendedHeader(offset) { diff --git a/js/bigwig/chromTree.js b/js/bigwig/chromTree.js index 2fa631b78..cbb938303 100644 --- a/js/bigwig/chromTree.js +++ b/js/bigwig/chromTree.js @@ -32,7 +32,6 @@ export default class ChromTree { const idToName = [] let sumLengths = 0 const readTreeNode = (offset) => { - if (offset >= 0) binaryParser.position = offset const type = binaryParser.getByte() const reserved = binaryParser.getByte() @@ -69,7 +68,7 @@ export default class ChromTree { } // Recursively walk tree to populate dictionary - readTreeNode(binaryParser, -1) + readTreeNode( -1) return new ChromTree(header, nameToId, idToName, sumLengths) } diff --git a/js/binary.js b/js/binary.js index 9184b87a5..b540c87e0 100644 --- a/js/binary.js +++ b/js/binary.js @@ -34,6 +34,20 @@ class BinaryParser { this.length = dataView.byteLength } + /** + * Print the first "n" bytes to the console. Used for debugging. + * @param n + */ + dumpBytes (n = 100) { + const pos = this.position + const bytes = [] + for(let i=0; i<= n; i++) { + bytes.push(this.getByte()) + } + console.log(bytes.join(" ")) + this.setPosition(pos) + } + setPosition(position) { this.position = position } diff --git a/test/testBigwig.js b/test/testBigwig.js index 73276db4b..6fe12d715 100644 --- a/test/testBigwig.js +++ b/test/testBigwig.js @@ -11,7 +11,6 @@ suite("testBigWig", function () { this.timeout(10000) - //chr21:19,146,376-19,193,466 const url = "https://s3.amazonaws.com/igv.org.test/data/uncompressed.bw", chr = "chr21", start = 0, @@ -21,14 +20,27 @@ suite("testBigWig", function () { const bwReader = new BWReader({url: url}) const features = await bwReader.readFeatures(chr, start, chr, end, bpPerPixel) assert.equal(features.length, 8) // Verified in iPad app + }) + + + /** + * Test a BW file with an unusual layout (chromTree after full data). + */ + test("chromTree", async function () { + this.timeout(10000) + + const url = "https://data.broadinstitute.org/igvdata/test/data/bb/chromTreeTest.bigwig" + const bwReader = new BWReader({url: url}) + const header = await bwReader.loadHeader() + assert.ok(header) + assert.equal(bwReader.chrNames.size, 6) }) test("bigwig", async function () { this.timeout(10000) - //chr21:19,146,376-19,193,466 const url = "test/data/bb/fixedStep.bw" const chr = "chr1" const bwReader = new BWReader({url: url})