diff --git a/package.json b/package.json index 27508fe19258a..b6f335c7b7e35 100644 --- a/package.json +++ b/package.json @@ -10,7 +10,6 @@ "autoprefixer": "^10.4.14", "babel-loader": "^9.1.2", "caniuse-lite": "^1.0.30001489", - "canvas": "^2.11.2", "core-js": "^3.30.2", "cross-env": "^7.0.3", "es-module-shims": "1.4.7", @@ -41,7 +40,6 @@ "postcss": "^8.4.23", "postcss-dir-pseudo-class": "^7.0.2", "prettier": "^2.8.8", - "puppeteer": "^20.5.0", "rimraf": "^3.0.2", "streamqueue": "^1.1.2", "stylelint": "^15.6.2", @@ -59,7 +57,6 @@ "yargs": "^17.7.2" }, "scripts": { - "postinstall": "cross-env PUPPETEER_PRODUCT=firefox node node_modules/puppeteer/install.js" }, "repository": { "type": "git", diff --git a/src/core/annotation.js b/src/core/annotation.js index 7937479d452aa..5beb266704594 100644 --- a/src/core/annotation.js +++ b/src/core/annotation.js @@ -495,6 +495,7 @@ class Annotation { hasOwnCanvas: false, noRotate: !!(this.flags & AnnotationFlag.NOROTATE), noHTML: isLocked && isContentLocked, + isZotero: (dict.get("NM") || '').startsWith('Zotero-') || dict.get("Zotero:Key"), }; if (params.collectFields) { diff --git a/src/core/document.js b/src/core/document.js index 93f31b7950b97..51475cc0f258c 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -58,6 +58,8 @@ import { StructTreePage } from "./struct_tree.js"; import { writeObject } from "./writer.js"; import { XFAFactory } from "./xfa/factory.js"; import { XRef } from "./xref.js"; +import { getParagraphs } from './text/structure.js'; +import { OutlineAnalyzer, PageAnalyzer } from './text/analyzer.js'; const DEFAULT_USER_UNIT = 1.0; const LETTER_SIZE_MEDIABOX = [0, 0, 612, 792]; @@ -460,6 +462,22 @@ class Page { intentDisplay = !!(intent & RenderingIntentFlag.DISPLAY), intentPrint = !!(intent & RenderingIntentFlag.PRINT); + const allowedSubtypes = [ + 'Link', + 'Widget', + 'Line', + 'Circle', + 'PolyLine', + 'Polygon', + 'Caret', + 'Squiggly', + 'StrikeOut', + 'Stamp' + ]; + + annotations = annotations.filter(x => allowedSubtypes.includes(x.data.subtype) + || ['Square', 'Ink', 'FreeText'].includes(x.data.subtype) && !x.data.isZotero); + // Collect the operator list promises for the annotations. Each promise // is resolved with the complete operator list for a single annotation. const opListPromises = []; @@ -554,6 +572,47 @@ class Page { }); } + async getStructuredText({ handler, task, data }) { + let items = []; + let sink = {}; + sink.enqueue = function (a, b) { + items.push(...a.items); + }; + + try { + await this.extractTextContent({ + handler, + task, + sink, + includeMarkedContent: data.includeMarkedContent, + combineTextItems: data.combineTextItems, + }); + } catch (e) { + console.log(e); + throw e; + } + + let fingerprints = new Set(); + let chars = []; + for (let item of items) { + if (!item.chars) { + continue; + } + for (let char of item.chars) { + // Some PDF files have their text layer characters repeated many times, therefore remove them + let fingerprint = char.c + char.rect.join(''); + if (!fingerprints.has(fingerprint)) { + fingerprints.add(fingerprint); + char.index = chars.length; + chars.push(char); + + } + } + } + let paragraphs = getParagraphs(chars); + return { paragraphs }; + } + async getStructTree() { const structTreeRoot = await this.pdfManager.ensureCatalog( "structTreeRoot" @@ -780,6 +839,7 @@ class PDFDocument { this.xref = new XRef(stream, pdfManager); this._pagePromises = new Map(); this._version = null; + this._structuredTexts = []; const idCounters = { font: 0, @@ -1517,6 +1577,58 @@ class PDFDocument { } } + async getPageData({ handler, task, data }) { + let { pageIndex } = data; + let structuredTextProvider = async (pageIndex) => { + if (this._structuredTexts[pageIndex]) { + return this._structuredTexts[pageIndex]; + } + let page = await this.getPage(pageIndex); + let structuredText; + try { + structuredText = await page.getStructuredText({ handler, task, data }); + this._structuredTexts[pageIndex] = structuredText; + } catch (e) { + console.log(e); + } + return structuredText; + }; + + let structuredText = await structuredTextProvider(pageIndex); + let page = await this.getPage(pageIndex); + + let pageAnalyzer = new PageAnalyzer(pageIndex, this, structuredTextProvider); + let overlays = await pageAnalyzer.getOverlays(); + let pageLabel = await pageAnalyzer.getPageLabel(); + let pageData = { + structuredText, + overlays, + viewBox: page.view, + pageLabel + }; + return pageData; + } + + async getOutline2({ handler, task, data = {} }) { + let { extract } = data; + let structuredTextProvider = async (pageIndex) => { + if (this._structuredTexts[pageIndex]) { + return this._structuredTexts[pageIndex]; + } + let page = await this.getPage(pageIndex); + let structuredText; + try { + structuredText = await page.getStructuredText({ handler, task, data }); + this._structuredTexts[pageIndex] = structuredText; + } catch (e) { + console.log(e); + } + return structuredText; + }; + let outlineAnalyzer = new OutlineAnalyzer(this, structuredTextProvider); + return outlineAnalyzer.getOutline(extract); + } + async checkLastPage(recoveryMode = false) { const { catalog, pdfManager } = this; diff --git a/src/core/evaluator.js b/src/core/evaluator.js index a03fa7010286a..88115370e67ab 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2284,6 +2284,7 @@ class PartialEvaluator { transform: null, fontName: null, hasEOL: false, + chars: [], }; // Use a circular buffer (length === 2) to save the last chars in the @@ -2518,6 +2519,7 @@ class PartialEvaluator { transform: textChunk.transform, fontName: textChunk.fontName, hasEOL: textChunk.hasEOL, + chars: textChunk.chars, }; } @@ -2843,6 +2845,9 @@ class PartialEvaluator { scaledDim = 0; } + let prevWidth = textChunk.width; + let m = Util.transform(textState.ctm, textState.textMatrix); + if (!font.vertical) { scaledDim *= textState.textHScale; textState.translateTextMatrix(scaledDim, 0); @@ -2869,6 +2874,120 @@ class PartialEvaluator { } textChunk.str.push(glyphUnicode); + function closestStandardAngle(degrees) { + const standardAngles = [0, 90, 180, 270]; + let closestAngle = standardAngles[0]; + let minDifference = Math.abs(degrees - closestAngle); + + for (let i = 1; i < standardAngles.length; i++) { + const difference = Math.abs(degrees - standardAngles[i]); + if (difference < minDifference) { + minDifference = difference; + closestAngle = standardAngles[i]; + } + } + + return closestAngle; + } + + function matrixToDegrees(matrix) { + let radians = Math.atan2(matrix[1], matrix[0]); + if (radians < 0) { + radians += (2 * Math.PI); + } + let degrees = Math.round(radians * (180 / Math.PI)); + degrees = degrees % 360; + if (degrees < 0) { + degrees += 360; + } + degrees = closestStandardAngle(degrees); + return degrees; + } + + let rotation = matrixToDegrees(m); + + let ascent = font.ascent; + let descent = font.descent; + if (descent > 0) { + descent = -descent; + } + if (ascent && descent) { + if (ascent > 1) { + ascent = 0.75; + } + if (descent < -0.5) { + descent = -0.25; + } + } + else { + ascent = 0.75; + descent = -0.25; + } + + if (font.capHeight && font.capHeight < ascent) { + ascent = font.capHeight; + } + + let charWidth = textChunk.width - prevWidth; + let rect = [0, textState.fontSize * descent, charWidth, textState.fontSize * ascent] + + if ( + font.isType3Font && + textState.fontSize <= 1 && + !isArrayEqual(textState.fontMatrix, FONT_IDENTITY_MATRIX) + ) { + const glyphHeight = font.bbox[3] - font.bbox[1]; + if (glyphHeight > 0) { + rect[1] = font.bbox[1] * textState.fontMatrix[3]; + rect[3] = font.bbox[3] * textState.fontMatrix[3]; + } + } + + rect = Util.getAxialAlignedBoundingBox(rect, m); + + let baselineRect = Util.getAxialAlignedBoundingBox([0, 0, 0, 0], m); + let baseline = 0; + if (rotation === 0 || rotation === 180) { + baseline = baselineRect[1]; + } + else if (rotation === 90 || rotation === 270) { + baseline = baselineRect[0]; + } + + let p1 = [0, 0]; + let p2 = [0, 1]; + + let [x1, y1] = Util.applyTransform(p1, getCurrentTextTransform()); + let [x2, y2] = Util.applyTransform(p2, getCurrentTextTransform()); + let fontSize = Math.hypot(x1 - x2, y1 - y2); + + let diagonal = rotation % 90 !== 0; + + if ( + glyph.unicode !== ' ' + && fontSize !== 0 + // Sometimes char can map to null and break strings + && glyph.unicode.charCodeAt(0) + ) { + textChunk.chars.push({ + // Decomposed ligatures, normalized Arabic characters + c: glyphUnicode, + // Normalizes Arabic characters others characters where length remains 1, but preserves + // ligatures and more importantly avoids 'e\u00be' being converted into 'e \u0301' + // which is quite common in Spanish author names and because of the space prevents + // author name recognition + u: glyphUnicode.length === 1 ? glyphUnicode : glyph.unicode, + rect, + fontSize, + fontName: textState.fontName, + bold: textState.font.bold, + italic: textState.font.italic, + baseline, + rotation, + diagonal, + }); + } + if (charSpacing) { if (!font.vertical) { textState.translateTextMatrix( @@ -2949,6 +3068,7 @@ class PartialEvaluator { textContent.items.push(runBidiTransform(textContentItem)); textContentItem.initialized = false; textContentItem.str.length = 0; + textContentItem.chars = []; } function enqueueChunk(batch = false) { diff --git a/src/core/text/analyzer.js b/src/core/text/analyzer.js new file mode 100644 index 0000000000000..a23488446c719 --- /dev/null +++ b/src/core/text/analyzer.js @@ -0,0 +1,492 @@ +let isNum = c => c >= '0' && c <= '9'; + +function getSurroundedNumber(chars, idx) { + while ( + idx > 0 && isNum(chars[idx - 1].c) + && Math.abs(chars[idx].rect[0] - chars[idx - 1].rect[2]) < chars[idx].rect[2] - chars[idx].rect[0] + && Math.abs(chars[idx - 1].rect[1] - chars[idx].rect[1]) < 2 + ) { + idx--; + } + + let str = chars[idx].c; + + while ( + idx < chars.length - 1 && isNum(chars[idx + 1].c) + && Math.abs(chars[idx + 1].rect[0] - chars[idx].rect[2]) < chars[idx + 1].rect[2] - chars[idx + 1].rect[0] + && Math.abs(chars[idx].rect[1] - chars[idx + 1].rect[1]) < 2 + ) { + idx++; + str += chars[idx].c; + } + + return parseInt(str); +} + +function getSurroundedNumberAtPos(chars, x, y) { + for (let i = 0; i < chars.length; i++) { + let ch = chars[i]; + let { x: x2, y: y2 } = getRectCenter(ch.rect); + if (isNum(ch.c) && Math.abs(x - x2) < 10 && Math.abs(y - y2) < 5) { + return getSurroundedNumber(chars, i); + } + } + return null; +} + +function getRectCenter(rect) { + return { + x: rect[0] + (rect[2] - rect[0]) / 2, + y: rect[1] + (rect[3] - rect[1]) / 2 + }; +} + +function filterNums(chars, pageHeight) { + return chars.filter(x => x.c >= '0' && x.c <= '9' && (x.rect[3] < pageHeight * 1 / 5 || x.rect[1] > pageHeight * 4 / 5)); +} + +export function flattenChars(structuredText) { + let flatCharsArray = []; + for (let paragraph of structuredText.paragraphs) { + for (let line of paragraph.lines) { + for (let word of line.words) { + for (let charObj of word.chars) { + flatCharsArray.push(charObj); + } + } + } + } + return flatCharsArray; +} + +function getLineSelectionRect(line, charFrom, charTo) { + if (line.vertical) { + return [ + line.rect[0], + Math.min(charFrom.rect[1], charTo.rect[1]), + line.rect[2], + Math.max(charFrom.rect[3], charTo.rect[3]) + ]; + } + else { + return [ + Math.min(charFrom.rect[0], charTo.rect[0]), + line.rect[1], + Math.max(charFrom.rect[2], charTo.rect[2]), + line.rect[3] + ]; + } +} + +function getRangeRects(structuredText, charStart, charEnd) { + let extracting = false; + let rects = []; + let n = 0; + loop: for (let paragraph of structuredText.paragraphs) { + for (let line of paragraph.lines) { + let charFrom = null; + let charTo = null; + for (let word of line.words) { + for (let char of word.chars) { + if (n === charStart || extracting && !charFrom) { + charFrom = char; + extracting = true; + } + if (extracting) { + charTo = char; + if (n === charEnd) { + rects.push(getLineSelectionRect(line, charFrom, charTo)); + break loop; + } + } + n++; + } + } + if (extracting && charFrom && charTo) { + rects.push(getLineSelectionRect(line, charFrom, charTo)); + charFrom = null; + } + } + } + rects = rects.map(rect => rect.map(value => parseFloat(value.toFixed(3)))); + return rects; +} + +function extractLinks(structuredText) { + let chars = flattenChars(structuredText); + let spaceBefore = new Set(); + for (let paragraph of structuredText.paragraphs) { + for (let line of paragraph.lines) { + for (let word of line.words) { + if (word.spaceAfter) { + spaceBefore.add(word.to + 1); + } + } + } + } + + let sequences = []; + let sequence = { from: 0, to: 0, lbp: [] }; + + let urlBreakChars = ['&', '.', '#', '?', '/']; + + for (let i = 0; i < chars.length; i++) { + let char = chars[i]; + let charBefore = chars[i - 1]; + + if (spaceBefore.has(i) + || charBefore && ( + char.fontSize !== charBefore.fontSize + || char.fontName !== charBefore.fontName + || charBefore.rect[0] > char.rect[0] && ( + charBefore.rect[1] - char.rect[3] > (char.rect[3] - char.rect[1]) / 2 + || !(urlBreakChars.includes(charBefore.c) || urlBreakChars.includes(char.c)) + ) + ) + ) { + sequences.push(sequence); + sequence = { from: i, to: i }; + } + else { + sequence.to = i; + } + } + + if (sequence.from !== sequence.to) { + sequences.push(sequence); + } + + let links = []; + + let urlRegExp = new RegExp(/(https?:\/\/|www\.|10\.)[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)/); + let doiRegExp = new RegExp(/10(?:\.[0-9]{4,})?\/[^\s]*[^\s\.,]/); + + for (let sequence of sequences) { + let text = ''; + for (let j = sequence.from; j <= sequence.to; j++) { + let char = chars[j]; + text += char.c; + } + let match = text.match(urlRegExp); + if (match) { + let url = match[0]; + if (url.includes('@')) { + continue; + } + url = url.replace(/[.)]*$/, ''); + let from = sequence.from + match.index; + let to = from + url.length; + links.push({ from, to, url }); + } + match = text.match(doiRegExp); + if (match) { + let from = sequence.from + match.index; + let to = from + match[0].length; + let url = 'https://doi.org/' + encodeURIComponent(match[0]); + links.push({ from, to, text: match[0], url }); + continue; + } + } + return links; +} + +function getSortIndex(pageIndex, offset) { + return [ + pageIndex.toString().slice(0, 5).padStart(5, '0'), + offset.toString().slice(0, 6).padStart(6, '0'), + (0).toString().slice(0, 5).padStart(5, '0') + ].join('|'); +} + +function rectsDist([ax1, ay1, ax2, ay2], [bx1, by1, bx2, by2]) { + let left = bx2 < ax1; + let right = ax2 < bx1; + let bottom = by2 < ay1; + let top = ay2 < by1; + + if (top && left) { + return Math.hypot(ax1 - bx2, ay2 - by1); + } + else if (left && bottom) { + return Math.hypot(ax1 - bx2, ay1 - by2); + } + else if (bottom && right) { + return Math.hypot(ax2 - bx1, ay1 - by2); + } + else if (right && top) { + return Math.hypot(ax2 - bx1, ay2 - by1); + } + else if (left) { + return ax1 - bx2; + } + else if (right) { + return bx1 - ax2; + } + else if (bottom) { + return ay1 - by2; + } + else if (top) { + return by1 - ay2; + } + + return 0; +} + +function getClosestOffset(chars, rect) { + let dist = Infinity; + let idx = 0; + for (let i = 0; i < chars.length; i++) { + let ch = chars[i]; + let distance = rectsDist(ch.rect, rect); + if (distance < dist) { + dist = distance; + idx = i; + } + } + return idx; +} + +export class PageAnalyzer { + constructor(pageIndex, pdfDocument, structuredTextProvider) { + this._pageIndex = pageIndex; + this._pdfDocument = pdfDocument; + this._structuredTextProvider = structuredTextProvider; + } + + async _getPagesNum() { + return this._pdfDocument.pdfManager.ensureDoc('numPages'); + } + + _getPageLabelPoints(pageIndex, chars1, chars2, chars3, chars4, pageHeight) { + let charsNum1 = filterNums(chars1, pageHeight); + let charsNum2 = filterNums(chars2, pageHeight); + let charsNum3 = filterNums(chars3, pageHeight); + let charsNum4 = filterNums(chars4, pageHeight); + + // Cut off the logic if one of the pages has too many digits + if ([charsNum1, charsNum2, charsNum3, charsNum4].find(x => x.length > 500)) { + return null; + } + for (let c1 = 0; c1 < charsNum1.length; c1++) { + let ch1 = charsNum1[c1]; + for (let c3 = 0; c3 < charsNum3.length; c3++) { + let ch3 = charsNum3[c3]; + let { x: x1, y: y1 } = getRectCenter(ch1.rect); + let { x: x2, y: y2 } = getRectCenter(ch3.rect); + if (Math.abs(x1 - x2) < 10 && Math.abs(y1 - y2) < 5) { + let num1 = getSurroundedNumber(charsNum1, c1); + let num3 = getSurroundedNumber(charsNum3, c3); + if (num1 && num1 + 2 === num3) { + let pos1 = { x: x1, y: y1, num: num1, idx: pageIndex }; + + + let extractedNum2 = getSurroundedNumberAtPos(chars2, x1, y1); + if (num1 + 1 === extractedNum2) { + return [pos1]; + } + + for (let c2 = 0; c2 < charsNum2.length; c2++) { + let ch2 = charsNum2[c2]; + for (let c4 = 0; c4 < charsNum4.length; c4++) { + let ch4 = charsNum4[c4]; + let { x: x1, y: y1 } = getRectCenter(ch2.rect); + let { x: x2, y: y2 } = getRectCenter(ch4.rect); + if (Math.abs(x1 - x2) < 10 && Math.abs(y1 - y2) < 5) { + let num2 = getSurroundedNumber(charsNum2, c2); + let num4 = getSurroundedNumber(charsNum4, c4); + if (num1 + 1 === num2 && num2 + 2 === num4) { + let pos2 = { x: x1, y: y1, num: num2, idx: pageIndex + 2 }; + return [pos1, pos2]; + } + } + } + } + } + } + } + } + + return null; + } + + _getPageLabel(pageIndex, charsPrev, charsCur, charsNext, points) { + let numPrev, numCur, numNext; + + // TODO: Instead of trying to extract from two positions, try to + // guess the right position by determining whether the page is even or odd + + // TODO: Take into account font parameters when comparing extracted numbers + let getNum = (charsNext, points) => points.length > 0 && getSurroundedNumberAtPos(charsNext, points[0].x, points[0].y) + || points.length > 1 && getSurroundedNumberAtPos(charsNext, points[1].x, points[1].y); + + if (charsPrev) { + numPrev = getNum(charsPrev, points); + } + + numCur = getNum(charsCur, points); + + if (charsNext) { + numNext = getNum(charsNext, points); + } + + if (numCur && (numCur - 1 === numPrev || numCur + 1 === numNext)) { + return numCur.toString(); + } + + if (pageIndex < points[0].idx) { + return (points[0].num - (points[0].idx - pageIndex)).toString(); + } + + return null; + } + + async _extractPageLabelPoints(pageIndex) { + let numPages = await this._getPagesNum(); + let start = pageIndex - 2; + if (start < 0) { + start = 0; + } + for (let i = start; i < start + 5 && i + 3 < numPages; i++) { + let chs1 = flattenChars(await this._structuredTextProvider(i)); + let chs2 = flattenChars(await this._structuredTextProvider(i + 1)); + let chs3 = flattenChars(await this._structuredTextProvider(i + 2)); + let chs4 = flattenChars(await this._structuredTextProvider(i + 3)); + let page = await this._pdfDocument.getPage(i); + let { view } = page; + let pageHeight = view[3] - view[1]; + let res = this._getPageLabelPoints(i, chs1, chs2, chs3, chs4, pageHeight); + if (res) { + return res; + } + } + return null; + } + + async _extractPageLabel(pageIndex, points) { + let chsPrev, chsCur, chsNext; + if (pageIndex > 0) { + chsPrev = flattenChars(await this._structuredTextProvider(pageIndex - 1)); + } + chsCur = flattenChars(await this._structuredTextProvider(pageIndex)); + let numPages = await this._getPagesNum(); + if (pageIndex < numPages - 1) { + chsNext = flattenChars(await this._structuredTextProvider(pageIndex + 1)); + } + return this._getPageLabel(pageIndex, chsPrev, chsCur, chsNext, points); + } + + async getPageLabel() { + let existingPageLabels = await this._pdfDocument.pdfManager.ensureCatalog("pageLabels"); + let pageLabel; + let points = await this._extractPageLabelPoints(this._pageIndex); + if (points) { + pageLabel = await this._extractPageLabel(this._pageIndex, points); + } + if ( + (!pageLabel || pageLabel === '0') + && existingPageLabels + && existingPageLabels[this._pageIndex] + ) { + pageLabel = existingPageLabels[this._pageIndex]; + } + return pageLabel; + } + + // Overlays + + async getOverlays() { + let overlays = []; + + let pageIndex = this._pageIndex; + + let structuredText = await this._structuredTextProvider(pageIndex); + let links = extractLinks(structuredText); + for (let link of links) { + let rects = getRangeRects(structuredText, link.from, link.to); + let overlay = { + type: 'external-link', + source: 'parsed', + url: link.url, + sortIndex: getSortIndex(pageIndex, link.from), + position: { + pageIndex, + rects, + }, + }; + overlays.push(overlay); + } + + let chars = flattenChars(structuredText); + let page = await this._pdfDocument.getPage(pageIndex); + let annotations = await page._parsedAnnotations; + for (let annotation of annotations) { + annotation = annotation.data; + if (!annotation.url && !annotation.dest || !annotation.rect) { + continue; + } + let offset = getClosestOffset(chars, annotation.rect); + let overlay = { + source: 'annotation', + sortIndex: getSortIndex(pageIndex, offset), + position: { + pageIndex, + rects: [annotation.rect], + } + }; + if (annotation.url) { + overlay.type = 'external-link'; + overlay.url = annotation.url; + } + else if (annotation.dest) { + overlay.type = 'internal-link'; + overlay.dest = annotation.dest; + } + else { + continue; + } + overlays.push(overlay); + } + + return overlays; + } +} + +export class OutlineAnalyzer { + constructor(pdfDocument, structuredTextProvider) { + this._pdfDocument = pdfDocument; + this._structuredTextProvider = structuredTextProvider; + } + + async getOutline(extract) { + let outline = []; + let items = await this._pdfDocument.pdfManager.ensureCatalog("documentOutline"); + function transformItems(items) { + let newItems = []; + for (let item of items) { + let newItem = { + title: item.title, + items: transformItems(item.items), + expanded: false, + }; + if (item.dest) { + newItem.location = { + dest: item.dest, + }; + } else if (item.unsafeUrl) { + newItem.url = item.unsafeUrl; + } + newItems.push(newItem); + } + return newItems; + } + if (items) { + outline = transformItems(items); + if (outline.length === 1) { + for (let item of outline) { + item.expanded = true; + } + } + } + return outline; + } +} diff --git a/src/core/text/structure.js b/src/core/text/structure.js new file mode 100644 index 0000000000000..a9bca3a49361e --- /dev/null +++ b/src/core/text/structure.js @@ -0,0 +1,864 @@ + +// *** bidi.js starts here *** +// This is taken from PDF.js source https://github.com/mozilla/pdf.js/blob/9416b14e8b06a39a1a57f2baf22aebab2370edeb/src/core/bidi.js + +/* Copyright 2012 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Character types for symbols from 0000 to 00FF. +// Source: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt +// prettier-ignore +let baseTypes = [ + "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "S", "B", "S", + "WS", "B", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", + "BN", "BN", "BN", "BN", "B", "B", "B", "S", "WS", "ON", "ON", "ET", + "ET", "ET", "ON", "ON", "ON", "ON", "ON", "ES", "CS", "ES", "CS", "CS", + "EN", "EN", "EN", "EN", "EN", "EN", "EN", "EN", "EN", "EN", "CS", "ON", + "ON", "ON", "ON", "ON", "ON", "L", "L", "L", "L", "L", "L", "L", "L", + "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", + "L", "L", "L", "L", "ON", "ON", "ON", "ON", "ON", "ON", "L", "L", "L", + "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", + "L", "L", "L", "L", "L", "L", "L", "L", "L", "ON", "ON", "ON", "ON", + "BN", "BN", "BN", "BN", "BN", "BN", "B", "BN", "BN", "BN", "BN", "BN", + "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", + "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "CS", "ON", "ET", + "ET", "ET", "ET", "ON", "ON", "ON", "ON", "L", "ON", "ON", "BN", "ON", + "ON", "ET", "ET", "EN", "EN", "ON", "L", "ON", "ON", "ON", "EN", "L", + "ON", "ON", "ON", "ON", "ON", "L", "L", "L", "L", "L", "L", "L", "L", + "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", + "L", "ON", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", + "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", + "L", "L", "L", "L", "L", "ON", "L", "L", "L", "L", "L", "L", "L", "L" +]; + +// Character types for symbols from 0600 to 06FF. +// Source: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt +// Note that 061D does not exist in the Unicode standard (see +// http://unicode.org/charts/PDF/U0600.pdf), so we replace it with an +// empty string and issue a warning if we encounter this character. The +// empty string is required to properly index the items after it. +// prettier-ignore +let arabicTypes = [ + "AN", "AN", "AN", "AN", "AN", "AN", "ON", "ON", "AL", "ET", "ET", "AL", + "CS", "AL", "ON", "ON", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", + "NSM", "NSM", "NSM", "NSM", "AL", "AL", "", "AL", "AL", "AL", "AL", "AL", + "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", + "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", + "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", + "AL", "AL", "AL", "AL", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", + "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", + "NSM", "NSM", "NSM", "NSM", "AN", "AN", "AN", "AN", "AN", "AN", "AN", + "AN", "AN", "AN", "ET", "AN", "AN", "AL", "AL", "AL", "NSM", "AL", "AL", + "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", + "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", + "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", + "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", + "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", + "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", + "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", + "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", + "AL", "AL", "AL", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "AN", + "ON", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "AL", "AL", "NSM", "NSM", + "ON", "NSM", "NSM", "NSM", "NSM", "AL", "AL", "EN", "EN", "EN", "EN", + "EN", "EN", "EN", "EN", "EN", "EN", "AL", "AL", "AL", "AL", "AL", "AL" +]; + +function isOdd(i) { + return (i & 1) !== 0; +} + +function isEven(i) { + return (i & 1) === 0; +} + +function findUnequal(arr, start, value) { + let j, jj; + for (j = start, jj = arr.length; j < jj; ++j) { + if (arr[j] !== value) { + return j; + } + } + return j; +} + +function setValues(arr, start, end, value) { + for (let j = start; j < end; ++j) { + arr[j] = value; + } +} + +function reverseValues(arr, start, end) { + for (let i = start, j = end - 1; i < j; ++i, --j) { + let temp = arr[i]; + arr[i] = arr[j]; + arr[j] = temp; + } +} + +function createBidiText(chars, isLTR, vertical = false) { + let dir = "ltr"; + if (vertical) { + dir = "ttb"; + } + else if (!isLTR) { + dir = "rtl"; + } + return { chars, dir }; +} + +// These are used in bidi(), which is called frequently. We re-use them on +// each call to avoid unnecessary allocations. +let types = []; + +function bidi(chars, startLevel = -1, vertical = false) { + let isLTR = true; + let strLength = chars.length; + if (strLength === 0 || vertical) { + return createBidiText(chars, isLTR, vertical); + } + + // Get types and fill arrays + types.length = strLength; + let numBidi = 0; + + let i, ii; + for (i = 0; i < strLength; ++i) { + + let charCode = chars[i].c.charCodeAt(0); + let charType = "L"; + if (charCode <= 0x00ff) { + charType = baseTypes[charCode]; + } + else if (0x0590 <= charCode && charCode <= 0x05f4) { + charType = "R"; + } + else if (0x0600 <= charCode && charCode <= 0x06ff) { + charType = arabicTypes[charCode & 0xff]; + if (!charType) { + console.log("Bidi: invalid Unicode character " + charCode.toString(16)); + } + } + else if (0x0700 <= charCode && charCode <= 0x08ac) { + charType = "AL"; + } + if (charType === "R" || charType === "AL" || charType === "AN") { + numBidi++; + } + types[i] = charType; + } + + // Detect the bidi method + // - If there are no rtl characters then no bidi needed + // - If less than 30% chars are rtl then string is primarily ltr, + // unless the string is very short. + // - If more than 30% chars are rtl then string is primarily rtl + if (numBidi === 0) { + isLTR = true; + return createBidiText(chars, isLTR); + } + + if (startLevel === -1) { + if (numBidi / strLength < 0.3 && strLength > 4) { + isLTR = true; + startLevel = 0; + } + else { + isLTR = false; + startLevel = 1; + } + } + + let levels = []; + for (i = 0; i < strLength; ++i) { + levels[i] = startLevel; + } + + /* + X1-X10: skip most of this, since we are NOT doing the embeddings. + */ + let e = isOdd(startLevel) ? "R" : "L"; + let sor = e; + let eor = sor; + + /* + W1. Examine each non-spacing mark (NSM) in the level run, and change the + type of the NSM to the type of the previous character. If the NSM is at the + start of the level run, it will get the type of sor. + */ + let lastType = sor; + for (i = 0; i < strLength; ++i) { + if (types[i] === "NSM") { + types[i] = lastType; + } + else { + lastType = types[i]; + } + } + + /* + W2. Search backwards from each instance of a European number until the + first strong type (R, L, AL, or sor) is found. If an AL is found, change + the type of the European number to Arabic number. + */ + lastType = sor; + let t; + for (i = 0; i < strLength; ++i) { + t = types[i]; + if (t === "EN") { + types[i] = lastType === "AL" ? "AN" : "EN"; + } + else if (t === "R" || t === "L" || t === "AL") { + lastType = t; + } + } + + /* + W3. Change all ALs to R. + */ + for (i = 0; i < strLength; ++i) { + t = types[i]; + if (t === "AL") { + types[i] = "R"; + } + } + + /* + W4. A single European separator between two European numbers changes to a + European number. A single common separator between two numbers of the same + type changes to that type: + */ + for (i = 1; i < strLength - 1; ++i) { + if (types[i] === "ES" && types[i - 1] === "EN" && types[i + 1] === "EN") { + types[i] = "EN"; + } + if ( + types[i] === "CS" && + (types[i - 1] === "EN" || types[i - 1] === "AN") && + types[i + 1] === types[i - 1] + ) { + types[i] = types[i - 1]; + } + } + + /* + W5. A sequence of European terminators adjacent to European numbers changes + to all European numbers: + */ + for (i = 0; i < strLength; ++i) { + if (types[i] === "EN") { + // do before + for (let j = i - 1; j >= 0; --j) { + if (types[j] !== "ET") { + break; + } + types[j] = "EN"; + } + // do after + for (let j = i + 1; j < strLength; ++j) { + if (types[j] !== "ET") { + break; + } + types[j] = "EN"; + } + } + } + + /* + W6. Otherwise, separators and terminators change to Other Neutral: + */ + for (i = 0; i < strLength; ++i) { + t = types[i]; + if (t === "WS" || t === "ES" || t === "ET" || t === "CS") { + types[i] = "ON"; + } + } + + /* + W7. Search backwards from each instance of a European number until the + first strong type (R, L, or sor) is found. If an L is found, then change + the type of the European number to L. + */ + lastType = sor; + for (i = 0; i < strLength; ++i) { + t = types[i]; + if (t === "EN") { + types[i] = lastType === "L" ? "L" : "EN"; + } + else if (t === "R" || t === "L") { + lastType = t; + } + } + + /* + N1. A sequence of neutrals takes the direction of the surrounding strong + text if the text on both sides has the same direction. European and Arabic + numbers are treated as though they were R. Start-of-level-run (sor) and + end-of-level-run (eor) are used at level run boundaries. + */ + for (i = 0; i < strLength; ++i) { + if (types[i] === "ON") { + let end = findUnequal(types, i + 1, "ON"); + let before = sor; + if (i > 0) { + before = types[i - 1]; + } + + let after = eor; + if (end + 1 < strLength) { + after = types[end + 1]; + } + if (before !== "L") { + before = "R"; + } + if (after !== "L") { + after = "R"; + } + if (before === after) { + setValues(types, i, end, before); + } + i = end - 1; // reset to end (-1 so next iteration is ok) + } + } + + /* + N2. Any remaining neutrals take the embedding direction. + */ + for (i = 0; i < strLength; ++i) { + if (types[i] === "ON") { + types[i] = e; + } + } + + /* + I1. For all characters with an even (left-to-right) embedding direction, + those of type R go up one level and those of type AN or EN go up two + levels. + I2. For all characters with an odd (right-to-left) embedding direction, + those of type L, EN or AN go up one level. + */ + for (i = 0; i < strLength; ++i) { + t = types[i]; + if (isEven(levels[i])) { + if (t === "R") { + levels[i] += 1; + } + else if (t === "AN" || t === "EN") { + levels[i] += 2; + } + } + else { + // isOdd + if (t === "L" || t === "AN" || t === "EN") { + levels[i] += 1; + } + } + } + + /* + L1. On each line, reset the embedding level of the following characters to + the paragraph embedding level: + + segment separators, + paragraph separators, + any sequence of whitespace characters preceding a segment separator or + paragraph separator, and any sequence of white space characters at the end + of the line. + */ + + // don't bother as text is only single line + + /* + L2. From the highest level found in the text to the lowest odd level on + each line, reverse any contiguous sequence of characters that are at that + level or higher. + */ + + // find highest level & lowest odd level + let highestLevel = -1; + let lowestOddLevel = 99; + let level; + for (i = 0, ii = levels.length; i < ii; ++i) { + level = levels[i]; + if (highestLevel < level) { + highestLevel = level; + } + if (lowestOddLevel > level && isOdd(level)) { + lowestOddLevel = level; + } + } + + // now reverse between those limits + for (level = highestLevel; level >= lowestOddLevel; --level) { + // find segments to reverse + let start = -1; + for (i = 0, ii = levels.length; i < ii; ++i) { + if (levels[i] < level) { + if (start >= 0) { + reverseValues(chars, start, i); + start = -1; + } + } + else if (start < 0) { + start = i; + } + } + if (start >= 0) { + reverseValues(chars, start, levels.length); + } + } + + /* + L3. Combining marks applied to a right-to-left base character will at this + point precede their base character. If the rendering engine expects them to + follow the base characters in the final display process, then the ordering + of the marks and the base character must be reversed. + */ + + // don't bother for now + + /* + L4. A character that possesses the mirrored property as specified by + Section 4.7, Mirrored, must be depicted by a mirrored glyph if the resolved + directionality of that character is R. + */ + + // don't mirror as characters are already mirrored in the pdf + + // Finally, return string + for (i = 0, ii = chars.length; i < ii; ++i) { + let ch = chars[i]; + if (ch === "<" || ch === ">") { + chars[i] = ""; + } + } + return createBidiText(chars, isLTR); +} + +function isRTL(char) { + let charCode = char.charCodeAt(0); + let charType = "L"; + if (charCode <= 0x00ff) { + charType = baseTypes[charCode]; + } + else if (0x0590 <= charCode && charCode <= 0x05f4) { + charType = "R"; + } + else if (0x0600 <= charCode && charCode <= 0x06ff) { + charType = arabicTypes[charCode & 0xff]; + if (!charType) { + console.log("Bidi: invalid Unicode character " + charCode.toString(16)); + } + } + else if (0x0700 <= charCode && charCode <= 0x08ac) { + charType = "AL"; + } + if (charType === "R" || charType === "AL" || charType === "AN") { + return true; + } + return false; +} + +// *** bidi.js ends here *** + + + +// The function is adapted from Xpdf https://www.xpdfreader.com/opensource.html +// Original copyright: 1996-2019 Glyph & Cog, LLC. +function computeWordSpacingThreshold(chars) { + // Inter-character spacing that varies by less than this multiple of + // font size is assumed to be equivalent. + let uniformSpacing = 0.07; + // Typical word spacing, as a fraction of font size. This will be + // added to the minimum inter-character spacing, to account for wide + // character spacing. + let wordSpacing = 0.1; + // Compute the inter-word spacing threshold for a line of chars. + // Spaces greater than this threshold will be considered inter-word + // spaces. + + let char, char2; + let avgFontSize; + let minAdjGap, maxAdjGap, minSpGap, maxSpGap, minGap, maxGap, gap, gap2; + let i; + + avgFontSize = 0; + minGap = maxGap = 0; + minAdjGap = minSpGap = 1; + maxAdjGap = maxSpGap = 0; + for (i = 0; i < chars.length; ++i) { + char = chars[i]; + avgFontSize += char.fontSize; + if (i < chars.length - 1) { + char2 = chars[i + 1]; + gap = getSpaceBetweenChars(char, char2); + if (char.spaceAfter) { + if (minSpGap > maxSpGap) { + minSpGap = maxSpGap = gap; + } + else if (gap < minSpGap) { + minSpGap = gap; + } + else if (gap > maxSpGap) { + maxSpGap = gap; + } + } + else if (minAdjGap > maxAdjGap) { + minAdjGap = maxAdjGap = gap; + } + else if (gap < minAdjGap) { + minAdjGap = gap; + } + else if (gap > maxAdjGap) { + maxAdjGap = gap; + } + if (i == 0 || gap < minGap) { + minGap = gap; + } + if (gap > maxGap) { + maxGap = gap; + } + } + } + avgFontSize /= chars.length; + if (minGap < 0) { + minGap = 0; + } + + // if spacing is nearly uniform (minGap is close to maxGap), use the + // SpGap/AdjGap values if available, otherwise assume it's a single + // word (technically it could be either "ABC" or "A B C", but it's + // essentially impossible to tell) + if (maxGap - minGap < uniformSpacing * avgFontSize) { + if (minAdjGap <= maxAdjGap + && minSpGap <= maxSpGap + && minSpGap - maxAdjGap > 0.01) { + return 0.5 * (maxAdjGap + minSpGap); + } + else { + return maxGap + 1; + } + + // if there is some variation in spacing, but it's small, assume + // there are some inter-word spaces + } + else if (maxGap - minGap < wordSpacing * avgFontSize) { + return 0.5 * (minGap + maxGap); + + // if there is a large variation in spacing, use the SpGap/AdjGap + // values if they look reasonable, otherwise, assume a reasonable + // threshold for inter-word spacing (we can't use something like + // 0.5*(minGap+maxGap) here because there can be outliers at the + // high end) + } + else if (minAdjGap <= maxAdjGap + && minSpGap <= maxSpGap + && minSpGap - maxAdjGap > uniformSpacing * avgFontSize) { + gap = wordSpacing * avgFontSize; + gap2 = 0.5 * (minSpGap - minGap); + return minGap + (gap < gap2 ? gap : gap2); + } + else { + return minGap + wordSpacing * avgFontSize; + } +} + +function getSpaceBetweenChars(char, char2) { + let { rotation } = char; + return !rotation && char2.rect[0] - char.rect[2] + || rotation === 90 && char2.rect[1] - char.rect[3] + || rotation === 180 && char.rect[0] - char2.rect[2] + || rotation === 270 && char.rect[1] - char2.rect[3] +} + +function overlaps(rect1, rect2, rotation) { + if ([0, 180].includes(rotation)) { + return (rect1[1] <= rect2[1] && rect2[1] <= rect1[3] + || rect2[1] <= rect1[1] && rect1[1] <= rect2[3]); + } + return ( + rect1[0] <= rect2[0] && rect2[0] <= rect1[2] + || rect2[0] <= rect1[0] && rect1[0] <= rect2[2] + ); +} + +function isDash(c) { + let re = /[\x2D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]/; + return re.test(c); +} + +function charHeight(char) { + return ([0, 180].includes(char.rotation) && char.rect[3] - char.rect[1] + || [90, 270].includes(char.rotation) && char.rect[2] - char.rect[0]); +} + +function getBoundingRect(objs, from, to) { + let objs2 = objs.slice(from, to + 1); + return [ + Math.min(...objs2.map(x => x.rect[0])), + Math.min(...objs2.map(x => x.rect[1])), + Math.max(...objs2.map(x => x.rect[2])), + Math.max(...objs2.map(x => x.rect[3])), + ]; +} + +function roundRect(rect) { + return rect.map(n => Math.round(n * 1000) / 1000); +} + +function split(chars, reflowRTL) { + if (!chars.length) { + return []; + } + let lines = []; + + let hasRTL = false; + for (let char of chars) { + if (isRTL(char.c)) { + hasRTL = true; + break; + } + } + + let lineBreaks = []; + + for (let i = 1; i < chars.length; i++) { + let char = chars[i - 1]; + let char2 = chars[i]; + if ( + // Caret jumps to the next line start for non-RTL text and baseline isn't the same. + // (characters can sometimes even jump back in the same line) + !hasRTL && Math.abs(char.baseline - char2.baseline) > 0.01 && ( + !char2.rotation && char.rect[0] - 10 > char2.rect[0] + || char2.rotation === 90 && char.rect[1] > char2.rect[1] + || char2.rotation === 180 && char.rect[0] < char2.rect[0] + || char2.rotation === 270 && char.rect[1] < char2.rect[1] + ) + || hasRTL && Math.abs(char.baseline - char2.baseline) > 0.01 + // Rotation changes + || char.rotation !== char2.rotation + // Chars aren't in the same line + || !overlaps(char.rect, char2.rect, char2.rotation) + // Line's first char is more than 2x larger than the following char, to put drop cap into a separate line + || lineBreaks.find(x => x === i - 1) && charHeight(char) > charHeight(char2) * 2 + ) { + lineBreaks.push(i); + } + } + + lineBreaks = [0, ...lineBreaks, chars.length]; + + // Sort characters in lines by their visual order. That fixes some RTL lines + // and weird cases when caret jumps back in the same line for LTR text + for (let i = 0; i < lineBreaks.length - 1; i++) { + let from = lineBreaks[i]; + let to = lineBreaks[i + 1] - 1; + let lineChars = chars.slice(from, to + 1); + lineChars.sort((a, b) => { + let { rotation } = a; + let x1 = a.rect[0] + (a.rect[2] - a.rect[0]) / 2; + let x2 = b.rect[0] + (b.rect[2] - b.rect[0]) / 2; + let y1 = a.rect[1] + (a.rect[3] - a.rect[1]) / 2; + let y2 = b.rect[1] + (b.rect[3] - b.rect[1]) / 2; + + return !rotation && x1 - x2 + || rotation === 90 && y1 - y2 + || rotation === 180 && x2 - x1 + || rotation === 270 && y2 - y1 + }); + bidi(lineChars, -1, false); + chars.splice(from, to - from + 1, ...lineChars); + } + + let extraLineBreaks = []; + let wordBreaks = []; + let wordSpaces = []; + // Get word breaks + for (let i = 0; i < lineBreaks.length - 1; i++) { + let from = lineBreaks[i]; + let to = lineBreaks[i + 1] - 1; + let wordSp = computeWordSpacingThreshold(chars.slice(from, to + 1)); + let spaces = []; + for (let j = from + 1; j <= to; j++) { + let sp = wordSp - 1; + + let char = chars[j - 1]; + let char2 = chars[j]; + + let rtl = isRTL(char.c) && isRTL(char2.c); + sp = rtl ? (char.rect[0] - char2.rect[2]) : getSpaceBetweenChars(char, char2); + if (sp > wordSp || sp < -char.fontSize) { + wordSpaces.push(j); + wordBreaks.push(j); + spaces.push({index: j, width: sp}); + continue; + } + + let punctuation = '?.,;!¡¿。、·(){}[]/$:'; + + if ( + char.fontName !== char2.fontName + || Math.abs(char.fontSize - char2.fontSize) > 0.01 + || Math.abs(char.baseline - char2.baseline) > 0.01 + || punctuation.includes(char.c) || punctuation.includes(char2.c) + ) { + wordBreaks.push(j); + } + } + if (to < chars.length - 1) { + wordBreaks.push(to + 1); + } + let min = Math.min(...spaces.map(x => x.width)); + for (let space of spaces) { + if (space.width > min * 10 && space.width > 10) { + extraLineBreaks.push(space.index); + } + } + } + wordBreaks = [0, ...wordBreaks, chars.length]; + lineBreaks = [...lineBreaks, ...extraLineBreaks].sort((a, b) => a - b); + + let paragraphBreaks = []; + for (let i = 1; i < lineBreaks.length - 1; i++) { + let previousRect = getBoundingRect(chars, lineBreaks[i - 1], lineBreaks[i] - 1); + let currentRect = getBoundingRect(chars, lineBreaks[i], lineBreaks[i + 1] - 1); + + let lineSpacing = previousRect[1] - currentRect[3]; + + let previousLineHeight = previousRect[3] - previousRect[1]; + let currentLineHeight = currentRect[3] - currentRect[1]; + + if ( + // The lines shouldn't be in the same row + !(previousRect[1] > currentRect[3]) + || Math.abs(previousLineHeight - currentLineHeight) > 0.1 + || lineSpacing > previousLineHeight) { + paragraphBreaks.push(lineBreaks[i]); + } + } + + paragraphBreaks = [0, ...paragraphBreaks, chars.length]; + + let paragraphs = []; + + for (let p = 0; p < paragraphBreaks.length - 1; p++) { + let paragraphStart = paragraphBreaks[p]; + let paragraphEnd = paragraphBreaks[p + 1]; + let lines = []; + for (let l = 0; l < lineBreaks.length - 1; l++) { + if (lineBreaks[l] < paragraphStart || lineBreaks[l + 1] > paragraphEnd) { + continue; + } + let lineStart = lineBreaks[l]; + let lineEnd = lineBreaks[l + 1]; + let words = []; + for (let w = 0; w < wordBreaks.length - 1; w++) { + if (wordBreaks[w] < lineStart || wordBreaks[w + 1] > lineEnd) { + continue; + } + let wordStart = wordBreaks[w]; + let wordEnd = wordBreaks[w + 1]; + let spaceAfter = wordSpaces.includes(wordEnd); + let wordChars = chars.slice(wordStart, wordEnd); + words.push({ + rect: getBoundingRect(chars, wordStart, wordEnd - 1), + chars: wordChars, + spaceAfter, + }); + } + lines.push({ + rect: getBoundingRect(words, 0, words.length - 1), + hyphenated: isDash(words.at(-1).chars.at(-1).c), + words + }); + } + paragraphs.push({ + rect: getBoundingRect(lines, 0, lines.length - 1), + lines + }); + } + return paragraphs; +} + +function getParagraphText(paragraph) { + let text = []; + for (let line of paragraph.lines) { + for (let word of line.words) { + for (let char of word.chars) { + text.push(char.c); + } + if (word.spaceAfter) { + text.push(' '); + } + } + text.push('\n'); + } + return text.join(''); +} + +function sortParagraphs(paragraphs) { + let clusters = []; + for (let i = 0; i < paragraphs.length; i++) { + let rect1 = paragraphs[i].rect; + let addedToCluster = false; + for (let j = 0; j < clusters.length && !addedToCluster; j++) { + let cluster = clusters[j]; + for (let idx of cluster) { + let rect2 = paragraphs[idx].rect; + if (overlaps(rect1, rect2, 0)) { + cluster.push(i); + cluster.sort((a, b) => a - b); + addedToCluster = true; + break; + } + } + } + if (!addedToCluster) { + clusters.push([i]); + } + } + let merged = true; + while (merged) { + merged = false; + for (let i = 0; i < clusters.length - 1; i++) { + for (let j = i + 1; j < clusters.length; j++) { + let [min1, max1] = [clusters[i][0], clusters[i][clusters[i].length - 1]]; + let [min2, max2] = [clusters[j][0], clusters[j][clusters[j].length - 1]]; + if ((min1 >= min2 && min1 <= max2) || (min2 >= min1 && min2 <= max1)) { + clusters[i] = [...new Set([...clusters[i], ...clusters[j]])].sort((a, b) => a - b); + clusters.splice(j, 1); + merged = true; + break; + } + } + } + } + let sortedClusters = clusters.sort((clusterA, clusterB) => { + let maxYA = Math.min(...clusterA.map(index => paragraphs[index].rect[3])); + let maxYB = Math.min(...clusterB.map(index => paragraphs[index].rect[3])); + return maxYB - maxYA; + }); + + let sortedParagraphs = sortedClusters.reduce((result, cluster) => { + return result.concat(cluster.map(index => paragraphs[index])); + }, []); + + return sortedParagraphs; +} + +export function getParagraphs(chars) { + let paragraphs = split(chars); + // paragraphs = sortParagraphs(paragraphs); + return paragraphs; +} diff --git a/src/core/worker.js b/src/core/worker.js index d4b02815660a1..33e37e1bcbd20 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -764,6 +764,33 @@ class WorkerMessageHandler { }); }); + handler.on("GetPageData", async function (data) { + let pageIndex = data.pageIndex; + let task = new WorkerTask('GetPageData: ' + pageIndex); + startWorkerTask(task); + let pageData; + try { + pageData = await pdfManager.pdfDocument.getPageData({ handler, task, data }); + } catch (e) { + console.log(e); + } + finishWorkerTask(task); + return pageData; + }); + + handler.on("GetOutline2", async function (data) { + let task = new WorkerTask('GetOutline2'); + startWorkerTask(task); + let pageData; + try { + pageData = await pdfManager.pdfDocument.getOutline2({ handler, task, data }); + } catch (e) { + console.log(e); + } + finishWorkerTask(task); + return pageData; + }); + handler.on("GetStructTree", function (data) { return pdfManager.getPage(data.pageIndex).then(function (page) { return pdfManager.ensure(page, "getStructTree"); diff --git a/src/display/api.js b/src/display/api.js index 46ceef1ac6069..3a927aa2397d9 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -963,6 +963,14 @@ class PDFDocumentProxy { return this._transport.getOutline(); } + getPageData(data) { + return this._transport.messageHandler.sendWithPromise("GetPageData", data); + } + + getOutline2(data) { + return this._transport.messageHandler.sendWithPromise("GetOutline2", data); + } + /** * @returns {Promise} A promise that is resolved with * an {@link OptionalContentConfig} that contains all the optional content diff --git a/web/annotation_layer_builder.js b/web/annotation_layer_builder.js index a0fb1ce52071e..2cdf3e92b259e 100644 --- a/web/annotation_layer_builder.js +++ b/web/annotation_layer_builder.js @@ -105,7 +105,7 @@ class AnnotationLayerBuilder { return; } - const [annotations, hasJSActions, fieldObjects] = await Promise.all([ + let [annotations, hasJSActions, fieldObjects] = await Promise.all([ this.pdfPage.getAnnotations({ intent }), this._hasJSActionsPromise, this._fieldObjectsPromise, @@ -114,6 +114,21 @@ class AnnotationLayerBuilder { return; } + const allowedSubtypes = [ + 'Link', + 'Widget', + 'Line', + 'Circle', + 'PolyLine', + 'Polygon', + 'Caret', + 'Squiggly', + 'StrikeOut', + 'Stamp' + ]; + annotations = annotations.filter(x => allowedSubtypes.includes(x.subtype) + || ['Square', 'Ink', 'FreeText'].includes(x.subtype) && !x.isZotero); + // Create an annotation layer div and render the annotations // if there is at least one annotation. const div = (this.div = document.createElement("div")); diff --git a/web/app.js b/web/app.js index ee6001ef42152..e755ff3c9ca88 100644 --- a/web/app.js +++ b/web/app.js @@ -78,7 +78,7 @@ import { Toolbar } from "web-toolbar"; import { ViewHistory } from "./view_history.js"; const FORCE_PAGES_LOADED_TIMEOUT = 10000; // ms -const WHEEL_ZOOM_DISABLED_TIMEOUT = 1000; // ms +const WHEEL_ZOOM_DISABLED_TIMEOUT = 20; // ms const ViewOnLoad = { UNKNOWN: -1, @@ -982,7 +982,7 @@ const PDFViewerApplication = { const loadingTask = getDocument(params); this.pdfLoadingTask = loadingTask; - loadingTask.onPassword = (updateCallback, reason) => { + loadingTask.onPassword = this.onPassword || ((updateCallback, reason) => { if (this.isViewerEmbedded) { // The load event can't be triggered until the password is entered, so // if the viewer is in an iframe and its visibility depends on the @@ -993,12 +993,13 @@ const PDFViewerApplication = { this.pdfLinkService.externalLinkEnabled = false; this.passwordPrompt.setUpdateCallback(updateCallback, reason); this.passwordPrompt.open(); - }; + }); loadingTask.onProgress = ({ loaded, total }) => { this.progress(loaded / total); }; + await this.initializedPromise; return loadingTask.promise.then( pdfDocument => { this.load(pdfDocument); @@ -1177,9 +1178,9 @@ const PDFViewerApplication = { this.downloadComplete = true; this.loadingBar?.hide(); - firstPagePromise.then(() => { - this.eventBus.dispatch("documentloaded", { source: this }); - }); + // firstPagePromise.then(() => { + // this.eventBus.dispatch("documentloaded", { source: this }); + // }); }); // Since the `setInitialView` call below depends on this being resolved, @@ -1214,6 +1215,10 @@ const PDFViewerApplication = { pdfViewer.setDocument(pdfDocument); const { firstPagePromise, onePageRendered, pagesPromise } = pdfViewer; + firstPagePromise.then(() => { + this.eventBus.dispatch("documentloaded", { source: this }); + }); + this.pdfThumbnailViewer?.setDocument(pdfDocument); const storedPromise = (this.store = new ViewHistory( diff --git a/web/chromecom.js b/web/chromecom.js index 60a83846c5d0f..f1bf1d3d944d1 100644 --- a/web/chromecom.js +++ b/web/chromecom.js @@ -258,10 +258,10 @@ if (window === top) { // If the runtime is still available, the unload is most likely a normal // tab closure. Otherwise it is most likely an extension reload. if (!isRuntimeAvailable()) { - localStorage.setItem( - "unload-" + Date.now() + "-" + document.hidden + "-" + location.href, - JSON.stringify(history.state) - ); + // localStorage.setItem( + // "unload-" + Date.now() + "-" + document.hidden + "-" + location.href, + // JSON.stringify(history.state) + // ); } }); } diff --git a/web/genericcom.js b/web/genericcom.js index 6d5120e9be243..9f82038f4b5f9 100644 --- a/web/genericcom.js +++ b/web/genericcom.js @@ -29,11 +29,11 @@ const GenericCom = {}; class GenericPreferences extends BasePreferences { async _writeToStorage(prefObj) { - localStorage.setItem("pdfjs.preferences", JSON.stringify(prefObj)); + // localStorage.setItem("pdfjs.preferences", JSON.stringify(prefObj)); } async _readFromStorage(prefObj) { - return JSON.parse(localStorage.getItem("pdfjs.preferences")); + // return JSON.parse(localStorage.getItem("pdfjs.preferences")); } } diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index 201e716127cdb..647e6a59df060 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -29,7 +29,7 @@ const FindState = { }; const FIND_TIMEOUT = 250; // ms -const MATCH_SCROLL_OFFSET_TOP = -50; // px +const MATCH_SCROLL_OFFSET_TOP = -120; // px const MATCH_SCROLL_OFFSET_LEFT = -400; // px const CHARACTERS_TO_NORMALIZE = { diff --git a/web/pdf_page_view.js b/web/pdf_page_view.js index 4825d1ef75b30..fdb768c28617f 100644 --- a/web/pdf_page_view.js +++ b/web/pdf_page_view.js @@ -289,6 +289,7 @@ class PDFPageView { } destroy() { + window.onDetachPage && window.onDetachPage(this); this.reset(); this.pdfPage?.cleanup(); } @@ -928,6 +929,7 @@ class PDFPageView { this.hasRestrictedScaling = false; } } + this.currentCanvasWidth = width * outputScale.sx; const sfx = approximateFraction(outputScale.sx); const sfy = approximateFraction(outputScale.sy); @@ -956,9 +958,11 @@ class PDFPageView { const renderTask = (this.renderTask = this.pdfPage.render(renderContext)); renderTask.onContinue = renderContinueCallback; + let that = this; const resultPromise = renderTask.promise.then( async () => { showCanvas?.(true); + window.onAttachPage && window.onAttachPage(that); await this.#finishRenderTask(renderTask); this.#renderTextLayer(); diff --git a/web/pdf_print_service.js b/web/pdf_print_service.js index 52af95969963b..5594bc1596fca 100644 --- a/web/pdf_print_service.js +++ b/web/pdf_print_service.js @@ -302,6 +302,7 @@ function renderProgress(index, total, l10n) { window.addEventListener( "keydown", function (event) { + return; // Intercept Cmd/Ctrl + P in all browsers. // Also intercept Cmd/Ctrl + Shift + P in Chrome and Opera if (