From 4545ce24cdc1f53073b7350981f7f433d14b25ef Mon Sep 17 00:00:00 2001 From: Timothy Carambat Date: Fri, 14 Feb 2025 17:38:13 -0800 Subject: [PATCH] Drop Node `canvas` for manual `sharp` conversion (#3221) * Drop Node `canvas` for manual `sharp` conversion * bump dev --- .github/workflows/dev-build.yaml | 2 +- collector/package.json | 3 +- collector/utils/OCRLoader/CanvasFactory.js | 52 --------- collector/utils/OCRLoader/index.js | 126 +++++++++++++++------ collector/yarn.lock | 37 +----- 5 files changed, 94 insertions(+), 126 deletions(-) delete mode 100644 collector/utils/OCRLoader/CanvasFactory.js diff --git a/.github/workflows/dev-build.yaml b/.github/workflows/dev-build.yaml index 433643ae46..7873053377 100644 --- a/.github/workflows/dev-build.yaml +++ b/.github/workflows/dev-build.yaml @@ -6,7 +6,7 @@ concurrency: on: push: - branches: ['ocr-parse-images'] # put your current branch to create a build. Core team only. + branches: ['sharp-pdf-image-converter'] # put your current branch to create a build. Core team only. paths-ignore: - '**.md' - 'cloud-deployments/*' diff --git a/collector/package.json b/collector/package.json index b67951df32..7de9338ab0 100644 --- a/collector/package.json +++ b/collector/package.json @@ -19,7 +19,6 @@ "@xenova/transformers": "^2.11.0", "bcrypt": "^5.1.0", "body-parser": "^1.20.2", - "canvas": "^2.11.2", "cors": "^2.8.5", "dotenv": "^16.0.3", "epub2": "^3.0.2", @@ -52,4 +51,4 @@ "nodemon": "^2.0.22", "prettier": "^2.4.1" } -} \ No newline at end of file +} diff --git a/collector/utils/OCRLoader/CanvasFactory.js b/collector/utils/OCRLoader/CanvasFactory.js deleted file mode 100644 index 067917e51b..0000000000 --- a/collector/utils/OCRLoader/CanvasFactory.js +++ /dev/null @@ -1,52 +0,0 @@ -/** - * This is a factory for creating a canvas and context in Node.js - * it is used to create a canvas and context for the PDFLoader for turning the PDF into an image - * so we can later use the image to extract text from the PDF. - */ -class NodeCanvasFactory { - constructor() { - this.CanvasModule = null; - } - - async init() { - this.CanvasModule = await import("canvas"); - this.Image = this.CanvasModule.Image; - } - - /** - * Creates a canvas and context for the PDFLoader - * @param {number} width - The width of the canvas - * @param {number} height - The height of the canvas - * @param {boolean} transparent - Whether the canvas is transparent - * @returns {{canvas: HTMLCanvasElement, context: CanvasRenderingContext2D}} - The canvas and context - */ - create(width, height, transparent = false) { - const canvas = this.CanvasModule.createCanvas(width, height); - const context = canvas.getContext("2d", { alpha: transparent }); - if (transparent) context.clearRect(0, 0, width, height); - return { - canvas, - context, - }; - } - - /** - * Required for the PDFLoader pdfjs interation - do not remove or use directly. - */ - reset(canvasAndContext, width, height) { - canvasAndContext.canvas.width = width; - canvasAndContext.canvas.height = height; - } - - /** - * Required for the PDFLoader pdfjs interation - do not remove or use directly. - */ - destroy(canvasAndContext) { - canvasAndContext.canvas.width = 0; - canvasAndContext.canvas.height = 0; - canvasAndContext.canvas = null; - canvasAndContext.context = null; - } -} - -module.exports = NodeCanvasFactory; diff --git a/collector/utils/OCRLoader/index.js b/collector/utils/OCRLoader/index.js index 88ac31e613..45f76506d3 100644 --- a/collector/utils/OCRLoader/index.js +++ b/collector/utils/OCRLoader/index.js @@ -1,7 +1,6 @@ const fs = require("fs"); const os = require("os"); const path = require("path"); -const NodeCanvasFactory = require("./CanvasFactory"); class OCRLoader { constructor() { @@ -38,15 +37,8 @@ class OCRLoader { this.log(`Starting OCR of ${documentTitle}`); const pdfjs = await import("pdf-parse/lib/pdf.js/v2.0.550/build/pdf.js"); let buffer = fs.readFileSync(filePath); - const canvasFactory = new NodeCanvasFactory(); - await canvasFactory.init(); - global.Image = canvasFactory.Image; - const pdfDocument = await pdfjs.getDocument({ - data: new Uint8Array(buffer), - canvasFactory, - }).promise; - buffer = null; + const pdfDocument = await pdfjs.getDocument({ data: buffer }); const documents = []; const meta = await pdfDocument.getMetadata().catch(() => null); @@ -60,30 +52,14 @@ class OCRLoader { }, }; - async function getPageAsBuffer(pageNumber, scale = 1) { - let canvas = null; - let context = null; - try { - const page = await pdfDocument.getPage(pageNumber); - const viewport = page.getViewport(scale); - ({ canvas, context } = canvasFactory.create( - viewport.width, - viewport.height - )); - await page.render({ - canvasFactory, - canvasContext: context, - viewport, - }).promise; - return canvas.toBuffer(); - } catch (e) { - this.log(`Error getting page as buffer: ${e.message}`); - return null; - } finally { - canvas = null; - context = null; - } - } + const pdfSharp = new PDFSharp({ + validOps: [ + pdfjs.OPS.paintJpegXObject, + pdfjs.OPS.paintImageXObject, + pdfjs.OPS.paintInlineImageXObject, + ], + }); + await pdfSharp.init(); const { createWorker, OEM } = require("tesseract.js"); const BATCH_SIZE = batchSize; @@ -143,7 +119,9 @@ class OCRLoader { workerIndex + 1 }]\x1b[0m assigned pg${pageNum}` ); - const imageBuffer = await getPageAsBuffer(pageNum, 5); + const page = await pdfDocument.getPage(pageNum); + const imageBuffer = await pdfSharp.pageToBuffer({ page }); + if (!imageBuffer) continue; const { data } = await worker.recognize(imageBuffer, {}, "text"); this.log( `✅ \x1b[34m[Worker ${ @@ -172,7 +150,7 @@ class OCRLoader { await Promise.race([timeoutPromise, processPages()]); } catch (e) { - this.log(`Error: ${e.message}`); + this.log(`Error: ${e.message}`, e.stack); } finally { global.Image = undefined; await Promise.all(workerPool.map((worker) => worker.terminate())); @@ -248,4 +226,82 @@ class OCRLoader { } } +/** + * Converts a PDF page to a buffer using Sharp. + * @param {Object} options - The options for the Sharp PDF page object. + * @param {Object} options.page - The PDFJS page proxy object. + * @returns {Promise} The buffer of the page. + */ +class PDFSharp { + constructor({ validOps = [] } = {}) { + this.sharp = null; + this.validOps = validOps; + } + + log(text, ...args) { + console.log(`\x1b[36m[PDFSharp]\x1b[0m ${text}`, ...args); + } + + async init() { + this.sharp = (await import("sharp")).default; + } + + /** + * Converts a PDF page to a buffer. + * @param {Object} options - The options for the Sharp PDF page object. + * @param {Object} options.page - The PDFJS page proxy object. + * @returns {Promise} The buffer of the page. + */ + async pageToBuffer({ page }) { + if (!this.sharp) await this.init(); + try { + this.log(`Converting page ${page.pageNumber} to image...`); + const ops = await page.getOperatorList(); + const pageImages = ops.fnArray.length; + + for (let i = 0; i < pageImages; i++) { + try { + if (!this.validOps.includes(ops.fnArray[i])) continue; + + const name = ops.argsArray[i][0]; + const img = await page.objs.get(name); + const { width, height } = img; + const size = img.data.length; + const channels = size / width / height; + const targetDPI = 70; + const targetWidth = Math.floor(width * (targetDPI / 72)); + const targetHeight = Math.floor(height * (targetDPI / 72)); + + const image = this.sharp(img.data, { + raw: { width, height, channels }, + density: targetDPI, + }) + .resize({ + width: targetWidth, + height: targetHeight, + fit: "fill", + }) + .withMetadata({ + density: targetDPI, + resolution: targetDPI, + }) + .png(); + + // For debugging purposes + // await image.toFile(path.resolve(__dirname, `../../storage/`, `pg${page.pageNumber}.png`)); + return await image.toBuffer(); + } catch (error) { + this.log(`Iteration error: ${error.message}`, error.stack); + continue; + } + } + this.log(`No valid images found on page ${page.pageNumber}`); + return null; + } catch (error) { + this.log(`Error: ${error.message}`, error.stack); + return null; + } + } +} + module.exports = OCRLoader; diff --git a/collector/yarn.lock b/collector/yarn.lock index 611b395673..df7cf31265 100644 --- a/collector/yarn.lock +++ b/collector/yarn.lock @@ -280,7 +280,7 @@ "@langchain/core" "~0.1" js-tiktoken "^1.0.11" -"@mapbox/node-pre-gyp@^1.0.0", "@mapbox/node-pre-gyp@^1.0.11": +"@mapbox/node-pre-gyp@^1.0.11": version "1.0.11" resolved "https://registry.yarnpkg.com/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz#417db42b7f5323d79e93b34a6d7a2a12c0df43fa" integrity sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ== @@ -793,15 +793,6 @@ camelcase@6: resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a" integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA== -canvas@^2.11.2: - version "2.11.2" - resolved "https://registry.yarnpkg.com/canvas/-/canvas-2.11.2.tgz#553d87b1e0228c7ac0fc72887c3adbac4abbd860" - integrity sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw== - dependencies: - "@mapbox/node-pre-gyp" "^1.0.0" - nan "^2.17.0" - simple-get "^3.0.3" - chalk@^2.4.2: version "2.4.2" resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424" @@ -1057,13 +1048,6 @@ decamelize@1.2.0: resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-1.2.0.tgz#f6534d15148269b20352e7bee26f501f9a191290" integrity sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA== -decompress-response@^4.2.0: - version "4.2.1" - resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-4.2.1.tgz#414023cc7a302da25ce2ec82d0d5238ccafd8986" - integrity sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw== - dependencies: - mimic-response "^2.0.0" - decompress-response@^6.0.0: version "6.0.0" resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-6.0.0.tgz#ca387612ddb7e104bd16d85aab00d5ecf09c66fc" @@ -2307,11 +2291,6 @@ mime@^3.0.0: resolved "https://registry.yarnpkg.com/mime/-/mime-3.0.0.tgz#b374550dca3a0c18443b0c950a6a58f1931cf7a7" integrity sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A== -mimic-response@^2.0.0: - version "2.1.0" - resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-2.1.0.tgz#d13763d35f613d09ec37ebb30bac0469c0ee8f43" - integrity sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA== - mimic-response@^3.1.0: version "3.1.0" resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9" @@ -2425,11 +2404,6 @@ mustache@^4.2.0: resolved "https://registry.yarnpkg.com/mustache/-/mustache-4.2.0.tgz#e5892324d60a12ec9c2a73359edca52972bf6f64" integrity sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ== -nan@^2.17.0: - version "2.22.0" - resolved "https://registry.yarnpkg.com/nan/-/nan-2.22.0.tgz#31bc433fc33213c97bad36404bb68063de604de3" - integrity sha512-nbajikzWTMwsW+eSsNm3QwlOs7het9gGJU5dDZzRTQGk03vyBOauxgI4VakDzE0PtsGTmXPsXTbbjVhRwR5mpw== - napi-build-utils@^1.0.1: version "1.0.2" resolved "https://registry.yarnpkg.com/napi-build-utils/-/napi-build-utils-1.0.2.tgz#b1fddc0b2c46e380a0b7a76f984dd47c41a13806" @@ -3255,15 +3229,6 @@ simple-concat@^1.0.0: resolved "https://registry.yarnpkg.com/simple-concat/-/simple-concat-1.0.1.tgz#f46976082ba35c2263f1c8ab5edfe26c41c9552f" integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q== -simple-get@^3.0.3: - version "3.1.1" - resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-3.1.1.tgz#cc7ba77cfbe761036fbfce3d021af25fc5584d55" - integrity sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA== - dependencies: - decompress-response "^4.2.0" - once "^1.3.1" - simple-concat "^1.0.0" - simple-get@^4.0.0, simple-get@^4.0.1: version "4.0.1" resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-4.0.1.tgz#4a39db549287c979d352112fa03fd99fd6bc3543"