Skip to content

Commit

Permalink
Drop Node canvas for manual sharp conversion (#3221)
Browse files Browse the repository at this point in the history
* Drop Node `canvas` for manual `sharp` conversion

* bump dev
  • Loading branch information
timothycarambat authored Feb 15, 2025
1 parent aba910d commit 4545ce2
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 126 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/dev-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ concurrency:

on:
push:
branches: ['ocr-parse-images'] # put your current branch to create a build. Core team only.
branches: ['sharp-pdf-image-converter'] # put your current branch to create a build. Core team only.
paths-ignore:
- '**.md'
- 'cloud-deployments/*'
Expand Down
3 changes: 1 addition & 2 deletions collector/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
"@xenova/transformers": "^2.11.0",
"bcrypt": "^5.1.0",
"body-parser": "^1.20.2",
"canvas": "^2.11.2",
"cors": "^2.8.5",
"dotenv": "^16.0.3",
"epub2": "^3.0.2",
Expand Down Expand Up @@ -52,4 +51,4 @@
"nodemon": "^2.0.22",
"prettier": "^2.4.1"
}
}
}
52 changes: 0 additions & 52 deletions collector/utils/OCRLoader/CanvasFactory.js

This file was deleted.

126 changes: 91 additions & 35 deletions collector/utils/OCRLoader/index.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
const fs = require("fs");
const os = require("os");
const path = require("path");
const NodeCanvasFactory = require("./CanvasFactory");

class OCRLoader {
constructor() {
Expand Down Expand Up @@ -38,15 +37,8 @@ class OCRLoader {
this.log(`Starting OCR of ${documentTitle}`);
const pdfjs = await import("pdf-parse/lib/pdf.js/v2.0.550/build/pdf.js");
let buffer = fs.readFileSync(filePath);
const canvasFactory = new NodeCanvasFactory();
await canvasFactory.init();
global.Image = canvasFactory.Image;

const pdfDocument = await pdfjs.getDocument({
data: new Uint8Array(buffer),
canvasFactory,
}).promise;
buffer = null;
const pdfDocument = await pdfjs.getDocument({ data: buffer });

const documents = [];
const meta = await pdfDocument.getMetadata().catch(() => null);
Expand All @@ -60,30 +52,14 @@ class OCRLoader {
},
};

async function getPageAsBuffer(pageNumber, scale = 1) {
let canvas = null;
let context = null;
try {
const page = await pdfDocument.getPage(pageNumber);
const viewport = page.getViewport(scale);
({ canvas, context } = canvasFactory.create(
viewport.width,
viewport.height
));
await page.render({
canvasFactory,
canvasContext: context,
viewport,
}).promise;
return canvas.toBuffer();
} catch (e) {
this.log(`Error getting page as buffer: ${e.message}`);
return null;
} finally {
canvas = null;
context = null;
}
}
const pdfSharp = new PDFSharp({
validOps: [
pdfjs.OPS.paintJpegXObject,
pdfjs.OPS.paintImageXObject,
pdfjs.OPS.paintInlineImageXObject,
],
});
await pdfSharp.init();

const { createWorker, OEM } = require("tesseract.js");
const BATCH_SIZE = batchSize;
Expand Down Expand Up @@ -143,7 +119,9 @@ class OCRLoader {
workerIndex + 1
}]\x1b[0m assigned pg${pageNum}`
);
const imageBuffer = await getPageAsBuffer(pageNum, 5);
const page = await pdfDocument.getPage(pageNum);
const imageBuffer = await pdfSharp.pageToBuffer({ page });
if (!imageBuffer) continue;
const { data } = await worker.recognize(imageBuffer, {}, "text");
this.log(
`✅ \x1b[34m[Worker ${
Expand Down Expand Up @@ -172,7 +150,7 @@ class OCRLoader {

await Promise.race([timeoutPromise, processPages()]);
} catch (e) {
this.log(`Error: ${e.message}`);
this.log(`Error: ${e.message}`, e.stack);
} finally {
global.Image = undefined;
await Promise.all(workerPool.map((worker) => worker.terminate()));
Expand Down Expand Up @@ -248,4 +226,82 @@ class OCRLoader {
}
}

/**
* Converts a PDF page to a buffer using Sharp.
* @param {Object} options - The options for the Sharp PDF page object.
* @param {Object} options.page - The PDFJS page proxy object.
* @returns {Promise<Buffer>} The buffer of the page.
*/
class PDFSharp {
constructor({ validOps = [] } = {}) {
this.sharp = null;
this.validOps = validOps;
}

log(text, ...args) {
console.log(`\x1b[36m[PDFSharp]\x1b[0m ${text}`, ...args);
}

async init() {
this.sharp = (await import("sharp")).default;
}

/**
* Converts a PDF page to a buffer.
* @param {Object} options - The options for the Sharp PDF page object.
* @param {Object} options.page - The PDFJS page proxy object.
* @returns {Promise<Buffer>} The buffer of the page.
*/
async pageToBuffer({ page }) {
if (!this.sharp) await this.init();
try {
this.log(`Converting page ${page.pageNumber} to image...`);
const ops = await page.getOperatorList();
const pageImages = ops.fnArray.length;

for (let i = 0; i < pageImages; i++) {
try {
if (!this.validOps.includes(ops.fnArray[i])) continue;

const name = ops.argsArray[i][0];
const img = await page.objs.get(name);
const { width, height } = img;
const size = img.data.length;
const channels = size / width / height;
const targetDPI = 70;
const targetWidth = Math.floor(width * (targetDPI / 72));
const targetHeight = Math.floor(height * (targetDPI / 72));

const image = this.sharp(img.data, {
raw: { width, height, channels },
density: targetDPI,
})
.resize({
width: targetWidth,
height: targetHeight,
fit: "fill",
})
.withMetadata({
density: targetDPI,
resolution: targetDPI,
})
.png();

// For debugging purposes
// await image.toFile(path.resolve(__dirname, `../../storage/`, `pg${page.pageNumber}.png`));
return await image.toBuffer();
} catch (error) {
this.log(`Iteration error: ${error.message}`, error.stack);
continue;
}
}
this.log(`No valid images found on page ${page.pageNumber}`);
return null;
} catch (error) {
this.log(`Error: ${error.message}`, error.stack);
return null;
}
}
}

module.exports = OCRLoader;
37 changes: 1 addition & 36 deletions collector/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@
"@langchain/core" "~0.1"
js-tiktoken "^1.0.11"

"@mapbox/node-pre-gyp@^1.0.0", "@mapbox/node-pre-gyp@^1.0.11":
"@mapbox/node-pre-gyp@^1.0.11":
version "1.0.11"
resolved "https://registry.yarnpkg.com/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz#417db42b7f5323d79e93b34a6d7a2a12c0df43fa"
integrity sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==
Expand Down Expand Up @@ -793,15 +793,6 @@ camelcase@6:
resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a"
integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==

canvas@^2.11.2:
version "2.11.2"
resolved "https://registry.yarnpkg.com/canvas/-/canvas-2.11.2.tgz#553d87b1e0228c7ac0fc72887c3adbac4abbd860"
integrity sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==
dependencies:
"@mapbox/node-pre-gyp" "^1.0.0"
nan "^2.17.0"
simple-get "^3.0.3"

chalk@^2.4.2:
version "2.4.2"
resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424"
Expand Down Expand Up @@ -1057,13 +1048,6 @@ [email protected]:
resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-1.2.0.tgz#f6534d15148269b20352e7bee26f501f9a191290"
integrity sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==

decompress-response@^4.2.0:
version "4.2.1"
resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-4.2.1.tgz#414023cc7a302da25ce2ec82d0d5238ccafd8986"
integrity sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==
dependencies:
mimic-response "^2.0.0"

decompress-response@^6.0.0:
version "6.0.0"
resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-6.0.0.tgz#ca387612ddb7e104bd16d85aab00d5ecf09c66fc"
Expand Down Expand Up @@ -2307,11 +2291,6 @@ mime@^3.0.0:
resolved "https://registry.yarnpkg.com/mime/-/mime-3.0.0.tgz#b374550dca3a0c18443b0c950a6a58f1931cf7a7"
integrity sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A==

mimic-response@^2.0.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-2.1.0.tgz#d13763d35f613d09ec37ebb30bac0469c0ee8f43"
integrity sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==

mimic-response@^3.1.0:
version "3.1.0"
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9"
Expand Down Expand Up @@ -2425,11 +2404,6 @@ mustache@^4.2.0:
resolved "https://registry.yarnpkg.com/mustache/-/mustache-4.2.0.tgz#e5892324d60a12ec9c2a73359edca52972bf6f64"
integrity sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==

nan@^2.17.0:
version "2.22.0"
resolved "https://registry.yarnpkg.com/nan/-/nan-2.22.0.tgz#31bc433fc33213c97bad36404bb68063de604de3"
integrity sha512-nbajikzWTMwsW+eSsNm3QwlOs7het9gGJU5dDZzRTQGk03vyBOauxgI4VakDzE0PtsGTmXPsXTbbjVhRwR5mpw==

napi-build-utils@^1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/napi-build-utils/-/napi-build-utils-1.0.2.tgz#b1fddc0b2c46e380a0b7a76f984dd47c41a13806"
Expand Down Expand Up @@ -3255,15 +3229,6 @@ simple-concat@^1.0.0:
resolved "https://registry.yarnpkg.com/simple-concat/-/simple-concat-1.0.1.tgz#f46976082ba35c2263f1c8ab5edfe26c41c9552f"
integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==

simple-get@^3.0.3:
version "3.1.1"
resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-3.1.1.tgz#cc7ba77cfbe761036fbfce3d021af25fc5584d55"
integrity sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==
dependencies:
decompress-response "^4.2.0"
once "^1.3.1"
simple-concat "^1.0.0"

simple-get@^4.0.0, simple-get@^4.0.1:
version "4.0.1"
resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-4.0.1.tgz#4a39db549287c979d352112fa03fd99fd6bc3543"
Expand Down

0 comments on commit 4545ce2

Please sign in to comment.