Skip to content

Commit

Permalink
feat: use office-text-extractor to extract text
Browse files Browse the repository at this point in the history
  • Loading branch information
DemoMacro committed Jul 14, 2023
1 parent e3a876e commit fcb8886
Show file tree
Hide file tree
Showing 8 changed files with 1,968 additions and 655 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"name": "docen",
"name": "docen-edge",
"version": "0.0.0",
"description": "Programmatically and command-line implementation of document formatting, powered by Demo Macro.",
"private": true,
Expand Down
8 changes: 4 additions & 4 deletions packages/docen/bundle.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@ execSync("pnpm prepack", { stdio: "inherit" });
execSync("pnpm ncc build dist/cli.cjs -o dist/ncc", { stdio: "inherit" });

const version = readFileSync("package.json", "utf-8").match(
/"version": "(.*?)"/
/"version": "(.*?)"/,
)[1];

execSync(`cp dist/ncc/index.cjs dist/ncc/docen-${version}.cjs`, {
execSync(`mv dist/ncc/index.cjs dist/ncc/docen-${version}.cjs`, {
stdio: "inherit",
});

execSync(
`pnpm dlx pkg dist/ncc/docen-${version}.cjs --out-path dist/bundle -C GZip`,
`pnpm dlx pkg dist/ncc/docen-${version}.cjs --out-path dist/bundle -c pkg.config.json -C GZip`,
{
stdio: "inherit",
}
},
);
4 changes: 2 additions & 2 deletions packages/docen/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "docen",
"version": "0.0.0",
"version": "0.0.1",
"description": "Programmatically and command-line implementation of document formatting, powered by Demo Macro.",
"main": "dist/index.cjs",
"types": "dist/index.d.ts",
Expand Down Expand Up @@ -41,6 +41,6 @@
"dependencies": {
"@funish/cli": "0.0.4",
"file-type": "18.5.0",
"mammoth": "1.6.0"
"office-text-extractor": "^3.0.1"
}
}
5 changes: 5 additions & 0 deletions packages/docen/pkg.config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"pkg": {
"scripts": "dist/ncc/*"
}
}
2 changes: 1 addition & 1 deletion packages/docen/src/cli.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { CLI } from "@funish/cli";
import { docen } from ".";
import { CLI } from "@funish/cli";

const cli = new CLI("docen");

Expand Down
16 changes: 11 additions & 5 deletions packages/docen/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
import { fileTypeFromBuffer } from "file-type";
import { readFileSync, writeFileSync } from "fs";
import { extractRawText } from "mammoth";
import { getTextExtractor } from "office-text-extractor";
import { extname } from "path";
import { fileTypeFromBuffer } from "file-type";

export async function docen(source: string, target: string): Promise<void> {
const sourceContent = readFileSync(source);
const sourceType = await fileTypeFromBuffer(sourceContent);

if (sourceType?.ext === "docx" && extname(target) === ".txt") {
extractRawText({ buffer: sourceContent }).then((result) => {
writeFileSync(target, result.value);
if (
(sourceType?.ext === "docx" || "pptx" || "xlsx" || "pdf") &&
extname(target) === ".txt"
) {
const extractor = getTextExtractor();
const text = await extractor.extractText({
input: sourceContent,
type: "buffer",
});
writeFileSync(target, text);
} else {
console.error(`Unsupported source file type: ${sourceType?.ext}`);
}
Expand Down
Loading

0 comments on commit fcb8886

Please sign in to comment.