Skip to content

Commit

Permalink
Browse: markdown transform as default
Browse files Browse the repository at this point in the history
  • Loading branch information
enricoros committed May 13, 2024
1 parent 51b6e30 commit 7946cd6
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 7 deletions.
21 changes: 21 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
"superjson": "^2.2.1",
"tesseract.js": "^5.1.0",
"tiktoken": "^1.0.14",
"turndown": "^7.1.3",
"uuid": "^9.0.1",
"zod": "^3.23.8",
"zustand": "^4.5.2"
Expand All @@ -77,6 +78,7 @@
"@types/react-dom": "^18.3.0",
"@types/react-katex": "^3.0.4",
"@types/react-timeago": "^4.1.7",
"@types/turndown": "^5.0.4",
"@types/uuid": "^9.0.8",
"eslint": "^8.57.0",
"eslint-config-next": "^14.2.3",
Expand Down
28 changes: 21 additions & 7 deletions src/modules/browse/browse.router.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import { z } from 'zod';
import { TRPCError } from '@trpc/server';

import { BrowserContext, connect, ScreenshotOptions, TimeoutError } from '@cloudflare/puppeteer';

import TurndownService from 'turndown';

import { createTRPCRouter, publicProcedure } from '~/server/api/trpc.server';
import { env } from '~/server/env.mjs';

Expand Down Expand Up @@ -125,8 +128,9 @@ async function workerPuppeteer(access: BrowseAccessSchema, targetUrl: string, tr
if (!isWebPage) {
// noinspection ExceptionCaughtLocallyJS
throw new Error(`Invalid content-type: ${contentType}`);
} else
} else {
result.stopReason = 'end';
}
} catch (error: any) {
const isTimeout: boolean = error instanceof TimeoutError;
result.stopReason = isTimeout ? 'timeout' : 'error';
Expand All @@ -137,12 +141,22 @@ async function workerPuppeteer(access: BrowseAccessSchema, targetUrl: string, tr
// transform the content of the page as text
try {
if (result.stopReason !== 'error') {
result.content = await page.evaluate(() => {
const content = document.body.innerText || document.textContent;
if (!content)
throw new Error('No content');
return content;
});
switch (transform) {
case 'html':
result.content = await page.evaluate(() => document.documentElement.innerHTML);
break;
case 'markdown':
const html = await page.evaluate(() => document.documentElement.innerHTML);
const turndownService = new TurndownService();
result.content = turndownService.turndown(html);
break;
case 'text':
default:
result.content = await page.evaluate(() => document.body.innerText || document.textContent || '');
break;
}
if (!result.content)
result.error = '[Puppeteer] Empty content';
}
} catch (error: any) {
result.error = '[Puppeteer] ' + error?.message || error?.toString() || 'Unknown evaluate error';
Expand Down

0 comments on commit 7946cd6

Please sign in to comment.