-
Notifications
You must be signed in to change notification settings - Fork 2
/
browse-text.mjs
95 lines (82 loc) · 2.78 KB
/
browse-text.mjs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
'use strict';
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import chromium from '@sparticuz/chromium';
const maxResultLength = 6291556;
function resultLength(r) {
return Buffer.byteLength(JSON.stringify(r));
}
puppeteer.use(StealthPlugin());
export async function main(url, log={}) {
const timeBase = Date.now();
log.url = url;
log.events = [];
function logEvent(event) {
log.events.push({time: (Date.now() - timeBase) / 1000, ...event});
}
const browser = await puppeteer.launch({
args: chromium.args.concat([
'--disable-file-system',
]),
defaultViewport: chromium.defaultViewport,
executablePath: await chromium.executablePath(),
headless: chromium.headless,
});
logEvent({event: 'launch'});
const page = await browser.newPage();
const cdp = await page.target().createCDPSession();
async function getContent() {
const {documents, strings} = await cdp.send('DOMSnapshot.captureSnapshot', {computedStyles: []});
return documents[0].layout.text.map(i => i == -1 ? '' : strings[i]).filter(i => i).join(' ');
}
logEvent({event: 'newPage'});
let progressTimeout = setTimeout(async function f() {
try {
logEvent({event: 'progress', length: (await getContent()).length});
} catch {}
progressTimeout = setTimeout(f, 500);
}, 500);
page.on('load', () => logEvent({event: 'load'}));
page.on('domcontentloaded', () => logEvent({event: 'domcontentloaded'}));
try {
await page.goto(url, {timeout: 30000, waitUntil: ['load', 'domcontentloaded', 'networkidle0']});
} catch {}
logEvent({event: 'goto'});
await new Promise(r => setTimeout(r, 3000));
const content = await getContent();
clearTimeout(progressTimeout);
logEvent({event: 'captureSnapshot', length: content.length});
let title = await page.title();
await browser.close();
logEvent({event: 'close'});
console.log(JSON.stringify(log, null, 2));
const result = {
data: {url, title, content},
truncated: false,
template: [
{field: 'url', name: 'URL', type: 'inline'},
{field: 'title', name: 'Title', type: 'inline'},
{field: 'content', name: 'Content', type: 'block'},
],
};
if (resultLength(result) > maxResultLength) {
result.truncated = true;
result.data.content = '';
let left = 0;
let right = content.length;
while (left + 1 < right) {
const mid = Math.floor((left + right) / 2);
result.data.content = content.slice(0, mid);
if (resultLength(result) > maxResultLength) {
right = mid;
} else {
left = mid;
}
}
result.data.content = content.slice(0, left);
}
return result;
}
export async function handler(event, context) {
return await main(JSON.parse(event.body).url, {event, context});
}