-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmain.js
246 lines (220 loc) · 7.93 KB
/
main.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
/**
* This is an Apify Actor (using the Apify v3 SDK, based on crawlee) for crawling
* web content. It extracts the body, title, and other metadata from pages it crawls.
* In addition, it handles PDF files by downloading the raw data and saving it as
* base64-encoded data to the dataset.
*/
import { Actor } from "apify";
import { PlaywrightCrawler } from "crawlee";
import got from "got";
await Actor.init();
// The number of concurrent browser instances to be used in a single crawl is determined
// by these settings.
const MIN_CONCURRENCY = 4;
// The types of files for which we support simple download.
const DOWNLOAD_FILE_EXTENSIONS = [
"pdf",
"doc",
"docx",
"epub",
"ppt",
"pptx",
];
const DOWNLOAD_FILE_REGEX = `\.(${DOWNLOAD_FILE_EXTENSIONS.join("|")})\$`
let {
startUrls,
datasetName,
maxCrawlDepth,
maxCrawlPages,
includeGlobPatterns,
excludeGlobPatterns,
} = await Actor.getInput();
// Our APIs interpret "maxDepth = 1" as "only crawl the direct source" but
// Playwright uses 0 for this meaning. This corrects the off-by-one difference.
maxCrawlDepth = Math.max(maxCrawlDepth - 1, 0);
console.log(`Requested maxCrawlDepth is ${maxCrawlDepth}`);
console.log(`Requested maxCrawlPages is ${maxCrawlPages}`);
console.log(`Requested datasetName is ${datasetName}`);
console.log(
`Requested includeGlobPatterns is ${JSON.stringify(includeGlobPatterns)}`
);
console.log(
`Requested excludeGlobPatterns is ${JSON.stringify(excludeGlobPatterns)}`
);
// Note: Deep inside Crawlee, the `minimatch` library is used for glob matching,
// with `{ nocase: true }` as the default options.
// https://github.com/isaacs/minimatch
if (!includeGlobPatterns || includeGlobPatterns.length == 0) {
// Apify requires that glob patterns be non-empty, so the only way to express
// an empty include-glob set is to set excludePatterns to "**".
console.warn(
'Empty includeGlobPatterns - setting excludeGlobPatterns to "**"'
);
excludeGlobPatterns = [{ glob: "**" }];
// We need to set includeGlobPatterns to a nonempty value for enqueueLinks() to consider
// excludeGlobPatterns as well.
includeGlobPatterns = [{ glob: "unused" }];
}
const dataset = await Actor.openDataset(datasetName);
/** Download the raw file from the given URL and save to the dataset. */
async function downloadFile(crawler, url) {
console.log(`Downloading file: ${url}`);
try {
const response = await got(url);
if (!response || !response.ok) {
throw new Error(`Error fetching ${url}: ${response}`);
}
const rawData = response.rawBody;
const b64Data = rawData.toString("base64");
console.log(`Successfully downloaded ${url}: ${rawData.length} bytes`);
await dataset.pushData({
public_url: url,
content: b64Data,
mime_type: response.headers["content-type"],
content_length: response.headers["content-length"],
encoding: "base64",
timestamp: new Date().toISOString(),
});
} catch (error) {
console.error(
`There was a problem with the fetch operation for ${url}: ${error}`
);
}
}
/** Return the value of the given meta tag. */
async function getMetaTag(page, name) {
try {
return await page.$eval(
`meta[name="${name}"]`,
(element) => element.content
);
} catch (e) {}
return undefined;
}
/** Get metadata description for this page. */
async function getDescription(page) {
return (
getMetaTag(page, "description") ||
getMetaTag(page, "og:description") ||
getMetaTag(page, "twitter:description")
);
}
/** Get language for this page. */
async function getLanguage(page, response) {
const header = await response.headers["content-language"];
const htmlTagLang = await page.$eval("html", (element) => element.lang);
return (
header ||
htmlTagLang ||
getMetaTag("og:locale") ||
getMetaTag("docusaurus_locale") ||
getMetaTag("docsearch:language")
);
}
/** Get publication date of this page. */
async function getPublished(page) {
return (
getMetaTag(page, "article:published_time") ||
getMetaTag(page, "book:release_date")
);
}
/** Get the MIME type of this response. */
async function getMimeType(response) {
return await response.headers["content-type"];
}
// Configure Apify proxy.
const proxyConfiguration = await Actor.createProxyConfiguration({
groups: ['AUTO'],
countryCode: 'US',
});
/** This is the main crawler. */
const crawler = new PlaywrightCrawler({
proxyConfiguration,
minConcurrency: MIN_CONCURRENCY,
// Maximum number of pages to crawl.
maxRequestsPerCrawl: maxCrawlPages,
// This is useful for debugging.
// preNavigationHooks: [
// (crawlingContext, gotoOptions) => {
// console.log(`Navigating to ${crawlingContext.request.url}`);
// console.log(`proxyInfo is ${JSON.stringify(crawlingContext.proxyInfo)}`);
// },
// ],
// This handler is called on each page navigation.
async requestHandler({ request, response, page, enqueueLinks, proxyInfo }) {
const state = await crawler.useState({ downloadedUrls: [] });
if (
state.downloadedUrls &&
state.downloadedUrls.indexOf(request.loadedUrl) != -1
) {
console.warn(`Skipping already downloaded page: ${request.loadedUrl}`);
return;
}
const title = await page.title();
if (request.url != request.loadedUrl) {
console.log(
`Crawled ${request.loadedUrl} (redirected from ${request.url})`
);
} else {
console.log(`Crawled ${request.loadedUrl}`);
}
await dataset.pushData({
// This *must* be the request.url (as opposed to request.loadedUrl) because we
// need a *unique* key to ensure we load all records from the result data set.
// While the code above attempts to deduplicate based on loadedUrls, it can't
// account for concurrent instances. The request.url on the other hand is
// guaranteed to be unique because Apify uses it as the request deduplication
// key itself by default: https://crawlee.dev/api/core/class/Request
public_url: request.url,
title: await page.title(),
description: await getDescription(page),
language: await getLanguage(page, response),
published: await getPublished(page),
mime_type: await getMimeType(response),
content_length: response.headers["content-length"],
content: await page.content(),
timestamp: new Date().toISOString(),
});
state.downloadedUrls.push(request.loadedUrl);
// Only follow links if we have not reached the max crawl depth.
const curDepth = request.userData?.depth || 0;
if (curDepth < maxCrawlDepth) {
await enqueueLinks({
strategy: "all",
globs: includeGlobPatterns,
exclude: excludeGlobPatterns,
userData: { depth: curDepth + 1 },
});
} else {
console.warn(
`Exceeded max crawl depth ${curDepth} - not following links`
);
}
},
// This handler is called when there's an error on the headless browser navigating
// to a URL.
async errorHandler({ crawler, request }) {
// If there is an error fetching a URL, it might be because the underlying
// headless browser does not support file downloads. For now, we try to download
// any file that might be a PDF and add it to the dataset.
// Avoid downloading files multiple times.
const state = await crawler.useState({ downloadedUrls: [] });
if (
state.downloadedUrls &&
state.downloadedUrls.indexOf(request.url) != -1
) {
console.warn(`Skipping already downloaded file: ${request.url}`);
return;
}
if (request.url.match(DOWNLOAD_FILE_REGEX)) {
await downloadFile(crawler, request.url);
state.downloadedUrls.push(request.url);
request.noRetry = true; // Don't retry this request.
} else {
console.log(`Not downloading ${request.url} becuase its extension is not whitelisted by the Fixie actor.`)
}
},
});
console.log(`Starting crawl with startUrls: ${JSON.stringify(startUrls)}`);
await crawler.run(startUrls);
await Actor.exit();