-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetArticlesFromURLs.js
50 lines (45 loc) · 1.74 KB
/
getArticlesFromURLs.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import { ArticleParser } from './articleParser.js';
import { randDelay } from './delay.js';
import { bigDelay } from './globals.js';
export default async function getArticlesFromURLs(articleURLs = []) {
let parsedArticles = [];
let urlBatch = [];
try {
articleURLs = [...articleURLs];
const batchSize = 1;
const initialLen = articleURLs.length;
while (articleURLs.length) {
urlBatch = articleURLs.splice(0, batchSize);
let articleBatch = await parseArticleBatch(urlBatch);
parsedArticles = parsedArticles.concat(articleBatch);
console.log('Number of articles parsed: ', parsedArticles.length + '/' + initialLen);
}
return Promise.resolve(parsedArticles);
} catch (err) {
console.error(err);
let dataUpToError = {parsedArticles: parsedArticles, unparsedURLs: urlBatch.concat(articleURLs)};
return Promise.reject(dataUpToError);
}
}
async function parseArticleBatch(articleURLs) {
try {
let promises = [];
for (let url of articleURLs) {
let articleParser = new ArticleParser();
promises.push(articleParser.generateArticle(url));
}
// The timeout here will make sure every batch takes at least $delay ms to complete.
// This is done to (hopefully) avoid triggering google scholar's bot detection.
let promisedArticles = await Promise.all([...promises, randDelay(...bigDelay)]);
let delay = Math.floor(promisedArticles.pop()) / 1000;
console.log("Batch delay: ", delay);
// let articles = [];
// promisedArticles.forEach(article => {
// (article.status === 'fulfilled') ? articles.push(article.value) : articles.push(article.reason);
// });
// return articles;
return promisedArticles;
} catch (err) {
return Promise.reject(err);
}
}