-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
66 lines (61 loc) · 2.27 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
const rp = require('request-promise');
//to csv not use, cz scraped data aren't save to csv, but DB
const tocsv = require('objects-to-csv');
const cheerio = require('cheerio');
//config of request promise
rp.options.simple = false;
const newsData = require("./newsHtmlStructure");
//retrive data from base url, and mapping it.
const getScrapingNews = async (kindNews) => {
let dataSelector = {};
let i;
let pages = new Array;
let allNews = new Array;
newsData.forEach(news=>{
if(news.name === kindNews){
dataSelector = news;
//get few pages
for(i = news.to;i >= news.from; i-=news.interval){
pages.push(i);
console.log(i);
}
}
});
// console.log(pages);
for(let i = 0;i<pages.length;i++){
let url = dataSelector.baseurl+dataSelector.page_param+pages[i];
console.log(url);
const html = await rp(url);
//handling error when fetch news from those site
try{
const businessMap = cheerio(dataSelector.dom, html).map(async (i, e) => {
//get link from list of posts
const link = e.attribs.href;
const innerHtml = await rp(link);
const titleArticle = cheerio(dataSelector.domtitle,innerHtml).text();
const innerImage = cheerio(dataSelector.domimg,innerHtml).attr('src');
const newsContent = cheerio(dataSelector.domcontent,innerHtml).text();
const source = dataSelector.name;
//combine link, title, image and content in one object
return {
titleArticle,
innerImage,
newsContent,
link,
source
}
}).get();
//code that causes an error
let newsByPages = await Promise.all(businessMap);
allNews.push(newsByPages);
}catch(e){
console.log(e)
}
}
// console.log(allNews);
return allNews;
//return promise of bussinessMap
};
'use strict';
//export, to access from another method
module.exports = getScrapingNews;