Skip to content

Commit

Permalink
done viblo post
Browse files Browse the repository at this point in the history
  • Loading branch information
tranphuquy19 committed Sep 5, 2020
1 parent 00b4bb8 commit 889930c
Show file tree
Hide file tree
Showing 12 changed files with 389 additions and 19 deletions.
14 changes: 11 additions & 3 deletions .env
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
NODE_ENV=production

CRAWL_FORM="kipalog,daynhauhoc,viblopost" # giaphiep is not supported!

KIPALOG_LAST_PAGE=76
DAYNHAUHOC_LAST_PAGE=1358
GIAPHIEP_LAST_PAGE=1375
VIBLOQUESTION_LAST_PAGE=64
VIBLOPOST_LAST_PAGE=1350
# GIAPHIEP_LAST_PAGE=1375

## For debug
# DAYNHAUHOC_LAST_PAGE=2
# KIPALOG_LAST_PAGE=2
# KIPALOG_LAST_PAGE=76
# DAYNHAUHOC_LAST_PAGE=1358
# VIBLOQUESTION_LAST_PAGE=64
# VIBLOPOST_LAST_PAGE=1350
# GIAPHIEP_LAST_PAGE=1375
24 changes: 17 additions & 7 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,16 @@ const chalk = require('chalk');

const { kipalogCrawler, kipalogPaginateUrl } = require('./crawlers/kipalogCrawler');
const { daynhauhocCrawler, daynhauhocPaginateUrl } = require('./crawlers/daynhauhocCrawler');
const { giaphiepCrawler, giaphiepPaginateUrl } = require('./crawlers/giaphiepCrawler');
const { viblopostCrawler, viblopostPaginateUrl } = require('./crawlers/viblopostCrawler');

const { kipalogLastPage, daynhauhocLastPage, giaphiepLastPage, vibloquestionLastPage } = require('./config/app.config');
const { kipalogLastPage, daynhauhocLastPage, giaphiepLastPage, vibloquestionLastPage, viblopostLastPage, crawlFrom } = require('./config/app.config');

const crawlers = { kipalogCrawler, daynhauhocCrawler };
const paginateUrls = { kipalogPaginateUrl, daynhauhocPaginateUrl };
const lastPages = { kipalogLastPage, daynhauhocLastPage, giaphiepLastPage, vibloquestionLastPage };
const { getCurrentTime, sleep, getRandomInt } = require('./shared');

const crawlers = { kipalogCrawler, daynhauhocCrawler, giaphiepCrawler, viblopostCrawler };
const paginateUrls = { kipalogPaginateUrl, daynhauhocPaginateUrl, giaphiepPaginateUrl, viblopostPaginateUrl };
const lastPages = { kipalogLastPage, daynhauhocLastPage, giaphiepLastPage, vibloquestionLastPage, viblopostLastPage };

const paginateCrawler = async (pageUrls) => {
const browser = await puppeteer.launch({ headless: true });
Expand All @@ -20,6 +24,10 @@ const paginateCrawler = async (pageUrls) => {
for (let i = 0; i < pageUrls.length; i++) {
const page = await browser.newPage();
const { url, type } = pageUrls[i];
await page.setDefaultNavigationTimeout(0);
let delay = getRandomInt(500, 10_000);
console.log(getCurrentTime() + chalk.yellow('Delay... ') + chalk.white.bgRed(`${delay / 1000}s\t`) + chalk.green(url));
await sleep(delay);
await page.goto(url);

let articlesJSON = await page.evaluate(() => {
Expand All @@ -30,26 +38,28 @@ const paginateCrawler = async (pageUrls) => {

if (type === 'daynhauhoc') {
articles = articles.topic_list.topics;
} else if (type === 'giaphiep' || type === 'viblopost') {
articles = articles.data;
}

articleCounter += articles.length;
const resPromise = articles.map(async (article) => {
return await crawlers[`${type}Crawler`](browser, article);
});

const res = await Promise.all(resPromise);
await Promise.all(resPromise);

page.close();
}

console.log(chalk.yellow('Crawled successfully: ') + chalk.white.bgRed(`${articleCounter} articles`));
console.log(getCurrentTime() + chalk.yellow('Crawled successfully: ') + chalk.white.bgRed(`${articleCounter} articles`));
browser.close();
}

const crawler = () => {
let pageUrls = [];

const types = ['kipalog', 'daynhauhoc'];
const types = crawlFrom;

types.map(type => {
const end = lastPages[`${type}LastPage`];
Expand Down
4 changes: 3 additions & 1 deletion config/app.config.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
module.exports = {
app :{
app: {

},
kipalogLastPage: process.env.KIPALOG_LAST_PAGE,
daynhauhocLastPage: process.env.DAYNHAUHOC_LAST_PAGE,
giaphiepLastPage: process.env.GIAPHIEP_LAST_PAGE,
vibloquestionLastPage: process.env.VIBLOQUESTION_LAST_PAGE,
viblopostLastPage: process.env.VIBLOPOST_LAST_PAGE,
crawlFrom: String(process.env.CRAWL_FROM).split(',')
}
3 changes: 2 additions & 1 deletion config/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
},
"production": {
"storage": "development.db",
"dialect": "sqlite"
"dialect": "sqlite",
"logging": false
}
}
9 changes: 8 additions & 1 deletion crawlers/daynhauhocCrawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ const chalk = require('chalk');

const { articleModel } = require('../models');

const { getCurrentTime, sleep, getRandomInt } = require('../shared');

const daynhauhocHomePage = 'https://daynhauhoc.com';

const type = 'daynhauhoc';
Expand All @@ -20,12 +22,16 @@ const daynhauhocCrawler = async (browser, article) => {

let path = `/t/${slug}/${id}`;
const pageUrl = `${daynhauhocHomePage}${path}`;
console.log(chalk.yellow('Crawling... ') + chalk.green(pageUrl));

const page = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
let delay = getRandomInt(500, 10_000);
console.log(getCurrentTime() + chalk.yellow('Delay... ') + chalk.white.bgRed(`${delay / 1000}s\t`) + chalk.green(pageUrl));
await sleep(delay);
await page.goto(pageUrl);

console.log(getCurrentTime() + chalk.yellow('Crawling...\t') + chalk.green(pageUrl));

const { htmlContent, textContent } = await page.evaluate(() => {
const raw = document.querySelectorAll('#post_1 > div > div.topic-body.clearfix > div.regular.contents')[0];
const htmlContent = raw.outerHTML;
Expand All @@ -36,6 +42,7 @@ const daynhauhocCrawler = async (browser, article) => {
page.close();
const articleData = { title, path, tags, htmlContent, textContent, from: `${type}` };
await articleModel.create(articleData);
console.log(getCurrentTime() + chalk.yellow('Done:\t\t') + chalk.green(pageUrl));
return articleData;
}

Expand Down
99 changes: 99 additions & 0 deletions crawlers/giaphiepCrawler.js
Original file line number Diff line number Diff line change
@@ -1,2 +1,101 @@
const chalk = require('chalk');

const { articleModel } = require('../models');
const { sleep, getRandomInt, getCurrentTime } = require('../shared');

const giaphiepHomePage = 'https://giaphiep.com';

const type = 'giaphiep';

const giaphiepPaginateUrl = (page = 0) => {
return `https://api.giaphiep.com/posts?page=${page}`;
}

const giaphiepCrawler = async (browser, article) => {
let { title, link, tags } = article;

if (tags.length !== 0)
tags = tags.map(tag => String(tag.name).toLocaleLowerCase().trim().replace(' ', '-')).join(';');
else
tags = 'none';

let path = `/blog/${link}`;
const pageUrl = `${giaphiepHomePage}${path}`;

const page = await browser.newPage();
await page.setDefaultNavigationTimeout(0);

await sleep(getRandomInt(5000, 30_000));

await page.goto(pageUrl);

// retry: while (true) {
// try {
// console.log(getCurrentTime() + chalk.yellow(' Crawling... ') + chalk.green(pageUrl));
// const { htmlContent, textContent, realPath } = await page.evaluate(() => {
// const raw = document.querySelectorAll('#__layout > div > div.el-col.el-col-20 > div:nth-child(1) > div.el-col.el-col-24.el-col-md-18 > div:nth-child(2) > div > div > div.content')[0];
// const realPathRaw = document.querySelectorAll('#__layout > div > div.el-col.el-col-20 > div:nth-child(1) > div.el-col.el-col-24.el-col-md-18 > div:nth-child(2) > div > div > div:nth-child(6) > h3 > a')[0];
// const htmlContent = raw.outerHTML;
// const { textContent } = raw;
// const realPath = realPathRaw.textContent.replace('https://viblo.asia', '');;
// return { htmlContent, textContent, realPath };
// });

// page.close();
// const articleData = { title, path: realPath, tags, htmlContent, textContent, from: `viblo` };
// await articleModel.create(articleData);
// return articleData;
// } catch (e) {
// const delay = getRandomInt(5000, 100_000);
// console.log(getCurrentTime() + chalk.yellow('Re-try crawling... ' + chalk.green(pageUrl)) + chalk.white.bgRed(` after ${delay / 1000}s`));
// await sleep(delay);
// await page.reload({ waitUntil: ["networkidle0", "domcontentloaded"] });
// console.log(getCurrentTime() + chalk.yellow('Crawling... ' + chalk.green(pageUrl)));
// continue retry;
// }
// }

console.log(getCurrentTime() + chalk.yellow(' Crawling... ') + chalk.green(pageUrl));
const { htmlContent, textContent, realPath } = await page.evaluate(async () => {
const sleep = (ms = 1000) => {
return new Promise((resolve) => setTimeout(resolve, ms));
}

const getRandomInt = (min, max) => {
min = Math.ceil(min);
max = Math.floor(max);
return Math.floor(Math.random() * (max - min + 1)) + min;
}

let raw = null;
retry: while (true) {
try {
raw = document.querySelectorAll('#__layout > div > div.el-col.el-col-20 > div:nth-child(1) > div.el-col.el-col-24.el-col-md-18 > div:nth-child(2) > div > div > div.content')[0];
const realPathRaw = document.querySelectorAll('#__layout > div > div.el-col.el-col-20 > div:nth-child(1) > div.el-col.el-col-24.el-col-md-18 > div:nth-child(2) > div > div > div:nth-child(6) > h3 > a')[0];
const htmlContent = raw.outerHTML;
const { textContent } = raw;
const realPath = realPathRaw.textContent.replace('https://viblo.asia', '');
return { htmlContent, textContent, realPath };
} catch (e) {
await sleep(getRandomInt(5000, 30_000));
await location.reload();
continue retry;
}
}
});

page.close();
const articleData = { title, path: realPath, tags, htmlContent, textContent, from: `viblo` };
await articleModel.create(articleData);
return articleData;
}

module.exports = {
giaphiepPaginateUrl,
giaphiepCrawler,
giaphiepHomePage,
type
}

// max=1375x20
// https://api.giaphiep.com/posts?page=1375
9 changes: 8 additions & 1 deletion crawlers/kipalogCrawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ const chalk = require('chalk');

const { articleModel } = require('../models');

const { getCurrentTime, sleep, getRandomInt } = require('../shared');

const kipalogHomePage = 'https://kipalog.com';

const type = 'kipalog';
Expand All @@ -19,12 +21,16 @@ const kipalogCrawler = async (browser, article) => {
tags = 'none';

const pageUrl = `${kipalogHomePage}${path}`;
console.log(chalk.yellow('Crawling... ') + chalk.green(pageUrl));

const page = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
let delay = getRandomInt(500, 10_000);
console.log(getCurrentTime() + chalk.yellow('Delay... ') + chalk.white.bgRed(`${delay / 1000}s\t`) + chalk.green(pageUrl));
await sleep(delay);
await page.goto(pageUrl);

console.log(getCurrentTime() + chalk.yellow('Crawling...\t') + chalk.green(pageUrl));

const { htmlContent, textContent } = await page.evaluate(() => {
const raw = document.getElementById('content');
const htmlContent = raw.outerHTML;
Expand All @@ -35,6 +41,7 @@ const kipalogCrawler = async (browser, article) => {
page.close();
const articleData = { title, path, tags, htmlContent, textContent, from: `${type}` };
await articleModel.create(articleData);
console.log(getCurrentTime() + chalk.yellow('Done:\t\t') + chalk.green(pageUrl));
return articleData;
}

Expand Down
48 changes: 48 additions & 0 deletions crawlers/viblopostCrawler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
const chalk = require('chalk');
const showdown = require('showdown');

const converter = new showdown.Converter();

const { articleModel } = require('../models');

const { getCurrentTime } = require('../shared');

const viblopostHomePage = 'https://viblo.asia';

const type = 'viblopost';

const viblopostPaginateUrl = (page = 0) => {
return `${viblopostHomePage}/api/posts/newest?page=${page}&limit=20`;
}

const viblopostCrawler = async (browser, article) => {
let { title, url, tags, contents } = article;

console.log(getCurrentTime() + chalk.yellow('Crawling...\t') + chalk.green(url));

tags = tags.data;

if (tags.length !== 0)
tags = tags.map(tag => String(tag.name).toLocaleLowerCase().trim().replace(' ', '-')).join(';');
else
tags = 'none';

let path = url.replace(viblopostHomePage, '');
let textContent = contents;
let htmlContent = converter.makeHtml(contents);

const articleData = { title, path, tags, htmlContent, textContent, from: `${type}` };
await articleModel.create(articleData);
console.log(getCurrentTime() + chalk.yellow('Done:\t\t') + chalk.green(url));
return articleData;
}

module.exports = {
viblopostPaginateUrl,
viblopostCrawler,
viblopostHomePage,
type
}

// max = 1350
// https://viblo.asia/api/posts/newest?page=4&limit=20
6 changes: 5 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,17 @@
"main": "index.js",
"license": "MIT",
"scripts": {
"start": "node app.js"
"start": "node app.js",
"db:migrate": "sequelize db:migrate",
"db:new": "rm -f development.db && sequelize db:migrate"
},
"dependencies": {
"chalk": "^4.1.0",
"dotenv": "^8.2.0",
"moment": "^2.27.0",
"puppeteer": "^5.2.1",
"sequelize": "^6.3.5",
"showdown": "^1.9.1",
"sqlite3": "^5.0.0"
}
}
8 changes: 7 additions & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,13 @@ yarn install
## Migration

```bash
sequelize db:migrate
yarn db:migrate
```

OR - remove old DB and re-migrate

```bash
yarn db:new # this command will remove developement.db file & migrate DB !
```

## Start crawling data
Expand Down
21 changes: 21 additions & 0 deletions shared/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
const moment = require('moment');

function getCurrentTime() {
let time = `[${moment().format('DD/MM/YYYY, h:m:s A')}]`;
time += time.length < 24 ? '\t\t' : '\t';
return time;
}

function sleep(ms = 1000) {
return new Promise((resolve) => setTimeout(resolve, ms));
}

function getRandomInt(min, max) {
min = Math.ceil(min);
max = Math.floor(max);
return Math.floor(Math.random() * (max - min + 1)) + min;
}

module.exports = {
sleep, getRandomInt, getCurrentTime
}
Loading

0 comments on commit 889930c

Please sign in to comment.