-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
00b4bb8
commit 889930c
Showing
12 changed files
with
389 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,16 @@ | ||
NODE_ENV=production | ||
|
||
CRAWL_FORM="kipalog,daynhauhoc,viblopost" # giaphiep is not supported! | ||
|
||
KIPALOG_LAST_PAGE=76 | ||
DAYNHAUHOC_LAST_PAGE=1358 | ||
GIAPHIEP_LAST_PAGE=1375 | ||
VIBLOQUESTION_LAST_PAGE=64 | ||
VIBLOPOST_LAST_PAGE=1350 | ||
# GIAPHIEP_LAST_PAGE=1375 | ||
|
||
## For debug | ||
# DAYNHAUHOC_LAST_PAGE=2 | ||
# KIPALOG_LAST_PAGE=2 | ||
# KIPALOG_LAST_PAGE=76 | ||
# DAYNHAUHOC_LAST_PAGE=1358 | ||
# VIBLOQUESTION_LAST_PAGE=64 | ||
# VIBLOPOST_LAST_PAGE=1350 | ||
# GIAPHIEP_LAST_PAGE=1375 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,11 @@ | ||
module.exports = { | ||
app :{ | ||
app: { | ||
|
||
}, | ||
kipalogLastPage: process.env.KIPALOG_LAST_PAGE, | ||
daynhauhocLastPage: process.env.DAYNHAUHOC_LAST_PAGE, | ||
giaphiepLastPage: process.env.GIAPHIEP_LAST_PAGE, | ||
vibloquestionLastPage: process.env.VIBLOQUESTION_LAST_PAGE, | ||
viblopostLastPage: process.env.VIBLOPOST_LAST_PAGE, | ||
crawlFrom: String(process.env.CRAWL_FROM).split(',') | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ | |
}, | ||
"production": { | ||
"storage": "development.db", | ||
"dialect": "sqlite" | ||
"dialect": "sqlite", | ||
"logging": false | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,101 @@ | ||
const chalk = require('chalk'); | ||
|
||
const { articleModel } = require('../models'); | ||
const { sleep, getRandomInt, getCurrentTime } = require('../shared'); | ||
|
||
const giaphiepHomePage = 'https://giaphiep.com'; | ||
|
||
const type = 'giaphiep'; | ||
|
||
const giaphiepPaginateUrl = (page = 0) => { | ||
return `https://api.giaphiep.com/posts?page=${page}`; | ||
} | ||
|
||
const giaphiepCrawler = async (browser, article) => { | ||
let { title, link, tags } = article; | ||
|
||
if (tags.length !== 0) | ||
tags = tags.map(tag => String(tag.name).toLocaleLowerCase().trim().replace(' ', '-')).join(';'); | ||
else | ||
tags = 'none'; | ||
|
||
let path = `/blog/${link}`; | ||
const pageUrl = `${giaphiepHomePage}${path}`; | ||
|
||
const page = await browser.newPage(); | ||
await page.setDefaultNavigationTimeout(0); | ||
|
||
await sleep(getRandomInt(5000, 30_000)); | ||
|
||
await page.goto(pageUrl); | ||
|
||
// retry: while (true) { | ||
// try { | ||
// console.log(getCurrentTime() + chalk.yellow(' Crawling... ') + chalk.green(pageUrl)); | ||
// const { htmlContent, textContent, realPath } = await page.evaluate(() => { | ||
// const raw = document.querySelectorAll('#__layout > div > div.el-col.el-col-20 > div:nth-child(1) > div.el-col.el-col-24.el-col-md-18 > div:nth-child(2) > div > div > div.content')[0]; | ||
// const realPathRaw = document.querySelectorAll('#__layout > div > div.el-col.el-col-20 > div:nth-child(1) > div.el-col.el-col-24.el-col-md-18 > div:nth-child(2) > div > div > div:nth-child(6) > h3 > a')[0]; | ||
// const htmlContent = raw.outerHTML; | ||
// const { textContent } = raw; | ||
// const realPath = realPathRaw.textContent.replace('https://viblo.asia', '');; | ||
// return { htmlContent, textContent, realPath }; | ||
// }); | ||
|
||
// page.close(); | ||
// const articleData = { title, path: realPath, tags, htmlContent, textContent, from: `viblo` }; | ||
// await articleModel.create(articleData); | ||
// return articleData; | ||
// } catch (e) { | ||
// const delay = getRandomInt(5000, 100_000); | ||
// console.log(getCurrentTime() + chalk.yellow('Re-try crawling... ' + chalk.green(pageUrl)) + chalk.white.bgRed(` after ${delay / 1000}s`)); | ||
// await sleep(delay); | ||
// await page.reload({ waitUntil: ["networkidle0", "domcontentloaded"] }); | ||
// console.log(getCurrentTime() + chalk.yellow('Crawling... ' + chalk.green(pageUrl))); | ||
// continue retry; | ||
// } | ||
// } | ||
|
||
console.log(getCurrentTime() + chalk.yellow(' Crawling... ') + chalk.green(pageUrl)); | ||
const { htmlContent, textContent, realPath } = await page.evaluate(async () => { | ||
const sleep = (ms = 1000) => { | ||
return new Promise((resolve) => setTimeout(resolve, ms)); | ||
} | ||
|
||
const getRandomInt = (min, max) => { | ||
min = Math.ceil(min); | ||
max = Math.floor(max); | ||
return Math.floor(Math.random() * (max - min + 1)) + min; | ||
} | ||
|
||
let raw = null; | ||
retry: while (true) { | ||
try { | ||
raw = document.querySelectorAll('#__layout > div > div.el-col.el-col-20 > div:nth-child(1) > div.el-col.el-col-24.el-col-md-18 > div:nth-child(2) > div > div > div.content')[0]; | ||
const realPathRaw = document.querySelectorAll('#__layout > div > div.el-col.el-col-20 > div:nth-child(1) > div.el-col.el-col-24.el-col-md-18 > div:nth-child(2) > div > div > div:nth-child(6) > h3 > a')[0]; | ||
const htmlContent = raw.outerHTML; | ||
const { textContent } = raw; | ||
const realPath = realPathRaw.textContent.replace('https://viblo.asia', ''); | ||
return { htmlContent, textContent, realPath }; | ||
} catch (e) { | ||
await sleep(getRandomInt(5000, 30_000)); | ||
await location.reload(); | ||
continue retry; | ||
} | ||
} | ||
}); | ||
|
||
page.close(); | ||
const articleData = { title, path: realPath, tags, htmlContent, textContent, from: `viblo` }; | ||
await articleModel.create(articleData); | ||
return articleData; | ||
} | ||
|
||
module.exports = { | ||
giaphiepPaginateUrl, | ||
giaphiepCrawler, | ||
giaphiepHomePage, | ||
type | ||
} | ||
|
||
// max=1375x20 | ||
// https://api.giaphiep.com/posts?page=1375 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
const chalk = require('chalk'); | ||
const showdown = require('showdown'); | ||
|
||
const converter = new showdown.Converter(); | ||
|
||
const { articleModel } = require('../models'); | ||
|
||
const { getCurrentTime } = require('../shared'); | ||
|
||
const viblopostHomePage = 'https://viblo.asia'; | ||
|
||
const type = 'viblopost'; | ||
|
||
const viblopostPaginateUrl = (page = 0) => { | ||
return `${viblopostHomePage}/api/posts/newest?page=${page}&limit=20`; | ||
} | ||
|
||
const viblopostCrawler = async (browser, article) => { | ||
let { title, url, tags, contents } = article; | ||
|
||
console.log(getCurrentTime() + chalk.yellow('Crawling...\t') + chalk.green(url)); | ||
|
||
tags = tags.data; | ||
|
||
if (tags.length !== 0) | ||
tags = tags.map(tag => String(tag.name).toLocaleLowerCase().trim().replace(' ', '-')).join(';'); | ||
else | ||
tags = 'none'; | ||
|
||
let path = url.replace(viblopostHomePage, ''); | ||
let textContent = contents; | ||
let htmlContent = converter.makeHtml(contents); | ||
|
||
const articleData = { title, path, tags, htmlContent, textContent, from: `${type}` }; | ||
await articleModel.create(articleData); | ||
console.log(getCurrentTime() + chalk.yellow('Done:\t\t') + chalk.green(url)); | ||
return articleData; | ||
} | ||
|
||
module.exports = { | ||
viblopostPaginateUrl, | ||
viblopostCrawler, | ||
viblopostHomePage, | ||
type | ||
} | ||
|
||
// max = 1350 | ||
// https://viblo.asia/api/posts/newest?page=4&limit=20 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
const moment = require('moment'); | ||
|
||
function getCurrentTime() { | ||
let time = `[${moment().format('DD/MM/YYYY, h:m:s A')}]`; | ||
time += time.length < 24 ? '\t\t' : '\t'; | ||
return time; | ||
} | ||
|
||
function sleep(ms = 1000) { | ||
return new Promise((resolve) => setTimeout(resolve, ms)); | ||
} | ||
|
||
function getRandomInt(min, max) { | ||
min = Math.ceil(min); | ||
max = Math.floor(max); | ||
return Math.floor(Math.random() * (max - min + 1)) + min; | ||
} | ||
|
||
module.exports = { | ||
sleep, getRandomInt, getCurrentTime | ||
} |
Oops, something went wrong.