Skip to content

Commit

Permalink
done crawler daynhauhoc
Browse files Browse the repository at this point in the history
  • Loading branch information
tranphuquy19 committed Sep 5, 2020
1 parent b1cfa3b commit 00b4bb8
Show file tree
Hide file tree
Showing 11 changed files with 133 additions and 14 deletions.
8 changes: 8 additions & 0 deletions .env
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
KIPALOG_LAST_PAGE=76
DAYNHAUHOC_LAST_PAGE=1358
GIAPHIEP_LAST_PAGE=1375
VIBLOQUESTION_LAST_PAGE=64

## For debug
# DAYNHAUHOC_LAST_PAGE=2
# KIPALOG_LAST_PAGE=2
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ web_modules/
.yarn-integrity

# dotenv environment variables file
.env
# .env
.env.test

# parcel-bundler cache (https://parceljs.org/)
Expand Down
17 changes: 17 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"type": "node",
"request": "launch",
"name": "Launch Program",
"skipFiles": [
"<node_internals>/**"
],
"program": "${workspaceFolder}/app.js"
}
]
}
41 changes: 32 additions & 9 deletions app.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
require('dotenv').config();

const puppeteer = require('puppeteer');
const chalk = require('chalk');

const { kipalogCrawler, kipalogPaginateUrl } = require('./crawlers/kipalogCrawler');
const { daynhauhocCrawler, daynhauhocPaginateUrl } = require('./crawlers/daynhauhocCrawler');

const { kipalogLastPage, daynhauhocLastPage, giaphiepLastPage, vibloquestionLastPage } = require('./config/app.config');

const crawlers = { kipalogCrawler, daynhauhocCrawler };
const paginateUrls = { kipalogPaginateUrl, daynhauhocPaginateUrl };
const lastPages = { kipalogLastPage, daynhauhocLastPage, giaphiepLastPage, vibloquestionLastPage };

const paginateCrawler = async (pageUrls) => {
const browser = await puppeteer.launch({ headless: true });
Expand All @@ -10,16 +19,22 @@ const paginateCrawler = async (pageUrls) => {

for (let i = 0; i < pageUrls.length; i++) {
const page = await browser.newPage();
await page.goto(pageUrls[i]);
const { url, type } = pageUrls[i];
await page.goto(url);

let articlesJSON = await page.evaluate(() => {
return document.body.textContent
return document.body.textContent;
});

const articles = JSON.parse(articlesJSON);
let articles = JSON.parse(articlesJSON);

if (type === 'daynhauhoc') {
articles = articles.topic_list.topics;
}

articleCounter += articles.length;
const resPromise = articles.map(async (article) => {
return await kipalogCrawler(browser, article);
return await crawlers[`${type}Crawler`](browser, article);
});

const res = await Promise.all(resPromise);
Expand All @@ -31,13 +46,21 @@ const paginateCrawler = async (pageUrls) => {
browser.close();
}

const crawler = (start = 0, end = 0) => {
const crawler = () => {
let pageUrls = [];
for (let i = start; i <= end; i++) {
pageUrls.push(kipalogPaginateUrl(i));
}

const types = ['kipalog', 'daynhauhoc'];

types.map(type => {
const end = lastPages[`${type}LastPage`];
for (let i = 0; i <= end; i++) {
const url = paginateUrls[`${type}PaginateUrl`](i);
pageUrls.push({ url, type });
}
})

paginateCrawler(pageUrls);
}

crawler(0, 76);
crawler();

9 changes: 9 additions & 0 deletions config/app.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
module.exports = {
app :{

},
kipalogLastPage: process.env.KIPALOG_LAST_PAGE,
daynhauhocLastPage: process.env.DAYNHAUHOC_LAST_PAGE,
giaphiepLastPage: process.env.GIAPHIEP_LAST_PAGE,
vibloquestionLastPage: process.env.VIBLOQUESTION_LAST_PAGE,
}
49 changes: 49 additions & 0 deletions crawlers/daynhauhocCrawler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
const chalk = require('chalk');

const { articleModel } = require('../models');

const daynhauhocHomePage = 'https://daynhauhoc.com';

const type = 'daynhauhoc';

const daynhauhocPaginateUrl = (page = 0) => {
return `${daynhauhocHomePage}/latest.json?no_definitions=true&page=${page}`;
}

const daynhauhocCrawler = async (browser, article) => {
let { id, title, slug, tags } = article;

if (tags.length !== 0)
tags = tags.map(tag => String(tag).toLocaleLowerCase().trim().replace(' ', '-')).join(';');
else
tags = 'none';

let path = `/t/${slug}/${id}`;
const pageUrl = `${daynhauhocHomePage}${path}`;
console.log(chalk.yellow('Crawling... ') + chalk.green(pageUrl));

const page = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
await page.goto(pageUrl);

const { htmlContent, textContent } = await page.evaluate(() => {
const raw = document.querySelectorAll('#post_1 > div > div.topic-body.clearfix > div.regular.contents')[0];
const htmlContent = raw.outerHTML;
const { textContent } = raw;
return { htmlContent, textContent };
});

page.close();
const articleData = { title, path, tags, htmlContent, textContent, from: `${type}` };
await articleModel.create(articleData);
return articleData;
}

module.exports = {
daynhauhocPaginateUrl,
daynhauhocCrawler,
daynhauhocHomePage,
type
}
// max = 1358x20
// https://daynhauhoc.com/latest.json?no_definitions=true&page=200
2 changes: 2 additions & 0 deletions crawlers/giaphiepCrawler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
// max=1375x20
// https://api.giaphiep.com/posts?page=1375
11 changes: 7 additions & 4 deletions crawlers/kipalogCrawler.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
const chalk = require('chalk');

const articleModel = require('../models').articleModel;
const { articleModel } = require('../models');

const kipalogHomePage = 'https://kipalog.com';

const type = 'kipalog';

const kipalogPaginateUrl = (page = 0) => {
return `${kipalogHomePage}/posts/pagination?filter=top&page=${page}`;
}
Expand All @@ -12,7 +14,7 @@ const kipalogCrawler = async (browser, article) => {
let { title, path, tags } = article;

if (tags.length !== 0)
tags = tags.map(tag => String(tag.name).toLocaleLowerCase().trim()).join(';');
tags = tags.map(tag => String(tag.name).toLocaleLowerCase().trim().replace(' ', '-')).join(';');
else
tags = 'none';

Expand All @@ -31,13 +33,14 @@ const kipalogCrawler = async (browser, article) => {
});

page.close();
const articleData = { title, path, tags, htmlContent, textContent, from: 'kipalog' };
const articleData = { title, path, tags, htmlContent, textContent, from: `${type}` };
await articleModel.create(articleData);
return articleData;
}

module.exports = {
kipalogPaginateUrl,
kipalogCrawler,
kipalogHomePage
kipalogHomePage,
type
}
2 changes: 2 additions & 0 deletions crawlers/vibloquestionCrawler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
// max=64x20
// https://viblo.asia/api/questions?feed=newest&limit=20&page=64
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
},
"dependencies": {
"chalk": "^4.1.0",
"dotenv": "^8.2.0",
"puppeteer": "^5.2.1",
"sequelize": "^6.3.5",
"sqlite3": "^5.0.0"
Expand Down
5 changes: 5 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,11 @@ [email protected]:
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.781568.tgz#4cdca90a952d2c77831096ff6cd32695d8715a04"
integrity sha512-9Uqnzy6m6zEStluH9iyJ3iHyaQziFnMnLeC8vK0eN6smiJmIx7+yB64d67C2lH/LZra+5cGscJAJsNXO+MdPMg==

dotenv@^8.2.0:
version "8.2.0"
resolved "https://registry.yarnpkg.com/dotenv/-/dotenv-8.2.0.tgz#97e619259ada750eea3e4ea3e26bceea5424b16a"
integrity sha512-8sJ78ElpbDJBHNeBzUbUVLsqKdccaa/BXF1uPTw3GrvQTBgrQrtObr2mUrE38vzYd8cEv+m/JBfDLioYcfXoaw==

dottie@^2.0.0:
version "2.0.2"
resolved "https://registry.yarnpkg.com/dottie/-/dottie-2.0.2.tgz#cc91c0726ce3a054ebf11c55fbc92a7f266dd154"
Expand Down

0 comments on commit 00b4bb8

Please sign in to comment.