-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6cffe81
commit 2d459ad
Showing
10 changed files
with
1,139 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,48 +1,43 @@ | ||
const puppeteer = require('puppeteer'); | ||
const chalk = require('chalk'); | ||
|
||
const kipalogHomePage = 'https://kipalog.com'; | ||
const { kipalogCrawler, kipalogPaginateUrl } = require('./crawlers/kipalogCrawler'); | ||
|
||
const kipalogPaginateUrl = (page = 0) => { | ||
return `${kipalogHomePage}/posts/pagination?filter=top&page=${page}`; | ||
} | ||
|
||
const kipalogCrawler = async (browser, article) => { | ||
let { title, path, tags } = article; | ||
|
||
tags = tags.map(tag => String(tag.name).toLocaleLowerCase()); | ||
const pageUrl = `${kipalogHomePage}${path}`; | ||
const paginateCrawler = async (pageUrls) => { | ||
const browser = await puppeteer.launch({ headless: true }); | ||
|
||
const page = await browser.newPage(); | ||
await page.setDefaultNavigationTimeout(0); | ||
await page.goto(pageUrl); | ||
let articleCounter = 0; | ||
|
||
const content = await page.evaluate(() => { | ||
const raw = document.getElementById('content'); | ||
const htmlContent = raw.outerHTML; | ||
const { textContent } = raw; | ||
return { htmlContent, textContent }; | ||
}); | ||
for (let i = 0; i < pageUrls.length; i++) { | ||
const page = await browser.newPage(); | ||
await page.goto(pageUrls[i]); | ||
|
||
return { title, path, tags, content, home: 'kipalog' }; | ||
} | ||
let articlesJSON = await page.evaluate(() => { | ||
return document.body.textContent | ||
}); | ||
|
||
const paginateCrawler = async (pageUrl) => { | ||
const browser = await puppeteer.launch({ headless: true }); | ||
const page = await browser.newPage(); | ||
await page.goto(pageUrl); | ||
const articles = JSON.parse(articlesJSON); | ||
articleCounter += articles.length; | ||
const resPromise = articles.map(async (article) => { | ||
return await kipalogCrawler(browser, article); | ||
}); | ||
|
||
let articlesJSON = await page.evaluate(() => { | ||
return document.body.textContent | ||
}); | ||
const res = await Promise.all(resPromise); | ||
|
||
const articles = JSON.parse(articlesJSON); | ||
const resPromise = articles.map(async (article) => { | ||
return await kipalogCrawler(browser, article); | ||
}); | ||
page.close(); | ||
} | ||
|
||
const res = await Promise.all(resPromise); | ||
console.log(chalk.yellow('Crawled successfully: ') + chalk.white.bgRed(`${articleCounter} articles`)); | ||
browser.close(); | ||
} | ||
|
||
console.log('DATA', res); | ||
const crawler = (start = 0, end = 0) => { | ||
let pageUrls = []; | ||
for (let i = start; i <= end; i++) { | ||
pageUrls.push(kipalogPaginateUrl(i)); | ||
} | ||
paginateCrawler(pageUrls); | ||
} | ||
|
||
paginateCrawler(kipalogPaginateUrl(0)); | ||
crawler(0, 76); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
{ | ||
"development": { | ||
"storage": "development.db", | ||
"dialect": "sqlite" | ||
}, | ||
"test": { | ||
"storage": "development.db", | ||
"dialect": "sqlite" | ||
}, | ||
"production": { | ||
"storage": "development.db", | ||
"dialect": "sqlite" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
const chalk = require('chalk'); | ||
|
||
const articleModel = require('../models').articleModel; | ||
|
||
const kipalogHomePage = 'https://kipalog.com'; | ||
|
||
const kipalogPaginateUrl = (page = 0) => { | ||
return `${kipalogHomePage}/posts/pagination?filter=top&page=${page}`; | ||
} | ||
|
||
const kipalogCrawler = async (browser, article) => { | ||
let { title, path, tags } = article; | ||
|
||
if (tags.length !== 0) | ||
tags = tags.map(tag => String(tag.name).toLocaleLowerCase().trim()).join(';'); | ||
else | ||
tags = 'none'; | ||
|
||
const pageUrl = `${kipalogHomePage}${path}`; | ||
console.log(chalk.yellow('Crawling... ') + chalk.green(pageUrl)); | ||
|
||
const page = await browser.newPage(); | ||
await page.setDefaultNavigationTimeout(0); | ||
await page.goto(pageUrl); | ||
|
||
const { htmlContent, textContent } = await page.evaluate(() => { | ||
const raw = document.getElementById('content'); | ||
const htmlContent = raw.outerHTML; | ||
const { textContent } = raw; | ||
return { htmlContent, textContent }; | ||
}); | ||
|
||
page.close(); | ||
const articleData = { title, path, tags, htmlContent, textContent, from: 'kipalog' }; | ||
await articleModel.create(articleData); | ||
return articleData; | ||
} | ||
|
||
module.exports = { | ||
kipalogPaginateUrl, | ||
kipalogCrawler, | ||
kipalogHomePage | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
'use strict'; | ||
module.exports = { | ||
up: async (queryInterface, Sequelize) => { | ||
await queryInterface.createTable('articleModels', { | ||
id: { | ||
allowNull: false, | ||
autoIncrement: true, | ||
primaryKey: true, | ||
type: Sequelize.INTEGER | ||
}, | ||
title: { | ||
type: Sequelize.STRING | ||
}, | ||
path: { | ||
type: Sequelize.STRING | ||
}, | ||
tags: { | ||
type: Sequelize.STRING | ||
}, | ||
htmlContent: { | ||
type: Sequelize.STRING | ||
}, | ||
textContent: { | ||
type: Sequelize.STRING | ||
}, | ||
from: { | ||
type: Sequelize.STRING | ||
}, | ||
createdAt: { | ||
allowNull: false, | ||
type: Sequelize.DATE | ||
}, | ||
updatedAt: { | ||
allowNull: false, | ||
type: Sequelize.DATE | ||
} | ||
}); | ||
}, | ||
down: async (queryInterface, Sequelize) => { | ||
await queryInterface.dropTable('articleModels'); | ||
} | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
'use strict'; | ||
const { | ||
Model | ||
} = require('sequelize'); | ||
module.exports = (sequelize, DataTypes) => { | ||
class articleModel extends Model { | ||
/** | ||
* Helper method for defining associations. | ||
* This method is not a part of Sequelize lifecycle. | ||
* The `models/index` file will call this method automatically. | ||
*/ | ||
static associate(models) { | ||
// define association here | ||
} | ||
}; | ||
articleModel.init({ | ||
title: DataTypes.STRING, | ||
path: DataTypes.STRING, | ||
tags: DataTypes.STRING, | ||
htmlContent: DataTypes.STRING, | ||
textContent: DataTypes.STRING, | ||
from: DataTypes.STRING | ||
}, { | ||
sequelize, | ||
modelName: 'articleModel', | ||
}); | ||
return articleModel; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
'use strict'; | ||
|
||
const fs = require('fs'); | ||
const path = require('path'); | ||
const Sequelize = require('sequelize'); | ||
const basename = path.basename(__filename); | ||
const env = process.env.NODE_ENV || 'development'; | ||
const config = require(__dirname + '/../config/config.json')[env]; | ||
const db = {}; | ||
|
||
let sequelize; | ||
if (config.use_env_variable) { | ||
sequelize = new Sequelize(process.env[config.use_env_variable], config); | ||
} else { | ||
sequelize = new Sequelize(config.database, config.username, config.password, config); | ||
} | ||
|
||
fs | ||
.readdirSync(__dirname) | ||
.filter(file => { | ||
return (file.indexOf('.') !== 0) && (file !== basename) && (file.slice(-3) === '.js'); | ||
}) | ||
.forEach(file => { | ||
const model = require(path.join(__dirname, file))(sequelize, Sequelize.DataTypes); | ||
db[model.name] = model; | ||
}); | ||
|
||
Object.keys(db).forEach(modelName => { | ||
if (db[modelName].associate) { | ||
db[modelName].associate(db); | ||
} | ||
}); | ||
|
||
db.sequelize = sequelize; | ||
db.Sequelize = Sequelize; | ||
|
||
module.exports = db; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# Crawler Tools | ||
|
||
## Install yarn and sesquelize-cli | ||
|
||
```bash | ||
[sudo] npm install yarn sequelize-cli -g | ||
``` | ||
|
||
## Clone project | ||
|
||
```bash | ||
git clone https://github.com/DoraMatching/crawler-tools.git | ||
``` | ||
|
||
## Install dependencies | ||
|
||
```bash | ||
cd crawler-tools | ||
yarn install | ||
``` | ||
|
||
## Migration | ||
|
||
```bash | ||
sequelize db:migrate | ||
``` | ||
|
||
## Start crawling data | ||
|
||
```bash | ||
yarn start | ||
``` |
Oops, something went wrong.