Skip to content

Commit

Permalink
release crawler kipalog
Browse files Browse the repository at this point in the history
  • Loading branch information
tranphuquy19 committed Sep 4, 2020
1 parent 6cffe81 commit 2d459ad
Show file tree
Hide file tree
Showing 10 changed files with 1,139 additions and 40 deletions.
65 changes: 30 additions & 35 deletions app.js
Original file line number Diff line number Diff line change
@@ -1,48 +1,43 @@
const puppeteer = require('puppeteer');
const chalk = require('chalk');

const kipalogHomePage = 'https://kipalog.com';
const { kipalogCrawler, kipalogPaginateUrl } = require('./crawlers/kipalogCrawler');

const kipalogPaginateUrl = (page = 0) => {
return `${kipalogHomePage}/posts/pagination?filter=top&page=${page}`;
}

const kipalogCrawler = async (browser, article) => {
let { title, path, tags } = article;

tags = tags.map(tag => String(tag.name).toLocaleLowerCase());
const pageUrl = `${kipalogHomePage}${path}`;
const paginateCrawler = async (pageUrls) => {
const browser = await puppeteer.launch({ headless: true });

const page = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
await page.goto(pageUrl);
let articleCounter = 0;

const content = await page.evaluate(() => {
const raw = document.getElementById('content');
const htmlContent = raw.outerHTML;
const { textContent } = raw;
return { htmlContent, textContent };
});
for (let i = 0; i < pageUrls.length; i++) {
const page = await browser.newPage();
await page.goto(pageUrls[i]);

return { title, path, tags, content, home: 'kipalog' };
}
let articlesJSON = await page.evaluate(() => {
return document.body.textContent
});

const paginateCrawler = async (pageUrl) => {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(pageUrl);
const articles = JSON.parse(articlesJSON);
articleCounter += articles.length;
const resPromise = articles.map(async (article) => {
return await kipalogCrawler(browser, article);
});

let articlesJSON = await page.evaluate(() => {
return document.body.textContent
});
const res = await Promise.all(resPromise);

const articles = JSON.parse(articlesJSON);
const resPromise = articles.map(async (article) => {
return await kipalogCrawler(browser, article);
});
page.close();
}

const res = await Promise.all(resPromise);
console.log(chalk.yellow('Crawled successfully: ') + chalk.white.bgRed(`${articleCounter} articles`));
browser.close();
}

console.log('DATA', res);
const crawler = (start = 0, end = 0) => {
let pageUrls = [];
for (let i = start; i <= end; i++) {
pageUrls.push(kipalogPaginateUrl(i));
}
paginateCrawler(pageUrls);
}

paginateCrawler(kipalogPaginateUrl(0));
crawler(0, 76);

14 changes: 14 additions & 0 deletions config/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"development": {
"storage": "development.db",
"dialect": "sqlite"
},
"test": {
"storage": "development.db",
"dialect": "sqlite"
},
"production": {
"storage": "development.db",
"dialect": "sqlite"
}
}
43 changes: 43 additions & 0 deletions crawlers/kipalogCrawler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
const chalk = require('chalk');

const articleModel = require('../models').articleModel;

const kipalogHomePage = 'https://kipalog.com';

const kipalogPaginateUrl = (page = 0) => {
return `${kipalogHomePage}/posts/pagination?filter=top&page=${page}`;
}

const kipalogCrawler = async (browser, article) => {
let { title, path, tags } = article;

if (tags.length !== 0)
tags = tags.map(tag => String(tag.name).toLocaleLowerCase().trim()).join(';');
else
tags = 'none';

const pageUrl = `${kipalogHomePage}${path}`;
console.log(chalk.yellow('Crawling... ') + chalk.green(pageUrl));

const page = await browser.newPage();
await page.setDefaultNavigationTimeout(0);
await page.goto(pageUrl);

const { htmlContent, textContent } = await page.evaluate(() => {
const raw = document.getElementById('content');
const htmlContent = raw.outerHTML;
const { textContent } = raw;
return { htmlContent, textContent };
});

page.close();
const articleData = { title, path, tags, htmlContent, textContent, from: 'kipalog' };
await articleModel.create(articleData);
return articleData;
}

module.exports = {
kipalogPaginateUrl,
kipalogCrawler,
kipalogHomePage
}
Binary file added development.db
Binary file not shown.
42 changes: 42 additions & 0 deletions migrations/20200904194014-create-article-model.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
'use strict';
module.exports = {
up: async (queryInterface, Sequelize) => {
await queryInterface.createTable('articleModels', {
id: {
allowNull: false,
autoIncrement: true,
primaryKey: true,
type: Sequelize.INTEGER
},
title: {
type: Sequelize.STRING
},
path: {
type: Sequelize.STRING
},
tags: {
type: Sequelize.STRING
},
htmlContent: {
type: Sequelize.STRING
},
textContent: {
type: Sequelize.STRING
},
from: {
type: Sequelize.STRING
},
createdAt: {
allowNull: false,
type: Sequelize.DATE
},
updatedAt: {
allowNull: false,
type: Sequelize.DATE
}
});
},
down: async (queryInterface, Sequelize) => {
await queryInterface.dropTable('articleModels');
}
};
28 changes: 28 additions & 0 deletions models/articlemodel.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
'use strict';
const {
Model
} = require('sequelize');
module.exports = (sequelize, DataTypes) => {
class articleModel extends Model {
/**
* Helper method for defining associations.
* This method is not a part of Sequelize lifecycle.
* The `models/index` file will call this method automatically.
*/
static associate(models) {
// define association here
}
};
articleModel.init({
title: DataTypes.STRING,
path: DataTypes.STRING,
tags: DataTypes.STRING,
htmlContent: DataTypes.STRING,
textContent: DataTypes.STRING,
from: DataTypes.STRING
}, {
sequelize,
modelName: 'articleModel',
});
return articleModel;
};
37 changes: 37 additions & 0 deletions models/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
'use strict';

const fs = require('fs');
const path = require('path');
const Sequelize = require('sequelize');
const basename = path.basename(__filename);
const env = process.env.NODE_ENV || 'development';
const config = require(__dirname + '/../config/config.json')[env];
const db = {};

let sequelize;
if (config.use_env_variable) {
sequelize = new Sequelize(process.env[config.use_env_variable], config);
} else {
sequelize = new Sequelize(config.database, config.username, config.password, config);
}

fs
.readdirSync(__dirname)
.filter(file => {
return (file.indexOf('.') !== 0) && (file !== basename) && (file.slice(-3) === '.js');
})
.forEach(file => {
const model = require(path.join(__dirname, file))(sequelize, Sequelize.DataTypes);
db[model.name] = model;
});

Object.keys(db).forEach(modelName => {
if (db[modelName].associate) {
db[modelName].associate(db);
}
});

db.sequelize = sequelize;
db.Sequelize = Sequelize;

module.exports = db;
5 changes: 4 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
"start": "node app.js"
},
"dependencies": {
"puppeteer": "^5.2.1"
"chalk": "^4.1.0",
"puppeteer": "^5.2.1",
"sequelize": "^6.3.5",
"sqlite3": "^5.0.0"
}
}
32 changes: 32 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Crawler Tools

## Install yarn and sesquelize-cli

```bash
[sudo] npm install yarn sequelize-cli -g
```

## Clone project

```bash
git clone https://github.com/DoraMatching/crawler-tools.git
```

## Install dependencies

```bash
cd crawler-tools
yarn install
```

## Migration

```bash
sequelize db:migrate
```

## Start crawling data

```bash
yarn start
```
Loading

0 comments on commit 2d459ad

Please sign in to comment.