-
Notifications
You must be signed in to change notification settings - Fork 0
/
Crawl.js
88 lines (78 loc) · 3.35 KB
/
Crawl.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
module.exports = class Crawl {
constructor() {
this.Crawler = require('crawler')
this.wordArray = []
this.wordCount = 0
this.fs = require('fs')
}
random(length) {
var result = '';
var characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789';
var charactersLength = characters.length;
for (var i = 0; i < length; i++) {
result += characters.charAt(Math.floor(Math.random() * charactersLength));
}
return result;
}
async work(url, callback, time_sleep) {
let fs = require('fs')
let md5 = require('md5')
let newWordArray = this.wordArray
let newWordCount = this.wordCount
let c = new this.Crawler({
maxConnections: 1,
// This will be called for each crawled page
callback: async function (error, res, done) {
console.log('-------------Started crawl----------------')
console.log(newWordCount, ' => ', newWordArray[newWordCount])
let date = new Date()
let objResult = null
if (error) {
console.log(error)
fs.writeFileSync('./logs/' + date.getTime(), error + '\nbreak from line ' + newWordCount + ' from words.txt')
} else {
const cheerio = require('cheerio')
var $ = res.$
let US = null
let UK = null
let pos = null
// If have element .uk and .us
if ($('.entry-body').find('.uk').length > 0) {
//console.log('element is exists')
let UkObj = cheerio.load($(".uk").html())
UK = UkObj('.ipa').text()
// console.log(word, ' => ', UK)
}
if ($('.entry-body').find('.us').length > 0) {
let UsObj = cheerio.load($(".us").html())
US = UsObj('.ipa').text()
// console.log(word, '=>', US)
}
if ($('.entry-body').find('.pos').length > 0) {
let posObj = cheerio.load($('.pos').html())
pos = posObj.text()
console.log('pos => ', pos)
}
objResult = {
[newWordArray[newWordCount]]: {
uk: UK,
us: US,
pos: pos
}
}
console.log(objResult)
fs.writeFileSync('./files/' + md5(newWordArray[newWordCount]), JSON.stringify(objResult))
fs.writeFileSync('point.txt', newWordCount + '\n')
console.log('-------------Finished crawl----------------')
callback(objResult)
done()
let sleep = require('sleep')
sleep.sleep(time_sleep)
newWordCount++
c.queue(url + newWordArray[newWordCount] + '?q=' + newWordArray[newWordCount])
}
}
})
c.queue(url + newWordArray[newWordCount] + '?q=' + newWordArray[newWordCount])
}
}