-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathscraper.js
83 lines (66 loc) · 2.41 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
var request = require('request');
var getProxies = function (callback, pageNum, proxiesScraped) {
if (!proxiesScraped) {
proxiesScraped = {};
}
if (!pageNum){
pageNum = 1;
}
var fakeNums = {};
request('http://www.hidemyass.com/proxy-list/' + pageNum, function (err, res, body) {
if (!res || res.statusCode != 200) {
callback("Response code was not 200");
return;
}
var ips = [];
var ports = [];
var types = [];
body.replace(/\.(.*?)\{display\:none\}/g, function () {
//arguments[0] is the entire match
fakeNums[arguments[1]] = 1
});
body.replace(/<td>([\S\s]*?)<\/td>/g, function () {
var str = arguments[1].trim();
if (str === "HTTP" || str === "HTTPS" || str === "socks4/5")
types.push(str);
});
var trim = body;
trim = trim.replace(/\s/g, '');
trim.replace(/<td>([0-9]+)<\/td>/g, function () {
ports.push(arguments[1])
});
body.replace(/{display:inline}[\S\s]?<\/style>([\S\s]*?)<\/td>/g, function () {
var temp = arguments[1];
temp = temp.replace(/<span class\=\"(.*?)\">.*?<\/span>/g, function () {
if (fakeNums[arguments[1]]) {
return ''
}
return arguments[0]
});
temp = temp.replace(/<span style\=\"display\:none\">(.*?)<\/span>/g, "");
temp = temp.replace(/<div style\=\"display\:none\">(.*?)<\/div>/g, "");
temp = temp.replace(/<(.*?)>/g, '');
temp = temp.trim();
ips.push(temp)
});
var count = 0;
if (ips.length > 0) {
if (ports.length == 0 || ports.length != ips.length || ips.length != types.length){
callback("Regex parsing has failed.");
return;
}
for (var i = 0; i < ips.length; i++) {
if (types[i] == 'HTTP' || types[i] == 'HTTPS') {
count++;
proxiesScraped[ips[i]] = ports[i]
}
}
console.log('collected ' + count + ' http proxies from page ' + pageNum);
getProxies(callback, pageNum + 1, proxiesScraped)
}
else {
callback(null,proxiesScraped)
}
})
};
module.exports = {getProxies: getProxies};