-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCrawler.php
137 lines (107 loc) · 3.79 KB
/
Crawler.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
<?php
require_once 'Queue.php';
class Crawler
{
protected $queue;
protected $stayOnDomain = true;
protected $tasks = array();
protected $debugMode;
protected $domain;
protected $limit;
protected $counter;
/**
* @var Zend_Http_Response
*/
protected $currentResponse;
/**
* @var Zend_Http_Client
*/
protected $client;
public function __construct($startFrom)
{
$this->queue = new Queue();
$this->queue->push($startFrom);
$this->client = new Zend_Http_Client();
$this->domain = parse_url($startFrom, PHP_URL_HOST);
}
public function registerTask(CrawlTaskInterface $crawlTask)
{
$this->tasks[] = $crawlTask;
}
public function setDebugMode($val)
{
$this->debugMode = $val;
}
public function setQueue(Queue $queue)
{
$this->queue = $queue;
}
public function getQueue()
{
return $this->queue;
}
public function setLimit($num)
{
$this->limit = $num;
}
public function run()
{
if ($this->debugMode) {
echo "Restricting crawl to $this->domain\n";
}
//loop across available items in the queue of pages to crawl
while (!$this->queue->isEmpty()) {
if (isset($this->limit) && ($this->counter >= $this->limit)) {
break;
}
$this->counter++;
//get a new url to crawl
$url = $this->queue->pop();
if ($this->debugMode) {
echo "Queue Length: " . $this->queue->queueLength() . "\n";
echo "Crawling " . $url . "\n";
}
//set the url into the http client
$this->client->setUri($url);
//make the request to the remote server
$this->currentResponse = $this->client->request();
//don't bother trying to parse this if it's not text
if (stripos($this->currentResponse->getHeader('Content-type'), 'text') === false) {
continue;
}
//search for <a> tags in the document
$body = $this->currentResponse->getBody();
$linksQuery = new Zend_Dom_Query($body);
$links = $linksQuery->query('a');
if ($this->debugMode) {
echo "\tFound " . count($links) . " links...\n";
}
foreach ($links as $link) {
//get the href of the link and find out if it links to the current host
$href = $link->getAttribute('href');
$urlparts = parse_url($href);
if ($this->stayOnDomain && isset($urlparts["host"]) && $urlparts["host"] != $this->domain) {
continue;
}
//if it's an absolute link without a domain or a scheme, attempt to fix it
if (!isset($urlparts["host"])) {
$href = 'http://' . $this->domain . $href; //this is a really naive way of doing this!
}
//push this link into the queue to be crawled
$this->queue->push($href);
}
//for each page that we see, run every registered task across it
foreach ($this->tasks as $task) {
$task->task($this->currentResponse, $this->client);
}
}
//after we're done with everything, call the shutdown hook on all the tasks
$this->shutdownTasks();
}
public function shutdownTasks()
{
foreach ($this->tasks as $task) {
$task->shutdown();
}
}
}