-
Notifications
You must be signed in to change notification settings - Fork 13
/
spizer-cli.php
137 lines (111 loc) · 3.54 KB
/
spizer-cli.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
<?php
/**
* Spizer main executable file. Currently not very well implemented - should be
* quite different (configuration file based) execution in the final Spizer
* release
*
* This file can be used as an example for running Spizer
*
* @todo Implement configuration file parsing based on Zend_Config for the
* final bundeled runner
*
* @todo Consider other runners (GTK, web based, etc.)
*
* @package Spizer
* @subpackage Runner
* @author Shahar Evron, [email protected]
* @license Licensed under the Apache License 2.0, see COPYING for details
*/
set_include_path(dirname(__FILE__) . '/lib' . PATH_SEPARATOR . get_include_path());
require_once 'Zend/Console/Getopt.php';
require_once 'Spizer/Engine.php';
require_once 'Spizer/Handler/LinkAppender.php';
require_once 'Spizer/Handler/StringMatch.php';
require_once 'Spizer/Logger/Sqlite.php';
$opts = new Zend_Console_Getopt(array(
'delay|d=i' => 'Delay between requests',
'log|l=s' => 'Log output file (defaults to spizerlog.sq3)',
'savecookies|s' => 'Save and resend cookies throughout session',
'help|h' => 'Show this help text'
));
// Parse command line options
try {
$opts->parse();
} catch (Zend_Console_Getopt_Exception $e) {
fwrite(STDERR, "Error parsing command line options: {$e->getMessage()}\n");
exit(1);
}
// If help, show usage and exit
if ($opts->h) {
spizer_usage();
exit(0);
}
$delay = (int) $opts->delay;
$log = $opts->log;
if (! $log) $log = 'spizerlog.sq3';
// Get URL
$args = $opts->getRemainingArgs();
$url = $args[0];
if (! $url) {
spizer_usage();
exit(1);
}
// If we have pcntl - set up a handler for sigterm
if (function_exists('pcntl_signal')) {
declare(ticks = 1);
pcntl_signal(SIGABRT, 'do_exit');
pcntl_signal(SIGHUP, 'do_exit');
pcntl_signal(SIGQUIT, 'do_exit');
pcntl_signal(SIGINT, 'do_exit');
pcntl_signal(SIGTERM, 'do_exit');
}
// Instantiate Spizer engine
$spizer = new Spizer_Engine(array(
'delay' => $delay,
'savecookies' => $opts->savecookies,
'lifo' => true
));
// Set logger
$logger = new Spizer_Logger_Sqlite(array('dbfile' => $log));
$spizer->setLogger($logger);
// Set the spider to follow links, hrefs, images and script references
$spizer->addHandler(new Spizer_Handler_LinkAppender(array(
'domain' => parse_url($url, PHP_URL_HOST)
)));
// Add some handlers to be executed on 200 OK + text/html pages
$spizer->addHandler(new Spizer_Handler_StringMatch(array(
'match' => 'error',
'matchcase' => false,
'status' => 200,
'content-type' => 'text/html')));
$spizer->addHandler(new Spizer_Handler_StringMatch(array(
'match' => 'warning',
'matchcase' => false,
'status' => 200,
'content-type' => 'text/html')));
// Go!
$spizer->run($url);
do_exit();
// -- end here --
// Some functions
function spizer_usage()
{
if (! isset($argv)) $argv = $_SERVER['argv'];
echo <<<USAGE
Spizer - the flexible web spider, v. 0.1
Usage: {$argv[0]} [options] <Start URL>
Where [options] can be:
--delay | -d <seconds> Number of seconds to delay between requests
--log | -l <log file> Send messages to file instead of to stdout
--savecookies | -s Save and resend cookies throughout session
--help | -h Show this help message
USAGE;
}
function do_exit()
{
global $spizer;
$c = $spizer->getRequestCounter();
unset($spizer);
file_put_contents('php://stdout', "Spizer preformed a total of $c HTTP requests.\n");
exit(0);
}