From ab3c6553832d7023573c20eaee0f1e304b8a7fac Mon Sep 17 00:00:00 2001 From: seatle Date: Wed, 18 Jul 2018 19:28:55 +0700 Subject: [PATCH] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E7=BC=93=E5=AD=98=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E9=87=8F=E5=A4=AA=E5=A4=A7=E6=B8=85=E7=A9=BAredis?= =?UTF-8?q?=E7=9A=84=E6=80=A7=E8=83=BD=E9=97=AE=E9=A2=98=EF=BC=8C=E8=A7=A3?= =?UTF-8?q?=E5=86=B3url=E5=8C=B9=E9=85=8D=E4=B8=8D=E5=A4=9F=E5=AE=8C?= =?UTF-8?q?=E7=BE=8E=E9=97=AE=E9=A2=98=EF=BC=8C=E5=A2=9E=E5=8A=A0redis?= =?UTF-8?q?=E9=9B=86=E5=90=88=EF=BC=8C=E5=8F=AF=E4=BB=A5=E9=9A=8F=E6=9C=BA?= =?UTF-8?q?=E8=AF=BB=E5=8F=96url=E8=BF=9B=E8=A1=8C=E9=87=87=E9=9B=86?= =?UTF-8?q?=EF=BC=8C=E8=BF=9B=E8=A1=8C=E5=A4=9A=E5=9F=9F=E5=B9=B6=E5=8F=91?= =?UTF-8?q?=EF=BC=8C=E7=BC=A9=E7=9F=ADredis=E5=89=8D=E7=BC=80=EF=BC=8C?= =?UTF-8?q?=E5=87=8F=E5=B0=91redis=E5=8D=A0=E7=94=A8=E7=A9=BA=E9=97=B4?= =?UTF-8?q?=EF=BC=8C=E5=A2=9E=E5=8A=A0=E8=AE=B0=E5=BD=95=E9=87=87=E9=9B=86?= =?UTF-8?q?=E6=85=A2=E7=9A=84URL=EF=BC=8C=E5=A2=9E=E5=8A=A0=E6=B3=9B?= =?UTF-8?q?=E5=9F=9F=E5=90=8D=E9=87=87=E9=9B=86=EF=BC=8C=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E6=AF=8F=E4=B8=AA=E8=BF=9B=E7=A8=8B=E9=87=87=E9=9B=86=E5=9F=9F?= =?UTF-8?q?=E5=90=8D=E6=95=B0=E9=87=8F=E9=85=8D=E7=BD=AE=EF=BC=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/phpspider.php | 1260 ++++++++++++++++++++++++++++++++---------- core/queue.php | 150 ++++- core/requests.php | 189 ++++--- demo/qiushibaike.php | 18 +- 4 files changed, 1236 insertions(+), 381 deletions(-) diff --git a/core/phpspider.php b/core/phpspider.php index bff6748..b9b7fd5 100755 --- a/core/phpspider.php +++ b/core/phpspider.php @@ -11,26 +11,31 @@ //---------------------------------- // PHPSpider核心类文件 +// *********** +// 泛域名抓取优化版 BY KEN a-site@foxmail.com +// *********** +// * 泛域名设置:domain = array('*') +// * 增加子域名数量限制 $max_sub_num = 100 //---------------------------------- namespace phpspider\core; -require_once __DIR__ . '/constants.php'; +require_once __DIR__.'/constants.php'; +use Exception; +use phpspider\core\db; +use phpspider\core\log; +use phpspider\core\queue; use phpspider\core\requests; use phpspider\core\selector; -use phpspider\core\queue; -use phpspider\core\db; use phpspider\core\util; -use phpspider\core\log; -use Exception; // 启动的时候生成data目录 util::path_exists(PATH_DATA); -util::path_exists(PATH_DATA."/lock"); -util::path_exists(PATH_DATA."/log"); -util::path_exists(PATH_DATA."/cache"); -util::path_exists(PATH_DATA."/status"); +util::path_exists(PATH_DATA.'/lock'); +util::path_exists(PATH_DATA.'/log'); +util::path_exists(PATH_DATA.'/cache'); +util::path_exists(PATH_DATA.'/status'); class phpspider { @@ -38,12 +43,12 @@ class phpspider * 版本号 * @var string */ - const VERSION = '2.1.4'; + const VERSION = '2.1.5'; /** * 爬虫爬取每个网页的时间间隔,0表示不延时, 单位: 毫秒 */ - const INTERVAL = 0; + const INTERVAL = 100; /** * 爬虫爬取每个网页的超时时间, 单位: 秒 @@ -56,12 +61,12 @@ class phpspider const MAX_TRY = 0; /** - * 爬虫爬取网页所使用的浏览器类型: pc、ios、android + * 爬虫爬取网页所使用的浏览器类型: pc/Mac、ios、android * 默认类型是PC */ - const AGENT_PC = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"; - const AGENT_IOS = "Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_3 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13G34 Safari/601.1"; - const AGENT_ANDROID = "Mozilla/5.0 (Linux; U; Android 6.0.1;zh_cn; Le X820 Build/FEXCNFN5801507014S) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/49.0.0.0 Mobile Safari/537.36 EUI Browser/5.8.015S"; + const AGENT_PC = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'; + const AGENT_IOS = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_3 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13G34 Safari/601.1'; + const AGENT_ANDROID = 'Mozilla/5.0 (Linux; U; Android 6.0.1;zh_cn; Le X820 Build/FEXCNFN5801507014S) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/49.0.0.0 Mobile Safari/537.36 EUI Browser/5.8.015S'; /** * pid文件的路径及名称 @@ -195,7 +200,49 @@ class phpspider public static $fields_num = 0; /** - * 采集深度 + * 【KEN】提取到的页面数按域名计数容器 结构为 domain => number + */ + public static $pages_num = array(); + + /** + * 【KEN】单域名允许抓取的最大页面数,0为不限制 + */ + public static $max_pages = 0; + + /** + * 【KEN】花费的抓取时长计数容器 结构为 domain => number + */ + public static $duration = array(); + + /** + * 【KEN】单域名允许抓取的最大时长,单位秒,0为不限制 + */ + public static $max_duration = 0; + + /** + * 【KEN】单域名最大子域名发现数量 防止掉进蜘蛛池,推荐值:3000(多数大型网站上限) + */ + public static $max_sub_num = 3000; //建议值 3000 + + /** + * 【KEN】子进程未获取任务,超时退出前,等待计时器 + */ + + public static $stand_by_time = 0; + + /** + * 【KEN】子进程未获取任务,超时退出前,最大等待时长/秒,全部任务束后,子进程将会等待的时间,以便有缓冲时间,获得新的任务 + */ + public static $max_stand_by_time = 60; //建议值 60 + + /** + * 【KEN】每个主机并发上限,降低对方网站流量压力和减少被阻挡概率,建议值 6 ,须与 queue_order = rand 一起使用 + */ + public static $max_task_per_host = 0; //0值和非0值会使用不同类型的队列缓存库,从0改为非0值或从非0值改为0需清空队列缓存库再运行,否则任务无法添加 + public static $task_per_host_counter = array(); //计数容器 + + /** + * 采集深度 */ public static $depth_num = 0; @@ -205,14 +252,14 @@ class phpspider public static $time_start = 0; /** - * 任务状态 + * 任务状态 */ public static $task_status = array(); // 导出类型配置 - public static $export_type = ''; - public static $export_file = ''; - public static $export_conf = ''; + public static $export_type = ''; + public static $export_file = ''; + public static $export_conf = ''; public static $export_table = ''; // 数据库配置 @@ -221,13 +268,13 @@ class phpspider public static $queue_config = array(); // 运行面板参数长度 - public static $server_length = 10; + public static $server_length = 10; public static $tasknum_length = 8; - public static $taskid_length = 8; - public static $pid_length = 8; - public static $mem_length = 8; - public static $urls_length = 15; - public static $speed_length = 6; + public static $taskid_length = 8; + public static $pid_length = 8; + public static $mem_length = 8; + public static $urls_length = 15; + public static $speed_length = 6; /** * 爬虫初始化时调用, 用来指定一些爬取前的操作 @@ -349,7 +396,7 @@ class phpspider */ public $on_attachment_file = null; - function __construct($configs = array()) + public function __construct($configs = array()) { // 产生时钟云,解决php7下面ctrl+c无法停止bug declare(ticks = 1); @@ -369,16 +416,31 @@ function __construct($configs = array()) exit; } - $configs['name'] = isset($configs['name']) ? $configs['name'] : 'phpspider'; - $configs['proxy'] = isset($configs['proxy']) ? $configs['proxy'] : false; - $configs['user_agent'] = isset($configs['user_agent']) ? $configs['user_agent'] : self::AGENT_PC; - $configs['client_ip'] = isset($configs['client_ip']) ? $configs['client_ip'] : array(); - $configs['interval'] = isset($configs['interval']) ? $configs['interval'] : self::INTERVAL; - $configs['timeout'] = isset($configs['timeout']) ? $configs['timeout'] : self::TIMEOUT; - $configs['max_try'] = isset($configs['max_try']) ? $configs['max_try'] : self::MAX_TRY; - $configs['max_depth'] = isset($configs['max_depth']) ? $configs['max_depth'] : 0; - $configs['max_fields'] = isset($configs['max_fields']) ? $configs['max_fields'] : 0; - $configs['export'] = isset($configs['export']) ? $configs['export'] : array(); + $configs['name'] = isset($configs['name']) ? $configs['name'] : 'phpspider'; + $configs['proxy'] = isset($configs['proxy']) ? $configs['proxy'] : false; + $configs['user_agent'] = isset($configs['user_agent']) ? $configs['user_agent'] : self::AGENT_PC; + $configs['client_ip'] = isset($configs['client_ip']) ? $configs['client_ip'] : array(); + $configs['interval'] = isset($configs['interval']) ? $configs['interval'] : self::INTERVAL; + $configs['timeout'] = isset($configs['timeout']) ? $configs['timeout'] : self::TIMEOUT; + $configs['max_try'] = isset($configs['max_try']) ? $configs['max_try'] : self::MAX_TRY; + $configs['max_depth'] = isset($configs['max_depth']) ? $configs['max_depth'] : 0; + $configs['max_fields'] = isset($configs['max_fields']) ? $configs['max_fields'] : 0; + $configs['export'] = isset($configs['export']) ? $configs['export'] : array(); + //新增参数 BY KEN + $configs['max_pages'] = isset($configs['max_pages']) ? $configs['max_pages'] : self::$max_pages; + $configs['max_duration'] = isset($configs['max_duration']) ? $configs['max_duration'] : self::$max_duration; + $configs['max_sub_num'] = isset($configs['max_sub_num']) ? $configs['max_sub_num'] : self::$max_sub_num; + $configs['max_stand_by_time'] = isset($configs['max_stand_by_time']) ? $configs['max_stand_by_time'] : self::$max_stand_by_time; + $configs['max_task_per_host'] = isset($configs['max_task_per_host']) ? $configs['max_task_per_host'] : self::$max_task_per_host; + //启用 host并发上限时,队列参数强制为随机 + if ($configs['max_task_per_host'] > 0) + { + $configs['queue_order'] = 'rand'; + } + else + { + $configs['queue_order'] = isset($configs['queue_order']) ? $configs['queue_order'] : 'list'; + } // csv、sql、db self::$export_type = isset($configs['export']['type']) ? $configs['export']['type'] : ''; @@ -411,12 +473,12 @@ function __construct($configs = array()) self::$serverid = $configs['serverid']; } - // 不同项目的采集以采集名称作为前缀区分 - if (isset(self::$queue_config['prefix'])) + // 不同项目的采集以采集名称作为前缀区分 缩短 spider name md5长度到4位,减少内存占用 + if (isset(self::$queue_config['prefix'])) { - self::$queue_config['prefix'] = self::$queue_config['prefix'].'-'.md5($configs['name']); + self::$queue_config['prefix'] = self::$queue_config['prefix'].'-'.substr(md5($configs['name']), 0, 4); } - + self::$configs = $configs; } @@ -429,40 +491,51 @@ public function add_scan_url($url, $options = array(), $allowed_repeat = true) { // 投递状态 $status = false; + //限制最大子域名数量 + if ( ! empty(self::$configs['max_sub_num'])) + { + //抓取到的子域名超过指定数量,就丢掉此域名 + $sub_domain_count = $this->sub_domain_count($url); + if ($sub_domain_count > self::$configs['max_sub_num']) + { + log::debug('Task('.self::$taskid.') subdomin = '.$sub_domain_count.' more than '.self::$configs['max_sub_num'].",add_scan_url $url [Skip]"); + return $status; + } + } - $link = $options; - $link['url'] = $url; + $link = $options; + $link['url'] = $url; $link['url_type'] = 'scan_page'; - $link = $this->link_uncompress($link); + $link = $this->link_uncompress($link); - if ($this->is_list_page($url)) + if ($this->is_content_page($url)) { - $link['url_type'] = 'list_page'; - $status = $this->queue_lpush($link, $allowed_repeat); + $link['url_type'] = 'content_page'; + $status = $this->queue_lpush($link, $allowed_repeat); } - elseif ($this->is_content_page($url)) + elseif ($this->is_list_page($url)) { - $link['url_type'] = 'content_page'; - $status = $this->queue_lpush($link, $allowed_repeat); + $link['url_type'] = 'list_page'; + $status = $this->queue_lpush($link, $allowed_repeat); } else { $status = $this->queue_lpush($link, $allowed_repeat); } - if ($status) + if ($status) { - if ($link['url_type'] == 'scan_page') + if ($link['url_type'] == 'scan_page') { log::debug("Find scan page: {$url}"); } - elseif ($link['url_type'] == 'list_page') + elseif ($link['url_type'] == 'content_page') { - log::debug("Find list page: {$url}"); + log::debug("Find content page: {$url}"); } - elseif ($link['url_type'] == 'content_page') + elseif ($link['url_type'] == 'list_page') { - log::debug("Find content page: {$url}"); + log::debug("Find list page: {$url}"); } } @@ -483,37 +556,47 @@ public function add_url($url, $options = array(), $depth = 0) { // 投递状态 $status = false; - - $link = $options; - $link['url'] = $url; - $link['depth'] = $depth; - $link = $this->link_uncompress($link); - - if ($this->is_list_page($url)) + //限制最大子域名数量 + if ( ! empty(self::$configs['max_sub_num'])) { - $link['url_type'] = 'list_page'; - $status = $this->queue_lpush($link); + //抓取超过 max_sub_num 子域名的,就丢掉 + $sub_domain_count = $this->sub_domain_count($url); + if ($sub_domain_count > self::$configs['max_sub_num']) + { + log::debug('Task('.self::$taskid.') subdomin = '.$sub_domain_count.' more than '.self::$configs['max_sub_num'].",add_url $url [Skip]"); + //echo '[on_download_page] ' . $domain . "'s subdomin > 1000 ,Skip!\n"; + return $status; + } } + $link = $options; + $link['url'] = $url; + $link['depth'] = $depth; + $link = $this->link_uncompress($link); if ($this->is_content_page($url)) { $link['url_type'] = 'content_page'; - $status = $this->queue_lpush($link); + $status = $this->queue_lpush($link); + } + elseif ($this->is_list_page($url)) + { + $link['url_type'] = 'list_page'; + $status = $this->queue_lpush($link); } - if ($status) + if ($status) { - if ($link['url_type'] == 'scan_page') + if ($link['url_type'] == 'scan_page') { log::debug("Find scan page: {$url}"); } - elseif ($link['url_type'] == 'list_page') + elseif ($link['url_type'] == 'content_page') { - log::debug("Find list page: {$url}"); + log::debug("Find content page: {$url}"); } - elseif ($link['url_type'] == 'content_page') + elseif ($link['url_type'] == 'list_page') { - log::debug("Find content page: {$url}"); + log::debug("Find list page: {$url}"); } } @@ -531,7 +614,13 @@ public function add_url($url, $options = array(), $depth = 0) public function is_scan_page($url) { $parse_url = parse_url($url); - if (empty($parse_url['host']) || !in_array($parse_url['host'], self::$configs['domains'])) + //2018-1-3 通配所有域名 + if ( ! empty($parse_url['host']) and self::$configs['domains'][0] == '*') + { + return true; + } + //限定域名 + if (empty($parse_url['host']) || ! in_array($parse_url['host'], self::$configs['domains'])) { return false; } @@ -549,7 +638,37 @@ public function is_scan_page($url) public function is_list_page($url) { $result = false; - if (!empty(self::$configs['list_url_regexes'])) + //过滤下载类型文件 20180209 + if (preg_match('/\.(zip|7z|cab|rar|iso|gho|jar|ace|tar|gz|bz2|z|xml|pdf|doc|txt|rtf|snd|xls|xlsx|docx|apk|ipa|flv|midi|mps|pls|pps|ppa|pwz|mp3|mp4|mpeg|mpe|asf|asx|mpg|3gp|mov|m4v|mkv|vob|vod|mod|ogg|rm|rmvb|wmv|avi|dat|exe|wps|js|css|bmp|jpg|png|gif|ico|tiff|jpeg|svg|webp|mpa|mdb|bin)$/iu', $url)) + { + return false; + } + + //增加 要排除的列表页特征正则 BY KEN + if ( ! empty(self::$configs['list_url_regexes_remove'])) + { + foreach (self::$configs['list_url_regexes_remove'] as $regex) + { + if (preg_match("#{$regex}#i", $url)) + { + return false; + } + } + } + + //增加无列表页选项,即所有页面都要抓取内容,包含列表页 + if (empty(self::$configs['list_url_regexes']) or self::$configs['list_url_regexes'][0] == 'x') + { + return false; + } + + //增加泛列表页,即所有页面都是列表页,只抓取链接,不抓取内容 + if (self::$configs['list_url_regexes'][0] == '*') + { + return true; + } + + if ( ! empty(self::$configs['list_url_regexes'])) { foreach (self::$configs['list_url_regexes'] as $regex) { @@ -574,7 +693,36 @@ public function is_list_page($url) public function is_content_page($url) { $result = false; - if (!empty(self::$configs['content_url_regexes'])) + //过滤下载类型文件 20180209 + if (preg_match('/\.(zip|7z|cab|rar|iso|gho|jar|ace|tar|gz|bz2|z|xml|pdf|doc|txt|rtf|snd|xls|xlsx|docx|apk|ipa|flv|midi|mps|pls|pps|ppa|pwz|mp3|mp4|mpeg|mpe|asf|asx|mpg|3gp|mov|m4v|mkv|vob|vod|mod|ogg|rm|rmvb|wmv|avi|dat|exe|wps|js|css|bmp|jpg|png|gif|ico|tiff|jpeg|svg|webp|mpa|mdb|bin)$/iu', $url)) + { + return false; + } + + //增加 要排除的内容页特征正则 BY KEN + if ( ! empty(self::$configs['content_url_regexes_remove'])) + { + foreach (self::$configs['content_url_regexes_remove'] as $regex) + { + if (preg_match("#{$regex}#i", $url)) + { + return false; + } + } + } + + //增加泛内容模式,即所有页面都要提取内容 + if (empty(self::$configs['content_url_regexes']) or self::$configs['content_url_regexes'][0] == '*') + { + return true; + } + //无内容,泛列表模式,即所有页面都不提取内容 + if (self::$configs['content_url_regexes'][0] == 'x') + { + return false; + } + + if ( ! empty(self::$configs['content_url_regexes'])) { foreach (self::$configs['content_url_regexes'] as $regex) { @@ -649,16 +797,17 @@ public function parse_command() */ public function signal_handler($signal) { - switch ($signal) { + switch ($signal) + { // Stop. - case SIGINT: - log::warn("Program stopping..."); - self::$terminate = true; - break; + case SIGINT: + log::warn('Program stopping...'); + self::$terminate = true; + break; // Show status. - case SIGUSR2: - echo "show status\n"; - break; + case SIGUSR2: + echo "show status\n"; + break; } } @@ -709,15 +858,15 @@ protected static function daemonize() } if (-1 === posix_setsid()) { - throw new Exception("setsid fail"); + throw new Exception('setsid fail'); } // Fork again avoid SVR4 system regain the control of terminal. $pid = pcntl_fork(); - if (-1 === $pid) + if (-1 === $pid) { - throw new Exception("fork fail"); - } - elseif (0 !== $pid) + throw new Exception('fork fail'); + } + elseif (0 !== $pid) { exit(0); } @@ -755,13 +904,13 @@ public function check_terminate() $all_stop = false; } } - if ($all_stop) + if ($all_stop) { break; } - else + else { - log::warn("Task stop waiting..."); + log::warn('Task stop waiting...'); } sleep(1); } @@ -789,7 +938,7 @@ public function start() // 当前任务ID self::$taskid = 1; // 当前任务进程ID - self::$taskpid = function_exists('posix_getpid') ? posix_getpid() : 1; + self::$taskpid = function_exists('posix_getpid') ? posix_getpid() : 1; self::$collect_succ = 0; self::$collect_fail = 0; @@ -807,47 +956,47 @@ public function start() // 检查CURL扩展 if(!function_exists('curl_init')) { - log::error("The curl extension was not found"); + log::error('The curl extension was not found'); exit; } // 多任务需要pcntl扩展支持 if (self::$tasknum > 1 && !function_exists('pcntl_fork')) { - log::error("Multitasking needs pcntl, the pcntl extension was not found"); + log::error('Multitasking needs pcntl, the pcntl extension was not found'); exit; } // 守护进程需要pcntl扩展支持 if (self::$daemonize && !function_exists('pcntl_fork')) { - log::error("Daemonize needs pcntl, the pcntl extension was not found"); + log::error('Daemonize needs pcntl, the pcntl extension was not found'); exit; } // 集群、保存运行状态、多任务都需要Redis支持 - if (self::$multiserver || self::$save_running_state || self::$tasknum > 1) + if ( self::$multiserver || self::$save_running_state || self::$tasknum > 1 ) { self::$use_redis = true; queue::set_connect('default', self::$queue_config); if (!queue::init()) { - if (self::$multiserver) + if ( self::$multiserver ) { - log::error("Multiserver needs Redis support, ".queue::$error); + log::error('Multiserver needs Redis support, '.queue::$error); exit; } - if (self::$tasknum > 1) + if ( self::$tasknum > 1 ) { - log::error("Multitasking needs Redis support, ".queue::$error); + log::error('Multitasking needs Redis support, '.queue::$error); exit; } - if (self::$save_running_state) + if ( self::$save_running_state ) { - log::error("Spider kept running state needs Redis support, ".queue::$error); + log::error('Spider kept running state needs Redis support, '.queue::$error); exit; } } @@ -862,7 +1011,7 @@ public function start() // 检查 scan_urls if (empty(self::$configs['scan_urls'])) { - log::error("No scan url to start"); + log::error('No scan url to start'); exit; } @@ -879,8 +1028,8 @@ public function start() // windows 下没法显示面板, 强制显示日志 if (util::is_win()) { - self::$configs['name'] = iconv("UTF-8", "GB2312//IGNORE", self::$configs['name']); - log::$log_show = true; + self::$configs['name'] = iconv('UTF-8', 'GB2312//IGNORE', self::$configs['name']); + log::$log_show = true; } // 守护进程下也显示日志 elseif (self::$daemonize) @@ -897,14 +1046,22 @@ public function start() global $argv; $start_file = $argv[0]; - $header = ""; - if (!util::is_win()) $header .= "\033[33m"; + $header = ''; + if ( ! util::is_win()) + { + $header .= "\033[33m"; + } + $header .= "\n[ ".self::$configs['name']." Spider ] is started...\n\n"; - $header .= " * PHPSpider Version: ".self::VERSION."\n"; + $header .= ' * PHPSpider Version: '.self::VERSION."\n"; $header .= " * Documentation: https://doc.phpspider.org\n"; - $header .= " * Task Number: ".self::$tasknum."\n\n"; + $header .= ' * Task Number: '.self::$tasknum."\n\n"; $header .= "Input \"php $start_file stop\" to quit. Start success.\n"; - if (!util::is_win()) $header .= "\033[0m"; + if ( ! util::is_win()) + { + $header .= "\033[0m"; + } + log::note($header); } @@ -976,23 +1133,35 @@ public function fork_one_task($taskid) //self::$taskpids[$taskid] = $pid; } // 子进程运行 - elseif(0 === $pid) + elseif (0 === $pid) { log::warn("Fork children task({$taskid}) successful..."); // 初始化子进程参数 - self::$time_start = microtime(true); - self::$taskid = $taskid; - self::$taskmaster = false; - self::$taskpid = posix_getpid(); + self::$time_start = microtime(true); + self::$taskid = $taskid; + self::$taskmaster = false; + self::$taskpid = posix_getpid(); self::$collect_succ = 0; self::$collect_fail = 0; queue::set_connect('default', self::$queue_config); - queue::init(); - $this->do_collect_page(); + queue::init(); + + //退出前计时,等待1分钟,如果获取不到新任务,再退出 + self::$stand_by_time = 0; + while (self::$stand_by_time < self::$configs['max_stand_by_time']) + { + $this->do_collect_page(); + log::warn('Task('.self::$taskid.') Stand By '.self::$stand_by_time.'/'.self::$configs['max_stand_by_time'].' s'); + self::$stand_by_time++; + sleep(1); + } + $queue_lsize = $this->queue_lsize(); + log::warn('Task('.self::$taskid.') exit : queue_lsize = '.$queue_lsize); + $this->del_task_status(self::$serverid, $taskid); - // 这里用0表示正常退出 + // 这里用0表示子进程正常退出 exit(0); } else @@ -1012,8 +1181,8 @@ public function do_collect_page() // 多任务下主任务未准备就绪 if (self::$tasknum > 1 && !self::$fork_task_complete) { - // 主进程采集到两倍于任务数时, 生成子任务一起采集 - if ( $queue_lsize > self::$tasknum*2 ) + // 主进程采集到多于任务数2个时, 生成子任务一起采集 + if ($queue_lsize > self::$tasknum + 2) { self::$fork_task_complete = true; @@ -1026,7 +1195,11 @@ public function do_collect_page() } } } - + //在主进程中,保存当前配置到缓存,以使子进程可实时读取动态修改后的配置 20180209 + if (self::$use_redis and ! empty(self::$configs)) + { + queue::set('configs_'.self::$configs['name'], json_encode(self::$configs)); + } // 抓取页面 $this->collect_page(); // 保存任务状态 @@ -1041,9 +1214,17 @@ public function do_collect_page() // 如果是子任务 else { - // 如果队列中的网页比任务数2倍多, 子任务可以采集, 否则等待... - if ( $queue_lsize > self::$tasknum*2 ) + // 主进程采集到多于任务数2个时, 子任务可以采集, 否则等待... + if ($queue_lsize > self::$taskid + 2) { + //在子进程中,从内存中实时读取当前最新配置,用于适应主进程常驻内存模式,无限循环后的配置变动 20180209 + if (self::$use_redis and ! empty(self::$configs)) + { + if ($configs_active = queue::get('configs_'.self::$configs['name'])) + { + self::$configs = json_decode($configs_active, true); + } + } // 抓取页面 $this->collect_page(); // 保存任务状态 @@ -1051,7 +1232,7 @@ public function do_collect_page() } else { - log::warn("Task(".self::$taskid.") waiting..."); + log::warn('Task('.self::$taskid.') waiting...reason: queue_lsize = '.$queue_lsize.' < tasknum = '.self::$tasknum); sleep(1); } } @@ -1071,27 +1252,82 @@ public function do_collect_page() */ public function collect_page() { - $get_collect_url_num = $this->get_collect_url_num(); - log::info("Find pages: {$get_collect_url_num} "); + //减少非必要 queue_lsize 查询 20180214 + if (isset(self::$configs['log_type']) and strstr(self::$configs['log_type'], 'info')) + { + $get_collect_url_num = $this->get_collect_url_num(); + log::info('task id: '.self::$taskid." Find pages: {$get_collect_url_num} "); - $queue_lsize = $this->queue_lsize(); - log::info("Waiting for collect pages: {$queue_lsize} "); + $queue_lsize = $this->queue_lsize(); + log::info('task id: '.self::$taskid." Waiting for collect pages: {$queue_lsize} "); - $get_collected_url_num = $this->get_collected_url_num(); - log::info("Collected pages: {$get_collected_url_num} "); + $get_collected_url_num = $this->get_collected_url_num(); + log::info('task id: '.self::$taskid." Collected pages: {$get_collected_url_num} "); - // 多任务的时候输出爬虫序号 - if (self::$tasknum > 1) - { - log::info("Current task id: ".self::$taskid); + // 多任务的时候输出爬虫序号 + if (self::$tasknum > 1) + { + log::info('Current task id: '.self::$taskid); + } } - - // 先进先出 + //顺序提取任务,先进先出(当配置 queue_order = rand ,先进先出无效,都为随机提取任务) $link = $this->queue_rpop(); + + if (empty($link)) + { + log::warn('Task('.self::$taskid.') Get Task link Fail...Stand By...'); + return false; + } $link = $this->link_uncompress($link); + if (empty($link['url'])) + { + log::warn('Task('.self::$taskid.') Get Task url Fail...Stand By...'); + return false; + } + self::$stand_by_time = 0; //接到任务,则超时退出计时重置 + $url = $link['url']; - // 标记为已爬取网页 + //限制单域名最大url数量 20180213 + if (isset(self::$configs['max_pages']) and self::$configs['max_pages'] > 0) + { + $domain_pages_num = $this->incr_pages_num($url); + if ($domain_pages_num > self::$configs['max_pages']) + { + log::debug('Task('.self::$taskid.') pages = '.$domain_pages_num.' more than '.self::$configs['max_pages'].", $url [Skip]"); + return false; + } + } + + //限制单域名最大花费时长 20180213 + if (isset(self::$configs['max_duration']) and self::$configs['max_duration'] > 0) + { + $domain_duration = $this->get_duration_num($url); + if ($domain_duration > self::$configs['max_duration']) + { + log::debug('Task('.self::$taskid.') duration = '.$domain_duration.' more than '.self::$configs['max_duration'].", $url [Skip]"); + return false; + } + } + + //当前 host 并发检测 2018-5 BY KEN + if (self::$configs['max_task_per_host'] > 0) + { + $task_per_host = $this->get_task_per_host_num($url); + if ($task_per_host < self::$configs['max_task_per_host']) + { + $task_per_host = $this->incr_task_per_host($url); + } + else + { + log::warn('Task('.self::$taskid.') task_per_host = '.$task_per_host.' > '.self::$configs['max_task_per_host'].' ; URL: '.$url.' will be retry later...'); + $this->queue_lpush($link); //放回队列 + usleep(100000); + return false; + } + } + + // 已采集页面数量 +1 $this->incr_collected_url_num($url); // 爬取页面开始时间 @@ -1108,6 +1344,16 @@ public function collect_page() requests::$input_encoding = null; $html = $this->request_url($url, $link); + //记录速度较慢域名花费抓取时间 20180213 + $time_run = round(microtime(true) - $page_time_start); + if ($time_run > 1) + { + $this->incr_duration_num($url, $time_run); + } + + // 爬完页面开始处理时间 + $page_time_start = microtime(true); + if (!$html) { return false; @@ -1148,38 +1394,55 @@ public function collect_page() // 在一个网页下载完成之后调用. 主要用来对下载的网页进行处理. // 比如下载了某个网页, 希望向网页的body中添加html标签 - if ($this->on_download_page) + if ($this->on_download_page) { $return = call_user_func($this->on_download_page, $page, $this); // 针对那些老是忘记return的人 - if (isset($return)) $page = $return; + if (isset($return)) + { + $page = $return; + } + unset($return); } // 是否从当前页面分析提取URL // 回调函数如果返回false表示不需要再从此网页中发现待爬url $is_find_url = true; - if ($link['url_type'] == 'scan_page') + if ($link['url_type'] == 'scan_page') { - if ($this->on_scan_page) + if ($this->on_scan_page) { $return = call_user_func($this->on_scan_page, $page, $page['raw'], $this); - if (isset($return)) $is_find_url = $return; + if (isset($return)) + { + $is_find_url = $return; + } + + unset($return); } } - elseif ($link['url_type'] == 'list_page') + elseif ($link['url_type'] == 'content_page') { - if ($this->on_list_page) + if ($this->on_content_page) { - $return = call_user_func($this->on_list_page, $page, $page['raw'], $this); - if (isset($return)) $is_find_url = $return; + $return = call_user_func($this->on_content_page, $page, $page['raw'], $this); + if (isset($return)) + { + $is_find_url = $return; + } + unset($return); } } - elseif ($link['url_type'] == 'content_page') + elseif ($link['url_type'] == 'list_page') { - if ($this->on_content_page) + if ($this->on_list_page) { - $return = call_user_func($this->on_content_page, $page, $page['raw'], $this); - if (isset($return)) $is_find_url = $return; + $return = call_user_func($this->on_list_page, $page, $page['raw'], $this); + if (isset($return)) + { + $is_find_url = $return; + } + unset($return); } } @@ -1206,10 +1469,10 @@ public function collect_page() // 处理页面耗时时间 $time_run = round(microtime(true) - $page_time_start, 3); - log::debug("Success process page {$url} in {$time_run} s"); + log::debug('task id: '.self::$taskid." Success process page {$url} in {$time_run} s"); $spider_time_run = util::time2second(intval(microtime(true) - self::$time_start)); - log::info("Spider running in {$spider_time_run}"); + log::info('task id: '.self::$taskid." Spider running in {$spider_time_run}"); // 爬虫爬取每个网页的时间间隔, 单位: 毫秒 if (!isset(self::$configs['interval'])) @@ -1269,8 +1532,9 @@ public function request_url($url, $link = array()) requests::set_header($k, $v); } } - - $method = empty($link['method']) ? 'get' : strtolower($link['method']); + //限制 http 请求模式为 get 或 post + $method = trim(strtolower($link['method'])); + $method = ($method == 'post') ? 'post' : 'get'; $params = empty($link['params']) ? array() : $link['params']; $html = requests::$method($url, $params); // 此url附加的数据不为空, 比如内容页需要列表页一些数据, 拼接到后面去 @@ -1281,14 +1545,21 @@ public function request_url($url, $link = array()) $http_code = requests::$status_code; - if ($this->on_status_code) + //请求完成 host 的并发计数减 1 2018-5 BY KEN + if (self::$configs['max_task_per_host'] > 0) + { + $this->incr_task_per_host($url, 'decr'); + } + + if ($this->on_status_code) { $return = call_user_func($this->on_status_code, $http_code, $url, $html, $this); if (isset($return)) { $html = $return; } - if (!$html) + unset($return); + if ( ! $html) { return false; } @@ -1322,14 +1593,14 @@ public function request_url($url, $link = array()) } else { - if ($http_code == 407) + if ( ! empty(self::$configs['max_try']) and $http_code == 407) { // 扔到队列头部去, 继续采集 $this->queue_rpush($link); log::error("Failed to download page {$url}"); self::$collect_fail++; } - elseif (in_array($http_code, array('0','502','503','429'))) + elseif ( ! empty(self::$configs['max_try']) and in_array($http_code, array('0', '502', '503', '429'))) { // 采集次数加一 $link['try_num']++; @@ -1397,13 +1668,25 @@ public function get_urls($html, $collect_url, $depth = 0) foreach ($urls as $key=>$url) { - $urls[$key] = str_replace(array("\"", "'",'&'), array("",'','&'), $url); + //限制最大子域名数量 + if ( ! empty(self::$configs['max_sub_num'])) + { + //抓取子域名超过超过指定值,就丢掉 + $sub_domain_count = $this->sub_domain_count($url); + if ($sub_domain_count > self::$configs['max_sub_num']) + { + unset($urls[$key]); + log::debug('Task('.self::$taskid.') subdomin = '.$sub_domain_count.' more than '.self::$configs['max_sub_num'].",get_urls $url [Skip]"); + continue; + } + } + $urls[$key] = str_replace(array('"', "'", '&'), array('', '', '&'), $url); } //-------------------------------------------------------------------------------- // 过滤和拼凑URL //-------------------------------------------------------------------------------- - // 去除重复的RUL + // 去除重复的URL $urls = array_unique($urls); foreach ($urls as $k=>$url) { @@ -1414,7 +1697,18 @@ public function get_urls($html, $collect_url, $depth = 0) } $val = $this->fill_url($url, $collect_url); - if ($val) + + //限制单域名最大url数量 20180213 + if ($val and isset(self::$configs['max_pages']) and self::$configs['max_pages'] > 0) + { + $domain_pages_num = $this->incr_pages_num($val); + if ($domain_pages_num > self::$configs['max_pages']) + { + continue; + } + } + + if ($val) { $urls[$k] = $val; } @@ -1468,17 +1762,18 @@ public function get_urls($html, $collect_url, $depth = 0) */ public function fill_url($url, $collect_url) { - $url = trim($url); + $url = trim($url); $collect_url = trim($collect_url); // 排除JavaScript的连接 - //if (strpos($url, "javascript:") !== false) - if( preg_match("@^(javascript:|#|'|\")@i", $url) || $url == '') + //if (strpos($url, "javascript:") !== false) + if (preg_match("@^(mailto|javascript:|#|'|\")@i", $url) || $url == '') { return false; } // 排除没有被解析成功的语言标签 - if(substr($url, 0, 3) == '<%=') + if (substr($url, 0, 3) == '<%=' or substr($url, 0, 1) == '{' or substr($url, 0, 2) == ' {') + // if(substr($url, 0, 3) == '<%=') { return false; } @@ -1489,30 +1784,29 @@ public function fill_url($url, $collect_url) return false; } // 过滤mailto、tel、sms、wechat、sinaweibo、weixin等协议 - if (!in_array($parse_url['scheme'], array("http", "https"))) + if ( ! in_array($parse_url['scheme'], array('http', 'https'))) { return false; } - $scheme = $parse_url['scheme']; - $domain = $parse_url['host']; - $path = empty($parse_url['path']) ? '' : $parse_url['path']; + $scheme = $parse_url['scheme']; + $domain = $parse_url['host']; + $path = empty($parse_url['path']) ? '' : $parse_url['path']; $base_url_path = $domain.$path; - $base_url_path = preg_replace("/\/([^\/]*)\.(.*)$/","/",$base_url_path); - $base_url_path = preg_replace("/\/$/",'',$base_url_path); - - $i = $path_step = 0; - $dstr = $pstr = ''; - $pos = strpos($url,'#'); - if($pos > 0) + $base_url_path = preg_replace("/\/([^\/]*)\.(.*)$/", '/', $base_url_path); + $base_url_path = preg_replace("/\/$/", '', $base_url_path); + $i = $path_step = 0; + $dstr = $pstr = ''; + $pos = strpos($url, '#'); + if ($pos > 0) { // 去掉#和后面的字符串 $url = substr($url, 0, $pos); } - // 京东变态的都是 //www.jd.com/111.html - if(substr($url, 0, 2) == '//') + // 修正url格式为 //www.jd.com/111.html 为正确的http + if (substr($url, 0, 2) == '//') { - $url = str_replace("//", "", $url); + $url = preg_replace('/^\/\//iu', '', $url); } // /1234.html elseif($url[0] == '/') @@ -1563,8 +1857,8 @@ public function fill_url($url, $collect_url) { if( strtolower(substr($url, 0, 7))=='http://' ) { - $url = preg_replace('#^http://#i','',$url); - $scheme = "http"; + $url = preg_replace('#^http://#i', '', $url); + $scheme = 'http'; } else if( strtolower(substr($url, 0, 8))=='https://' ) { @@ -1582,14 +1876,19 @@ public function fill_url($url, $collect_url) } } // 两个 / 或以上的替换成一个 / - $url = preg_replace('@/{1,}@i', '/', $url); + $url = preg_replace('/\/{1,}/i', '/', $url); $url = $scheme.'://'.$url; $parse_url = @parse_url($url); - $domain = empty($parse_url['host']) ? $domain : $parse_url['host']; + $domain = empty($parse_url['host']) ? $domain : $parse_url['host']; // 如果host不为空, 判断是不是要爬取的域名 - if (!empty($parse_url['host'])) + if ( ! empty($parse_url['host'])) { + //2018-1-3 通配所有域名 + if (empty(self::$configs['domains']) or self::$configs['domains'][0] == '*') + { + return $url; + } //排除非域名下的url以提高爬取速度 if (!in_array($parse_url['host'], self::$configs['domains'])) { @@ -1712,7 +2011,7 @@ public function get_html_fields($html, $url, $page) } elseif (!is_array($return)) { - log::warn("on_extract_page return value must be an array"); + log::warn('on_extract_page return value must be an array'); } else { @@ -1731,10 +2030,11 @@ public function get_html_fields($html, $url, $page) if (version_compare(PHP_VERSION,'5.4.0','<')) { $fields_str = json_encode($fields); - $fields_str = preg_replace_callback( "#\\\u([0-9a-f]{4})#i", function($matchs) { - return iconv('UCS-2BE', 'UTF-8', pack('H4', $matchs[1])); - }, $fields_str ); - } + $fields_str = preg_replace_callback("#\\\u([0-9a-f]{4})#i", function ($matchs) + { + return @iconv('UCS-2BE', 'UTF-8', pack('H4', $matchs[1])); + }, $fields_str); + } else { $fields_str = json_encode($fields, JSON_UNESCAPED_UNICODE); @@ -1809,10 +2109,7 @@ public function get_fields($confs, $html, $url, $page) $link['url'] = $collect_url; $link = $this->link_uncompress($link); requests::$input_encoding = null; - //$method = empty($link['method']) ? 'get' : strtolower($link['method']); - //$params = empty($link['params']) ? array() : $link['params']; - //$html = requests::$method($collect_url, $params); - $html = $this->request_url($collect_url, $link); + $html = $this->request_url($collect_url, $link); // 在一个attached_url对应的网页下载完成之后调用. 主要用来对下载的网页进行处理. if ($this->on_download_attached_page) { @@ -1910,7 +2207,7 @@ public function get_fields($confs, $html, $url, $page) { foreach ($fields as $fieldname => $data) { - $pattern = "/ \r\n\t]{1,}/isU"; + $pattern = "/ \r\n\t]{1,}/isu"; /*$pattern = "//i"; */ // 在抽取到field内容之后调用, 对其中包含的img标签进行回调处理 if ($this->on_handle_img && preg_match($pattern, $data)) @@ -1963,7 +2260,7 @@ public function check_export() { if (empty(self::$export_file)) { - log::error("Export data into CSV files need to Set the file path."); + log::error('Export data into CSV files need to Set the file path.'); exit; } } @@ -1971,7 +2268,7 @@ public function check_export() { if (empty(self::$export_file)) { - log::error("Export data into SQL files need to Set the file path."); + log::error('Export data into SQL files need to Set the file path.'); exit; } } @@ -1979,13 +2276,13 @@ public function check_export() { if (!function_exists('mysqli_connect')) { - log::error("Export data to a database need Mysql support, unable to load mysqli extension."); + log::error('Export data to a database need Mysql support, unable to load mysqli extension.'); exit; } if (empty(self::$db_config)) { - log::error("Export data to a database need Mysql support, you have not set a config array for connect."); + log::error('Export data to a database need Mysql support, you have not set a config array for connect.'); exit; } @@ -1993,7 +2290,7 @@ public function check_export() @mysqli_connect($config['host'], $config['user'], $config['pass'], $config['name'], $config['port']); if(mysqli_connect_errno()) { - log::error("Export data to a database need Mysql support, ".mysqli_connect_error()); + log::error('Export data to a database need Mysql support, '.mysqli_connect_error()); exit; } @@ -2002,7 +2299,7 @@ public function check_export() if (!db::table_exists(self::$export_table)) { - log::error("Table ".self::$export_table." does not exist"); + log::error('Table '.self::$export_table.' does not exist'); exit; } } @@ -2011,31 +2308,36 @@ public function check_export() public function check_cache() { - if (!self::$use_redis || self::$save_running_state) + if ( !self::$use_redis) { return false; } - //if (queue::exists("collect_queue")) - $keys = queue::keys("*"); - $count = count($keys); - if ($count != 0) + // 这个位置要改 + //$keys = queue::keys("*"); + //$count = count($keys); + // 直接检查db,清空的时候整个db清空,所以注意db不要跟其他项目混用 + $count = queue::dbsize(); + if ( $count > 0 ) { // After this operation, 4,318 kB of additional disk space will be used. // Do you want to continue? [Y/n] //$msg = "发现Redis中有采集数据, 是否继续执行, 不继续则清空Redis数据重新采集\n"; $msg = "Found that the data of Redis, no continue will empty Redis data start again\n"; - $msg .= "Do you want to continue? [Y/n]"; + $msg .= 'Do you want to continue? [Y/n]'; fwrite(STDOUT, $msg); $arg = strtolower(trim(fgets(STDIN))); - $arg = empty($arg) || !in_array($arg, array('y','n')) ? 'y' : $arg; + $arg = empty($arg) || !in_array($arg, array('Y', 'N', 'y','n')) ? 'y' : strtolower($arg); if ($arg == 'n') { - foreach ($keys as $key) - { - $key = str_replace(self::$queue_config['prefix'].":", "", $key); - queue::del($key); - } + log::warn('Clear redis data...'); + queue::flushdb(); + // 下面这种性能太差了 + //foreach ($keys as $key) + //{ + //$key = str_replace(self::$queue_config['prefix'].':', '', $key); + //queue::del($key); + //} } } } @@ -2083,8 +2385,8 @@ public function set_task_status() if (self::$use_redis) { - $key = "server-".self::$serverid."-task_status-".self::$taskid; - queue::set($key, $task_status); + $key = 'server-'.self::$serverid.'-task_status-'.self::$taskid; + queue::set($key, $task_status); } else { @@ -2142,7 +2444,7 @@ public function get_task_status_list($serverid = 1, $tasknum) { for ($i = 1; $i <= $tasknum; $i++) { - $key = "server-{$serverid}-task_status-".$i; + $key = "server-{$serverid}-task_status-".$i; $task_status[] = queue::get($key); } } @@ -2168,9 +2470,9 @@ public function add_server_list($serverid, $tasknum) } // 更新服务器列表 - $server_list_json = queue::get("server_list"); - $server_list = array(); - if (!$server_list_json) + $server_list_json = queue::get('server_list'); + $server_list = array(); + if ( ! $server_list_json) { $server_list[$serverid] = array( 'serverid' => $serverid, @@ -2180,15 +2482,15 @@ public function add_server_list($serverid, $tasknum) } else { - $server_list = json_decode($server_list_json, true); + $server_list = json_decode($server_list_json, true); $server_list[$serverid] = array( 'serverid' => $serverid, - 'tasknum' => $tasknum, - 'time' => time(), + 'tasknum' => $tasknum, + 'time' => time(), ); ksort($server_list); } - queue::set("server_list", json_encode($server_list)); + queue::set('server_list', json_encode($server_list)); } /** @@ -2205,9 +2507,9 @@ public function del_server_list($serverid) return false; } - $server_list_json = queue::get("server_list"); - $server_list = array(); - if ($server_list_json) + $server_list_json = queue::get('server_list'); + $server_list = array(); + if ($server_list_json) { $server_list = json_decode($server_list_json, true); if (isset($server_list[$serverid])) @@ -2219,7 +2521,7 @@ public function del_server_list($serverid) if (!empty($server_list)) { ksort($server_list); - queue::set("server_list", json_encode($server_list)); + queue::set('server_list', json_encode($server_list)); } } } @@ -2236,7 +2538,7 @@ public function get_collect_url_num() { if (self::$use_redis) { - $count = queue::get("collect_urls_num"); + $count = queue::get('collect_urls_num'); } else { @@ -2257,7 +2559,7 @@ public function get_collected_url_num() { if (self::$use_redis) { - $count = queue::get("collected_urls_num"); + $count = queue::get('collected_urls_num'); } else { @@ -2278,7 +2580,7 @@ public function incr_collected_url_num($url) { if (self::$use_redis) { - queue::incr("collected_urls_num"); + queue::incr('collected_urls_num'); } else { @@ -2306,8 +2608,8 @@ public function queue_lpush($link = array(), $allowed_repeat = false) $status = false; if (self::$use_redis) { - $key = "collect_urls-".md5($url); - $lock = "lock-".$key; + $key = 'collect_urls-'.md5($url); + $lock = 'lock-'.$key; // 加锁: 一个进程一个进程轮流处理 if (queue::lock($lock)) { @@ -2316,12 +2618,20 @@ public function queue_lpush($link = array(), $allowed_repeat = false) if (!$exists || $allowed_repeat) { // 待爬取网页记录数加一 - queue::incr("collect_urls_num"); + queue::incr('collect_urls_num'); // 先标记为待爬取网页 queue::set($key, time()); // 入队列 $link = json_encode($link); - queue::lpush("collect_queue", $link); + //根据采集设置为顺序采集还是随机采集,使用列表或集合对象 2018-5 BY KEN + if (self::$configs['queue_order'] == 'rand') + { + queue::sadd('collect_queue', $link); + } + else + { + queue::lpush('collect_queue', $link); + } $status = true; } // 解锁 @@ -2344,9 +2654,9 @@ public function queue_lpush($link = array(), $allowed_repeat = false) /** * 从队列右边插入 - * + * * @return void - * @author seatle + * @author seatle * @created time :2016-09-23 17:13 */ public function queue_rpush($link = array(), $allowed_repeat = false) @@ -2361,22 +2671,30 @@ public function queue_rpush($link = array(), $allowed_repeat = false) $status = false; if (self::$use_redis) { - $key = "collect_urls-".md5($url); - $lock = "lock-".$key; + $key = 'collect_urls-'.md5($url); + $lock = 'lock-'.$key; // 加锁: 一个进程一个进程轮流处理 if (queue::lock($lock)) { - $exists = queue::exists($key); + $exists = queue::exists($key); // 不存在或者当然URL可重复入 - if (!$exists || $allowed_repeat) + if ( ! $exists || $allowed_repeat) { // 待爬取网页记录数加一 - queue::incr("collect_urls_num"); + queue::incr('collect_urls_num'); // 先标记为待爬取网页 - queue::set($key, time()); + queue::set($key, time()); // 入队列 $link = json_encode($link); - queue::rpush("collect_queue", $link); + //根据采集设置为顺序采集还是随机采集,使用列表或集合对象 2018-5 BY KEN + if (self::$configs['queue_order'] == 'rand') + { + queue::sadd('collect_queue', $link); //无序集合 + } + else + { + queue::rpush('collect_queue', $link); //有序列表 + } $status = true; } // 解锁 @@ -2411,7 +2729,15 @@ public function queue_lpop() { if (self::$use_redis) { - $link = queue::lpop("collect_queue"); + //根据采集设置为顺序采集还是随机采集,使用列表或集合对象 + if (self::$configs['queue_order'] == 'rand') + { + $link = queue::spop('collect_queue'); + } + else + { + $link = queue::lpop('collect_queue'); + } $link = json_decode($link, true); } else @@ -2432,7 +2758,15 @@ public function queue_rpop() { if (self::$use_redis) { - $link = queue::rpop("collect_queue"); + //根据采集设置为顺序采集还是随机采集,使用列表或集合对象 + if (self::$configs['queue_order'] == 'rand') + { + $link = queue::spop('collect_queue'); + } + else + { + $link = queue::rpop('collect_queue'); + } $link = json_decode($link, true); } else @@ -2453,7 +2787,15 @@ public function queue_lsize() { if (self::$use_redis) { - $lsize = queue::lsize("collect_queue"); + //根据采集设置为顺序采集还是随机采集,使用列表或集合对象 + if (self::$configs['queue_order'] == 'rand') + { + $lsize = queue::scard('collect_queue'); + } + else + { + $lsize = queue::lsize('collect_queue'); + } } else { @@ -2473,13 +2815,13 @@ public function incr_depth_num($depth) { if (self::$use_redis) { - $lock = "lock-depth_num"; + $lock = 'lock-depth_num'; // 锁2秒 if (queue::lock($lock, time(), 2)) { - if (queue::get("depth_num") < $depth) + if (queue::get('depth_num') < $depth) { - queue::set("depth_num", $depth); + queue::set('depth_num', $depth); } queue::unlock($lock); @@ -2505,7 +2847,7 @@ public function get_depth_num() { if (self::$use_redis) { - $depth_num = queue::get("depth_num"); + $depth_num = queue::get('depth_num'); return $depth_num ? $depth_num : 0; } else @@ -2525,7 +2867,7 @@ public function incr_fields_num() { if (self::$use_redis) { - $fields_num = queue::incr("fields_num"); + $fields_num = queue::incr('fields_num'); } else { @@ -2546,7 +2888,7 @@ public function get_fields_num() { if (self::$use_redis) { - $fields_num = queue::get("fields_num"); + $fields_num = queue::get('fields_num'); } else { @@ -2555,6 +2897,180 @@ public function get_fields_num() return $fields_num ? $fields_num : 0; } + /** + * 提取到的pages数目加一,用于限制单域名采集页数上限 + * + * @return void + * @author KEN + * @created time :2018-05 + */ + public function incr_pages_num($url = '') + { + if ( ! empty($url)) + { + $domain = $this->getRootDomain($url, 'host'); + } + if (empty($domain)) + { + $domain = 'all'; + } + if (self::$use_redis) + { + $pages_num[$domain] = queue::incr('pages_num:'.$domain); + } + else + { + if (empty(self::$pages_num[$domain])) + { + self::$pages_num[$domain] = 1; + } + else + { + self::$pages_num[$domain]++; + } + $pages_num[$domain] = self::$pages_num[$domain]; + } + return $pages_num[$domain]; + } + + /** + * 超过1秒的慢速采集时间计数,用于限制单域名总采集时间上限 + * + * @return void + * @author KEN + * @created time :2018-05 + */ + public function incr_duration_num($url = '', $time_run = 1) + { + if ( ! empty($url)) + { + $domain = $this->getRootDomain($url); + } + if (empty($domain)) + { + $domain = 'all'; + } + if (self::$use_redis) + { + $duration[$domain] = queue::incr('duration:'.$domain, $time_run); + } + else + { + if (empty(self::$duration[$domain])) + { + self::$duration[$domain] = $time_run; + } + else + { + self::$duration[$domain] += $time_run; + } + $duration[$domain] = self::$duration[$domain]; + } + return $duration[$domain]; + } + + /** + * 读取单域名总慢速采集(响应超过1秒)的时间 + * + * @return void + * @author KEN + * @created time :2018-04 + */ + public function get_duration_num($url = '') + { + if ( ! empty($url)) + { + $domain = $this->getRootDomain($url); + } + if (empty($domain)) + { + $domain = 'all'; + } + if (self::$use_redis) + { + $duration[$domain] = queue::get('duration:'.$domain); + } + else + { + $duration[$domain] = ! empty(self::$duration[$domain]) ? self::$duration[$domain] : 0; + } + return $duration[$domain] ? $duration[$domain] : 0; + } + + /** + * 单 host 当前并发计数 + * @return int + * @author KEN + * @created time :2018-05-28 16:40 + */ + public function incr_task_per_host($url = '', $type = 'incr') + { + if (empty($url)) + { + return false; + } + $domain = $this->getRootDomain($url, 'host'); + if (empty($domain)) + { + return false; + } + if (self::$use_redis) + { + if ($type == 'decr') + { + $task_per_host_counter[$domain] = queue::decr('task_per_host:'.$domain); + } + else + { + $task_per_host_counter[$domain] = queue::incr('task_per_host:'.$domain); + } + } + else + { + + if (empty(self::$task_per_host_counter[$domain])) + { + self::$task_per_host_counter[$domain] = 1; + } + else + { + if ($type == 'decr') + { + self::$task_per_host_counter[$domain]--; + } + else + { + self::$task_per_host_counter[$domain]++; + } + } + $task_per_host_counter[$domain] = self::$task_per_host_counter[$domain]; + } + return $task_per_host_counter[$domain]; + } + + //获取url所属 host 当前并发数量 KEN + public function get_task_per_host_num($url) + { + if (empty($url)) + { + return 0; + } + $domain = $this->getRootDomain($url, 'host'); + if (empty($domain)) + { + return 0; + } + if (self::$use_redis) + { + $count = queue::get('task_per_host:'.$domain); + } + else + { + $count = self::$task_per_host_counter[$domain]; + } + return $count; + } + /** * 采用xpath分析提取字段 * @@ -2699,17 +3215,17 @@ public function display_ui() } $display_str = "\033[1A\n\033[K-----------------------------\033[47;30m PHPSPIDER \033[0m-----------------------------\n\033[0m"; //$display_str = "-----------------------------\033[47;30m PHPSPIDER \033[0m-----------------------------\n\033[0m"; - $run_time_str = util::time2second(time()-self::$time_start, false); - $display_str .= 'PHPSpider version:' . self::VERSION . " PHP version:" . PHP_VERSION . "\n"; - $display_str .= 'start time:'. date('Y-m-d H:i:s', self::$time_start).' run ' . $run_time_str . " \n"; + $run_time_str = util::time2second(time() - self::$time_start, false); + $display_str .= 'PHPSpider version:'.self::VERSION.' PHP version:'.PHP_VERSION."\n"; + $display_str .= 'start time:'.date('Y-m-d H:i:s', self::$time_start).' run '.$run_time_str." \n"; - $display_str .= 'spider name: ' . self::$configs['name'] . "\n"; - if (self::$multiserver) + $display_str .= 'spider name: '.self::$configs['name']."\n"; + if (self::$multiserver) { - $display_str .= 'server id: ' . self::$serverid."\n"; + $display_str .= 'server id: '.self::$serverid."\n"; } - $display_str .= 'task number: ' . self::$tasknum . "\n"; - $display_str .= 'load average: ' . implode(", ", $loadavg) . "\n"; + $display_str .= 'task number: '.self::$tasknum."\n"; + $display_str .= 'load average: '.implode(', ', $loadavg)."\n"; $display_str .= "document: https://doc.phpspider.org\n"; $display_str .= $this->display_task_ui(); @@ -2726,8 +3242,8 @@ public function display_ui() // 返回到第一行,第一列 //echo "\033[0;0H"; $display_str .= "---------------------------------------------------------------------\n"; - $display_str .= "Press Ctrl-C to quit. Start success."; - if (self::$terminate) + $display_str .= 'Press Ctrl-C to quit. Start success.'.date('Y-m-d H:i:s').' - '.round(memory_get_usage() / 1024 / 1024, 2).'MB'."\n"; + if (self::$terminate) { $display_str .= "\n\033[33mWait for the process exits...\033[0m"; } @@ -2756,12 +3272,12 @@ public function display_task_ui() { continue; } - $display_str .= str_pad($task['id'], self::$taskid_length+2). - str_pad($task['pid'], self::$pid_length+2). - str_pad($task['mem']."MB", self::$mem_length+2). - str_pad($task['collect_succ'], self::$urls_length). - str_pad($task['collect_fail'], self::$urls_length). - str_pad($task['speed']."/s", self::$speed_length+2). + $display_str .= str_pad($task['id'], self::$taskid_length + 2). + str_pad($task['pid'], self::$pid_length + 2). + str_pad($task['mem'].'MB', self::$mem_length + 2). + str_pad($task['collect_succ'], self::$urls_length). + str_pad($task['collect_fail'], self::$urls_length). + str_pad($task['speed'].'/s', self::$speed_length + 2). "\n"; } //echo "\033[9;0H"; @@ -2780,20 +3296,20 @@ public function display_server_ui() "\033[47;30mspeed\033[0m". str_pad('', self::$speed_length+2-strlen('speed')). "\n"; - $server_list_json = queue::get("server_list"); - $server_list = json_decode($server_list_json, true); - foreach ($server_list as $server) + $server_list_json = queue::get('server_list'); + $server_list = json_decode($server_list_json, true); + foreach ($server_list as $server) { - $serverid = $server['serverid']; - $tasknum = $server['tasknum']; - $mem = 0; - $speed = 0; + $serverid = $server['serverid']; + $tasknum = $server['tasknum']; + $mem = 0; + $speed = 0; $collect_succ = $collect_fail = 0; - $task_status = $this->get_task_status_list($serverid, $tasknum); - foreach ($task_status as $json) + $task_status = $this->get_task_status_list($serverid, $tasknum); + foreach ($task_status as $json) { $task = json_decode($json, true); - if (empty($task)) + if (empty($task)) { continue; } @@ -2804,11 +3320,11 @@ public function display_server_ui() } $display_str .= str_pad($serverid, self::$server_length). - str_pad($tasknum, self::$tasknum_length+2). - str_pad($mem."MB", self::$mem_length+2). - str_pad($collect_succ, self::$urls_length). - str_pad($collect_fail, self::$urls_length). - str_pad($speed."/s", self::$speed_length+2). + str_pad($tasknum, self::$tasknum_length + 2). + str_pad($mem.'MB', self::$mem_length + 2). + str_pad($collect_succ, self::$urls_length). + str_pad($collect_fail, self::$urls_length). + str_pad($speed.'/s', self::$speed_length + 2). "\n"; } return $display_str; @@ -2899,6 +3415,184 @@ public function display_collect_ui() //return $fileinfo; //} -} + //返回当前是否是主进程 + public function is_taskmaster() + { + return self::$taskmaster; + } + + //返回当前是否进程ID + public function get_task_id() + { + return self::$taskid; + } + //检测子域名数量 + public function sub_domain_count($url) + { + if (empty($url)) + { + return 0; + } + $count = 0; + $domain = $this->getRootDomain($url, 'root'); + if (empty($domain)) + { + return 0; + } + $host = $this->getRootDomain($url, 'host'); + if (empty($host)) + { + return $count; + } + if (self::$use_redis) + { + $count = queue::get($domain); + if ( ! empty(self::$configs['max_sub_num']) and $count > self::$configs['max_sub_num']) + { + return $count; + } + if (strlen($host) > 32) + { + $host = md5($host); + } + $hostkey = 'sub_d-'.$host; + $exists = queue::exists($hostkey); + if ( ! $exists) + { + // 子域名数量加一 + $count = queue::incr($domain); + queue::set($hostkey, 1); + } + } + return $count; + } + //提取url的根域名 host domain subdomain name tld + public function getRootDomain($url = '', $type = 'root', $domain_check = false) + { + if (empty($url)) + { + return $url; + } + $url = trim($url); + if ( ! preg_match('/^http/i', $url)) + { + $url = 'http://'.$url; + } + //截取限定字符 + $arr = array(); + if (preg_match_all('/(^https?:\/\/[\p{Han}a-zA-Z0-9\-\.\/]+)/iu', $url, $arr)) + { + $url = $arr['0']['0']; + unset($arr); + } + $url_parse = parse_url(strtolower($url)); + if (empty($url_parse['host'])) + { + return ''; + } + //host判断快速返回 + if ($domain_check === false and $type == 'host') + { + return $url_parse['host']; + } + + //结束数组初始化 + $res = array( + 'scheme' => '', + 'host' => '', + 'path' => '', + 'name' => '', + 'domain' => '', + ); + + $urlarr = explode('.', $url_parse['host']); + $count = count($urlarr); + $res['scheme'] = $url_parse['scheme']; + $res['host'] = $url_parse['host']; + if ( ! empty($url_parse['path'])) + { + $res['path'] = $url_parse['path']; + } + #列举域名中固定元素 + $state_domain = array('com', 'edu', 'gov', 'int', 'mil', 'net', 'org', 'biz', 'info', 'pro', 'name', 'coop', 'aero', 'xxx', 'idv', 'mobi', 'cc', 'me', 'jp', 'uk', 'ws', 'eu', 'pw', 'kr', 'io', 'us', 'cn', 'al', 'dz', 'af', 'ar', 'ae', 'aw', 'om', 'az', 'eg', 'et', 'ie', 'ee', 'ad', 'ao', 'ai', 'ag', 'at', 'au', 'mo', 'bb', 'pg', 'bs', 'pk', 'py', 'ps', 'bh', 'pa', 'br', 'by', 'bm', 'bg', 'mp', 'bj', 'be', 'is', 'pr', 'ba', 'pl', 'bo', 'bz', 'bw', 'bt', 'bf', 'bi', 'bv', 'kp', 'gq', 'dk', 'de', 'tl', 'tp', 'tg', 'dm', 'do', 'ru', 'ec', 'er', 'fr', 'fo', 'pf', 'gf', 'tf', 'va', 'ph', 'fj', 'fi', 'cv', 'fk', 'gm', 'cg', 'cd', 'co', 'cr', 'gg', 'gd', 'gl', 'ge', 'cu', 'gp', 'gu', 'gy', 'kz', 'ht', 'nl', 'an', 'hm', 'hn', 'ki', 'dj', 'kg', 'gn', 'gw', 'ca', 'gh', 'ga', 'kh', 'cz', 'zw', 'cm', 'qa', 'ky', 'km', 'ci', 'kw', 'hr', 'ke', 'ck', 'lv', 'ls', 'la', 'lb', 'lt', 'lr', 'ly', 'li', 're', 'lu', 'rw', 'ro', 'mg', 'im', 'mv', 'mt', 'mw', 'my', 'ml', 'mk', 'mh', 'mq', 'yt', 'mu', 'mr', 'um', 'as', 'vi', 'mn', 'ms', 'bd', 'pe', 'fm', 'mm', 'md', 'ma', 'mc', 'mz', 'mx', 'nr', 'np', 'ni', 'ne', 'ng', 'nu', 'no', 'nf', 'na', 'za', 'aq', 'gs', 'pn', 'pt', 'se', 'ch', 'sv', 'yu', 'sl', 'sn', 'cy', 'sc', 'sa', 'cx', 'st', 'sh', 'kn', 'lc', 'sm', 'pm', 'vc', 'lk', 'sk', 'si', 'sj', 'sz', 'sd', 'sr', 'sb', 'so', 'tj', 'tw', 'th', 'tz', 'to', 'tc', 'tt', 'tn', 'tv', 'tr', 'tm', 'tk', 'wf', 'vu', 'gt', 've', 'bn', 'ug', 'ua', 'uy', 'uz', 'es', 'eh', 'gr', 'hk', 'sg', 'nc', 'nz', 'hu', 'sy', 'jm', 'am', 'ac', 'ye', 'iq', 'ir', 'il', 'it', 'in', 'id', 'vg', 'jo', 'vn', 'zm', 'je', 'td', 'gi', 'cl', 'cf', 'yr', 'arpa', 'museum', 'asia', 'ax', 'bl', 'bq', 'cat', 'cw', 'gb', 'jobs', 'mf', 'rs', 'su', 'sx', 'tel', 'travel', 'shop', 'ltd', 'store', 'vip', '网店', '中国', '公司', '网络', 'co.il', 'co.nz', 'co.uk', 'me.uk', 'org.uk', 'com.sb', '在线', '中文网', '移动', 'wang', 'club', 'ren', 'top', 'website', 'cool', 'company', 'city', 'email', 'market', 'software', 'ninja', '我爱你', 'bike', 'today', 'life', 'space', 'pub', 'site', 'help', 'link', 'photo', 'video', 'click', 'pics', 'sexy', 'audio', 'gift', 'tech', '网址', 'online', 'win', 'download', 'party', 'bid', 'loan', 'date', 'trade', 'red', 'blue', 'pink', 'poker', 'green', 'farm', 'zone', 'guru', 'tips', 'land', 'care', 'camp', 'cab', 'cash', 'limo', 'toys', 'tax', 'town', 'fish', 'fund', 'fail', 'house', 'shoes', 'media', 'guide', 'tools', 'solar', 'watch', 'cheap', 'rocks', 'news', 'live', 'lawyer', 'host', 'wiki', 'ink', 'design', 'lol', 'hiphop', 'hosting', 'diet', 'flowers', 'car', 'cars', 'auto', 'mom', 'cq', 'he', 'nm', 'ln', 'jl', 'hl', 'js', 'zj', 'ah', 'jx', 'ha', 'hb', 'gx', 'hi', 'gz', 'yn', 'xz', 'qh', 'nx', 'xj', 'xyz', 'xin', 'science', 'press', 'band', 'engineer', 'social', 'studio', 'work', 'game', 'kim', 'games', 'group', '集团'); + if ($count <= 2) + { + #当域名直接根形式不存在host部分直接输出 + $last = array_pop($urlarr); + $last_1 = array_pop($urlarr); + if (in_array($last, $state_domain)) + { + $res['domain'] = $last_1.'.'.$last; + $res['name'] = $last_1; + $res['tld'] = $last; + } + } + elseif ($count > 2) + { + $last = array_pop($urlarr); + $last_1 = array_pop($urlarr); + $last_2 = array_pop($urlarr); + $res['domain'] = $last_1.'.'.$last; //默认为n.com形式 + $res['name'] = $last_2; + + //排除非标准 ltd 域名 + if ( ! in_array($last, $state_domain)) + { + return false; + } + + if (in_array($last, $state_domain)) + { + $res['domain'] = $last_1.'.'.$last; //n.com形式 + $res['name'] = $last_1; + $res['tld'] = $last; + } + //排除顶级根二级后缀 + if ($last_1 !== $last and in_array($last_1, $state_domain) and ! in_array($last, array('com', 'net', 'org', 'edu', 'gov'))) + { + $res['domain'] = $last_2.'.'.$last_1.'.'.$last; //n.n.com形式 + $res['name'] = $last_2; + $res['tld'] = $last_1.'.'.$last; + } + //限定cn顶级根二级后缀为'com', 'net', 'org', 'edu', 'gov' + if (in_array($last, array('cn')) and $last_1 !== $last and strlen($last_1) > 2 and ! in_array($last_1, array('com', 'net', 'org', 'edu', 'gov'))) + { + $res['domain'] = $last_1.'.'.$last; //n.n.cn形式 + $res['name'] = $last_1; + $res['tld'] = $last; + } + } + + //检测和验证返回的是不是域名格式 + if ( ! empty($res['domain']) and preg_match('/^([\p{Han}a-zA-Z0-9])+([\p{Han}a-zA-Z0-9\-])*\.[a-zA-Z\.\p{Han}]+$/iu', $res['domain'])) + { + if ($type == 'arr') + { + return $res; + } + elseif ($type == 'host') + { + return $res['host']; + } + elseif ($type == 'tld') + { + return $res['tld']; + } + elseif ($type == 'subdomain') + { + return $res['name']; + } + else + { + return $res['domain']; + } + } + else + { + return ''; + } + } + +} diff --git a/core/queue.php b/core/queue.php index 8330651..3ea4149 100644 --- a/core/queue.php +++ b/core/queue.php @@ -15,8 +15,8 @@ namespace phpspider\core; -use Redis; use Exception; +use Redis; class queue { @@ -35,15 +35,15 @@ class queue /** * 默认redis前缀 */ - public static $prefix = "phpspider"; + public static $prefix = 'phpspider'; - public static $error = ""; + public static $error = ''; public static function init() { - if (!extension_loaded("redis")) + if ( ! extension_loaded('redis')) { - self::$error = "The redis extension was not found"; + self::$error = 'The redis extension was not found'; return false; } @@ -55,26 +55,38 @@ public static function init() if (empty(self::$links[self::$link_name])) { self::$links[self::$link_name] = new Redis(); - if (!self::$links[self::$link_name]->connect($config['host'], $config['port'], $config['timeout'])) + if (strstr($config['host'], '.sock')) { - self::$error = "Unable to connect to redis server"; - unset(self::$links[self::$link_name]); - return false; + if ( ! self::$links[self::$link_name]->connect($config['host'])) + { + self::$error = 'Unable to connect to redis server'; + unset(self::$links[self::$link_name]); + return false; + } + } + else + { + if ( ! self::$links[self::$link_name]->connect($config['host'], $config['port'], $config['timeout'])) + { + self::$error = 'Unable to connect to redis server'; + unset(self::$links[self::$link_name]); + return false; + } } // 验证 if ($config['pass']) { - if ( !self::$links[self::$link_name]->auth($config['pass']) ) + if ( ! self::$links[self::$link_name]->auth($config['pass'])) { - self::$error = "Redis Server authentication failed"; + self::$error = 'Redis Server authentication failed'; unset(self::$links[self::$link_name]); return false; } } $prefix = empty($config['prefix']) ? self::$prefix : $config['prefix']; - self::$links[self::$link_name]->setOption(Redis::OPT_PREFIX, $prefix . ":"); + self::$links[self::$link_name]->setOption(Redis::OPT_PREFIX, $prefix.':'); // 永不超时 // ini_set('default_socket_timeout', -1); 无效,要用下面的做法 self::$links[self::$link_name]->setOption(Redis::OPT_READ_TIMEOUT, -1); @@ -107,7 +119,7 @@ public static function set_connect($link_name, $config = array()) { if (empty(self::$configs[self::$link_name])) { - throw new Exception("You not set a config array for connect!"); + throw new Exception('You not set a config array for connect!'); } } //print_r(self::$configs); @@ -247,7 +259,10 @@ public static function setnx($key, $value, $expire = 0) */ public static function lock($name, $value = 1, $expire = 5, $interval = 100000) { - if ($name == null) return false; + if ($name == null) + { + return false; + } self::init(); try @@ -616,7 +631,7 @@ public static function dbsize() return self::dbsize(); } } - return NULL; + return 0; } /** @@ -1265,6 +1280,109 @@ public static function decode($value) { return json_decode($value, true); } -} + /** + * 集合操作 + */ + + /** + * sadd 将数据压入集合 + * + * @param mixed $key + * @param mixed $value + * @return void + * @author seatle + * @created time :2015-12-13 01:05 + */ + public static function sadd($key, $value) + { + self::init(); + try + { + if (self::$links[self::$link_name]) + { + return self::$links[self::$link_name]->sadd($key, $value); + } + } + catch (Exception $e) + { + $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; + log::warn($msg); + if ($e->getCode() == 0) + { + self::$links[self::$link_name]->close(); + self::$links[self::$link_name] = null; + usleep(100000); + return self::sadd($key, $value); + } + } + return null; + } + + /** + * spop 从集合中随机取出数据并移除 + * + * @param mixed $key + * @return void + * @author seatle + * @created time :2015-12-13 01:05 + */ + public static function spop($key) + { + self::init(); + try + { + if (self::$links[self::$link_name]) + { + return self::$links[self::$link_name]->spop($key); + } + } + catch (Exception $e) + { + $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; + log::warn($msg); + if ($e->getCode() == 0) + { + self::$links[self::$link_name]->close(); + self::$links[self::$link_name] = null; + usleep(100000); + return self::spop($key); + } + } + return null; + } + + /** + * Redis Scard 命令返回集合中元素的数量。 + * + * @param mixed $key + * @return void + * @author seatle + * @created time :2015-12-13 01:05 + */ + public static function scard($key) + { + self::init(); + try + { + if (self::$links[self::$link_name]) + { + return self::$links[self::$link_name]->scard($key); + } + } + catch (Exception $e) + { + $msg = "PHP Fatal error: Uncaught exception 'RedisException' with message '".$e->getMessage()."'\n"; + log::warn($msg); + if ($e->getCode() == 0) + { + self::$links[self::$link_name]->close(); + self::$links[self::$link_name] = null; + usleep(100000); + return self::scard($key); + } + } + return null; + } +} diff --git a/core/requests.php b/core/requests.php index c486abd..7b7526d 100644 --- a/core/requests.php +++ b/core/requests.php @@ -35,11 +35,11 @@ class requests /* user definable vars */ - public static $timeout = 5; - public static $encoding = null; - public static $input_encoding = null; - public static $output_encoding = 'utf-8'; // 默认输出utf-8,否则xpath提取不了 - public static $cookies = array(); // array of cookies to pass + public static $timeout = 15; + public static $encoding = null; + public static $input_encoding = null; + public static $output_encoding = null; + public static $cookies = array(); // array of cookies to pass // $cookies['username'] = "seatle"; public static $rawheaders = array(); // array of raw headers to send public static $domain_cookies = array(); // array of cookies for domain to pass @@ -147,17 +147,17 @@ public static function set_cookie($key, $value, $domain = '') */ public static function set_cookies($cookies, $domain = '') { - $cookies_arr = explode(";", $cookies); - if (empty($cookies_arr)) + $cookies_arr = explode(';', $cookies); + if (empty($cookies_arr)) { return false; } foreach ($cookies_arr as $cookie) { - $cookie_arr = explode("=", $cookie, 2); - $key = $cookie_arr[0]; - $value = empty($cookie_arr[1]) ? '' : $cookie_arr[1]; + $cookie_arr = explode('=', $cookie, 2); + $key = $cookie_arr[0]; + $value = empty($cookie_arr[1]) ? '' : $cookie_arr[1]; if (!empty($domain)) { @@ -383,43 +383,10 @@ public static function split_header_body() //$body = implode("\r\n\r\n", $array); //} - // 如果用户没有明确指定输入的页面编码格式(utf-8, gb2312),通过程序去判断 - if(self::$input_encoding == null) - { - // 从头部获取 - preg_match("/charset=([^\s]*)/i", $head, $out); - $encoding = empty($out[1]) ? '' : str_replace(array('"', '\''), '', strtolower(trim($out[1]))); - //$encoding = null; - if (empty($encoding)) - { - // 在某些情况下,无法再 response header 中获取 html 的编码格式 - // 则需要根据 html 的文本格式获取 - $encoding = self::get_encoding($body); - $encoding = strtolower($encoding); - if($encoding == false || $encoding == "ascii") - { - $encoding = 'gbk'; - } - } - - // 没有转码前 - self::$encoding = $encoding; - self::$input_encoding = $encoding; - } - // 设置了输出编码的转码,注意: xpath只支持utf-8,iso-8859-1 不要转,他本身就是utf-8 - if (self::$output_encoding && self::$input_encoding != self::$output_encoding && self::$input_encoding != 'iso-8859-1') - { - // 先将非utf8编码,转化为utf8编码 - $body = @mb_convert_encoding($body, self::$output_encoding, self::$input_encoding); - // 将页面中的指定的编码方式修改为utf8 - $body = preg_replace("/]*)charset=([^>]*)>/is", '', $body); - // 直接干掉头部,国外很多信息是在头部的 - //$body = self::_remove_head($body); - - // 转码后 - self::$encoding = self::$output_encoding; - } + $body = self::encoding($body); //自动转码 + // 转码后 + self::$encoding = self::$output_encoding; // The body after encoding self::$text = $body; @@ -444,11 +411,11 @@ public static function get_response_cookies($header, $domain) // 解析到Cookie if (!empty($cookies)) { - $cookies = implode(";", $cookies); - $cookies = explode(";", $cookies); - foreach ($cookies as $cookie) + $cookies = implode(';', $cookies); + $cookies = explode(';', $cookies); + foreach ($cookies as $cookie) { - $cookie_arr = explode("=", $cookie, 2); + $cookie_arr = explode('=', $cookie, 2); // 过滤 httponly、secure if (count($cookie_arr) < 2) { @@ -486,10 +453,10 @@ public static function get_response_headers($header) { foreach ($header_lines as $line) { - $header_arr = explode(":", $line, 2); - $key = empty($header_arr[0]) ? '' : trim($header_arr[0]); - $val = empty($header_arr[1]) ? '' : trim($header_arr[1]); - if (empty($key) || empty($val)) + $header_arr = explode(':', $line, 2); + $key = empty($header_arr[0]) ? '' : trim($header_arr[0]); + $val = empty($header_arr[1]) ? '' : trim($header_arr[1]); + if (empty($key) || empty($val)) { continue; } @@ -507,7 +474,7 @@ public static function get_response_headers($header) */ public static function get_encoding($string) { - $encoding = mb_detect_encoding($string, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5')); + $encoding = mb_detect_encoding($string, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5', 'ISO-8859-1')); return strtolower($encoding); } @@ -557,9 +524,10 @@ public static function init() } else { - curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout ); - curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout); + curl_setopt(self::$ch, CURLOPT_CONNECTTIMEOUT, ceil(self::$timeout / 2)); + curl_setopt(self::$ch, CURLOPT_TIMEOUT, self::$timeout); } + curl_setopt(self::$ch, CURLOPT_MAXREDIRS, 5); //maximum number of redirects allowed // 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生 curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true); } @@ -655,7 +623,7 @@ public static function request($url, $method = 'GET', $fields = array(), $files // 如果是 get 方式,直接拼凑一个 url 出来 if ($method == 'GET' && !empty($fields)) { - $url = $url . (strpos($url,"?")===false ? "?" : "&") . http_build_query($fields); + $url = $url.(strpos($url, '?') === false ? '?' : '&').http_build_query($fields); } $parse_url = parse_url($url); @@ -750,10 +718,10 @@ public static function request($url, $method = 'GET', $fields = array(), $files { foreach ($cookies as $key=>$value) { - $cookie_arr[] = $key."=".$value; + $cookie_arr[] = $key.'='.$value; } - $cookies = implode("; ", $cookie_arr); - curl_setopt( self::$ch, CURLOPT_COOKIE, $cookies ); + $cookies = implode('; ', $cookie_arr); + curl_setopt(self::$ch, CURLOPT_COOKIE, $cookies); } if (!empty(self::$useragents)) @@ -764,9 +732,9 @@ public static function request($url, $method = 'GET', $fields = array(), $files if (!empty(self::$client_ips)) { - $key = rand(0, count(self::$client_ips) - 1); - self::$rawheaders["CLIENT-IP"] = self::$client_ips[$key]; - self::$rawheaders["X-FORWARDED-FOR"] = self::$client_ips[$key]; + $key = rand(0, count(self::$client_ips) - 1); + self::$rawheaders['CLIENT-IP'] = self::$client_ips[$key]; + self::$rawheaders['X-FORWARDED-FOR'] = self::$client_ips[$key]; } if (self::$rawheaders) @@ -774,7 +742,7 @@ public static function request($url, $method = 'GET', $fields = array(), $files $headers = array(); foreach (self::$rawheaders as $k=>$v) { - $headers[] = $k.": ".$v; + $headers[] = $k.': '.$v; } curl_setopt( self::$ch, CURLOPT_HTTPHEADER, $headers ); } @@ -849,7 +817,7 @@ public static function get_mimetype($filepath) $fp = finfo_open(FILEINFO_MIME); $mime = finfo_file($fp, $filepath); finfo_close($fp); - $arr = explode(";", $mime); + $arr = explode(';', $mime); $type = empty($arr[0]) ? '' : $arr[0]; return $type; } @@ -872,8 +840,8 @@ public static function get_postfile_form($post_fields, $file_fields) // 表单数据 foreach ($post_fields as $name => $content) { - $data .= "--" . $delimiter . "\r\n"; - $data .= 'Content-Disposition: form-data; name = "' . $name . '"'; + $data .= '--'.$delimiter."\r\n"; + $data .= 'Content-Disposition: form-data; name = "'.$name.'"'; $data .= "\r\n\r\n"; $data .= $content; $data .= "\r\n"; @@ -881,9 +849,9 @@ public static function get_postfile_form($post_fields, $file_fields) foreach ($file_fields as $input_name => $file) { - $data .= "--" . $delimiter . "\r\n"; - $data .= 'Content-Disposition: form-data; name = "' . $input_name . '";' . - ' filename="' . $file['filename'] . '"' . "\r\n"; + $data .= '--'.$delimiter."\r\n"; + $data .= 'Content-Disposition: form-data; name = "'.$input_name.'";'. + ' filename="'.$file['filename'].'"'."\r\n"; $data .= "Content-Type: {$file['type']}\r\n"; $data .= "\r\n"; $data .= $file['content']; @@ -891,7 +859,7 @@ public static function get_postfile_form($post_fields, $file_fields) } // 结束符 - $data .= "--" . $delimiter . "--\r\n"; + $data .= '--'.$delimiter."--\r\n"; //return array( //CURLOPT_HTTPHEADER => array( @@ -903,6 +871,81 @@ public static function get_postfile_form($post_fields, $file_fields) //); return array($delimiter, $data); } -} + /** + * html encoding transform + * + * @param string $html + * @param string $in + * @param string $out + * @param string $content + * @param string $mode + * auto|iconv|mb_convert_encoding + * @return string + */ + public static function encoding($html, $in = null, $out = null, $mode = 'auto') + { + $valid = array( + 'auto', + 'iconv', + 'mb_convert_encoding', + ); + if (isset(self::$output_encoding)) + { + $out = self::$output_encoding; + } + if ( ! isset($out)) + { + $out = 'UTF-8'; + } + if ( ! in_array($mode, $valid)) + { + throw new Exception('invalid mode, mode='.$mode); + } + $if = function_exists('mb_convert_encoding'); + $if = $if && ($mode == 'auto' || $mode == 'mb_convert_encoding'); + if (function_exists('iconv') && ($mode == 'auto' || $mode == 'iconv')) + { + $func = 'iconv'; + } + elseif ($if) + { + $func = 'mb_convert_encoding'; + } + else + { + throw new Exception('charsetTrans failed, no function'); + } + + $pattern = '/(]*?charset=([\"\']?))([a-z\d_\-]*)(\2[^>]*?>)/is'; + if ( ! isset($in)) + { + $n = preg_match($pattern, $html, $in); + if ($n > 0) + { + $in = $in[3]; + } + else + { + $in = null; + } + if (empty($in) and function_exists('mb_detect_encoding')) + { + $in = mb_detect_encoding($html, array('UTF-8', 'GBK', 'GB2312', 'LATIN1', 'ASCII', 'BIG5', 'ISO-8859-1')); + } + } + if (isset($in)) + { + if ($in == 'ISO-8859-1') + { + $in = 'UTF-8'; + } + $old = error_reporting(error_reporting() & ~E_NOTICE); + $html = call_user_func($func, $in, $out.'//IGNORE', $html); + error_reporting($old); + $html = preg_replace($pattern, "\\1$out\\4", $html, 1); + } + return $html; + } +} diff --git a/demo/qiushibaike.php b/demo/qiushibaike.php index b424ce3..eecfc85 100644 --- a/demo/qiushibaike.php +++ b/demo/qiushibaike.php @@ -16,7 +16,7 @@ 'name' => '糗事百科', 'log_show' => true, 'tasknum' => 1, - //'save_running_state' => true, + 'save_running_state' => true, 'domains' => array( 'qiushibaike.com', 'www.qiushibaike.com' @@ -54,14 +54,14 @@ //'pass' => 'root', //'name' => 'qiushibaike', //), - //'queue_config' => array( - //'host' => '127.0.0.1', - //'port' => 6379, - //'pass' => '', - //'db' => 5, - //'prefix' => 'phpspider', - //'timeout' => 30, - //), + 'queue_config' => array( + 'host' => '127.0.0.1', + 'port' => 6379, + 'pass' => 'foobared', + 'db' => 5, + 'prefix' => 'phpspider', + 'timeout' => 30, + ), 'fields' => array( array( 'name' => "article_title",