From 97f35be88dbb51179fa71095d24d400c42804ea2 Mon Sep 17 00:00:00 2001 From: lvsong <201099101@qq.com> Date: Mon, 21 Nov 2016 10:49:59 +0800 Subject: [PATCH] =?UTF-8?q?Update,=20=E4=BF=AE=E6=94=B9=E5=A4=87=E6=B3=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 + library/cls_curl.php | 268 ++++++++++++++++++------------------------ library/cls_query.php | 177 ++++++++++++---------------- 3 files changed, 191 insertions(+), 256 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4c0c12a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/.idea/ + diff --git a/library/cls_curl.php b/library/cls_curl.php index 55ec992..1e2f216 100644 --- a/library/cls_curl.php +++ b/library/cls_curl.php @@ -1,4 +1,5 @@ + * @author seatle * @copyright seatle - * @link http://www.epooll.com/ - * @license http://www.opensource.org/licenses/mit-license.php MIT License + * @link http://www.epooll.com/ + * @license http://www.opensource.org/licenses/mit-license.php MIT License */ - -class cls_curl -{ - protected static $timeout = 10; - protected static $ch = null; - protected static $useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36'; - protected static $http_raw = false; - protected static $cookie = null; - protected static $cookie_jar = null; +class cls_curl { + protected static $timeout = 10; + protected static $ch = null; + protected static $useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36'; + protected static $http_raw = false; + protected static $cookie = null; + protected static $cookie_jar = null; protected static $cookie_file = null; - protected static $referer = null; - protected static $ip = null; - protected static $proxy = null; - protected static $headers = array(); - protected static $hosts = array(); - protected static $gzip = false; - protected static $info = array(); + protected static $referer = null; + protected static $ip = null; + protected static $proxy = null; + protected static $headers = array(); + protected static $hosts = array(); + protected static $gzip = false; + protected static $info = array(); /** * set timeout * - * @param init $timeout - * @return + * @param int $timeout */ - public static function set_timeout($timeout) - { + public static function set_timeout($timeout) { self::$timeout = $timeout; } /** * 设置代理 - * + * * @param mixed $proxy * @return void - * @author seatle + * @author seatle * @created time :2016-09-18 10:17 */ - public static function set_proxy($proxy) - { + public static function set_proxy($proxy) { self::$proxy = $proxy; } @@ -57,8 +53,7 @@ public static function set_proxy($proxy) * set referer * */ - public static function set_referer($referer) - { + public static function set_referer($referer) { self::$referer = $referer; } @@ -68,8 +63,7 @@ public static function set_referer($referer) * @param string $useragent * @return void */ - public static function set_useragent($useragent) - { + public static function set_useragent($useragent) { self::$useragent = $useragent; } @@ -79,8 +73,7 @@ public static function set_useragent($useragent) * @param string $cookie * @return void */ - public static function set_cookie($cookie) - { + public static function set_cookie($cookie) { self::$cookie = $cookie; } @@ -90,8 +83,7 @@ public static function set_cookie($cookie) * @param string $cookie_jar * @return void */ - public static function set_cookie_jar($cookie_jar) - { + public static function set_cookie_jar($cookie_jar) { self::$cookie_jar = $cookie_jar; } @@ -101,21 +93,19 @@ public static function set_cookie_jar($cookie_jar) * @param string $cookie_file * @return void */ - public static function set_cookie_file($cookie_file) - { + public static function set_cookie_file($cookie_file) { self::$cookie_file = $cookie_file; } /** * 获取内容的时候是不是连header也一起获取 - * + * * @param mixed $http_raw * @return void - * @author seatle + * @author seatle * @created time :2016-09-18 10:17 */ - public static function set_http_raw($http_raw) - { + public static function set_http_raw($http_raw) { self::$http_raw = $http_raw; } @@ -125,8 +115,7 @@ public static function set_http_raw($http_raw) * @param string $ip * @return void */ - public static function set_ip($ip) - { + public static function set_ip($ip) { self::$ip = $ip; } @@ -136,8 +125,7 @@ public static function set_ip($ip) * @param string $headers * @return void */ - public static function set_headers($headers) - { + public static function set_headers($headers) { self::$headers = $headers; } @@ -147,19 +135,17 @@ public static function set_headers($headers) * @param string $hosts * @return void */ - public static function set_hosts($hosts) - { + public static function set_hosts($hosts) { self::$hosts = $hosts; } /** * 设置Gzip * - * @param string $hosts + * @param string $gzip * @return void */ - public static function set_gzip($gzip) - { + public static function set_gzip($gzip) { self::$gzip = $gzip; } @@ -167,19 +153,17 @@ public static function set_gzip($gzip) * 初始化 CURL * */ - public static function init() - { + public static function init() { //if (empty ( self::$ch )) - if (!is_resource ( self::$ch )) - { - self::$ch = curl_init (); - curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true ); - curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout ); - curl_setopt( self::$ch, CURLOPT_HEADER, false ); - curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent ); - curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout + 5); + if (!is_resource(self::$ch)) { + self::$ch = curl_init(); + curl_setopt(self::$ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt(self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout); + curl_setopt(self::$ch, CURLOPT_HEADER, false); + curl_setopt(self::$ch, CURLOPT_USERAGENT, self::$useragent); + curl_setopt(self::$ch, CURLOPT_TIMEOUT, self::$timeout + 5); // 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生 - curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true); + curl_setopt(self::$ch, CURLOPT_NOSIGNAL, true); } return self::$ch; } @@ -187,11 +171,12 @@ public static function init() /** * get * - * + * @param string $url + * @param array $fields + * @return mixed */ - public static function get($url, $fields = array()) - { - self::init (); + public static function get($url, $fields = array()) { + self::init(); return self::http_request($url, 'get', $fields); } @@ -199,119 +184,99 @@ public static function get($url, $fields = array()) * $fields 有三种类型:1、数组;2、http query;3、json * 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao')) * 前两种是普通的post,可以用$_POST方式获取 - * 第三种是post stream( json rpc,其实就是webservice ),虽然是post方式,但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取 - * - * @param mixed $url - * @param array $fields - * @param mixed $proxy + * 第三种是post stream( json rpc,其实就是webservice ),虽然是post方式,但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取 + * + * @param mixed $url + * @param array $fields * @static * @access public - * @return void + * @return mixed */ - public static function post($url, $fields = array()) - { - self::init (); + //* @param mixed $proxy + public static function post($url, $fields = array()) { + self::init(); return self::http_request($url, 'post', $fields); } - public static function http_request($url, $type = 'get', $fields) - { + public static function http_request($url, $type = 'get', $fields) { // 如果是 get 方式,直接拼凑一个 url 出来 - if (strtolower($type) == 'get' && !empty($fields)) - { - $url = $url . (strpos($url,"?")===false ? "?" : "&") . http_build_query($fields); + if (strtolower($type) == 'get' && !empty($fields)) { + $url = $url . (strpos($url, "?") === false ? "?" : "&") . http_build_query($fields); } // 随机绑定 hosts,做负载均衡 - if (self::$hosts) - { - $parse_url = parse_url($url); - $host = $parse_url['host']; - $key = rand(0, count(self::$hosts)-1); - $ip = self::$hosts[$key]; - $url = str_replace($host, $ip, $url); - self::$headers = array_merge( array('Host:'.$host), self::$headers ); + if (self::$hosts) { + $parse_url = parse_url($url); + $host = $parse_url['host']; + $key = rand(0, count(self::$hosts) - 1); + $ip = self::$hosts[$key]; + $url = str_replace($host, $ip, $url); + self::$headers = array_merge(array('Host:' . $host), self::$headers); } - curl_setopt( self::$ch, CURLOPT_URL, $url ); + curl_setopt(self::$ch, CURLOPT_URL, $url); // 如果是 post 方式 - if (strtolower($type) == 'post') - { - curl_setopt( self::$ch, CURLOPT_POST, true ); - curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields ); + if (strtolower($type) == 'post') { + curl_setopt(self::$ch, CURLOPT_POST, true); + curl_setopt(self::$ch, CURLOPT_POSTFIELDS, $fields); } - if (self::$useragent) - { - curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent ); + if (self::$useragent) { + curl_setopt(self::$ch, CURLOPT_USERAGENT, self::$useragent); } - if (self::$cookie) - { - curl_setopt( self::$ch, CURLOPT_COOKIE, self::$cookie ); + if (self::$cookie) { + curl_setopt(self::$ch, CURLOPT_COOKIE, self::$cookie); } - if (self::$cookie_jar) - { - curl_setopt( self::$ch, CURLOPT_COOKIEJAR, self::$cookie_jar ); + if (self::$cookie_jar) { + curl_setopt(self::$ch, CURLOPT_COOKIEJAR, self::$cookie_jar); } - if (self::$cookie_file) - { - curl_setopt( self::$ch, CURLOPT_COOKIEFILE, self::$cookie_file ); + if (self::$cookie_file) { + curl_setopt(self::$ch, CURLOPT_COOKIEFILE, self::$cookie_file); } - if (self::$referer) - { - curl_setopt( self::$ch, CURLOPT_REFERER, self::$referer ); + if (self::$referer) { + curl_setopt(self::$ch, CURLOPT_REFERER, self::$referer); } - if (self::$ip) - { - self::$headers = array_merge( array('CLIENT-IP:'.self::$ip, 'X-FORWARDED-FOR:'.self::$ip), self::$headers ); + if (self::$ip) { + self::$headers = array_merge(array('CLIENT-IP:' . self::$ip, 'X-FORWARDED-FOR:' . self::$ip), self::$headers); } - if (self::$headers) - { - curl_setopt( self::$ch, CURLOPT_HTTPHEADER, self::$headers ); + if (self::$headers) { + curl_setopt(self::$ch, CURLOPT_HTTPHEADER, self::$headers); } - if (self::$gzip) - { - curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' ); + if (self::$gzip) { + curl_setopt(self::$ch, CURLOPT_ENCODING, 'gzip'); } - if (self::$proxy) - { - curl_setopt( self::$ch, CURLOPT_PROXY, self::$proxy ); + if (self::$proxy) { + curl_setopt(self::$ch, CURLOPT_PROXY, self::$proxy); } - if (self::$http_raw) - { - curl_setopt( self::$ch, CURLOPT_HEADER, true ); + if (self::$http_raw) { + curl_setopt(self::$ch, CURLOPT_HEADER, true); } - $data = curl_exec ( self::$ch ); + $data = curl_exec(self::$ch); self::$info = curl_getinfo(self::$ch); - if ($data === false) - { + if ($data === false) { //echo date("Y-m-d H:i:s"), ' Curl error: ' . curl_error( self::$ch ), "\n"; } // 关闭句柄 - curl_close( self::$ch ); + curl_close(self::$ch); //$data = substr($data, 10); //$data = gzinflate($data); return $data; } - public static function get_info() - { + public static function get_info() { return self::$info; } - public static function get_http_code() - { + public static function get_http_code() { return self::$info['http_code']; } } -function classic_curl($urls, $delay) -{ +function classic_curl($urls, $delay) { $queue = curl_multi_init(); - $map = array(); + $map = array(); - foreach ($urls as $url) - { + foreach ($urls as $url) { // create cURL resources $ch = curl_init(); @@ -335,10 +300,9 @@ function classic_curl($urls, $delay) } while ($mrc == CURLM_CALL_MULTI_PERFORM); while ($active > 0 && $mrc == CURLM_OK) { - while (curl_multi_exec($queue, $active) === CURLM_CALL_MULTI_PERFORM); + while (curl_multi_exec($queue, $active) === CURLM_CALL_MULTI_PERFORM) ; // 这里 curl_multi_select 一直返回 -1,所以这里就死循环了,CPU就100%了 - if (curl_multi_select($queue, 0.5) != -1) - { + if (curl_multi_select($queue, 0.5) != -1) { do { $mrc = curl_multi_exec($queue, $active); } while ($mrc == CURLM_CALL_MULTI_PERFORM); @@ -346,7 +310,7 @@ function classic_curl($urls, $delay) } $responses = array(); - foreach ($map as $url=>$ch) { + foreach ($map as $url => $ch) { //$responses[$url] = callback(curl_multi_getcontent($ch), $delay); $responses[$url] = callback(curl_multi_getcontent($ch), $delay, $url); curl_multi_remove_handle($queue, $ch); @@ -357,10 +321,9 @@ function classic_curl($urls, $delay) return $responses; } -function rolling_curl($urls, $delay) -{ +function rolling_curl($urls, $delay) { $queue = curl_multi_init(); - $map = array(); + $map = array(); foreach ($urls as $url) { $ch = curl_init(); @@ -373,27 +336,29 @@ function rolling_curl($urls, $delay) $cookie = '_za=36643642-e546-4d60-a771-8af8dcfbd001; q_c1=a57a2b9f10964f909b8d8969febf3ab2|1437705596000|1437705596000; _xsrf=f0304fba4e44e1d008ec308d59bab029; cap_id="YWY1YmRmODlmZGVmNDc3MWJlZGFkZDg3M2E0M2Q5YjM=|1437705596|963518c454bb6f10d96775021c098c84e1e46f5a"; z_c0="QUFCQVgtRWZBQUFYQUFBQVlRSlZUVjR6NEZVUTgtRkdjTVc5UDMwZXRJZFdWZ2JaOWctNVhnPT0=|1438164574|aed6ef3707f246a7b64da4f1e8c089395d77ff2b"; __utma=51854390.1105113342.1437990174.1438160686.1438164116.10; __utmc=51854390; __utmz=51854390.1438134939.8.5.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/yangzetao; __utmv=51854390.100-1|2=registration_date=20131030=1^3=entry_date=20131030=1'; curl_setopt($ch, CURLOPT_COOKIE, $cookie); $useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36'; - curl_setopt( $ch, CURLOPT_USERAGENT, $useragent ); + curl_setopt($ch, CURLOPT_USERAGENT, $useragent); curl_setopt($ch, CURLOPT_ENCODING, 'gzip'); curl_multi_add_handle($queue, $ch); - $map[(string) $ch] = $url; + $map[(string)$ch] = $url; } $responses = array(); do { while (($code = curl_multi_exec($queue, $active)) == CURLM_CALL_MULTI_PERFORM) ; - if ($code != CURLM_OK) { break; } + if ($code != CURLM_OK) { + break; + } // a request was just completed -- find out which one while ($done = curl_multi_info_read($queue)) { // get the info and content returned on the request - $info = curl_getinfo($done['handle']); - $error = curl_error($done['handle']); - $results = callback(curl_multi_getcontent($done['handle']), $delay, $map[(string) $done['handle']]); - $responses[$map[(string) $done['handle']]] = compact('info', 'error', 'results'); + $info = curl_getinfo($done['handle']); + $error = curl_error($done['handle']); + $results = callback(curl_multi_getcontent($done['handle']), $delay, $map[(string)$done['handle']]); + $responses[$map[(string)$done['handle']]] = compact('info', 'error', 'results'); // remove the curl handle that just completed curl_multi_remove_handle($queue, $done['handle']); @@ -414,9 +379,8 @@ function rolling_curl($urls, $delay) function callback($data, $delay, $url) { //echo $data; //echo date("Y-m-d H:i:s", time()) . " --- " . $url . "\n"; - if (!empty($data)) - { - file_put_contents("./html2/".md5($url).".html", $data); + if (!empty($data)) { + file_put_contents("./html2/" . md5($url) . ".html", $data); } // usleep模拟现实中比较负责的数据处理逻辑(如提取, 分词, 写入文件或数据库等) //usleep(1); diff --git a/library/cls_query.php b/library/cls_query.php index 65dd9fe..df035c0 100644 --- a/library/cls_query.php +++ b/library/cls_query.php @@ -1,83 +1,72 @@ + * @author seatle * @created time :2015-08-08 15:52 */ - private static function get_nodes($query) - { + private static function get_nodes($query) { // 把一到多个空格 替换成 一个空格 // 把 > 和 ~ 符号两边的空格去掉,因为没有用这两个符号,所以这里可以不这么做 // ul>li.className $query = trim( - preg_replace('@\s+@', ' ', - preg_replace('@\s*(>|\\+|~)\s*@', '\\1', $query) - ) - ); + preg_replace('@\s+@', ' ', + preg_replace('@\s*(>|\\+|~)\s*@', '\\1', $query) + ) + ); $nodes = array(); - if (! $query) - { - return $nodes; + if (!$query) { + return $nodes; } $query_arr = explode(" ", $query); - foreach ($query_arr as $k=>$v) - { - $path = $k == 0 ? $v : $path.' '.$v; - $node = array("path"=>(string)$path, "name"=>"", "id"=>"", "class"=>"", "other"=>array()); + foreach ($query_arr as $k => $v) { + $path = $k == 0 ? $v : $path . ' ' . $v; + $node = array("path" => (string)$path, "name" => "", "id" => "", "class" => "", "other" => array()); // 如果存在内容选择器 - if (preg_match('@(.*?)\[(.*?)=[\'|"](.*?)[\'|"]\]@', $v, $matches) && !empty($matches[2]) && !empty($matches[3])) - { + if (preg_match('@(.*?)\[(.*?)=[\'|"](.*?)[\'|"]\]@', $v, $matches) && !empty($matches[2]) && !empty($matches[3])) { // 把选择器过滤掉 [rel='topic'] - $v = $matches[1]; + $v = $matches[1]; $node['other'] = array( - 'key'=>$matches[2], - 'val'=>$matches[3], + 'key' => $matches[2], + 'val' => $matches[3], ); } // 如果存在 id - $id_arr = explode("#", $v); + $id_arr = explode("#", $v); $class_arr = explode(".", $v); - if (count($id_arr) === 2) - { + if (count($id_arr) === 2) { $node['name'] = $id_arr[0]; - $node['id'] = $id_arr[1]; - } - // 如果存在 class - elseif (count($class_arr) === 2) - { - $node['name'] = $class_arr[0]; + $node['id'] = $id_arr[1]; + } // 如果存在 class + elseif (count($class_arr) === 2) { + $node['name'] = $class_arr[0]; $node['class'] = $class_arr[1]; - } - // 如果没有样式 - else - { + } // 如果没有样式 + else { $node['name'] = $v; } $nodes[] = $node; @@ -87,44 +76,36 @@ private static function get_nodes($query) return $nodes; } - public static function get_datas($nodes, $attr = "html") - { - if (empty(self::$content)) - { + public static function get_datas($nodes, $attr = "html") { + if (empty(self::$content)) { return false; } $node_datas = array(); - $count = count($nodes); + $count = count($nodes); // 循环所有节点 - foreach ($nodes as $i=>$node) - { - $is_last = $count == $i+1 ? true : false; + foreach ($nodes as $i => $node) { + $is_last = $count == $i + 1 ? true : false; // 第一次 - if ($i == 0) - { + if ($i == 0) { $datas = array(); $datas = self::get_node_datas($node, self::$content, $attr, $is_last); // 如果第一次都取不到数据,直接跳出循环 - if(!$datas) - { + if (!$datas) { break; } $node_datas[$nodes[$i]['path']] = $datas; - } - else - { + } else { $datas = array(); // 循环上一个节点的数组 - foreach ($node_datas[$nodes[$i-1]['path']] as $v) - { - $datas = array_merge( $datas, self::get_node_datas($node, trim($v), $attr, $is_last) ); + foreach ($node_datas[$nodes[$i - 1]['path']] as $v) { + $datas = array_merge($datas, self::get_node_datas($node, trim($v), $attr, $is_last)); } $node_datas[$nodes[$i]['path']] = $datas; // 删除上一个节点,防止内存溢出,或者缓存到本地,再次使用?! - unset($node_datas[$nodes[$i-1]['path']]); + unset($node_datas[$nodes[$i - 1]['path']]); } - } + } //print_r($datas);exit; // 从数组中弹出最后一个元素 $node_datas = array_pop($node_datas); @@ -136,65 +117,55 @@ public static function get_datas($nodes, $attr = "html") /** * 从节点中获取内容 * $regex = '@]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i'; - * + * * @param mixed $node * @param mixed $content * @return void - * @author seatle + * @author seatle * @created time :2015-08-08 15:52 */ - private static function get_node_datas($node, $content, $attr = "html", $is_last = false) - { + private static function get_node_datas($node, $content, $attr = "html", $is_last = false) { $node_datas = $datas = array(); - if (!empty($node['id'])) - { - if ($node['name']) - $regex = '@<'.$node['name'].'[^>]+id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)@is'; - else - $regex = '@id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)<@is'; - } - elseif (!empty($node['class'])) - { - if ($node['name']) - $regex = '@<'.$node['name'].'[^>]+class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)@is'; - else - $regex = '@class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)<@is'; - } - else - { + if (!empty($node['id'])) { + if ($node['name']) { + $regex = '@<' . $node['name'] . '[^>]+id\\s*=\\s*["|\']+?' . $node['id'] . '\\s*[^>]+?>(.*?)@is'; + } else { + $regex = '@id\\s*=\\s*["|\']+?' . $node['id'] . '\\s*[^>]+?>(.*?)<@is'; + } + } elseif (!empty($node['class'])) { + if ($node['name']) { + $regex = '@<' . $node['name'] . '[^>]+class\\s*=\\s*["|\']+?' . $node['class'] . '\\s*[^>]+?>(.*?)@is'; + } else { + $regex = '@class\\s*=\\s*["|\']+?' . $node['class'] . '\\s*[^>]+?>(.*?)<@is'; + } + } else { // 这里为是么是*,0次到多次,因为有可能是
  • - $regex = '@<'.$node['name'].'[^>]*?>(.*?)@is'; + $regex = '@<' . $node['name'] . '[^>]*?>(.*?)@is'; } self::log("regex --- " . $regex);; preg_match_all($regex, $content, $matches); - $all_datas = empty($matches[0]) ? array() : $matches[0]; + $all_datas = empty($matches[0]) ? array() : $matches[0]; $html_datas = empty($matches[1]) ? array() : $matches[1]; // 过滤掉选择器对不上的 - foreach ($all_datas as $i=>$data) - { + foreach ($all_datas as $i => $data) { // 如果有设置其他选择器,验证一下选择器 - if (!empty($node['other'])) - { - $regex = '@'.$node['other']['key'].'=[\'|"]'.$node['other']['val'].'[\'|"]@is'; + if (!empty($node['other'])) { + $regex = '@' . $node['other']['key'] . '=[\'|"]' . $node['other']['val'] . '[\'|"]@is'; self::log("regex other --- " . $regex); // 过滤器对不上的,跳过 - if (!preg_match($regex, $data, $matches)) - { + if (!preg_match($regex, $data, $matches)) { continue; } } // 获取节点的html内容 - if ($attr != "html" && $is_last) - { - $regex = '@'.$attr.'=[\'|"](.*?)[\'|"]@is'; - preg_match($regex, $data, $matches); + if ($attr != "html" && $is_last) { + $regex = '@' . $attr . '=[\'|"](.*?)[\'|"]@is'; + preg_match($regex, $data, $matches); $node_datas[] = empty($matches[1]) ? '' : trim($matches[1]); - } - // 获取节点属性名的值 - else - { + } // 获取节点属性名的值 + else { $node_datas[] = trim($html_datas[$i]); } } @@ -209,11 +180,9 @@ private static function get_node_datas($node, $content, $attr = "html", $is_last * @param string $msg * @return void */ - private static function log($msg) - { - $msg = "[".date("Y-m-d H:i:s")."] " . $msg . "\n"; - if (self::$debug) - { + private static function log($msg) { + $msg = "[" . date("Y-m-d H:i:s") . "] " . $msg . "\n"; + if (self::$debug) { echo $msg; } }