mirror of
https://gitee.com/ledc/IYUUAutoReseed
synced 2025-07-08 08:31:53 +00:00
IYUUAutoReseed初始化版本库v0.2.0
This commit is contained in:
425
vendor/owner888/phpspider/library/cls_curl.php
vendored
Normal file
425
vendor/owner888/phpspider/library/cls_curl.php
vendored
Normal file
@ -0,0 +1,425 @@
|
||||
<?php
|
||||
/**
|
||||
* Worker多进程操作类
|
||||
*
|
||||
* Licensed under The MIT License
|
||||
* For full copyright and license information, please see the MIT-LICENSE.txt
|
||||
* Redistributions of files must retain the above copyright notice.
|
||||
*
|
||||
* @author seatle<seatle@foxmail.com>
|
||||
* @copyright seatle<seatle@foxmail.com>
|
||||
* @link http://www.epooll.com/
|
||||
* @license http://www.opensource.org/licenses/mit-license.php MIT License
|
||||
*/
|
||||
|
||||
class cls_curl
|
||||
{
|
||||
protected static $timeout = 10;
|
||||
protected static $ch = null;
|
||||
protected static $useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36';
|
||||
protected static $http_raw = false;
|
||||
protected static $cookie = null;
|
||||
protected static $cookie_jar = null;
|
||||
protected static $cookie_file = null;
|
||||
protected static $referer = null;
|
||||
protected static $ip = null;
|
||||
protected static $proxy = null;
|
||||
protected static $headers = array();
|
||||
protected static $hosts = array();
|
||||
protected static $gzip = false;
|
||||
protected static $info = array();
|
||||
|
||||
/**
|
||||
* set timeout
|
||||
*
|
||||
* @param init $timeout
|
||||
* @return
|
||||
*/
|
||||
public static function set_timeout($timeout)
|
||||
{
|
||||
self::$timeout = $timeout;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置代理
|
||||
*
|
||||
* @param mixed $proxy
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-09-18 10:17
|
||||
*/
|
||||
public static function set_proxy($proxy)
|
||||
{
|
||||
self::$proxy = $proxy;
|
||||
}
|
||||
|
||||
/**
|
||||
* set referer
|
||||
*
|
||||
*/
|
||||
public static function set_referer($referer)
|
||||
{
|
||||
self::$referer = $referer;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置 user_agent
|
||||
*
|
||||
* @param string $useragent
|
||||
* @return void
|
||||
*/
|
||||
public static function set_useragent($useragent)
|
||||
{
|
||||
self::$useragent = $useragent;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置COOKIE
|
||||
*
|
||||
* @param string $cookie
|
||||
* @return void
|
||||
*/
|
||||
public static function set_cookie($cookie)
|
||||
{
|
||||
self::$cookie = $cookie;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置COOKIE JAR
|
||||
*
|
||||
* @param string $cookie_jar
|
||||
* @return void
|
||||
*/
|
||||
public static function set_cookie_jar($cookie_jar)
|
||||
{
|
||||
self::$cookie_jar = $cookie_jar;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置COOKIE FILE
|
||||
*
|
||||
* @param string $cookie_file
|
||||
* @return void
|
||||
*/
|
||||
public static function set_cookie_file($cookie_file)
|
||||
{
|
||||
self::$cookie_file = $cookie_file;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取内容的时候是不是连header也一起获取
|
||||
*
|
||||
* @param mixed $http_raw
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-09-18 10:17
|
||||
*/
|
||||
public static function set_http_raw($http_raw)
|
||||
{
|
||||
self::$http_raw = $http_raw;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置IP
|
||||
*
|
||||
* @param string $ip
|
||||
* @return void
|
||||
*/
|
||||
public static function set_ip($ip)
|
||||
{
|
||||
self::$ip = $ip;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置Headers
|
||||
*
|
||||
* @param string $headers
|
||||
* @return void
|
||||
*/
|
||||
public static function set_headers($headers)
|
||||
{
|
||||
self::$headers = $headers;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置Hosts
|
||||
*
|
||||
* @param string $hosts
|
||||
* @return void
|
||||
*/
|
||||
public static function set_hosts($hosts)
|
||||
{
|
||||
self::$hosts = $hosts;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置Gzip
|
||||
*
|
||||
* @param string $hosts
|
||||
* @return void
|
||||
*/
|
||||
public static function set_gzip($gzip)
|
||||
{
|
||||
self::$gzip = $gzip;
|
||||
}
|
||||
|
||||
/**
|
||||
* 初始化 CURL
|
||||
*
|
||||
*/
|
||||
public static function init()
|
||||
{
|
||||
//if (empty ( self::$ch ))
|
||||
if (!is_resource ( self::$ch ))
|
||||
{
|
||||
self::$ch = curl_init ();
|
||||
curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true );
|
||||
curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout );
|
||||
curl_setopt( self::$ch, CURLOPT_HEADER, false );
|
||||
curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent );
|
||||
curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout + 5);
|
||||
// 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生
|
||||
curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true);
|
||||
}
|
||||
return self::$ch;
|
||||
}
|
||||
|
||||
/**
|
||||
* get
|
||||
*
|
||||
*
|
||||
*/
|
||||
public static function get($url, $fields = array())
|
||||
{
|
||||
self::init ();
|
||||
return self::http_request($url, 'get', $fields);
|
||||
}
|
||||
|
||||
/**
|
||||
* $fields 有三种类型:1、数组;2、http query;3、json
|
||||
* 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao'))
|
||||
* 前两种是普通的post,可以用$_POST方式获取
|
||||
* 第三种是post stream( json rpc,其实就是webservice ),虽然是post方式,但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取
|
||||
*
|
||||
* @param mixed $url
|
||||
* @param array $fields
|
||||
* @param mixed $proxy
|
||||
* @static
|
||||
* @access public
|
||||
* @return void
|
||||
*/
|
||||
public static function post($url, $fields = array())
|
||||
{
|
||||
self::init ();
|
||||
return self::http_request($url, 'post', $fields);
|
||||
}
|
||||
|
||||
public static function http_request($url, $type = 'get', $fields)
|
||||
{
|
||||
// 如果是 get 方式,直接拼凑一个 url 出来
|
||||
if (strtolower($type) == 'get' && !empty($fields))
|
||||
{
|
||||
$url = $url . (strpos($url,"?")===false ? "?" : "&") . http_build_query($fields);
|
||||
}
|
||||
|
||||
// 随机绑定 hosts,做负载均衡
|
||||
if (self::$hosts)
|
||||
{
|
||||
$parse_url = parse_url($url);
|
||||
$host = $parse_url['host'];
|
||||
$key = rand(0, count(self::$hosts)-1);
|
||||
$ip = self::$hosts[$key];
|
||||
$url = str_replace($host, $ip, $url);
|
||||
self::$headers = array_merge( array('Host:'.$host), self::$headers );
|
||||
}
|
||||
curl_setopt( self::$ch, CURLOPT_URL, $url );
|
||||
// 如果是 post 方式
|
||||
if (strtolower($type) == 'post')
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_POST, true );
|
||||
curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields );
|
||||
}
|
||||
if (self::$useragent)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent );
|
||||
}
|
||||
if (self::$cookie)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_COOKIE, self::$cookie );
|
||||
}
|
||||
if (self::$cookie_jar)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_COOKIEJAR, self::$cookie_jar );
|
||||
}
|
||||
if (self::$cookie_file)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_COOKIEFILE, self::$cookie_file );
|
||||
}
|
||||
if (self::$referer)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_REFERER, self::$referer );
|
||||
}
|
||||
if (self::$ip)
|
||||
{
|
||||
self::$headers = array_merge( array('CLIENT-IP:'.self::$ip, 'X-FORWARDED-FOR:'.self::$ip), self::$headers );
|
||||
}
|
||||
if (self::$headers)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_HTTPHEADER, self::$headers );
|
||||
}
|
||||
if (self::$gzip)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' );
|
||||
}
|
||||
if (self::$proxy)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_PROXY, self::$proxy );
|
||||
}
|
||||
if (self::$http_raw)
|
||||
{
|
||||
curl_setopt( self::$ch, CURLOPT_HEADER, true );
|
||||
}
|
||||
|
||||
$data = curl_exec ( self::$ch );
|
||||
self::$info = curl_getinfo(self::$ch);
|
||||
if ($data === false)
|
||||
{
|
||||
//echo date("Y-m-d H:i:s"), ' Curl error: ' . curl_error( self::$ch ), "\n";
|
||||
}
|
||||
|
||||
// 关闭句柄
|
||||
curl_close( self::$ch );
|
||||
//$data = substr($data, 10);
|
||||
//$data = gzinflate($data);
|
||||
return $data;
|
||||
}
|
||||
|
||||
public static function get_info()
|
||||
{
|
||||
return self::$info;
|
||||
}
|
||||
|
||||
public static function get_http_code()
|
||||
{
|
||||
return self::$info['http_code'];
|
||||
}
|
||||
}
|
||||
|
||||
function classic_curl($urls, $delay)
|
||||
{
|
||||
$queue = curl_multi_init();
|
||||
$map = array();
|
||||
|
||||
foreach ($urls as $url)
|
||||
{
|
||||
// create cURL resources
|
||||
$ch = curl_init();
|
||||
|
||||
// 设置 URL 和 其他参数
|
||||
curl_setopt($ch, CURLOPT_URL, $url);
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, 1);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
||||
curl_setopt($ch, CURLOPT_HEADER, 0);
|
||||
curl_setopt($ch, CURLOPT_NOSIGNAL, true);
|
||||
|
||||
// 把当前 curl resources 加入到 curl_multi_init 队列
|
||||
curl_multi_add_handle($queue, $ch);
|
||||
$map[$url] = $ch;
|
||||
}
|
||||
|
||||
$active = null;
|
||||
|
||||
// execute the handles
|
||||
do {
|
||||
$mrc = curl_multi_exec($queue, $active);
|
||||
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
|
||||
|
||||
while ($active > 0 && $mrc == CURLM_OK) {
|
||||
while (curl_multi_exec($queue, $active) === CURLM_CALL_MULTI_PERFORM);
|
||||
// 这里 curl_multi_select 一直返回 -1,所以这里就死循环了,CPU就100%了
|
||||
if (curl_multi_select($queue, 0.5) != -1)
|
||||
{
|
||||
do {
|
||||
$mrc = curl_multi_exec($queue, $active);
|
||||
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
|
||||
}
|
||||
}
|
||||
|
||||
$responses = array();
|
||||
foreach ($map as $url=>$ch) {
|
||||
//$responses[$url] = callback(curl_multi_getcontent($ch), $delay);
|
||||
$responses[$url] = callback(curl_multi_getcontent($ch), $delay, $url);
|
||||
curl_multi_remove_handle($queue, $ch);
|
||||
curl_close($ch);
|
||||
}
|
||||
|
||||
curl_multi_close($queue);
|
||||
return $responses;
|
||||
}
|
||||
|
||||
function rolling_curl($urls, $delay)
|
||||
{
|
||||
$queue = curl_multi_init();
|
||||
$map = array();
|
||||
|
||||
foreach ($urls as $url) {
|
||||
$ch = curl_init();
|
||||
|
||||
curl_setopt($ch, CURLOPT_URL, $url);
|
||||
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
|
||||
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
||||
curl_setopt($ch, CURLOPT_HEADER, 0);
|
||||
curl_setopt($ch, CURLOPT_NOSIGNAL, true);
|
||||
$cookie = '_za=36643642-e546-4d60-a771-8af8dcfbd001; q_c1=a57a2b9f10964f909b8d8969febf3ab2|1437705596000|1437705596000; _xsrf=f0304fba4e44e1d008ec308d59bab029; cap_id="YWY1YmRmODlmZGVmNDc3MWJlZGFkZDg3M2E0M2Q5YjM=|1437705596|963518c454bb6f10d96775021c098c84e1e46f5a"; z_c0="QUFCQVgtRWZBQUFYQUFBQVlRSlZUVjR6NEZVUTgtRkdjTVc5UDMwZXRJZFdWZ2JaOWctNVhnPT0=|1438164574|aed6ef3707f246a7b64da4f1e8c089395d77ff2b"; __utma=51854390.1105113342.1437990174.1438160686.1438164116.10; __utmc=51854390; __utmz=51854390.1438134939.8.5.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/yangzetao; __utmv=51854390.100-1|2=registration_date=20131030=1^3=entry_date=20131030=1';
|
||||
curl_setopt($ch, CURLOPT_COOKIE, $cookie);
|
||||
$useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36';
|
||||
curl_setopt( $ch, CURLOPT_USERAGENT, $useragent );
|
||||
curl_setopt($ch, CURLOPT_ENCODING, 'gzip');
|
||||
|
||||
curl_multi_add_handle($queue, $ch);
|
||||
$map[(string) $ch] = $url;
|
||||
}
|
||||
|
||||
$responses = array();
|
||||
do {
|
||||
while (($code = curl_multi_exec($queue, $active)) == CURLM_CALL_MULTI_PERFORM) ;
|
||||
|
||||
if ($code != CURLM_OK) { break; }
|
||||
|
||||
// a request was just completed -- find out which one
|
||||
while ($done = curl_multi_info_read($queue)) {
|
||||
|
||||
// get the info and content returned on the request
|
||||
$info = curl_getinfo($done['handle']);
|
||||
$error = curl_error($done['handle']);
|
||||
$results = callback(curl_multi_getcontent($done['handle']), $delay, $map[(string) $done['handle']]);
|
||||
$responses[$map[(string) $done['handle']]] = compact('info', 'error', 'results');
|
||||
|
||||
// remove the curl handle that just completed
|
||||
curl_multi_remove_handle($queue, $done['handle']);
|
||||
curl_close($done['handle']);
|
||||
}
|
||||
|
||||
// Block for data in / output; error handling is done by curl_multi_exec
|
||||
if ($active > 0) {
|
||||
curl_multi_select($queue, 0.5);
|
||||
}
|
||||
|
||||
} while ($active);
|
||||
|
||||
curl_multi_close($queue);
|
||||
return $responses;
|
||||
}
|
||||
|
||||
function callback($data, $delay, $url) {
|
||||
//echo $data;
|
||||
//echo date("Y-m-d H:i:s", time()) . " --- " . $url . "\n";
|
||||
if (!empty($data))
|
||||
{
|
||||
file_put_contents("./html2/".md5($url).".html", $data);
|
||||
}
|
||||
// usleep模拟现实中比较负责的数据处理逻辑(如提取, 分词, 写入文件或数据库等)
|
||||
//usleep(1);
|
||||
//return compact('data', 'matches');
|
||||
}
|
||||
|
248
vendor/owner888/phpspider/library/cls_query.php
vendored
Normal file
248
vendor/owner888/phpspider/library/cls_query.php
vendored
Normal file
@ -0,0 +1,248 @@
|
||||
<?php
|
||||
class cls_query
|
||||
{
|
||||
private static $content;
|
||||
public static $debug = false;
|
||||
|
||||
public static function init($content)
|
||||
{
|
||||
self::$content = $content;
|
||||
}
|
||||
|
||||
public static function query($query, $attr = "html")
|
||||
{
|
||||
$nodes = self::get_nodes($query);
|
||||
$datas = self::get_datas($nodes, $attr);
|
||||
return $datas;
|
||||
}
|
||||
|
||||
protected static function is_char($char) {
|
||||
return extension_loaded('mbstring') ? mb_eregi('\w', $char) : preg_match('@\w@', $char);
|
||||
}
|
||||
|
||||
/**
|
||||
* 从xpath中得到节点
|
||||
*
|
||||
* @param mixed $xpath
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2015-08-08 15:52
|
||||
*/
|
||||
private static function get_nodes($query)
|
||||
{
|
||||
// 把一到多个空格 替换成 一个空格
|
||||
// 把 > 和 ~ 符号两边的空格去掉,因为没有用这两个符号,所以这里可以不这么做
|
||||
// ul>li.className
|
||||
$query = trim(
|
||||
preg_replace('@\s+@', ' ',
|
||||
preg_replace('@\s*(>|\\+|~)\s*@', '\\1', $query)
|
||||
)
|
||||
);
|
||||
|
||||
$nodes = array();
|
||||
if (! $query)
|
||||
{
|
||||
return $nodes;
|
||||
}
|
||||
|
||||
$query_arr = explode(" ", $query);
|
||||
foreach ($query_arr as $k=>$v)
|
||||
{
|
||||
$path = $k == 0 ? $v : $path.' '.$v;
|
||||
$node = array("path"=>(string)$path, "name"=>"", "id"=>"", "class"=>"", "other"=>array());
|
||||
// 如果存在内容选择器
|
||||
if (preg_match('@(.*?)\[(.*?)=[\'|"](.*?)[\'|"]\]@', $v, $matches) && !empty($matches[2]) && !empty($matches[3]))
|
||||
{
|
||||
// 把选择器过滤掉 [rel='topic']
|
||||
$v = $matches[1];
|
||||
$node['other'] = array(
|
||||
'key'=>$matches[2],
|
||||
'val'=>$matches[3],
|
||||
);
|
||||
}
|
||||
|
||||
// 如果存在 id
|
||||
$id_arr = explode("#", $v);
|
||||
$class_arr = explode(".", $v);
|
||||
if (count($id_arr) === 2)
|
||||
{
|
||||
$node['name'] = $id_arr[0];
|
||||
$node['id'] = $id_arr[1];
|
||||
}
|
||||
// 如果存在 class
|
||||
elseif (count($class_arr) === 2)
|
||||
{
|
||||
$node['name'] = $class_arr[0];
|
||||
$node['class'] = $class_arr[1];
|
||||
}
|
||||
// 如果没有样式
|
||||
else
|
||||
{
|
||||
$node['name'] = $v;
|
||||
}
|
||||
$nodes[] = $node;
|
||||
}
|
||||
//print_r($nodes);
|
||||
//exit;
|
||||
return $nodes;
|
||||
}
|
||||
|
||||
public static function get_datas($nodes, $attr = "html")
|
||||
{
|
||||
if (empty(self::$content))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
$node_datas = array();
|
||||
$count = count($nodes);
|
||||
// 循环所有节点
|
||||
foreach ($nodes as $i=>$node)
|
||||
{
|
||||
$is_last = $count == $i+1 ? true : false;
|
||||
// 第一次
|
||||
if ($i == 0)
|
||||
{
|
||||
$datas = array();
|
||||
$datas = self::get_node_datas($node, self::$content, $attr, $is_last);
|
||||
// 如果第一次都取不到数据,直接跳出循环
|
||||
if(!$datas)
|
||||
{
|
||||
break;
|
||||
}
|
||||
$node_datas[$nodes[$i]['path']] = $datas;
|
||||
}
|
||||
else
|
||||
{
|
||||
$datas = array();
|
||||
// 循环上一个节点的数组
|
||||
foreach ($node_datas[$nodes[$i-1]['path']] as $v)
|
||||
{
|
||||
$datas = array_merge( $datas, self::get_node_datas($node, trim($v), $attr, $is_last) );
|
||||
}
|
||||
$node_datas[$nodes[$i]['path']] = $datas;
|
||||
// 删除上一个节点,防止内存溢出,或者缓存到本地,再次使用?!
|
||||
unset($node_datas[$nodes[$i-1]['path']]);
|
||||
}
|
||||
}
|
||||
//print_r($datas);exit;
|
||||
// 从数组中弹出最后一个元素
|
||||
$node_datas = array_pop($node_datas);
|
||||
//print_r($node_datas);
|
||||
//exit;
|
||||
return $node_datas;
|
||||
}
|
||||
|
||||
/**
|
||||
* 从节点中获取内容
|
||||
* $regex = '@<meta[^>]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i';
|
||||
*
|
||||
* @param mixed $node
|
||||
* @param mixed $content
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2015-08-08 15:52
|
||||
*/
|
||||
private static function get_node_datas($node, $content, $attr = "html", $is_last = false)
|
||||
{
|
||||
$node_datas = $datas = array();
|
||||
|
||||
if (!empty($node['id']))
|
||||
{
|
||||
if ($node['name'])
|
||||
$regex = '@<'.$node['name'].'[^>]+id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)</'.$node['name'].'>@is';
|
||||
else
|
||||
$regex = '@id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)<@is';
|
||||
}
|
||||
elseif (!empty($node['class']))
|
||||
{
|
||||
if ($node['name'])
|
||||
$regex = '@<'.$node['name'].'[^>]+class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)</'.$node['name'].'>@is';
|
||||
else
|
||||
$regex = '@class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)<@is';
|
||||
}
|
||||
else
|
||||
{
|
||||
// 这里为是么是*,0次到多次,因为有可能是 <li>
|
||||
$regex = '@<'.$node['name'].'[^>]*?>(.*?)</'.$node['name'].'>@is';
|
||||
}
|
||||
self::log("regex --- " . $regex);;
|
||||
preg_match_all($regex, $content, $matches);
|
||||
$all_datas = empty($matches[0]) ? array() : $matches[0];
|
||||
$html_datas = empty($matches[1]) ? array() : $matches[1];
|
||||
|
||||
// 过滤掉选择器对不上的
|
||||
foreach ($all_datas as $i=>$data)
|
||||
{
|
||||
// 如果有设置其他选择器,验证一下选择器
|
||||
if (!empty($node['other']))
|
||||
{
|
||||
$regex = '@'.$node['other']['key'].'=[\'|"]'.$node['other']['val'].'[\'|"]@is';
|
||||
self::log("regex other --- " . $regex);
|
||||
// 过滤器对不上的,跳过
|
||||
if (!preg_match($regex, $data, $matches))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// 获取节点的html内容
|
||||
if ($attr != "html" && $is_last)
|
||||
{
|
||||
$regex = '@'.$attr.'=[\'|"](.*?)[\'|"]@is';
|
||||
preg_match($regex, $data, $matches);
|
||||
$node_datas[] = empty($matches[1]) ? '' : trim($matches[1]);
|
||||
}
|
||||
// 获取节点属性名的值
|
||||
else
|
||||
{
|
||||
$node_datas[] = trim($html_datas[$i]);
|
||||
}
|
||||
}
|
||||
//echo " 11111 ========================================= \n";
|
||||
//print_r($node_datas);
|
||||
//echo " 22222 ========================================= \n\n\n";
|
||||
return $node_datas;
|
||||
}
|
||||
|
||||
/**
|
||||
* 记录日志
|
||||
* @param string $msg
|
||||
* @return void
|
||||
*/
|
||||
private static function log($msg)
|
||||
{
|
||||
$msg = "[".date("Y-m-d H:i:s")."] " . $msg . "\n";
|
||||
if (self::$debug)
|
||||
{
|
||||
echo $msg;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//$xpath = "ul.top-nav-dropdown li";
|
||||
//$xpath = "i.zg-icon";
|
||||
//print_r($nodes);
|
||||
//exit;
|
||||
// [^>]+ 不是>的字符重复一次到多次, ? 表示不贪婪
|
||||
// \s 表示空白字符
|
||||
// * 表示0次或者多次
|
||||
// + 表示1次或者多次
|
||||
//
|
||||
// 后向引用,表示表达式中,从左往右数,第一个左括号对应的括号内的内容。
|
||||
// \\0 表示整个表达式
|
||||
// \\1表示第1个表达式
|
||||
// \\2表示第2个表达式
|
||||
// $regex = '@<meta[^>]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i';
|
||||
//preg_match_all($regex, $content, $matches);
|
||||
//print_r($matches);
|
||||
//exit;
|
||||
|
||||
// 用法
|
||||
//$content = file_get_contents("./test.html");
|
||||
//$query = "ul#top-nav-profile-dropdown li a";
|
||||
//$query = "div#zh-profile-following-topic a.link[href='/topic/19550937']";
|
||||
//cls_query::init($content);
|
||||
//$list = cls_query::query($query, "href");
|
||||
//print_r($list);
|
||||
|
1263
vendor/owner888/phpspider/library/cls_redis.php
vendored
Normal file
1263
vendor/owner888/phpspider/library/cls_redis.php
vendored
Normal file
File diff suppressed because it is too large
Load Diff
121
vendor/owner888/phpspider/library/cls_redis_client.php
vendored
Normal file
121
vendor/owner888/phpspider/library/cls_redis_client.php
vendored
Normal file
@ -0,0 +1,121 @@
|
||||
<?php
|
||||
/**
|
||||
* redis 客户端
|
||||
* redis的协议可参考这个文章http://redis.cn/topics/protocol.html
|
||||
*
|
||||
* @version 2.7.0
|
||||
* @copyright 1997-2018 The PHP Group
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2018-01-03
|
||||
*/
|
||||
class cls_redis_client
|
||||
{
|
||||
private $redis_socket = false;
|
||||
//private $command = '';
|
||||
|
||||
public function __construct($host='127.0.0.1', $port=6379, $timeout = 3)
|
||||
{
|
||||
$this->redis_socket = stream_socket_client("tcp://".$host.":".$port, $errno, $errstr, $timeout);
|
||||
if ( !$this->redis_socket )
|
||||
{
|
||||
throw new Exception("{$errno} - {$errstr}");
|
||||
}
|
||||
}
|
||||
|
||||
public function __destruct()
|
||||
{
|
||||
fclose($this->redis_socket);
|
||||
}
|
||||
|
||||
public function __call($name, $args)
|
||||
{
|
||||
$crlf = "\r\n";
|
||||
array_unshift($args, $name);
|
||||
$command = '*' . count($args) . $crlf;
|
||||
foreach ($args as $arg)
|
||||
{
|
||||
$command .= '$' . strlen($arg) . $crlf . $arg . $crlf;
|
||||
}
|
||||
//echo $command."\n";
|
||||
$fwrite = fwrite($this->redis_socket, $command);
|
||||
if ($fwrite === FALSE || $fwrite <= 0)
|
||||
{
|
||||
throw new Exception('Failed to write entire command to stream');
|
||||
}
|
||||
return $this->read_response();
|
||||
}
|
||||
|
||||
private function read_response()
|
||||
{
|
||||
$reply = trim(fgets($this->redis_socket, 1024));
|
||||
switch (substr($reply, 0, 1))
|
||||
{
|
||||
case '-':
|
||||
throw new Exception(trim(substr($reply, 1)));
|
||||
break;
|
||||
case '+':
|
||||
$response = substr(trim($reply), 1);
|
||||
if ($response === 'OK')
|
||||
{
|
||||
$response = TRUE;
|
||||
}
|
||||
break;
|
||||
case '$':
|
||||
$response = NULL;
|
||||
if ($reply == '$-1')
|
||||
{
|
||||
break;
|
||||
}
|
||||
$read = 0;
|
||||
$size = intval(substr($reply, 1));
|
||||
if ($size > 0)
|
||||
{
|
||||
do
|
||||
{
|
||||
$block_size = ($size - $read) > 1024 ? 1024 : ($size - $read);
|
||||
$r = fread($this->redis_socket, $block_size);
|
||||
if ($r === FALSE)
|
||||
{
|
||||
throw new Exception('Failed to read response from stream');
|
||||
}
|
||||
else
|
||||
{
|
||||
$read += strlen($r);
|
||||
$response .= $r;
|
||||
}
|
||||
}
|
||||
while ($read < $size);
|
||||
}
|
||||
fread($this->redis_socket, 2); /* discard crlf */
|
||||
break;
|
||||
/* Multi-bulk reply */
|
||||
case '*':
|
||||
$count = intval(substr($reply, 1));
|
||||
if ($count == '-1')
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
$response = array();
|
||||
for ($i = 0; $i < $count; $i++)
|
||||
{
|
||||
$response[] = $this->read_response();
|
||||
}
|
||||
break;
|
||||
/* Integer reply */
|
||||
case ':':
|
||||
$response = intval(substr(trim($reply), 1));
|
||||
break;
|
||||
default:
|
||||
throw new RedisException("Unknown response: {$reply}");
|
||||
break;
|
||||
}
|
||||
return $response;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//$redis = new cls_redis_client();
|
||||
//var_dump($redis->auth("foobared"));
|
||||
//var_dump($redis->set("name",'abc'));
|
||||
//var_dump($redis->get("name"));
|
||||
|
179
vendor/owner888/phpspider/library/cls_redis_server.php
vendored
Normal file
179
vendor/owner888/phpspider/library/cls_redis_server.php
vendored
Normal file
@ -0,0 +1,179 @@
|
||||
<?php
|
||||
ini_set("memory_limit", "128M");
|
||||
/**
|
||||
* redis 服务端
|
||||
* 多进程阻塞式
|
||||
* redis-benchmark -h 127.0.0.1 -p 11211 -t set -n 80000 -q
|
||||
*
|
||||
* @version 2.7.0
|
||||
* @copyright 1997-2018 The PHP Group
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2018-01-03
|
||||
*/
|
||||
class cls_redis_server
|
||||
{
|
||||
private $socket = false;
|
||||
private $process_num = 3;
|
||||
public $redis_kv_data = array();
|
||||
public $onMessage = null;
|
||||
|
||||
public function __construct($host="0.0.0.0", $port=6379)
|
||||
{
|
||||
$this->socket = stream_socket_server("tcp://".$host.":".$port,$errno, $errstr);
|
||||
if (!$this->socket) die($errstr."--".$errno);
|
||||
echo "listen $host $port \r\n";
|
||||
}
|
||||
|
||||
private function parse_resp(&$conn)
|
||||
{
|
||||
// 读取一行,遇到 \r\n 为一行
|
||||
$line = fgets($conn);
|
||||
if($line === '' || $line === false)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
// 获取第一个字符作为类型
|
||||
$type = $line[0];
|
||||
// 去掉第一个字符,去掉结尾的 \r\n
|
||||
$line = mb_substr($line, 1, -2);
|
||||
switch ( $type )
|
||||
{
|
||||
case "*":
|
||||
// 得到长度
|
||||
$count = (int) $line;
|
||||
$data = array();
|
||||
for ($i = 1; $i <= $count; $i++)
|
||||
{
|
||||
$data[] = $this->parse_resp($conn);
|
||||
}
|
||||
return $data;
|
||||
case "$":
|
||||
if ($line == '-1')
|
||||
{
|
||||
return null;
|
||||
}
|
||||
// 截取的长度要加上 \r\n 两个字符
|
||||
$length = $line + 2;
|
||||
$data = '';
|
||||
while ($length > 0)
|
||||
{
|
||||
$block = fread($conn, $length);
|
||||
if ($length !== strlen($block))
|
||||
{
|
||||
throw new Exception('RECEIVING');
|
||||
}
|
||||
$data .= $block;
|
||||
$length -= mb_strlen($block);
|
||||
}
|
||||
return mb_substr($data, 0, -2);
|
||||
}
|
||||
return $line;
|
||||
}
|
||||
|
||||
private function start_worker_process()
|
||||
{
|
||||
$pid = pcntl_fork();
|
||||
switch ($pid)
|
||||
{
|
||||
case -1:
|
||||
echo "fork error : {$i} \r\n";
|
||||
exit;
|
||||
case 0:
|
||||
while ( true )
|
||||
{
|
||||
echo "PID ".posix_getpid()." waiting...\n";
|
||||
// 堵塞等待
|
||||
$conn = stream_socket_accept($this->socket, -1);
|
||||
if ( !$conn )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
//"*3\r\n$3\r\nSET\r\n$5\r\nmykey\r\n$7\r\nmyvalue\r\n"
|
||||
while( true )
|
||||
{
|
||||
$arr = $this->parse_resp($conn);
|
||||
if ( is_array($arr) )
|
||||
{
|
||||
if ($this->onMessage)
|
||||
{
|
||||
call_user_func($this->onMessage, $conn, $arr);
|
||||
}
|
||||
}
|
||||
else if ( $arr )
|
||||
{
|
||||
if ($this->onMessage)
|
||||
{
|
||||
call_user_func($this->onMessage, $conn, $arr);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
fclose($conn);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
default:
|
||||
$this->pids[$pid] = $pid;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
public function run()
|
||||
{
|
||||
for($i = 1; $i <= $this->process_num; $i++)
|
||||
{
|
||||
$this->start_worker_process();
|
||||
}
|
||||
|
||||
while( true )
|
||||
{
|
||||
foreach ($this->pids as $i => $pid)
|
||||
{
|
||||
if($pid)
|
||||
{
|
||||
$res = pcntl_waitpid($pid, $status,WNOHANG);
|
||||
|
||||
if ( $res == -1 || $res > 0 )
|
||||
{
|
||||
$this->start_worker_process();
|
||||
unset($this->pids[$pid]);
|
||||
}
|
||||
}
|
||||
}
|
||||
sleep(1);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
$server = new cls_redis_server();
|
||||
$server->onMessage = function($conn, $info) use($server)
|
||||
{
|
||||
if ( is_array($info) )
|
||||
{
|
||||
$command = strtoupper($info[0]);
|
||||
if ( $command == "SET" )
|
||||
{
|
||||
$key = $info[1];
|
||||
$val = $info[2];
|
||||
$server->redis_kv_data[$key] = $val;
|
||||
fwrite($conn, "+OK\r\n");
|
||||
}
|
||||
else if ( $command == "GET" )
|
||||
{
|
||||
$key = $info[1];
|
||||
$val = isset($server->redis_kv_data[$key]) ? $server->redis_kv_data[$key] : '';
|
||||
fwrite($conn, "$".strlen($val)."\r\n".$val."\r\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
fwrite($conn,"+OK\r\n");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
fwrite($conn,"+OK\r\n");
|
||||
}
|
||||
};
|
||||
$server->run();
|
5727
vendor/owner888/phpspider/library/phpquery.php
vendored
Normal file
5727
vendor/owner888/phpspider/library/phpquery.php
vendored
Normal file
File diff suppressed because it is too large
Load Diff
466
vendor/owner888/phpspider/library/rolling_curl.php
vendored
Normal file
466
vendor/owner888/phpspider/library/rolling_curl.php
vendored
Normal file
@ -0,0 +1,466 @@
|
||||
<?php
|
||||
/**
|
||||
* Curl操作类
|
||||
*
|
||||
* Licensed under The MIT License
|
||||
* For full copyright and license information, please see the MIT-LICENSE.txt
|
||||
* Redistributions of files must retain the above copyright notice.
|
||||
*
|
||||
* @author seatle<seatle@foxmail.com>
|
||||
* @copyright seatle<seatle@foxmail.com>
|
||||
* @link http://www.epooll.com/
|
||||
* @license http://www.opensource.org/licenses/mit-license.php MIT License
|
||||
*/
|
||||
|
||||
class rolling_curl
|
||||
{
|
||||
/**
|
||||
* @var float
|
||||
*
|
||||
* 同时运行任务数
|
||||
* 例如:有8个请求,则会被分成两批,第一批5个请求,第二批3个请求
|
||||
* 注意:采集知乎的时候,5个是比较稳定的,7个以上就开始会超时了,多进程就没有这样的问题,因为多进程很少几率会发生并发
|
||||
*/
|
||||
public $window_size = 5;
|
||||
|
||||
/**
|
||||
* @var float
|
||||
*
|
||||
* Timeout is the timeout used for curl_multi_select.
|
||||
*/
|
||||
private $timeout = 10;
|
||||
|
||||
/**
|
||||
* @var string|array
|
||||
*
|
||||
* 应用在每个请求的回调函数
|
||||
*/
|
||||
public $callback;
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*
|
||||
* 设置默认的请求参数
|
||||
*/
|
||||
protected $options = array(
|
||||
CURLOPT_SSL_VERIFYPEER => 0,
|
||||
CURLOPT_RETURNTRANSFER => 1,
|
||||
// 注意:TIMEOUT = CONNECTTIMEOUT + 数据获取时间,所以 TIMEOUT 一定要大于 CONNECTTIMEOUT,否则 CONNECTTIMEOUT 设置了就没意义
|
||||
// "Connection timed out after 30001 milliseconds"
|
||||
CURLOPT_CONNECTTIMEOUT => 30,
|
||||
CURLOPT_TIMEOUT => 60,
|
||||
CURLOPT_RETURNTRANSFER => 1,
|
||||
CURLOPT_HEADER => 0,
|
||||
// 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生
|
||||
CURLOPT_NOSIGNAL => 1,
|
||||
CURLOPT_USERAGENT => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36",
|
||||
);
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $headers = array();
|
||||
|
||||
/**
|
||||
* @var Request[]
|
||||
*
|
||||
* 请求队列
|
||||
*/
|
||||
private $requests = array();
|
||||
|
||||
/**
|
||||
* @var RequestMap[]
|
||||
*
|
||||
* Maps handles to request indexes
|
||||
*/
|
||||
private $requestMap = array();
|
||||
|
||||
public function __construct()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* set timeout
|
||||
*
|
||||
* @param init $timeout
|
||||
* @return
|
||||
*/
|
||||
public function set_timeout($timeout)
|
||||
{
|
||||
$this->options[CURLOPT_TIMEOUT] = $timeout;
|
||||
}
|
||||
|
||||
/**
|
||||
* set proxy
|
||||
*
|
||||
*/
|
||||
public function set_proxy($proxy)
|
||||
{
|
||||
$this->options[CURLOPT_PROXY] = $proxy;
|
||||
}
|
||||
|
||||
/**
|
||||
* set referer
|
||||
*
|
||||
*/
|
||||
public function set_referer($referer)
|
||||
{
|
||||
$this->options[CURLOPT_REFERER] = $referer;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置 user_agent
|
||||
*
|
||||
* @param string $useragent
|
||||
* @return void
|
||||
*/
|
||||
public function set_useragent($useragent)
|
||||
{
|
||||
$this->options[CURLOPT_USERAGENT] = $useragent;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置COOKIE
|
||||
*
|
||||
* @param string $cookie
|
||||
* @return void
|
||||
*/
|
||||
public function set_cookie($cookie)
|
||||
{
|
||||
$this->options[CURLOPT_COOKIE] = $cookie;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置COOKIE JAR
|
||||
*
|
||||
* @param string $cookie_jar
|
||||
* @return void
|
||||
*/
|
||||
public function set_cookiejar($cookiejar)
|
||||
{
|
||||
$this->options[CURLOPT_COOKIEJAR] = $cookiejar;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置COOKIE FILE
|
||||
*
|
||||
* @param string $cookie_file
|
||||
* @return void
|
||||
*/
|
||||
public function set_cookiefile($cookiefile)
|
||||
{
|
||||
$this->options[CURLOPT_COOKIEFILE] = $cookiefile;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取内容的时候是不是连header也一起获取
|
||||
*
|
||||
* @param mixed $http_raw
|
||||
* @return void
|
||||
* @author seatle <seatle@foxmail.com>
|
||||
* @created time :2016-09-18 10:17
|
||||
*/
|
||||
public function set_http_raw($http_raw = false)
|
||||
{
|
||||
$this->options[CURLOPT_HEADER] = $http_raw;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置IP
|
||||
*
|
||||
* @param string $ip
|
||||
* @return void
|
||||
*/
|
||||
public function set_ip($ip)
|
||||
{
|
||||
$headers = array(
|
||||
'CLIENT-IP'=>$ip,
|
||||
'X-FORWARDED-FOR'=>$ip,
|
||||
);
|
||||
$this->headers = $this->headers + $headers;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置Headers
|
||||
*
|
||||
* @param string $headers
|
||||
* @return void
|
||||
*/
|
||||
public function set_headers($headers)
|
||||
{
|
||||
$this->headers = $this->headers + $headers;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置Hosts
|
||||
*
|
||||
* @param string $hosts
|
||||
* @return void
|
||||
*/
|
||||
public function set_hosts($hosts)
|
||||
{
|
||||
$headers = array(
|
||||
'Host'=>$hosts,
|
||||
);
|
||||
$this->headers = $this->headers + $headers;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置Gzip
|
||||
*
|
||||
* @param string $hosts
|
||||
* @return void
|
||||
*/
|
||||
public function set_gzip($gzip)
|
||||
{
|
||||
if ($gzip)
|
||||
{
|
||||
$this->options[CURLOPT_ENCODING] = 'gzip';
|
||||
}
|
||||
}
|
||||
|
||||
public function request($url, $method = "GET", $fields = array(), $headers = array(), $options = array())
|
||||
{
|
||||
$this->requests[] = array('url'=>$url,'method'=>$method,'fields'=>$fields,'headers'=>$headers,'options'=>$options);
|
||||
return true;
|
||||
}
|
||||
|
||||
public function get_options($request)
|
||||
{
|
||||
$options = $this->options;
|
||||
$headers = $this->headers;
|
||||
|
||||
if (ini_get('safe_mode') == 'Off' || !ini_get('safe_mode'))
|
||||
{
|
||||
$options[CURLOPT_FOLLOWLOCATION] = 1;
|
||||
$options[CURLOPT_MAXREDIRS] = 5;
|
||||
}
|
||||
|
||||
// 如果是 get 方式,直接拼凑一个 url 出来
|
||||
if (strtolower($request['method']) == 'get' && !empty($request['fields']))
|
||||
{
|
||||
$url = $request['url'] . "?" . http_build_query($request['fields']);
|
||||
}
|
||||
// 如果是 post 方式
|
||||
if (strtolower($request['method']) == 'post')
|
||||
{
|
||||
$options[CURLOPT_POST] = 1;
|
||||
$options[CURLOPT_POSTFIELDS] = $request['fields'];
|
||||
}
|
||||
|
||||
// append custom options for this specific request
|
||||
if ($request['options'])
|
||||
{
|
||||
$options = $request['options'] + $options;
|
||||
}
|
||||
|
||||
if ($request['headers'])
|
||||
{
|
||||
$headers = $request['headers'] + $headers;
|
||||
}
|
||||
|
||||
// 随机绑定 hosts,做负载均衡
|
||||
//if (self::$hosts)
|
||||
//{
|
||||
//$parse_url = parse_url($url);
|
||||
//$host = $parse_url['host'];
|
||||
//$key = rand(0, count(self::$hosts)-1);
|
||||
//$ip = self::$hosts[$key];
|
||||
//$url = str_replace($host, $ip, $url);
|
||||
//self::$headers = array_merge( array('Host:'.$host), self::$headers );
|
||||
//}
|
||||
|
||||
// header 要这样拼凑
|
||||
$headers_tmp = array();
|
||||
foreach ($headers as $k=>$v)
|
||||
{
|
||||
$headers_tmp[] = $k.":".$v;
|
||||
}
|
||||
$headers = $headers_tmp;
|
||||
|
||||
$options[CURLOPT_URL] = $request['url'];
|
||||
$options[CURLOPT_HTTPHEADER] = $headers;
|
||||
|
||||
return $options;
|
||||
}
|
||||
|
||||
/**
|
||||
* GET 请求
|
||||
*
|
||||
* @param string $url
|
||||
* @param array $headers
|
||||
* @param array $options
|
||||
* @return bool
|
||||
*/
|
||||
public function get($url, $fields = array(), $headers = array(), $options = array())
|
||||
{
|
||||
return $this->request($url, 'get', $fields, $headers, $options);
|
||||
}
|
||||
|
||||
/**
|
||||
* $fields 有三种类型:1、数组;2、http query;3、json
|
||||
* 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao'))
|
||||
* 前两种是普通的post,可以用$_POST方式获取
|
||||
* 第三种是post stream( json rpc,其实就是webservice ),虽然是post方式,但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取
|
||||
*
|
||||
* @param string $url
|
||||
* @param array $fields
|
||||
* @param array $headers
|
||||
* @param array $options
|
||||
* @return void
|
||||
*/
|
||||
public function post($url, $fields = array(), $headers = array(), $options = array())
|
||||
{
|
||||
return $this->request($url, 'post', $fields, $headers, $options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute processing
|
||||
*
|
||||
* @param int $window_size Max number of simultaneous connections
|
||||
* @return string|bool
|
||||
*/
|
||||
public function execute($window_size = null)
|
||||
{
|
||||
$count = sizeof($this->requests);
|
||||
if ($count == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// 只有一个请求
|
||||
elseif ($count == 1)
|
||||
{
|
||||
return $this->single_curl();
|
||||
}
|
||||
else
|
||||
{
|
||||
// 开始 rolling curl,window_size 是最大同时连接数
|
||||
return $this->rolling_curl($window_size);
|
||||
}
|
||||
}
|
||||
|
||||
private function single_curl()
|
||||
{
|
||||
$ch = curl_init();
|
||||
// 从请求队列里面弹出一个来
|
||||
$request = array_shift($this->requests);
|
||||
$options = $this->get_options($request);
|
||||
curl_setopt_array($ch, $options);
|
||||
$output = curl_exec($ch);
|
||||
$info = curl_getinfo($ch);
|
||||
$error = null;
|
||||
if ($output === false)
|
||||
{
|
||||
$error = curl_error( $ch );
|
||||
}
|
||||
//$output = substr($output, 10);
|
||||
//$output = gzinflate($output);
|
||||
|
||||
// 其实一个请求的时候没是么必要回调,直接返回数据就好了,不过这里算是多一个功能吧,和多请求保持一样的操作
|
||||
if ($this->callback)
|
||||
{
|
||||
if (is_callable($this->callback))
|
||||
{
|
||||
call_user_func($this->callback, $output, $info, $request, $error);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return $output;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private function rolling_curl($window_size = null)
|
||||
{
|
||||
// 如何设置了最大任务数
|
||||
if ($window_size)
|
||||
$this->window_size = $window_size;
|
||||
|
||||
// 如果请求数 小于 任务数,设置任务数为请求数
|
||||
if (sizeof($this->requests) < $this->window_size)
|
||||
$this->window_size = sizeof($this->requests);
|
||||
|
||||
// 如果任务数小于2个,不应该用这个方法的,用上面的single_curl方法就好了
|
||||
if ($this->window_size < 2)
|
||||
exit("Window size must be greater than 1");
|
||||
|
||||
// 初始化任务队列
|
||||
$master = curl_multi_init();
|
||||
|
||||
// 开始第一批请求
|
||||
for ($i = 0; $i < $this->window_size; $i++)
|
||||
{
|
||||
$ch = curl_init();
|
||||
$options = $this->get_options($this->requests[$i]);
|
||||
curl_setopt_array($ch, $options);
|
||||
curl_multi_add_handle($master, $ch);
|
||||
// 添加到请求数组
|
||||
$key = (string) $ch;
|
||||
$this->requestMap[$key] = $i;
|
||||
}
|
||||
|
||||
do {
|
||||
while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM) ;
|
||||
|
||||
// 如果
|
||||
if ($execrun != CURLM_OK) { break; }
|
||||
|
||||
// 一旦有一个请求完成,找出来,因为curl底层是select,所以最大受限于1024
|
||||
while ($done = curl_multi_info_read($master))
|
||||
{
|
||||
// 从请求中获取信息、内容、错误
|
||||
$info = curl_getinfo($done['handle']);
|
||||
$output = curl_multi_getcontent($done['handle']);
|
||||
$error = curl_error($done['handle']);
|
||||
|
||||
// 如果绑定了回调函数
|
||||
$callback = $this->callback;
|
||||
if (is_callable($callback))
|
||||
{
|
||||
$key = (string) $done['handle'];
|
||||
$request = $this->requests[$this->requestMap[$key]];
|
||||
unset($this->requestMap[$key]);
|
||||
call_user_func($callback, $output, $info, $request, $error);
|
||||
}
|
||||
|
||||
// 一个请求完了,就加一个进来,一直保证5个任务同时进行
|
||||
if ($i < sizeof($this->requests) && isset($this->requests[$i]) && $i < count($this->requests))
|
||||
{
|
||||
$ch = curl_init();
|
||||
$options = $this->get_options($this->requests[$i]);
|
||||
curl_setopt_array($ch, $options);
|
||||
curl_multi_add_handle($master, $ch);
|
||||
|
||||
// 添加到请求数组
|
||||
$key = (string) $ch;
|
||||
$this->requestMap[$key] = $i;
|
||||
$i++;
|
||||
}
|
||||
// 把请求已经完成了得 curl handle 删除
|
||||
curl_multi_remove_handle($master, $done['handle']);
|
||||
}
|
||||
|
||||
// 当没有数据的时候进行堵塞,把 CPU 使用权交出来,避免上面 do 死循环空跑数据导致 CPU 100%
|
||||
if ($running)
|
||||
{
|
||||
curl_multi_select($master, $this->timeout);
|
||||
}
|
||||
|
||||
} while ($running);
|
||||
// 关闭任务
|
||||
curl_multi_close($master);
|
||||
|
||||
// 把请求清空,否则没有重新 new rolling_curl(); 直接再次导入一批url的时候,就会把前面已经执行过的url又执行一轮
|
||||
unset($this->requests);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return void
|
||||
*/
|
||||
public function __destruct()
|
||||
{
|
||||
unset($this->window_size, $this->callback, $this->options, $this->headers, $this->requests);
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user