You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
425 lines
12 KiB
425 lines
12 KiB
<?php
|
|
/**
|
|
* Worker多进程操作类
|
|
*
|
|
* Licensed under The MIT License
|
|
* For full copyright and license information, please see the MIT-LICENSE.txt
|
|
* Redistributions of files must retain the above copyright notice.
|
|
*
|
|
* @author seatle<seatle@foxmail.com>
|
|
* @copyright seatle<seatle@foxmail.com>
|
|
* @link http://www.epooll.com/
|
|
* @license http://www.opensource.org/licenses/mit-license.php MIT License
|
|
*/
|
|
|
|
class cls_curl
|
|
{
|
|
protected static $timeout = 10;
|
|
protected static $ch = null;
|
|
protected static $useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36';
|
|
protected static $http_raw = false;
|
|
protected static $cookie = null;
|
|
protected static $cookie_jar = null;
|
|
protected static $cookie_file = null;
|
|
protected static $referer = null;
|
|
protected static $ip = null;
|
|
protected static $proxy = null;
|
|
protected static $headers = array();
|
|
protected static $hosts = array();
|
|
protected static $gzip = false;
|
|
protected static $info = array();
|
|
|
|
/**
|
|
* set timeout
|
|
*
|
|
* @param init $timeout
|
|
* @return
|
|
*/
|
|
public static function set_timeout($timeout)
|
|
{
|
|
self::$timeout = $timeout;
|
|
}
|
|
|
|
/**
|
|
* 设置代理
|
|
*
|
|
* @param mixed $proxy
|
|
* @return void
|
|
* @author seatle <seatle@foxmail.com>
|
|
* @created time :2016-09-18 10:17
|
|
*/
|
|
public static function set_proxy($proxy)
|
|
{
|
|
self::$proxy = $proxy;
|
|
}
|
|
|
|
/**
|
|
* set referer
|
|
*
|
|
*/
|
|
public static function set_referer($referer)
|
|
{
|
|
self::$referer = $referer;
|
|
}
|
|
|
|
/**
|
|
* 设置 user_agent
|
|
*
|
|
* @param string $useragent
|
|
* @return void
|
|
*/
|
|
public static function set_useragent($useragent)
|
|
{
|
|
self::$useragent = $useragent;
|
|
}
|
|
|
|
/**
|
|
* 设置COOKIE
|
|
*
|
|
* @param string $cookie
|
|
* @return void
|
|
*/
|
|
public static function set_cookie($cookie)
|
|
{
|
|
self::$cookie = $cookie;
|
|
}
|
|
|
|
/**
|
|
* 设置COOKIE JAR
|
|
*
|
|
* @param string $cookie_jar
|
|
* @return void
|
|
*/
|
|
public static function set_cookie_jar($cookie_jar)
|
|
{
|
|
self::$cookie_jar = $cookie_jar;
|
|
}
|
|
|
|
/**
|
|
* 设置COOKIE FILE
|
|
*
|
|
* @param string $cookie_file
|
|
* @return void
|
|
*/
|
|
public static function set_cookie_file($cookie_file)
|
|
{
|
|
self::$cookie_file = $cookie_file;
|
|
}
|
|
|
|
/**
|
|
* 获取内容的时候是不是连header也一起获取
|
|
*
|
|
* @param mixed $http_raw
|
|
* @return void
|
|
* @author seatle <seatle@foxmail.com>
|
|
* @created time :2016-09-18 10:17
|
|
*/
|
|
public static function set_http_raw($http_raw)
|
|
{
|
|
self::$http_raw = $http_raw;
|
|
}
|
|
|
|
/**
|
|
* 设置IP
|
|
*
|
|
* @param string $ip
|
|
* @return void
|
|
*/
|
|
public static function set_ip($ip)
|
|
{
|
|
self::$ip = $ip;
|
|
}
|
|
|
|
/**
|
|
* 设置Headers
|
|
*
|
|
* @param string $headers
|
|
* @return void
|
|
*/
|
|
public static function set_headers($headers)
|
|
{
|
|
self::$headers = $headers;
|
|
}
|
|
|
|
/**
|
|
* 设置Hosts
|
|
*
|
|
* @param string $hosts
|
|
* @return void
|
|
*/
|
|
public static function set_hosts($hosts)
|
|
{
|
|
self::$hosts = $hosts;
|
|
}
|
|
|
|
/**
|
|
* 设置Gzip
|
|
*
|
|
* @param string $hosts
|
|
* @return void
|
|
*/
|
|
public static function set_gzip($gzip)
|
|
{
|
|
self::$gzip = $gzip;
|
|
}
|
|
|
|
/**
|
|
* 初始化 CURL
|
|
*
|
|
*/
|
|
public static function init()
|
|
{
|
|
//if (empty ( self::$ch ))
|
|
if (!is_resource ( self::$ch ))
|
|
{
|
|
self::$ch = curl_init ();
|
|
curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true );
|
|
curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout );
|
|
curl_setopt( self::$ch, CURLOPT_HEADER, false );
|
|
curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent );
|
|
curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout + 5);
|
|
// 在多线程处理场景下使用超时选项时,会忽略signals对应的处理函数,但是无耐的是还有小概率的crash情况发生
|
|
curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true);
|
|
}
|
|
return self::$ch;
|
|
}
|
|
|
|
/**
|
|
* get
|
|
*
|
|
*
|
|
*/
|
|
public static function get($url, $fields = array())
|
|
{
|
|
self::init ();
|
|
return self::http_request($url, 'get', $fields);
|
|
}
|
|
|
|
/**
|
|
* $fields 有三种类型:1、数组;2、http query;3、json
|
|
* 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao'))
|
|
* 前两种是普通的post,可以用$_POST方式获取
|
|
* 第三种是post stream( json rpc,其实就是webservice ),虽然是post方式,但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取
|
|
*
|
|
* @param mixed $url
|
|
* @param array $fields
|
|
* @param mixed $proxy
|
|
* @static
|
|
* @access public
|
|
* @return void
|
|
*/
|
|
public static function post($url, $fields = array())
|
|
{
|
|
self::init ();
|
|
return self::http_request($url, 'post', $fields);
|
|
}
|
|
|
|
public static function http_request($url, $type = 'get', $fields)
|
|
{
|
|
// 如果是 get 方式,直接拼凑一个 url 出来
|
|
if (strtolower($type) == 'get' && !empty($fields))
|
|
{
|
|
$url = $url . (strpos($url,"?")===false ? "?" : "&") . http_build_query($fields);
|
|
}
|
|
|
|
// 随机绑定 hosts,做负载均衡
|
|
if (self::$hosts)
|
|
{
|
|
$parse_url = parse_url($url);
|
|
$host = $parse_url['host'];
|
|
$key = rand(0, count(self::$hosts)-1);
|
|
$ip = self::$hosts[$key];
|
|
$url = str_replace($host, $ip, $url);
|
|
self::$headers = array_merge( array('Host:'.$host), self::$headers );
|
|
}
|
|
curl_setopt( self::$ch, CURLOPT_URL, $url );
|
|
// 如果是 post 方式
|
|
if (strtolower($type) == 'post')
|
|
{
|
|
curl_setopt( self::$ch, CURLOPT_POST, true );
|
|
curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields );
|
|
}
|
|
if (self::$useragent)
|
|
{
|
|
curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent );
|
|
}
|
|
if (self::$cookie)
|
|
{
|
|
curl_setopt( self::$ch, CURLOPT_COOKIE, self::$cookie );
|
|
}
|
|
if (self::$cookie_jar)
|
|
{
|
|
curl_setopt( self::$ch, CURLOPT_COOKIEJAR, self::$cookie_jar );
|
|
}
|
|
if (self::$cookie_file)
|
|
{
|
|
curl_setopt( self::$ch, CURLOPT_COOKIEFILE, self::$cookie_file );
|
|
}
|
|
if (self::$referer)
|
|
{
|
|
curl_setopt( self::$ch, CURLOPT_REFERER, self::$referer );
|
|
}
|
|
if (self::$ip)
|
|
{
|
|
self::$headers = array_merge( array('CLIENT-IP:'.self::$ip, 'X-FORWARDED-FOR:'.self::$ip), self::$headers );
|
|
}
|
|
if (self::$headers)
|
|
{
|
|
curl_setopt( self::$ch, CURLOPT_HTTPHEADER, self::$headers );
|
|
}
|
|
if (self::$gzip)
|
|
{
|
|
curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' );
|
|
}
|
|
if (self::$proxy)
|
|
{
|
|
curl_setopt( self::$ch, CURLOPT_PROXY, self::$proxy );
|
|
}
|
|
if (self::$http_raw)
|
|
{
|
|
curl_setopt( self::$ch, CURLOPT_HEADER, true );
|
|
}
|
|
|
|
$data = curl_exec ( self::$ch );
|
|
self::$info = curl_getinfo(self::$ch);
|
|
if ($data === false)
|
|
{
|
|
//echo date("Y-m-d H:i:s"), ' Curl error: ' . curl_error( self::$ch ), "\n";
|
|
}
|
|
|
|
// 关闭句柄
|
|
curl_close( self::$ch );
|
|
//$data = substr($data, 10);
|
|
//$data = gzinflate($data);
|
|
return $data;
|
|
}
|
|
|
|
public static function get_info()
|
|
{
|
|
return self::$info;
|
|
}
|
|
|
|
public static function get_http_code()
|
|
{
|
|
return self::$info['http_code'];
|
|
}
|
|
}
|
|
|
|
function classic_curl($urls, $delay)
|
|
{
|
|
$queue = curl_multi_init();
|
|
$map = array();
|
|
|
|
foreach ($urls as $url)
|
|
{
|
|
// create cURL resources
|
|
$ch = curl_init();
|
|
|
|
// 设置 URL 和 其他参数
|
|
curl_setopt($ch, CURLOPT_URL, $url);
|
|
curl_setopt($ch, CURLOPT_TIMEOUT, 1);
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
|
curl_setopt($ch, CURLOPT_HEADER, 0);
|
|
curl_setopt($ch, CURLOPT_NOSIGNAL, true);
|
|
|
|
// 把当前 curl resources 加入到 curl_multi_init 队列
|
|
curl_multi_add_handle($queue, $ch);
|
|
$map[$url] = $ch;
|
|
}
|
|
|
|
$active = null;
|
|
|
|
// execute the handles
|
|
do {
|
|
$mrc = curl_multi_exec($queue, $active);
|
|
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
|
|
|
|
while ($active > 0 && $mrc == CURLM_OK) {
|
|
while (curl_multi_exec($queue, $active) === CURLM_CALL_MULTI_PERFORM);
|
|
// 这里 curl_multi_select 一直返回 -1,所以这里就死循环了,CPU就100%了
|
|
if (curl_multi_select($queue, 0.5) != -1)
|
|
{
|
|
do {
|
|
$mrc = curl_multi_exec($queue, $active);
|
|
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
|
|
}
|
|
}
|
|
|
|
$responses = array();
|
|
foreach ($map as $url=>$ch) {
|
|
//$responses[$url] = callback(curl_multi_getcontent($ch), $delay);
|
|
$responses[$url] = callback(curl_multi_getcontent($ch), $delay, $url);
|
|
curl_multi_remove_handle($queue, $ch);
|
|
curl_close($ch);
|
|
}
|
|
|
|
curl_multi_close($queue);
|
|
return $responses;
|
|
}
|
|
|
|
function rolling_curl($urls, $delay)
|
|
{
|
|
$queue = curl_multi_init();
|
|
$map = array();
|
|
|
|
foreach ($urls as $url) {
|
|
$ch = curl_init();
|
|
|
|
curl_setopt($ch, CURLOPT_URL, $url);
|
|
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
|
curl_setopt($ch, CURLOPT_HEADER, 0);
|
|
curl_setopt($ch, CURLOPT_NOSIGNAL, true);
|
|
$cookie = '_za=36643642-e546-4d60-a771-8af8dcfbd001; q_c1=a57a2b9f10964f909b8d8969febf3ab2|1437705596000|1437705596000; _xsrf=f0304fba4e44e1d008ec308d59bab029; cap_id="YWY1YmRmODlmZGVmNDc3MWJlZGFkZDg3M2E0M2Q5YjM=|1437705596|963518c454bb6f10d96775021c098c84e1e46f5a"; z_c0="QUFCQVgtRWZBQUFYQUFBQVlRSlZUVjR6NEZVUTgtRkdjTVc5UDMwZXRJZFdWZ2JaOWctNVhnPT0=|1438164574|aed6ef3707f246a7b64da4f1e8c089395d77ff2b"; __utma=51854390.1105113342.1437990174.1438160686.1438164116.10; __utmc=51854390; __utmz=51854390.1438134939.8.5.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/yangzetao; __utmv=51854390.100-1|2=registration_date=20131030=1^3=entry_date=20131030=1';
|
|
curl_setopt($ch, CURLOPT_COOKIE, $cookie);
|
|
$useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36';
|
|
curl_setopt( $ch, CURLOPT_USERAGENT, $useragent );
|
|
curl_setopt($ch, CURLOPT_ENCODING, 'gzip');
|
|
|
|
curl_multi_add_handle($queue, $ch);
|
|
$map[(string) $ch] = $url;
|
|
}
|
|
|
|
$responses = array();
|
|
do {
|
|
while (($code = curl_multi_exec($queue, $active)) == CURLM_CALL_MULTI_PERFORM) ;
|
|
|
|
if ($code != CURLM_OK) { break; }
|
|
|
|
// a request was just completed -- find out which one
|
|
while ($done = curl_multi_info_read($queue)) {
|
|
|
|
// get the info and content returned on the request
|
|
$info = curl_getinfo($done['handle']);
|
|
$error = curl_error($done['handle']);
|
|
$results = callback(curl_multi_getcontent($done['handle']), $delay, $map[(string) $done['handle']]);
|
|
$responses[$map[(string) $done['handle']]] = compact('info', 'error', 'results');
|
|
|
|
// remove the curl handle that just completed
|
|
curl_multi_remove_handle($queue, $done['handle']);
|
|
curl_close($done['handle']);
|
|
}
|
|
|
|
// Block for data in / output; error handling is done by curl_multi_exec
|
|
if ($active > 0) {
|
|
curl_multi_select($queue, 0.5);
|
|
}
|
|
|
|
} while ($active);
|
|
|
|
curl_multi_close($queue);
|
|
return $responses;
|
|
}
|
|
|
|
function callback($data, $delay, $url) {
|
|
//echo $data;
|
|
//echo date("Y-m-d H:i:s", time()) . " --- " . $url . "\n";
|
|
if (!empty($data))
|
|
{
|
|
file_put_contents("./html2/".md5($url).".html", $data);
|
|
}
|
|
// usleep模拟现实中比较负责的数据处理逻辑(如提取, 分词, 写入文件或数据库等)
|
|
//usleep(1);
|
|
//return compact('data', 'matches');
|
|
}
|
|
|
|
|