2020-01-14 19:17:29 +08:00

426 lines
12 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
/**
* Worker多进程操作类
*
* Licensed under The MIT License
* For full copyright and license information, please see the MIT-LICENSE.txt
* Redistributions of files must retain the above copyright notice.
*
* @author seatle<seatle@foxmail.com>
* @copyright seatle<seatle@foxmail.com>
* @link http://www.epooll.com/
* @license http://www.opensource.org/licenses/mit-license.php MIT License
*/
class cls_curl
{
protected static $timeout = 10;
protected static $ch = null;
protected static $useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36';
protected static $http_raw = false;
protected static $cookie = null;
protected static $cookie_jar = null;
protected static $cookie_file = null;
protected static $referer = null;
protected static $ip = null;
protected static $proxy = null;
protected static $headers = array();
protected static $hosts = array();
protected static $gzip = false;
protected static $info = array();
/**
* set timeout
*
* @param init $timeout
* @return
*/
public static function set_timeout($timeout)
{
self::$timeout = $timeout;
}
/**
* 设置代理
*
* @param mixed $proxy
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public static function set_proxy($proxy)
{
self::$proxy = $proxy;
}
/**
* set referer
*
*/
public static function set_referer($referer)
{
self::$referer = $referer;
}
/**
* 设置 user_agent
*
* @param string $useragent
* @return void
*/
public static function set_useragent($useragent)
{
self::$useragent = $useragent;
}
/**
* 设置COOKIE
*
* @param string $cookie
* @return void
*/
public static function set_cookie($cookie)
{
self::$cookie = $cookie;
}
/**
* 设置COOKIE JAR
*
* @param string $cookie_jar
* @return void
*/
public static function set_cookie_jar($cookie_jar)
{
self::$cookie_jar = $cookie_jar;
}
/**
* 设置COOKIE FILE
*
* @param string $cookie_file
* @return void
*/
public static function set_cookie_file($cookie_file)
{
self::$cookie_file = $cookie_file;
}
/**
* 获取内容的时候是不是连header也一起获取
*
* @param mixed $http_raw
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public static function set_http_raw($http_raw)
{
self::$http_raw = $http_raw;
}
/**
* 设置IP
*
* @param string $ip
* @return void
*/
public static function set_ip($ip)
{
self::$ip = $ip;
}
/**
* 设置Headers
*
* @param string $headers
* @return void
*/
public static function set_headers($headers)
{
self::$headers = $headers;
}
/**
* 设置Hosts
*
* @param string $hosts
* @return void
*/
public static function set_hosts($hosts)
{
self::$hosts = $hosts;
}
/**
* 设置Gzip
*
* @param string $hosts
* @return void
*/
public static function set_gzip($gzip)
{
self::$gzip = $gzip;
}
/**
* 初始化 CURL
*
*/
public static function init()
{
//if (empty ( self::$ch ))
if (!is_resource ( self::$ch ))
{
self::$ch = curl_init ();
curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout );
curl_setopt( self::$ch, CURLOPT_HEADER, false );
curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent );
curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout + 5);
// 在多线程处理场景下使用超时选项时会忽略signals对应的处理函数但是无耐的是还有小概率的crash情况发生
curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true);
}
return self::$ch;
}
/**
* get
*
*
*/
public static function get($url, $fields = array())
{
self::init ();
return self::http_request($url, 'get', $fields);
}
/**
* $fields 有三种类型:1、数组2、http query3、json
* 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao'))
* 前两种是普通的post可以用$_POST方式获取
* 第三种是post stream( json rpc其实就是webservice )虽然是post方式但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取
*
* @param mixed $url
* @param array $fields
* @param mixed $proxy
* @static
* @access public
* @return void
*/
public static function post($url, $fields = array())
{
self::init ();
return self::http_request($url, 'post', $fields);
}
public static function http_request($url, $type = 'get', $fields)
{
// 如果是 get 方式,直接拼凑一个 url 出来
if (strtolower($type) == 'get' && !empty($fields))
{
$url = $url . (strpos($url,"?")===false ? "?" : "&") . http_build_query($fields);
}
// 随机绑定 hosts做负载均衡
if (self::$hosts)
{
$parse_url = parse_url($url);
$host = $parse_url['host'];
$key = rand(0, count(self::$hosts)-1);
$ip = self::$hosts[$key];
$url = str_replace($host, $ip, $url);
self::$headers = array_merge( array('Host:'.$host), self::$headers );
}
curl_setopt( self::$ch, CURLOPT_URL, $url );
// 如果是 post 方式
if (strtolower($type) == 'post')
{
curl_setopt( self::$ch, CURLOPT_POST, true );
curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields );
}
if (self::$useragent)
{
curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent );
}
if (self::$cookie)
{
curl_setopt( self::$ch, CURLOPT_COOKIE, self::$cookie );
}
if (self::$cookie_jar)
{
curl_setopt( self::$ch, CURLOPT_COOKIEJAR, self::$cookie_jar );
}
if (self::$cookie_file)
{
curl_setopt( self::$ch, CURLOPT_COOKIEFILE, self::$cookie_file );
}
if (self::$referer)
{
curl_setopt( self::$ch, CURLOPT_REFERER, self::$referer );
}
if (self::$ip)
{
self::$headers = array_merge( array('CLIENT-IP:'.self::$ip, 'X-FORWARDED-FOR:'.self::$ip), self::$headers );
}
if (self::$headers)
{
curl_setopt( self::$ch, CURLOPT_HTTPHEADER, self::$headers );
}
if (self::$gzip)
{
curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' );
}
if (self::$proxy)
{
curl_setopt( self::$ch, CURLOPT_PROXY, self::$proxy );
}
if (self::$http_raw)
{
curl_setopt( self::$ch, CURLOPT_HEADER, true );
}
$data = curl_exec ( self::$ch );
self::$info = curl_getinfo(self::$ch);
if ($data === false)
{
//echo date("Y-m-d H:i:s"), ' Curl error: ' . curl_error( self::$ch ), "\n";
}
// 关闭句柄
curl_close( self::$ch );
//$data = substr($data, 10);
//$data = gzinflate($data);
return $data;
}
public static function get_info()
{
return self::$info;
}
public static function get_http_code()
{
return self::$info['http_code'];
}
}
function classic_curl($urls, $delay)
{
$queue = curl_multi_init();
$map = array();
foreach ($urls as $url)
{
// create cURL resources
$ch = curl_init();
// 设置 URL 和 其他参数
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_NOSIGNAL, true);
// 把当前 curl resources 加入到 curl_multi_init 队列
curl_multi_add_handle($queue, $ch);
$map[$url] = $ch;
}
$active = null;
// execute the handles
do {
$mrc = curl_multi_exec($queue, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active > 0 && $mrc == CURLM_OK) {
while (curl_multi_exec($queue, $active) === CURLM_CALL_MULTI_PERFORM);
// 这里 curl_multi_select 一直返回 -1所以这里就死循环了CPU就100%了
if (curl_multi_select($queue, 0.5) != -1)
{
do {
$mrc = curl_multi_exec($queue, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
$responses = array();
foreach ($map as $url=>$ch) {
//$responses[$url] = callback(curl_multi_getcontent($ch), $delay);
$responses[$url] = callback(curl_multi_getcontent($ch), $delay, $url);
curl_multi_remove_handle($queue, $ch);
curl_close($ch);
}
curl_multi_close($queue);
return $responses;
}
function rolling_curl($urls, $delay)
{
$queue = curl_multi_init();
$map = array();
foreach ($urls as $url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_NOSIGNAL, true);
$cookie = '_za=36643642-e546-4d60-a771-8af8dcfbd001; q_c1=a57a2b9f10964f909b8d8969febf3ab2|1437705596000|1437705596000; _xsrf=f0304fba4e44e1d008ec308d59bab029; cap_id="YWY1YmRmODlmZGVmNDc3MWJlZGFkZDg3M2E0M2Q5YjM=|1437705596|963518c454bb6f10d96775021c098c84e1e46f5a"; z_c0="QUFCQVgtRWZBQUFYQUFBQVlRSlZUVjR6NEZVUTgtRkdjTVc5UDMwZXRJZFdWZ2JaOWctNVhnPT0=|1438164574|aed6ef3707f246a7b64da4f1e8c089395d77ff2b"; __utma=51854390.1105113342.1437990174.1438160686.1438164116.10; __utmc=51854390; __utmz=51854390.1438134939.8.5.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/yangzetao; __utmv=51854390.100-1|2=registration_date=20131030=1^3=entry_date=20131030=1';
curl_setopt($ch, CURLOPT_COOKIE, $cookie);
$useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36';
curl_setopt( $ch, CURLOPT_USERAGENT, $useragent );
curl_setopt($ch, CURLOPT_ENCODING, 'gzip');
curl_multi_add_handle($queue, $ch);
$map[(string) $ch] = $url;
}
$responses = array();
do {
while (($code = curl_multi_exec($queue, $active)) == CURLM_CALL_MULTI_PERFORM) ;
if ($code != CURLM_OK) { break; }
// a request was just completed -- find out which one
while ($done = curl_multi_info_read($queue)) {
// get the info and content returned on the request
$info = curl_getinfo($done['handle']);
$error = curl_error($done['handle']);
$results = callback(curl_multi_getcontent($done['handle']), $delay, $map[(string) $done['handle']]);
$responses[$map[(string) $done['handle']]] = compact('info', 'error', 'results');
// remove the curl handle that just completed
curl_multi_remove_handle($queue, $done['handle']);
curl_close($done['handle']);
}
// Block for data in / output; error handling is done by curl_multi_exec
if ($active > 0) {
curl_multi_select($queue, 0.5);
}
} while ($active);
curl_multi_close($queue);
return $responses;
}
function callback($data, $delay, $url) {
//echo $data;
//echo date("Y-m-d H:i:s", time()) . " --- " . $url . "\n";
if (!empty($data))
{
file_put_contents("./html2/".md5($url).".html", $data);
}
// usleep模拟现实中比较负责的数据处理逻辑(如提取, 分词, 写入文件或数据库等)
//usleep(1);
//return compact('data', 'matches');
}