IYUUAutoReseed初始化版本库v0.2.0

This commit is contained in:
iyuu.cn
2020-01-14 19:17:09 +08:00
commit 822e4a2270
78 changed files with 26926 additions and 0 deletions

View File

@ -0,0 +1,425 @@
<?php
/**
* Worker多进程操作类
*
* Licensed under The MIT License
* For full copyright and license information, please see the MIT-LICENSE.txt
* Redistributions of files must retain the above copyright notice.
*
* @author seatle<seatle@foxmail.com>
* @copyright seatle<seatle@foxmail.com>
* @link http://www.epooll.com/
* @license http://www.opensource.org/licenses/mit-license.php MIT License
*/
class cls_curl
{
protected static $timeout = 10;
protected static $ch = null;
protected static $useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36';
protected static $http_raw = false;
protected static $cookie = null;
protected static $cookie_jar = null;
protected static $cookie_file = null;
protected static $referer = null;
protected static $ip = null;
protected static $proxy = null;
protected static $headers = array();
protected static $hosts = array();
protected static $gzip = false;
protected static $info = array();
/**
* set timeout
*
* @param init $timeout
* @return
*/
public static function set_timeout($timeout)
{
self::$timeout = $timeout;
}
/**
* 设置代理
*
* @param mixed $proxy
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public static function set_proxy($proxy)
{
self::$proxy = $proxy;
}
/**
* set referer
*
*/
public static function set_referer($referer)
{
self::$referer = $referer;
}
/**
* 设置 user_agent
*
* @param string $useragent
* @return void
*/
public static function set_useragent($useragent)
{
self::$useragent = $useragent;
}
/**
* 设置COOKIE
*
* @param string $cookie
* @return void
*/
public static function set_cookie($cookie)
{
self::$cookie = $cookie;
}
/**
* 设置COOKIE JAR
*
* @param string $cookie_jar
* @return void
*/
public static function set_cookie_jar($cookie_jar)
{
self::$cookie_jar = $cookie_jar;
}
/**
* 设置COOKIE FILE
*
* @param string $cookie_file
* @return void
*/
public static function set_cookie_file($cookie_file)
{
self::$cookie_file = $cookie_file;
}
/**
* 获取内容的时候是不是连header也一起获取
*
* @param mixed $http_raw
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public static function set_http_raw($http_raw)
{
self::$http_raw = $http_raw;
}
/**
* 设置IP
*
* @param string $ip
* @return void
*/
public static function set_ip($ip)
{
self::$ip = $ip;
}
/**
* 设置Headers
*
* @param string $headers
* @return void
*/
public static function set_headers($headers)
{
self::$headers = $headers;
}
/**
* 设置Hosts
*
* @param string $hosts
* @return void
*/
public static function set_hosts($hosts)
{
self::$hosts = $hosts;
}
/**
* 设置Gzip
*
* @param string $hosts
* @return void
*/
public static function set_gzip($gzip)
{
self::$gzip = $gzip;
}
/**
* 初始化 CURL
*
*/
public static function init()
{
//if (empty ( self::$ch ))
if (!is_resource ( self::$ch ))
{
self::$ch = curl_init ();
curl_setopt( self::$ch, CURLOPT_RETURNTRANSFER, true );
curl_setopt( self::$ch, CURLOPT_CONNECTTIMEOUT, self::$timeout );
curl_setopt( self::$ch, CURLOPT_HEADER, false );
curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent );
curl_setopt( self::$ch, CURLOPT_TIMEOUT, self::$timeout + 5);
// 在多线程处理场景下使用超时选项时会忽略signals对应的处理函数但是无耐的是还有小概率的crash情况发生
curl_setopt( self::$ch, CURLOPT_NOSIGNAL, true);
}
return self::$ch;
}
/**
* get
*
*
*/
public static function get($url, $fields = array())
{
self::init ();
return self::http_request($url, 'get', $fields);
}
/**
* $fields 有三种类型:1、数组2、http query3、json
* 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao'))
* 前两种是普通的post可以用$_POST方式获取
* 第三种是post stream( json rpc其实就是webservice )虽然是post方式但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取
*
* @param mixed $url
* @param array $fields
* @param mixed $proxy
* @static
* @access public
* @return void
*/
public static function post($url, $fields = array())
{
self::init ();
return self::http_request($url, 'post', $fields);
}
public static function http_request($url, $type = 'get', $fields)
{
// 如果是 get 方式,直接拼凑一个 url 出来
if (strtolower($type) == 'get' && !empty($fields))
{
$url = $url . (strpos($url,"?")===false ? "?" : "&") . http_build_query($fields);
}
// 随机绑定 hosts做负载均衡
if (self::$hosts)
{
$parse_url = parse_url($url);
$host = $parse_url['host'];
$key = rand(0, count(self::$hosts)-1);
$ip = self::$hosts[$key];
$url = str_replace($host, $ip, $url);
self::$headers = array_merge( array('Host:'.$host), self::$headers );
}
curl_setopt( self::$ch, CURLOPT_URL, $url );
// 如果是 post 方式
if (strtolower($type) == 'post')
{
curl_setopt( self::$ch, CURLOPT_POST, true );
curl_setopt( self::$ch, CURLOPT_POSTFIELDS, $fields );
}
if (self::$useragent)
{
curl_setopt( self::$ch, CURLOPT_USERAGENT, self::$useragent );
}
if (self::$cookie)
{
curl_setopt( self::$ch, CURLOPT_COOKIE, self::$cookie );
}
if (self::$cookie_jar)
{
curl_setopt( self::$ch, CURLOPT_COOKIEJAR, self::$cookie_jar );
}
if (self::$cookie_file)
{
curl_setopt( self::$ch, CURLOPT_COOKIEFILE, self::$cookie_file );
}
if (self::$referer)
{
curl_setopt( self::$ch, CURLOPT_REFERER, self::$referer );
}
if (self::$ip)
{
self::$headers = array_merge( array('CLIENT-IP:'.self::$ip, 'X-FORWARDED-FOR:'.self::$ip), self::$headers );
}
if (self::$headers)
{
curl_setopt( self::$ch, CURLOPT_HTTPHEADER, self::$headers );
}
if (self::$gzip)
{
curl_setopt( self::$ch, CURLOPT_ENCODING, 'gzip' );
}
if (self::$proxy)
{
curl_setopt( self::$ch, CURLOPT_PROXY, self::$proxy );
}
if (self::$http_raw)
{
curl_setopt( self::$ch, CURLOPT_HEADER, true );
}
$data = curl_exec ( self::$ch );
self::$info = curl_getinfo(self::$ch);
if ($data === false)
{
//echo date("Y-m-d H:i:s"), ' Curl error: ' . curl_error( self::$ch ), "\n";
}
// 关闭句柄
curl_close( self::$ch );
//$data = substr($data, 10);
//$data = gzinflate($data);
return $data;
}
public static function get_info()
{
return self::$info;
}
public static function get_http_code()
{
return self::$info['http_code'];
}
}
function classic_curl($urls, $delay)
{
$queue = curl_multi_init();
$map = array();
foreach ($urls as $url)
{
// create cURL resources
$ch = curl_init();
// 设置 URL 和 其他参数
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_NOSIGNAL, true);
// 把当前 curl resources 加入到 curl_multi_init 队列
curl_multi_add_handle($queue, $ch);
$map[$url] = $ch;
}
$active = null;
// execute the handles
do {
$mrc = curl_multi_exec($queue, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active > 0 && $mrc == CURLM_OK) {
while (curl_multi_exec($queue, $active) === CURLM_CALL_MULTI_PERFORM);
// 这里 curl_multi_select 一直返回 -1所以这里就死循环了CPU就100%了
if (curl_multi_select($queue, 0.5) != -1)
{
do {
$mrc = curl_multi_exec($queue, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
$responses = array();
foreach ($map as $url=>$ch) {
//$responses[$url] = callback(curl_multi_getcontent($ch), $delay);
$responses[$url] = callback(curl_multi_getcontent($ch), $delay, $url);
curl_multi_remove_handle($queue, $ch);
curl_close($ch);
}
curl_multi_close($queue);
return $responses;
}
function rolling_curl($urls, $delay)
{
$queue = curl_multi_init();
$map = array();
foreach ($urls as $url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_NOSIGNAL, true);
$cookie = '_za=36643642-e546-4d60-a771-8af8dcfbd001; q_c1=a57a2b9f10964f909b8d8969febf3ab2|1437705596000|1437705596000; _xsrf=f0304fba4e44e1d008ec308d59bab029; cap_id="YWY1YmRmODlmZGVmNDc3MWJlZGFkZDg3M2E0M2Q5YjM=|1437705596|963518c454bb6f10d96775021c098c84e1e46f5a"; z_c0="QUFCQVgtRWZBQUFYQUFBQVlRSlZUVjR6NEZVUTgtRkdjTVc5UDMwZXRJZFdWZ2JaOWctNVhnPT0=|1438164574|aed6ef3707f246a7b64da4f1e8c089395d77ff2b"; __utma=51854390.1105113342.1437990174.1438160686.1438164116.10; __utmc=51854390; __utmz=51854390.1438134939.8.5.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/yangzetao; __utmv=51854390.100-1|2=registration_date=20131030=1^3=entry_date=20131030=1';
curl_setopt($ch, CURLOPT_COOKIE, $cookie);
$useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36';
curl_setopt( $ch, CURLOPT_USERAGENT, $useragent );
curl_setopt($ch, CURLOPT_ENCODING, 'gzip');
curl_multi_add_handle($queue, $ch);
$map[(string) $ch] = $url;
}
$responses = array();
do {
while (($code = curl_multi_exec($queue, $active)) == CURLM_CALL_MULTI_PERFORM) ;
if ($code != CURLM_OK) { break; }
// a request was just completed -- find out which one
while ($done = curl_multi_info_read($queue)) {
// get the info and content returned on the request
$info = curl_getinfo($done['handle']);
$error = curl_error($done['handle']);
$results = callback(curl_multi_getcontent($done['handle']), $delay, $map[(string) $done['handle']]);
$responses[$map[(string) $done['handle']]] = compact('info', 'error', 'results');
// remove the curl handle that just completed
curl_multi_remove_handle($queue, $done['handle']);
curl_close($done['handle']);
}
// Block for data in / output; error handling is done by curl_multi_exec
if ($active > 0) {
curl_multi_select($queue, 0.5);
}
} while ($active);
curl_multi_close($queue);
return $responses;
}
function callback($data, $delay, $url) {
//echo $data;
//echo date("Y-m-d H:i:s", time()) . " --- " . $url . "\n";
if (!empty($data))
{
file_put_contents("./html2/".md5($url).".html", $data);
}
// usleep模拟现实中比较负责的数据处理逻辑(如提取, 分词, 写入文件或数据库等)
//usleep(1);
//return compact('data', 'matches');
}

View File

@ -0,0 +1,248 @@
<?php
class cls_query
{
private static $content;
public static $debug = false;
public static function init($content)
{
self::$content = $content;
}
public static function query($query, $attr = "html")
{
$nodes = self::get_nodes($query);
$datas = self::get_datas($nodes, $attr);
return $datas;
}
protected static function is_char($char) {
return extension_loaded('mbstring') ? mb_eregi('\w', $char) : preg_match('@\w@', $char);
}
/**
* 从xpath中得到节点
*
* @param mixed $xpath
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2015-08-08 15:52
*/
private static function get_nodes($query)
{
// 把一到多个空格 替换成 一个空格
// 把 > 和 ~ 符号两边的空格去掉,因为没有用这两个符号,所以这里可以不这么做
// ul>li.className
$query = trim(
preg_replace('@\s+@', ' ',
preg_replace('@\s*(>|\\+|~)\s*@', '\\1', $query)
)
);
$nodes = array();
if (! $query)
{
return $nodes;
}
$query_arr = explode(" ", $query);
foreach ($query_arr as $k=>$v)
{
$path = $k == 0 ? $v : $path.' '.$v;
$node = array("path"=>(string)$path, "name"=>"", "id"=>"", "class"=>"", "other"=>array());
// 如果存在内容选择器
if (preg_match('@(.*?)\[(.*?)=[\'|"](.*?)[\'|"]\]@', $v, $matches) && !empty($matches[2]) && !empty($matches[3]))
{
// 把选择器过滤掉 [rel='topic']
$v = $matches[1];
$node['other'] = array(
'key'=>$matches[2],
'val'=>$matches[3],
);
}
// 如果存在 id
$id_arr = explode("#", $v);
$class_arr = explode(".", $v);
if (count($id_arr) === 2)
{
$node['name'] = $id_arr[0];
$node['id'] = $id_arr[1];
}
// 如果存在 class
elseif (count($class_arr) === 2)
{
$node['name'] = $class_arr[0];
$node['class'] = $class_arr[1];
}
// 如果没有样式
else
{
$node['name'] = $v;
}
$nodes[] = $node;
}
//print_r($nodes);
//exit;
return $nodes;
}
public static function get_datas($nodes, $attr = "html")
{
if (empty(self::$content))
{
return false;
}
$node_datas = array();
$count = count($nodes);
// 循环所有节点
foreach ($nodes as $i=>$node)
{
$is_last = $count == $i+1 ? true : false;
// 第一次
if ($i == 0)
{
$datas = array();
$datas = self::get_node_datas($node, self::$content, $attr, $is_last);
// 如果第一次都取不到数据,直接跳出循环
if(!$datas)
{
break;
}
$node_datas[$nodes[$i]['path']] = $datas;
}
else
{
$datas = array();
// 循环上一个节点的数组
foreach ($node_datas[$nodes[$i-1]['path']] as $v)
{
$datas = array_merge( $datas, self::get_node_datas($node, trim($v), $attr, $is_last) );
}
$node_datas[$nodes[$i]['path']] = $datas;
// 删除上一个节点,防止内存溢出,或者缓存到本地,再次使用?!
unset($node_datas[$nodes[$i-1]['path']]);
}
}
//print_r($datas);exit;
// 从数组中弹出最后一个元素
$node_datas = array_pop($node_datas);
//print_r($node_datas);
//exit;
return $node_datas;
}
/**
* 从节点中获取内容
* $regex = '@<meta[^>]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i';
*
* @param mixed $node
* @param mixed $content
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2015-08-08 15:52
*/
private static function get_node_datas($node, $content, $attr = "html", $is_last = false)
{
$node_datas = $datas = array();
if (!empty($node['id']))
{
if ($node['name'])
$regex = '@<'.$node['name'].'[^>]+id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)</'.$node['name'].'>@is';
else
$regex = '@id\\s*=\\s*["|\']+?'.$node['id'].'\\s*[^>]+?>(.*?)<@is';
}
elseif (!empty($node['class']))
{
if ($node['name'])
$regex = '@<'.$node['name'].'[^>]+class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)</'.$node['name'].'>@is';
else
$regex = '@class\\s*=\\s*["|\']+?'.$node['class'].'\\s*[^>]+?>(.*?)<@is';
}
else
{
// 这里为是么是*0次到多次因为有可能是 <li>
$regex = '@<'.$node['name'].'[^>]*?>(.*?)</'.$node['name'].'>@is';
}
self::log("regex --- " . $regex);;
preg_match_all($regex, $content, $matches);
$all_datas = empty($matches[0]) ? array() : $matches[0];
$html_datas = empty($matches[1]) ? array() : $matches[1];
// 过滤掉选择器对不上的
foreach ($all_datas as $i=>$data)
{
// 如果有设置其他选择器,验证一下选择器
if (!empty($node['other']))
{
$regex = '@'.$node['other']['key'].'=[\'|"]'.$node['other']['val'].'[\'|"]@is';
self::log("regex other --- " . $regex);
// 过滤器对不上的,跳过
if (!preg_match($regex, $data, $matches))
{
continue;
}
}
// 获取节点的html内容
if ($attr != "html" && $is_last)
{
$regex = '@'.$attr.'=[\'|"](.*?)[\'|"]@is';
preg_match($regex, $data, $matches);
$node_datas[] = empty($matches[1]) ? '' : trim($matches[1]);
}
// 获取节点属性名的值
else
{
$node_datas[] = trim($html_datas[$i]);
}
}
//echo " 11111 ========================================= \n";
//print_r($node_datas);
//echo " 22222 ========================================= \n\n\n";
return $node_datas;
}
/**
* 记录日志
* @param string $msg
* @return void
*/
private static function log($msg)
{
$msg = "[".date("Y-m-d H:i:s")."] " . $msg . "\n";
if (self::$debug)
{
echo $msg;
}
}
}
//$xpath = "ul.top-nav-dropdown li";
//$xpath = "i.zg-icon";
//print_r($nodes);
//exit;
// [^>]+ 不是>的字符重复一次到多次, ? 表示不贪婪
// \s 表示空白字符
// * 表示0次或者多次
// + 表示1次或者多次
//
// 后向引用,表示表达式中,从左往右数,第一个左括号对应的括号内的内容。
// \\0 表示整个表达式
// \\1表示第1个表达式
// \\2表示第2个表达式
// $regex = '@<meta[^>]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i';
//preg_match_all($regex, $content, $matches);
//print_r($matches);
//exit;
// 用法
//$content = file_get_contents("./test.html");
//$query = "ul#top-nav-profile-dropdown li a";
//$query = "div#zh-profile-following-topic a.link[href='/topic/19550937']";
//cls_query::init($content);
//$list = cls_query::query($query, "href");
//print_r($list);

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,121 @@
<?php
/**
* redis 客户端
* redis的协议可参考这个文章http://redis.cn/topics/protocol.html
*
* @version 2.7.0
* @copyright 1997-2018 The PHP Group
* @author seatle <seatle@foxmail.com>
* @created time :2018-01-03
*/
class cls_redis_client
{
private $redis_socket = false;
//private $command = '';
public function __construct($host='127.0.0.1', $port=6379, $timeout = 3)
{
$this->redis_socket = stream_socket_client("tcp://".$host.":".$port, $errno, $errstr, $timeout);
if ( !$this->redis_socket )
{
throw new Exception("{$errno} - {$errstr}");
}
}
public function __destruct()
{
fclose($this->redis_socket);
}
public function __call($name, $args)
{
$crlf = "\r\n";
array_unshift($args, $name);
$command = '*' . count($args) . $crlf;
foreach ($args as $arg)
{
$command .= '$' . strlen($arg) . $crlf . $arg . $crlf;
}
//echo $command."\n";
$fwrite = fwrite($this->redis_socket, $command);
if ($fwrite === FALSE || $fwrite <= 0)
{
throw new Exception('Failed to write entire command to stream');
}
return $this->read_response();
}
private function read_response()
{
$reply = trim(fgets($this->redis_socket, 1024));
switch (substr($reply, 0, 1))
{
case '-':
throw new Exception(trim(substr($reply, 1)));
break;
case '+':
$response = substr(trim($reply), 1);
if ($response === 'OK')
{
$response = TRUE;
}
break;
case '$':
$response = NULL;
if ($reply == '$-1')
{
break;
}
$read = 0;
$size = intval(substr($reply, 1));
if ($size > 0)
{
do
{
$block_size = ($size - $read) > 1024 ? 1024 : ($size - $read);
$r = fread($this->redis_socket, $block_size);
if ($r === FALSE)
{
throw new Exception('Failed to read response from stream');
}
else
{
$read += strlen($r);
$response .= $r;
}
}
while ($read < $size);
}
fread($this->redis_socket, 2); /* discard crlf */
break;
/* Multi-bulk reply */
case '*':
$count = intval(substr($reply, 1));
if ($count == '-1')
{
return NULL;
}
$response = array();
for ($i = 0; $i < $count; $i++)
{
$response[] = $this->read_response();
}
break;
/* Integer reply */
case ':':
$response = intval(substr(trim($reply), 1));
break;
default:
throw new RedisException("Unknown response: {$reply}");
break;
}
return $response;
}
}
//$redis = new cls_redis_client();
//var_dump($redis->auth("foobared"));
//var_dump($redis->set("name",'abc'));
//var_dump($redis->get("name"));

View File

@ -0,0 +1,179 @@
<?php
ini_set("memory_limit", "128M");
/**
* redis 服务端
* 多进程阻塞式
* redis-benchmark -h 127.0.0.1 -p 11211 -t set -n 80000 -q
*
* @version 2.7.0
* @copyright 1997-2018 The PHP Group
* @author seatle <seatle@foxmail.com>
* @created time :2018-01-03
*/
class cls_redis_server
{
private $socket = false;
private $process_num = 3;
public $redis_kv_data = array();
public $onMessage = null;
public function __construct($host="0.0.0.0", $port=6379)
{
$this->socket = stream_socket_server("tcp://".$host.":".$port,$errno, $errstr);
if (!$this->socket) die($errstr."--".$errno);
echo "listen $host $port \r\n";
}
private function parse_resp(&$conn)
{
// 读取一行,遇到 \r\n 为一行
$line = fgets($conn);
if($line === '' || $line === false)
{
return null;
}
// 获取第一个字符作为类型
$type = $line[0];
// 去掉第一个字符,去掉结尾的 \r\n
$line = mb_substr($line, 1, -2);
switch ( $type )
{
case "*":
// 得到长度
$count = (int) $line;
$data = array();
for ($i = 1; $i <= $count; $i++)
{
$data[] = $this->parse_resp($conn);
}
return $data;
case "$":
if ($line == '-1')
{
return null;
}
// 截取的长度要加上 \r\n 两个字符
$length = $line + 2;
$data = '';
while ($length > 0)
{
$block = fread($conn, $length);
if ($length !== strlen($block))
{
throw new Exception('RECEIVING');
}
$data .= $block;
$length -= mb_strlen($block);
}
return mb_substr($data, 0, -2);
}
return $line;
}
private function start_worker_process()
{
$pid = pcntl_fork();
switch ($pid)
{
case -1:
echo "fork error : {$i} \r\n";
exit;
case 0:
while ( true )
{
echo "PID ".posix_getpid()." waiting...\n";
// 堵塞等待
$conn = stream_socket_accept($this->socket, -1);
if ( !$conn )
{
continue;
}
//"*3\r\n$3\r\nSET\r\n$5\r\nmykey\r\n$7\r\nmyvalue\r\n"
while( true )
{
$arr = $this->parse_resp($conn);
if ( is_array($arr) )
{
if ($this->onMessage)
{
call_user_func($this->onMessage, $conn, $arr);
}
}
else if ( $arr )
{
if ($this->onMessage)
{
call_user_func($this->onMessage, $conn, $arr);
}
}
else
{
fclose($conn);
break;
}
}
}
default:
$this->pids[$pid] = $pid;
break;
}
}
public function run()
{
for($i = 1; $i <= $this->process_num; $i++)
{
$this->start_worker_process();
}
while( true )
{
foreach ($this->pids as $i => $pid)
{
if($pid)
{
$res = pcntl_waitpid($pid, $status,WNOHANG);
if ( $res == -1 || $res > 0 )
{
$this->start_worker_process();
unset($this->pids[$pid]);
}
}
}
sleep(1);
}
}
}
$server = new cls_redis_server();
$server->onMessage = function($conn, $info) use($server)
{
if ( is_array($info) )
{
$command = strtoupper($info[0]);
if ( $command == "SET" )
{
$key = $info[1];
$val = $info[2];
$server->redis_kv_data[$key] = $val;
fwrite($conn, "+OK\r\n");
}
else if ( $command == "GET" )
{
$key = $info[1];
$val = isset($server->redis_kv_data[$key]) ? $server->redis_kv_data[$key] : '';
fwrite($conn, "$".strlen($val)."\r\n".$val."\r\n");
}
else
{
fwrite($conn,"+OK\r\n");
}
}
else
{
fwrite($conn,"+OK\r\n");
}
};
$server->run();

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,466 @@
<?php
/**
* Curl操作类
*
* Licensed under The MIT License
* For full copyright and license information, please see the MIT-LICENSE.txt
* Redistributions of files must retain the above copyright notice.
*
* @author seatle<seatle@foxmail.com>
* @copyright seatle<seatle@foxmail.com>
* @link http://www.epooll.com/
* @license http://www.opensource.org/licenses/mit-license.php MIT License
*/
class rolling_curl
{
/**
* @var float
*
* 同时运行任务数
* 例如有8个请求则会被分成两批第一批5个请求第二批3个请求
* 注意采集知乎的时候5个是比较稳定的7个以上就开始会超时了多进程就没有这样的问题因为多进程很少几率会发生并发
*/
public $window_size = 5;
/**
* @var float
*
* Timeout is the timeout used for curl_multi_select.
*/
private $timeout = 10;
/**
* @var string|array
*
* 应用在每个请求的回调函数
*/
public $callback;
/**
* @var array
*
* 设置默认的请求参数
*/
protected $options = array(
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_RETURNTRANSFER => 1,
// 注意TIMEOUT = CONNECTTIMEOUT + 数据获取时间,所以 TIMEOUT 一定要大于 CONNECTTIMEOUT否则 CONNECTTIMEOUT 设置了就没意义
// "Connection timed out after 30001 milliseconds"
CURLOPT_CONNECTTIMEOUT => 30,
CURLOPT_TIMEOUT => 60,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_HEADER => 0,
// 在多线程处理场景下使用超时选项时会忽略signals对应的处理函数但是无耐的是还有小概率的crash情况发生
CURLOPT_NOSIGNAL => 1,
CURLOPT_USERAGENT => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.89 Safari/537.36",
);
/**
* @var array
*/
private $headers = array();
/**
* @var Request[]
*
* 请求队列
*/
private $requests = array();
/**
* @var RequestMap[]
*
* Maps handles to request indexes
*/
private $requestMap = array();
public function __construct()
{
}
/**
* set timeout
*
* @param init $timeout
* @return
*/
public function set_timeout($timeout)
{
$this->options[CURLOPT_TIMEOUT] = $timeout;
}
/**
* set proxy
*
*/
public function set_proxy($proxy)
{
$this->options[CURLOPT_PROXY] = $proxy;
}
/**
* set referer
*
*/
public function set_referer($referer)
{
$this->options[CURLOPT_REFERER] = $referer;
}
/**
* 设置 user_agent
*
* @param string $useragent
* @return void
*/
public function set_useragent($useragent)
{
$this->options[CURLOPT_USERAGENT] = $useragent;
}
/**
* 设置COOKIE
*
* @param string $cookie
* @return void
*/
public function set_cookie($cookie)
{
$this->options[CURLOPT_COOKIE] = $cookie;
}
/**
* 设置COOKIE JAR
*
* @param string $cookie_jar
* @return void
*/
public function set_cookiejar($cookiejar)
{
$this->options[CURLOPT_COOKIEJAR] = $cookiejar;
}
/**
* 设置COOKIE FILE
*
* @param string $cookie_file
* @return void
*/
public function set_cookiefile($cookiefile)
{
$this->options[CURLOPT_COOKIEFILE] = $cookiefile;
}
/**
* 获取内容的时候是不是连header也一起获取
*
* @param mixed $http_raw
* @return void
* @author seatle <seatle@foxmail.com>
* @created time :2016-09-18 10:17
*/
public function set_http_raw($http_raw = false)
{
$this->options[CURLOPT_HEADER] = $http_raw;
}
/**
* 设置IP
*
* @param string $ip
* @return void
*/
public function set_ip($ip)
{
$headers = array(
'CLIENT-IP'=>$ip,
'X-FORWARDED-FOR'=>$ip,
);
$this->headers = $this->headers + $headers;
}
/**
* 设置Headers
*
* @param string $headers
* @return void
*/
public function set_headers($headers)
{
$this->headers = $this->headers + $headers;
}
/**
* 设置Hosts
*
* @param string $hosts
* @return void
*/
public function set_hosts($hosts)
{
$headers = array(
'Host'=>$hosts,
);
$this->headers = $this->headers + $headers;
}
/**
* 设置Gzip
*
* @param string $hosts
* @return void
*/
public function set_gzip($gzip)
{
if ($gzip)
{
$this->options[CURLOPT_ENCODING] = 'gzip';
}
}
public function request($url, $method = "GET", $fields = array(), $headers = array(), $options = array())
{
$this->requests[] = array('url'=>$url,'method'=>$method,'fields'=>$fields,'headers'=>$headers,'options'=>$options);
return true;
}
public function get_options($request)
{
$options = $this->options;
$headers = $this->headers;
if (ini_get('safe_mode') == 'Off' || !ini_get('safe_mode'))
{
$options[CURLOPT_FOLLOWLOCATION] = 1;
$options[CURLOPT_MAXREDIRS] = 5;
}
// 如果是 get 方式,直接拼凑一个 url 出来
if (strtolower($request['method']) == 'get' && !empty($request['fields']))
{
$url = $request['url'] . "?" . http_build_query($request['fields']);
}
// 如果是 post 方式
if (strtolower($request['method']) == 'post')
{
$options[CURLOPT_POST] = 1;
$options[CURLOPT_POSTFIELDS] = $request['fields'];
}
// append custom options for this specific request
if ($request['options'])
{
$options = $request['options'] + $options;
}
if ($request['headers'])
{
$headers = $request['headers'] + $headers;
}
// 随机绑定 hosts做负载均衡
//if (self::$hosts)
//{
//$parse_url = parse_url($url);
//$host = $parse_url['host'];
//$key = rand(0, count(self::$hosts)-1);
//$ip = self::$hosts[$key];
//$url = str_replace($host, $ip, $url);
//self::$headers = array_merge( array('Host:'.$host), self::$headers );
//}
// header 要这样拼凑
$headers_tmp = array();
foreach ($headers as $k=>$v)
{
$headers_tmp[] = $k.":".$v;
}
$headers = $headers_tmp;
$options[CURLOPT_URL] = $request['url'];
$options[CURLOPT_HTTPHEADER] = $headers;
return $options;
}
/**
* GET 请求
*
* @param string $url
* @param array $headers
* @param array $options
* @return bool
*/
public function get($url, $fields = array(), $headers = array(), $options = array())
{
return $this->request($url, 'get', $fields, $headers, $options);
}
/**
* $fields 有三种类型:1、数组2、http query3、json
* 1、array('name'=>'yangzetao') 2、http_build_query(array('name'=>'yangzetao')) 3、json_encode(array('name'=>'yangzetao'))
* 前两种是普通的post可以用$_POST方式获取
* 第三种是post stream( json rpc其实就是webservice )虽然是post方式但是只能用流方式 http://input 后者 $HTTP_RAW_POST_DATA 获取
*
* @param string $url
* @param array $fields
* @param array $headers
* @param array $options
* @return void
*/
public function post($url, $fields = array(), $headers = array(), $options = array())
{
return $this->request($url, 'post', $fields, $headers, $options);
}
/**
* Execute processing
*
* @param int $window_size Max number of simultaneous connections
* @return string|bool
*/
public function execute($window_size = null)
{
$count = sizeof($this->requests);
if ($count == 0)
{
return false;
}
// 只有一个请求
elseif ($count == 1)
{
return $this->single_curl();
}
else
{
// 开始 rolling curlwindow_size 是最大同时连接数
return $this->rolling_curl($window_size);
}
}
private function single_curl()
{
$ch = curl_init();
// 从请求队列里面弹出一个来
$request = array_shift($this->requests);
$options = $this->get_options($request);
curl_setopt_array($ch, $options);
$output = curl_exec($ch);
$info = curl_getinfo($ch);
$error = null;
if ($output === false)
{
$error = curl_error( $ch );
}
//$output = substr($output, 10);
//$output = gzinflate($output);
// 其实一个请求的时候没是么必要回调,直接返回数据就好了,不过这里算是多一个功能吧,和多请求保持一样的操作
if ($this->callback)
{
if (is_callable($this->callback))
{
call_user_func($this->callback, $output, $info, $request, $error);
}
}
else
{
return $output;
}
return true;
}
private function rolling_curl($window_size = null)
{
// 如何设置了最大任务数
if ($window_size)
$this->window_size = $window_size;
// 如果请求数 小于 任务数,设置任务数为请求数
if (sizeof($this->requests) < $this->window_size)
$this->window_size = sizeof($this->requests);
// 如果任务数小于2个不应该用这个方法的用上面的single_curl方法就好了
if ($this->window_size < 2)
exit("Window size must be greater than 1");
// 初始化任务队列
$master = curl_multi_init();
// 开始第一批请求
for ($i = 0; $i < $this->window_size; $i++)
{
$ch = curl_init();
$options = $this->get_options($this->requests[$i]);
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
// 添加到请求数组
$key = (string) $ch;
$this->requestMap[$key] = $i;
}
do {
while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM) ;
// 如果
if ($execrun != CURLM_OK) { break; }
// 一旦有一个请求完成找出来因为curl底层是select所以最大受限于1024
while ($done = curl_multi_info_read($master))
{
// 从请求中获取信息、内容、错误
$info = curl_getinfo($done['handle']);
$output = curl_multi_getcontent($done['handle']);
$error = curl_error($done['handle']);
// 如果绑定了回调函数
$callback = $this->callback;
if (is_callable($callback))
{
$key = (string) $done['handle'];
$request = $this->requests[$this->requestMap[$key]];
unset($this->requestMap[$key]);
call_user_func($callback, $output, $info, $request, $error);
}
// 一个请求完了就加一个进来一直保证5个任务同时进行
if ($i < sizeof($this->requests) && isset($this->requests[$i]) && $i < count($this->requests))
{
$ch = curl_init();
$options = $this->get_options($this->requests[$i]);
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
// 添加到请求数组
$key = (string) $ch;
$this->requestMap[$key] = $i;
$i++;
}
// 把请求已经完成了得 curl handle 删除
curl_multi_remove_handle($master, $done['handle']);
}
// 当没有数据的时候进行堵塞,把 CPU 使用权交出来,避免上面 do 死循环空跑数据导致 CPU 100%
if ($running)
{
curl_multi_select($master, $this->timeout);
}
} while ($running);
// 关闭任务
curl_multi_close($master);
// 把请求清空,否则没有重新 new rolling_curl(); 直接再次导入一批url的时候就会把前面已经执行过的url又执行一轮
unset($this->requests);
return true;
}
/**
* @return void
*/
public function __destruct()
{
unset($this->window_size, $this->callback, $this->options, $this->headers, $this->requests);
}
}